From ff9721111441e69b4a276a34c757476b625213c6 Mon Sep 17 00:00:00 2001 From: RSDuck Date: Wed, 10 Jul 2019 00:57:59 +0200 Subject: jit: thumb block transfer working also pc and sp relative loads and some refactoring --- src/ARMJIT_RegisterCache.h | 136 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 136 insertions(+) create mode 100644 src/ARMJIT_RegisterCache.h (limited to 'src/ARMJIT_RegisterCache.h') diff --git a/src/ARMJIT_RegisterCache.h b/src/ARMJIT_RegisterCache.h new file mode 100644 index 0000000..04c1eda --- /dev/null +++ b/src/ARMJIT_RegisterCache.h @@ -0,0 +1,136 @@ +#ifndef ARMJIT_REGCACHE_H +#define ARMJIT_REGCACHE_H + +#include "ARMJIT.h" + +// TODO: replace this in the future +#include "dolphin/BitSet.h" + +#include + +namespace ARMJIT +{ + +template +class RegisterCache +{ +public: + RegisterCache() + {} + + RegisterCache(T* compiler, FetchedInstr instrs[], int instrsCount) + : Compiler(compiler), Instrs(instrs), InstrsCount(instrsCount) + { + for (int i = 0; i < 16; i++) + Mapping[i] = (Reg)-1; + } + + void UnloadRegister(int reg) + { + assert(Mapping[reg] != -1); + + if (DirtyRegs & (1 << reg)) + Compiler->SaveReg(reg, Mapping[reg]); + + DirtyRegs &= ~(1 << reg); + LoadedRegs &= ~(1 << reg); + NativeRegsUsed &= ~(1 << (int)Mapping[reg]); + Mapping[reg] = (Reg)-1; + } + + void LoadRegister(int reg) + { + assert(Mapping[reg] == -1); + for (int i = 0; i < NativeRegsAvailable; i++) + { + Reg nativeReg = NativeRegAllocOrder[i]; + if (!(NativeRegsUsed & (1 << nativeReg))) + { + Mapping[reg] = nativeReg; + NativeRegsUsed |= 1 << (int)nativeReg; + LoadedRegs |= 1 << reg; + + Compiler->LoadReg(reg, nativeReg); + + return; + } + } + + assert("Welp!"); + } + + void Flush() + { + BitSet16 loadedSet(LoadedRegs); + for (int reg : loadedSet) + UnloadRegister(reg); + } + + void Prepare(int i) + { + u16 futureNeeded = 0; + int ranking[16]; + for (int j = 0; j < 16; j++) + ranking[j] = 0; + for (int j = i; j < InstrsCount; j++) + { + BitSet16 regsNeeded((Instrs[j].Info.SrcRegs & ~(1 << 15)) | Instrs[j].Info.DstRegs); + futureNeeded |= regsNeeded.m_val; + for (int reg : regsNeeded) + ranking[reg]++; + } + + // we'll unload all registers which are never used again + BitSet16 neverNeededAgain(LoadedRegs & ~futureNeeded); + for (int reg : neverNeededAgain) + UnloadRegister(reg); + + FetchedInstr Instr = Instrs[i]; + u16 necessaryRegs = (Instr.Info.SrcRegs & ~(1 << 15)) | Instr.Info.DstRegs; + BitSet16 needToBeLoaded(necessaryRegs & ~LoadedRegs); + if (needToBeLoaded != BitSet16(0)) + { + int neededCount = needToBeLoaded.Count(); + BitSet16 loadedSet(LoadedRegs); + while (loadedSet.Count() + neededCount > NativeRegsAvailable) + { + int leastReg = -1; + int rank = 1000; + for (int reg : loadedSet) + { + if (!((1 << reg) & necessaryRegs) && ranking[reg] < rank) + { + leastReg = reg; + rank = ranking[reg]; + } + } + + assert(leastReg != -1); + UnloadRegister(leastReg); + + loadedSet.m_val = LoadedRegs; + } + + for (int reg : needToBeLoaded) + LoadRegister(reg); + } + DirtyRegs |= Instr.Info.DstRegs & ~(1 << 15); + } + + static const Reg NativeRegAllocOrder[]; + static const int NativeRegsAvailable; + + Reg Mapping[16]; + u32 NativeRegsUsed = 0; + u16 LoadedRegs = 0; + u16 DirtyRegs = 0; + + T* Compiler; + + FetchedInstr* Instrs; + int InstrsCount; +}; + +} + +#endif \ No newline at end of file -- cgit v1.2.3 From 5338c28f408382263077b24bce5d5ab62bdf7024 Mon Sep 17 00:00:00 2001 From: RSDuck Date: Sun, 8 Sep 2019 14:48:20 +0200 Subject: load register only if needed - do thumb bl long merge in the first step - preparations for better branch jitting --- src/ARMJIT.cpp | 16 ++++++++++++++++ src/ARMJIT.h | 1 + src/ARMJIT_RegisterCache.h | 12 ++++++++---- src/ARMJIT_x64/ARMJIT_Branch.cpp | 12 +++++++----- src/ARMJIT_x64/ARMJIT_Compiler.cpp | 34 ++++++++++++---------------------- src/ARMJIT_x64/ARMJIT_Compiler.h | 2 +- src/ARM_InstrInfo.h | 3 +++ 7 files changed, 48 insertions(+), 32 deletions(-) (limited to 'src/ARMJIT_RegisterCache.h') diff --git a/src/ARMJIT.cpp b/src/ARMJIT.cpp index 5d92e47..85cadf3 100644 --- a/src/ARMJIT.cpp +++ b/src/ARMJIT.cpp @@ -159,6 +159,7 @@ CompiledBlock CompileBlock(ARM* cpu) u32 r15 = cpu->R[15]; cpu->FillPipeline(); u32 nextInstr[2] = {cpu->NextInstr[0], cpu->NextInstr[1]}; + u32 nextInstrAddr[2] = {blockAddr, r15}; do { r15 += thumb ? 2 : 4; @@ -166,6 +167,10 @@ CompiledBlock CompileBlock(ARM* cpu) instrs[i].SetFlags = 0; instrs[i].Instr = nextInstr[0]; instrs[i].NextInstr[0] = nextInstr[0] = nextInstr[1]; + + instrs[i].Addr = nextInstrAddr[0]; + nextInstrAddr[0] = nextInstrAddr[1]; + nextInstrAddr[1] = r15; if (cpu->Num == 0) { @@ -193,8 +198,19 @@ CompiledBlock CompileBlock(ARM* cpu) instrs[i].NextInstr[1] = nextInstr[1]; instrs[i].Info = ARMInstrInfo::Decode(thumb, cpu->Num, instrs[i].Instr); + if (thumb && instrs[i].Info.Kind == ARMInstrInfo::tk_BL_LONG_2 && i > 0 + && instrs[i - 1].Info.Kind == ARMInstrInfo::tk_BL_LONG_1) + { + instrs[i - 1].Info.Kind = ARMInstrInfo::tk_BL_LONG; + instrs[i - 1].Instr = (instrs[i - 1].Instr & 0xFFFF) | (instrs[i].Instr << 16); + instrs[i - 1].Info.DstRegs = 0xC000; + instrs[i - 1].Info.SrcRegs = 0; + instrs[i - 1].Info.EndBlock = true; + i--; + } i++; + bool canCompile = compiler->CanCompile(thumb, instrs[i - 1].Info.Kind); if (instrs[i - 1].Info.ReadFlags != 0 || !canCompile) floodFillSetFlags(instrs, i - 2, canCompile ? instrs[i - 1].Info.ReadFlags : 0xF); diff --git a/src/ARMJIT.h b/src/ARMJIT.h index 6197695..7e448ef 100644 --- a/src/ARMJIT.h +++ b/src/ARMJIT.h @@ -31,6 +31,7 @@ struct FetchedInstr u8 SetFlags; u32 Instr; u32 NextInstr[2]; + u32 Addr; u8 CodeCycles; diff --git a/src/ARMJIT_RegisterCache.h b/src/ARMJIT_RegisterCache.h index 04c1eda..fe2f203 100644 --- a/src/ARMJIT_RegisterCache.h +++ b/src/ARMJIT_RegisterCache.h @@ -38,7 +38,7 @@ public: Mapping[reg] = (Reg)-1; } - void LoadRegister(int reg) + void LoadRegister(int reg, bool loadValue) { assert(Mapping[reg] == -1); for (int i = 0; i < NativeRegsAvailable; i++) @@ -50,7 +50,8 @@ public: NativeRegsUsed |= 1 << (int)nativeReg; LoadedRegs |= 1 << reg; - Compiler->LoadReg(reg, nativeReg); + if (loadValue) + Compiler->LoadReg(reg, nativeReg); return; } @@ -66,7 +67,7 @@ public: UnloadRegister(reg); } - void Prepare(int i) + void Prepare(bool thumb, int i) { u16 futureNeeded = 0; int ranking[16]; @@ -111,8 +112,11 @@ public: loadedSet.m_val = LoadedRegs; } + BitSet16 needValueLoaded(needToBeLoaded); + if (thumb || Instr.Cond() >= 0xE) + needValueLoaded = BitSet16(Instr.Info.SrcRegs); for (int reg : needToBeLoaded) - LoadRegister(reg); + LoadRegister(reg, needValueLoaded[reg]); } DirtyRegs |= Instr.Info.DstRegs & ~(1 << 15); } diff --git a/src/ARMJIT_x64/ARMJIT_Branch.cpp b/src/ARMJIT_x64/ARMJIT_Branch.cpp index c0a8f1f..cc7a3c4 100644 --- a/src/ARMJIT_x64/ARMJIT_Branch.cpp +++ b/src/ARMJIT_x64/ARMJIT_Branch.cpp @@ -271,15 +271,17 @@ void Compiler::T_Comp_BL_LONG_2() Comp_JumpTo(RSCRATCH); } -void Compiler::T_Comp_BL_Merged(FetchedInstr part1) +void Compiler::T_Comp_BL_Merged() { - assert(part1.Info.Kind == ARMInstrInfo::tk_BL_LONG_1); Comp_AddCycles_C(); - u32 target = (R15 - 2) + ((s32)((part1.Instr & 0x7FF) << 21) >> 9); - target += (CurInstr.Instr & 0x7FF) << 1; + R15 += 2; - if (Num == 1 || CurInstr.Instr & (1 << 12)) + u32 upperPart = CurInstr.Instr >> 16; + u32 target = (R15 - 2) + ((s32)((CurInstr.Instr & 0x7FF) << 21) >> 9); + target += (upperPart & 0x7FF) << 1; + + if (Num == 1 || upperPart & (1 << 12)) target |= 1; MOV(32, MapReg(14), Imm32((R15 - 2) | 1)); diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp index d585f39..d8ce1aa 100644 --- a/src/ARMJIT_x64/ARMJIT_Compiler.cpp +++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp @@ -338,7 +338,8 @@ const Compiler::CompileFunc T_Comp[ARMInstrInfo::tk_Count] = { // Branch F(T_Comp_BCOND), F(T_Comp_BranchXchangeReg), F(T_Comp_BranchXchangeReg), F(T_Comp_B), F(T_Comp_BL_LONG_1), F(T_Comp_BL_LONG_2), // Unk, SVC - NULL, NULL + NULL, NULL, + F(T_Comp_BL_Merged) }; #undef F @@ -361,21 +362,18 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs ConstantCycles = 0; Thumb = cpu->CPSR & 0x20; Num = cpu->Num; - R15 = cpu->R[15]; CodeRegion = cpu->CodeRegion; CurCPU = cpu; CompiledBlock res = (CompiledBlock)GetWritableCodePtr(); if (!(Num == 0 - ? IsMapped<0>(R15 - (Thumb ? 2 : 4)) - : IsMapped<1>(R15 - (Thumb ? 2 : 4)))) + ? IsMapped<0>(instrs[0].Addr - (Thumb ? 2 : 4)) + : IsMapped<1>(instrs[0].Addr - (Thumb ? 2 : 4)))) { printf("Trying to compile a block in unmapped memory\n"); } - bool mergedThumbBL = false; - ABI_PushRegistersAndAdjustStack(BitSet32(ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS & ~BitSet32({RSP})), 8); MOV(64, R(RCPU), ImmPtr(cpu)); @@ -387,8 +385,8 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs for (int i = 0; i < instrsCount; i++) { - R15 += Thumb ? 2 : 4; CurInstr = instrs[i]; + R15 = CurInstr.Addr + (Thumb ? 4 : 8); CompileFunc comp = Thumb ? T_Comp[CurInstr.Info.Kind] @@ -406,29 +404,21 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs } if (comp != NULL) - RegCache.Prepare(i); + RegCache.Prepare(Thumb, i); else RegCache.Flush(); if (Thumb) { - if (i < instrsCount - 1 && CurInstr.Info.Kind == ARMInstrInfo::tk_BL_LONG_1 - && instrs[i + 1].Info.Kind == ARMInstrInfo::tk_BL_LONG_2) - mergedThumbBL = true; - else + u32 icode = (CurInstr.Instr >> 6) & 0x3FF; + if (comp == NULL) { - u32 icode = (CurInstr.Instr >> 6) & 0x3FF; - if (comp == NULL) - { - MOV(64, R(ABI_PARAM1), R(RCPU)); + MOV(64, R(ABI_PARAM1), R(RCPU)); - ABI_CallFunction(ARMInterpreter::THUMBInstrTable[icode]); - } - else if (mergedThumbBL) - T_Comp_BL_Merged(instrs[i - 1]); - else - (this->*comp)(); + ABI_CallFunction(ARMInterpreter::THUMBInstrTable[icode]); } + else + (this->*comp)(); } else { diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.h b/src/ARMJIT_x64/ARMJIT_Compiler.h index a62f043..fcb2380 100644 --- a/src/ARMJIT_x64/ARMJIT_Compiler.h +++ b/src/ARMJIT_x64/ARMJIT_Compiler.h @@ -90,7 +90,7 @@ public: void T_Comp_BranchXchangeReg(); void T_Comp_BL_LONG_1(); void T_Comp_BL_LONG_2(); - void T_Comp_BL_Merged(FetchedInstr prefix); + void T_Comp_BL_Merged(); void Comp_MemAccess(Gen::OpArg rd, bool signExtend, bool store, int size); s32 Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc, bool decrement, bool usermode); diff --git a/src/ARM_InstrInfo.h b/src/ARM_InstrInfo.h index 5336837..d01c600 100644 --- a/src/ARM_InstrInfo.h +++ b/src/ARM_InstrInfo.h @@ -212,6 +212,9 @@ enum tk_UNK, tk_SVC, + // not a real instruction + tk_BL_LONG, + tk_Count }; -- cgit v1.2.3 From a687be9879e5cab4ea5d8646c8cf47c214b18856 Mon Sep 17 00:00:00 2001 From: RSDuck Date: Thu, 3 Oct 2019 01:10:59 +0200 Subject: new block cache and much more... - more reliable code invalidation detection - blocks aren't stopped at any branch, but are being followed if possible to get larger blocks - idle loop recognition - optimised literal loads, load/store cycle counting and loads/stores from constant addresses --- src/ARM.cpp | 44 ++- src/ARM.h | 16 +- src/ARMInterpreter.h | 9 + src/ARMJIT.cpp | 755 ++++++++++++++++++++++++++++++------ src/ARMJIT.h | 141 ++----- src/ARMJIT_Internal.h | 198 ++++++++++ src/ARMJIT_RegisterCache.h | 36 +- src/ARMJIT_x64/ARMJIT_ALU.cpp | 16 +- src/ARMJIT_x64/ARMJIT_Branch.cpp | 43 +- src/ARMJIT_x64/ARMJIT_Compiler.cpp | 184 +++++++-- src/ARMJIT_x64/ARMJIT_Compiler.h | 51 ++- src/ARMJIT_x64/ARMJIT_LoadStore.cpp | 629 ++++++++++++++---------------- src/ARM_InstrInfo.cpp | 47 ++- src/ARM_InstrInfo.h | 11 +- src/CP15.cpp | 12 +- src/Config.cpp | 2 + src/Config.h | 1 + src/NDS.cpp | 22 +- src/libui_sdl/DlgEmuSettings.cpp | 22 +- 19 files changed, 1550 insertions(+), 689 deletions(-) create mode 100644 src/ARMJIT_Internal.h (limited to 'src/ARMJIT_RegisterCache.h') diff --git a/src/ARM.cpp b/src/ARM.cpp index e404943..423c940 100644 --- a/src/ARM.cpp +++ b/src/ARM.cpp @@ -580,21 +580,26 @@ void ARMv5::ExecuteJIT() return; } - ARMJIT::CompiledBlock block = ARMJIT::LookUpBlock<0>(instrAddr); - Cycles += (block ? block : ARMJIT::CompileBlock(this))(); + ARMJIT::JitBlockEntry block = ARMJIT::LookUpBlock<0>(instrAddr); + if (block) + Cycles += block(); + else + ARMJIT::CompileBlock(this); + + NDS::ARM9Timestamp += Cycles; + Cycles = 0; + if (IRQ) TriggerIRQ(); if (Halted) { - if (Halted == 1 && NDS::ARM9Timestamp < NDS::ARM9Target) + bool idleLoop = Halted & 0x20; + Halted &= ~0x20; + if ((Halted == 1 || idleLoop) && NDS::ARM9Timestamp < NDS::ARM9Target) { NDS::ARM9Timestamp = NDS::ARM9Target; } break; } - if (IRQ) TriggerIRQ(); - - NDS::ARM9Timestamp += Cycles; - Cycles = 0; } if (Halted == 2) @@ -710,23 +715,28 @@ void ARMv4::ExecuteJIT() printf("ARMv4 PC in non executable region %08X\n", R[15]); return; } - ARMJIT::CompiledBlock block = ARMJIT::LookUpBlock<1>(instrAddr); - Cycles += (block ? block : ARMJIT::CompileBlock(this))(); + + ARMJIT::JitBlockEntry block = ARMJIT::LookUpBlock<1>(instrAddr); + if (block) + Cycles += block(); + else + ARMJIT::CompileBlock(this); + + NDS::ARM7Timestamp += Cycles; + Cycles = 0; // TODO optimize this shit!!! + if (IRQ) TriggerIRQ(); if (Halted) { - if (Halted == 1 && NDS::ARM7Timestamp < NDS::ARM7Target) + bool idleLoop = Halted & 0x20; + Halted &= ~0x20; + if ((Halted == 1 || idleLoop) && NDS::ARM7Timestamp < NDS::ARM7Target) { NDS::ARM7Timestamp = NDS::ARM7Target; } break; } - - if (IRQ) TriggerIRQ(); - - NDS::ARM7Timestamp += Cycles; - Cycles = 0; } if (Halted == 2) @@ -736,6 +746,8 @@ void ARMv4::ExecuteJIT() void ARMv5::FillPipeline() { + SetupCodeMem(R[15]); + if (CPSR & 0x20) { if ((R[15] - 2) & 0x2) @@ -758,6 +770,8 @@ void ARMv5::FillPipeline() void ARMv4::FillPipeline() { + SetupCodeMem(R[15]); + if (CPSR & 0x20) { NextInstr[0] = CodeRead16(R[15] - 2); diff --git a/src/ARM.h b/src/ARM.h index 4d387bc..8a01068 100644 --- a/src/ARM.h +++ b/src/ARM.h @@ -299,7 +299,7 @@ public: { *val = NDS::ARM7Read8(addr); DataRegion = addr >> 24; - DataCycles = NDS::ARM7MemTimings[DataRegion][0]; + DataCycles = NDS::ARM7MemTimings[addr >> 15][0]; } void DataRead16(u32 addr, u32* val) @@ -308,7 +308,7 @@ public: *val = NDS::ARM7Read16(addr); DataRegion = addr >> 24; - DataCycles = NDS::ARM7MemTimings[DataRegion][0]; + DataCycles = NDS::ARM7MemTimings[addr >> 15][0]; } void DataRead32(u32 addr, u32* val) @@ -317,7 +317,7 @@ public: *val = NDS::ARM7Read32(addr); DataRegion = addr >> 24; - DataCycles = NDS::ARM7MemTimings[DataRegion][2]; + DataCycles = NDS::ARM7MemTimings[addr >> 15][2]; } void DataRead32S(u32 addr, u32* val) @@ -325,14 +325,14 @@ public: addr &= ~3; *val = NDS::ARM7Read32(addr); - DataCycles += NDS::ARM7MemTimings[DataRegion][3]; + DataCycles += NDS::ARM7MemTimings[addr >> 15][3]; } void DataWrite8(u32 addr, u8 val) { NDS::ARM7Write8(addr, val); DataRegion = addr >> 24; - DataCycles = NDS::ARM7MemTimings[DataRegion][0]; + DataCycles = NDS::ARM7MemTimings[addr >> 15][0]; } void DataWrite16(u32 addr, u16 val) @@ -341,7 +341,7 @@ public: NDS::ARM7Write16(addr, val); DataRegion = addr >> 24; - DataCycles = NDS::ARM7MemTimings[DataRegion][0]; + DataCycles = NDS::ARM7MemTimings[addr >> 15][0]; } void DataWrite32(u32 addr, u32 val) @@ -350,7 +350,7 @@ public: NDS::ARM7Write32(addr, val); DataRegion = addr >> 24; - DataCycles = NDS::ARM7MemTimings[DataRegion][2]; + DataCycles = NDS::ARM7MemTimings[addr >> 15][2]; } void DataWrite32S(u32 addr, u32 val) @@ -358,7 +358,7 @@ public: addr &= ~3; NDS::ARM7Write32(addr, val); - DataCycles += NDS::ARM7MemTimings[DataRegion][3]; + DataCycles += NDS::ARM7MemTimings[addr >> 15][3]; } diff --git a/src/ARMInterpreter.h b/src/ARMInterpreter.h index 7244238..2bf8167 100644 --- a/src/ARMInterpreter.h +++ b/src/ARMInterpreter.h @@ -28,6 +28,15 @@ namespace ARMInterpreter extern void (*ARMInstrTable[4096])(ARM* cpu); extern void (*THUMBInstrTable[1024])(ARM* cpu); +void A_MSR_IMM(ARM* cpu); +void A_MSR_REG(ARM* cpu); +void A_MRS(ARM* cpu); +void A_MCR(ARM* cpu); +void A_MRC(ARM* cpu); +void A_SVC(ARM* cpu); + +void T_SVC(ARM* cpu); + void A_BLX_IMM(ARM* cpu); // I'm a special one look at me } diff --git a/src/ARMJIT.cpp b/src/ARMJIT.cpp index 85cadf3..686bdd6 100644 --- a/src/ARMJIT.cpp +++ b/src/ARMJIT.cpp @@ -1,122 +1,137 @@ #include "ARMJIT.h" #include +#include #include "Config.h" +#include "ARMJIT_Internal.h" #include "ARMJIT_x64/ARMJIT_Compiler.h" +#include "ARMInterpreter_ALU.h" +#include "ARMInterpreter_LoadStore.h" +#include "ARMInterpreter_Branch.h" +#include "ARMInterpreter.h" + +#include "GPU3D.h" +#include "SPU.h" +#include "Wifi.h" + namespace ARMJIT { +#define JIT_DEBUGPRINT(msg, ...) + Compiler* compiler; -BlockCache cache; -#define DUP2(x) x, x +const u32 ExeMemRegionSizes[] = { + 0x8000, // Unmapped Region (dummy) + 0x8000, // ITCM + 4*1024*1024, // Main RAM + 0x8000, // SWRAM + 0xA4000, // LCDC + 0x8000, // ARM9 BIOS + 0x4000, // ARM7 BIOS + 0x10000, // ARM7 WRAM + 0x40000 // ARM7 WVRAM +}; -static ptrdiff_t JIT_MEM[2][32] = { - //arm9 - { - /* 0X*/ DUP2(offsetof(BlockCache, ARM9_ITCM)), - /* 1X*/ DUP2(offsetof(BlockCache, ARM9_ITCM)), // mirror - /* 2X*/ DUP2(offsetof(BlockCache, MainRAM)), - /* 3X*/ DUP2(offsetof(BlockCache, SWRAM)), - /* 4X*/ DUP2(-1), - /* 5X*/ DUP2(-1), - /* 6X*/ -1, - offsetof(BlockCache, ARM9_LCDC), // Plain ARM9-CPU Access (LCDC mode) (max 656KB) - /* 7X*/ DUP2(-1), - /* 8X*/ DUP2(-1), - /* 9X*/ DUP2(-1), - /* AX*/ DUP2(-1), - /* BX*/ DUP2(-1), - /* CX*/ DUP2(-1), - /* DX*/ DUP2(-1), - /* EX*/ DUP2(-1), - /* FX*/ DUP2(offsetof(BlockCache, ARM9_BIOS)) - }, - //arm7 - { - /* 0X*/ DUP2(offsetof(BlockCache, ARM7_BIOS)), - /* 1X*/ DUP2(-1), - /* 2X*/ DUP2(offsetof(BlockCache, MainRAM)), - /* 3X*/ offsetof(BlockCache, SWRAM), - offsetof(BlockCache, ARM7_WRAM), - /* 4X*/ DUP2(-1), - /* 5X*/ DUP2(-1), - /* 6X*/ DUP2(offsetof(BlockCache, ARM7_WVRAM)), /* contrary to Gbatek, melonDS and itself, - DeSmuME doesn't mirror the 64 MB region at 0x6800000 */ - /* 7X*/ DUP2(-1), - /* 8X*/ DUP2(-1), - /* 9X*/ DUP2(-1), - /* AX*/ DUP2(-1), - /* BX*/ DUP2(-1), - /* CX*/ DUP2(-1), - /* DX*/ DUP2(-1), - /* EX*/ DUP2(-1), - /* FX*/ DUP2(-1) - } +const u32 ExeMemRegionOffsets[] = { + 0, + 0x8000, + 0x10000, + 0x410000, + 0x418000, + 0x4BC000, + 0x4C4000, + 0x4C8000, + 0x4D8000, + 0x518000, }; -static u32 JIT_MASK[2][32] = { +#define DUP2(x) x, x + +const static ExeMemKind JIT_MEM[2][32] = { //arm9 { - /* 0X*/ DUP2(0x00007FFF), - /* 1X*/ DUP2(0x00007FFF), - /* 2X*/ DUP2(0x003FFFFF), - /* 3X*/ DUP2(0x00007FFF), - /* 4X*/ DUP2(0x00000000), - /* 5X*/ DUP2(0x00000000), - /* 6X*/ 0x00000000, - 0x000FFFFF, - /* 7X*/ DUP2(0x00000000), - /* 8X*/ DUP2(0x00000000), - /* 9X*/ DUP2(0x00000000), - /* AX*/ DUP2(0x00000000), - /* BX*/ DUP2(0x00000000), - /* CX*/ DUP2(0x00000000), - /* DX*/ DUP2(0x00000000), - /* EX*/ DUP2(0x00000000), - /* FX*/ DUP2(0x00007FFF) + /* 0X*/ DUP2(exeMem_ITCM), + /* 1X*/ DUP2(exeMem_ITCM), // mirror + /* 2X*/ DUP2(exeMem_MainRAM), + /* 3X*/ DUP2(exeMem_SWRAM), + /* 4X*/ DUP2(exeMem_Unmapped), + /* 5X*/ DUP2(exeMem_Unmapped), + /* 6X*/ exeMem_Unmapped, + exeMem_LCDC, // Plain ARM9-CPU Access (LCDC mode) (max 656KB) + /* 7X*/ DUP2(exeMem_Unmapped), + /* 8X*/ DUP2(exeMem_Unmapped), + /* 9X*/ DUP2(exeMem_Unmapped), + /* AX*/ DUP2(exeMem_Unmapped), + /* BX*/ DUP2(exeMem_Unmapped), + /* CX*/ DUP2(exeMem_Unmapped), + /* DX*/ DUP2(exeMem_Unmapped), + /* EX*/ DUP2(exeMem_Unmapped), + /* FX*/ DUP2(exeMem_ARM9_BIOS) }, //arm7 { - /* 0X*/ DUP2(0x00003FFF), - /* 1X*/ DUP2(0x00000000), - /* 2X*/ DUP2(0x003FFFFF), - /* 3X*/ 0x00007FFF, - 0x0000FFFF, - /* 4X*/ 0x00000000, - 0x0000FFFF, - /* 5X*/ DUP2(0x00000000), - /* 6X*/ DUP2(0x0003FFFF), - /* 7X*/ DUP2(0x00000000), - /* 8X*/ DUP2(0x00000000), - /* 9X*/ DUP2(0x00000000), - /* AX*/ DUP2(0x00000000), - /* BX*/ DUP2(0x00000000), - /* CX*/ DUP2(0x00000000), - /* DX*/ DUP2(0x00000000), - /* EX*/ DUP2(0x00000000), - /* FX*/ DUP2(0x00000000) + /* 0X*/ DUP2(exeMem_ARM7_BIOS), + /* 1X*/ DUP2(exeMem_Unmapped), + /* 2X*/ DUP2(exeMem_MainRAM), + /* 3X*/ exeMem_SWRAM, + exeMem_ARM7_WRAM, + /* 4X*/ DUP2(exeMem_Unmapped), + /* 5X*/ DUP2(exeMem_Unmapped), + /* 6X*/ DUP2(exeMem_ARM7_WVRAM), /* contrary to Gbatek, melonDS and itself, + DeSmuME doesn't mirror the 64 MB region at 0x6800000 */ + /* 7X*/ DUP2(exeMem_Unmapped), + /* 8X*/ DUP2(exeMem_Unmapped), + /* 9X*/ DUP2(exeMem_Unmapped), + /* AX*/ DUP2(exeMem_Unmapped), + /* BX*/ DUP2(exeMem_Unmapped), + /* CX*/ DUP2(exeMem_Unmapped), + /* DX*/ DUP2(exeMem_Unmapped), + /* EX*/ DUP2(exeMem_Unmapped), + /* FX*/ DUP2(exeMem_Unmapped) } }; #undef DUP2 +/* + translates address to pseudo physical address + - more compact, eliminates mirroring, everything comes in a row + - we only need one translation table +*/ +u32 AddrTranslate9[0x2000]; +u32 AddrTranslate7[0x4000]; -void Init() +JitBlockEntry FastBlockAccess[ExeMemSpaceSize / 2]; +AddressRange CodeRanges[ExeMemSpaceSize / 256]; + +TinyVector JitBlocks; +JitBlock* RestoreCandidates[0x1000] = {NULL}; + +u32 HashRestoreCandidate(u32 pseudoPhysicalAddr) { - memset(&cache, 0, sizeof(BlockCache)); + return (u32)(((u64)pseudoPhysicalAddr * 11400714819323198485llu) >> 53); +} +void Init() +{ for (int i = 0; i < 0x2000; i++) - cache.AddrMapping9[i] = JIT_MEM[0][i >> 8] == -1 ? NULL : - (CompiledBlock*)((u8*)&cache + JIT_MEM[0][i >> 8]) - + (((i << 15) & JIT_MASK[0][i >> 8]) >> 1); + { + ExeMemKind kind = JIT_MEM[0][i >> 8]; + u32 size = ExeMemRegionSizes[kind]; + + AddrTranslate9[i] = ExeMemRegionOffsets[kind] + ((i << 15) & (size - 1)); + } for (int i = 0; i < 0x4000; i++) - cache.AddrMapping7[i] = JIT_MEM[1][i >> 9] == -1 ? NULL : - (CompiledBlock*)((u8*)&cache + JIT_MEM[1][i >> 9]) - + (((i << 14) & JIT_MASK[1][i >> 9]) >> 1); + { + ExeMemKind kind = JIT_MEM[1][i >> 9]; + u32 size = ExeMemRegionSizes[kind]; + + AddrTranslate7[i] = ExeMemRegionOffsets[kind] + ((i << 14) & (size - 1)); + } compiler = new Compiler(); } @@ -126,7 +141,7 @@ void DeInit() delete compiler; } -void floodFillSetFlags(FetchedInstr instrs[], int start, u8 flags) +void FloodFillSetFlags(FetchedInstr instrs[], int start, u8 flags) { for (int j = start; j >= 0; j--) { @@ -144,7 +159,154 @@ void floodFillSetFlags(FetchedInstr instrs[], int start, u8 flags) } } -CompiledBlock CompileBlock(ARM* cpu) +bool DecodeBranch(bool thumb, const FetchedInstr& instr, u32& cond, u32& targetAddr) +{ + if (thumb) + { + u32 r15 = instr.Addr + 4; + cond = 0xE; + + if (instr.Info.Kind == ARMInstrInfo::tk_BL_LONG && !(instr.Instr & (1 << 12))) + { + targetAddr = r15 + ((s32)((instr.Instr & 0x7FF) << 21) >> 9); + targetAddr += ((instr.Instr >> 16) & 0x7FF) << 1; + return true; + } + else if (instr.Info.Kind == ARMInstrInfo::tk_B) + { + s32 offset = (s32)((instr.Instr & 0x7FF) << 21) >> 20; + targetAddr = r15 + offset; + return true; + } + else if (instr.Info.Kind == ARMInstrInfo::tk_BCOND) + { + cond = (instr.Instr >> 8) & 0xF; + s32 offset = (s32)(instr.Instr << 24) >> 23; + targetAddr = r15 + offset; + return true; + } + } + else + { + cond = instr.Cond(); + if (instr.Info.Kind == ARMInstrInfo::ak_BL + || instr.Info.Kind == ARMInstrInfo::ak_B) + { + s32 offset = (s32)(instr.Instr << 8) >> 6; + u32 r15 = instr.Addr + 8; + targetAddr = r15 + offset; + return true; + } + } + return false; +} + +bool IsIdleLoop(FetchedInstr* instrs, int instrsCount) +{ + // see https://github.com/dolphin-emu/dolphin/blob/master/Source/Core/Core/PowerPC/PPCAnalyst.cpp#L678 + // it basically checks if one iteration of a loop depends on another + // the rules are quite simple + + u16 regsWrittenTo = 0; + u16 regsDisallowedToWrite = 0; + for (int i = 0; i < instrsCount; i++) + { + //printf("instr %d %x regs(%x %x) %x %x\n", i, instrs[i].Instr, instrs[i].Info.DstRegs, instrs[i].Info.SrcRegs, regsWrittenTo, regsDisallowedToWrite); + if (instrs[i].Info.SpecialKind == ARMInstrInfo::special_WriteMem) + return false; + if (i < instrsCount - 1 && instrs[i].Info.Branches()) + return false; + + u16 srcRegs = instrs[i].Info.SrcRegs & ~(1 << 15); + u16 dstRegs = instrs[i].Info.DstRegs & ~(1 << 15); + + regsDisallowedToWrite |= srcRegs & ~regsWrittenTo; + + if (dstRegs & regsDisallowedToWrite) + return false; + regsWrittenTo |= dstRegs; + } + return true; +} + +typedef void (*InterpreterFunc)(ARM* cpu); + +#define F(x) &ARMInterpreter::A_##x +#define F_ALU(name, s) \ + F(name##_REG_LSL_IMM##s), F(name##_REG_LSR_IMM##s), F(name##_REG_ASR_IMM##s), F(name##_REG_ROR_IMM##s), \ + F(name##_REG_LSL_REG##s), F(name##_REG_LSR_REG##s), F(name##_REG_ASR_REG##s), F(name##_REG_ROR_REG##s), F(name##_IMM##s) +#define F_MEM_WB(name) \ + F(name##_REG_LSL), F(name##_REG_LSR), F(name##_REG_ASR), F(name##_REG_ROR), F(name##_IMM), \ + F(name##_POST_REG_LSL), F(name##_POST_REG_LSR), F(name##_POST_REG_ASR), F(name##_POST_REG_ROR), F(name##_POST_IMM) +#define F_MEM_HD(name) \ + F(name##_REG), F(name##_IMM), F(name##_POST_REG), F(name##_POST_IMM) +InterpreterFunc InterpretARM[ARMInstrInfo::ak_Count] = +{ + F_ALU(AND,), F_ALU(AND,_S), + F_ALU(EOR,), F_ALU(EOR,_S), + F_ALU(SUB,), F_ALU(SUB,_S), + F_ALU(RSB,), F_ALU(RSB,_S), + F_ALU(ADD,), F_ALU(ADD,_S), + F_ALU(ADC,), F_ALU(ADC,_S), + F_ALU(SBC,), F_ALU(SBC,_S), + F_ALU(RSC,), F_ALU(RSC,_S), + F_ALU(ORR,), F_ALU(ORR,_S), + F_ALU(MOV,), F_ALU(MOV,_S), + F_ALU(BIC,), F_ALU(BIC,_S), + F_ALU(MVN,), F_ALU(MVN,_S), + F_ALU(TST,), + F_ALU(TEQ,), + F_ALU(CMP,), + F_ALU(CMN,), + + F(MUL), F(MLA), F(UMULL), F(UMLAL), F(SMULL), F(SMLAL), F(SMLAxy), F(SMLAWy), F(SMULWy), F(SMLALxy), F(SMULxy), + F(CLZ), F(QADD), F(QDADD), F(QSUB), F(QDSUB), + + F_MEM_WB(STR), + F_MEM_WB(STRB), + F_MEM_WB(LDR), + F_MEM_WB(LDRB), + + F_MEM_HD(STRH), + F_MEM_HD(LDRD), + F_MEM_HD(STRD), + F_MEM_HD(LDRH), + F_MEM_HD(LDRSB), + F_MEM_HD(LDRSH), + + F(SWP), F(SWPB), + F(LDM), F(STM), + + F(B), F(BL), F(BLX_IMM), F(BX), F(BLX_REG), + F(UNK), F(MSR_IMM), F(MSR_REG), F(MRS), F(MCR), F(MRC), F(SVC) +}; +#undef F_ALU +#undef F_MEM_WB +#undef F_MEM_HD +#undef F + +#define F(x) ARMInterpreter::T_##x +InterpreterFunc InterpretTHUMB[ARMInstrInfo::tk_Count] = +{ + F(LSL_IMM), F(LSR_IMM), F(ASR_IMM), + F(ADD_REG_), F(SUB_REG_), F(ADD_IMM_), F(SUB_IMM_), + F(MOV_IMM), F(CMP_IMM), F(ADD_IMM), F(SUB_IMM), + F(AND_REG), F(EOR_REG), F(LSL_REG), F(LSR_REG), F(ASR_REG), + F(ADC_REG), F(SBC_REG), F(ROR_REG), F(TST_REG), F(NEG_REG), + F(CMP_REG), F(CMN_REG), F(ORR_REG), F(MUL_REG), F(BIC_REG), F(MVN_REG), + F(ADD_HIREG), F(CMP_HIREG), F(MOV_HIREG), + F(ADD_PCREL), F(ADD_SPREL), F(ADD_SP), + F(LDR_PCREL), F(STR_REG), F(STRB_REG), F(LDR_REG), F(LDRB_REG), F(STRH_REG), + F(LDRSB_REG), F(LDRH_REG), F(LDRSH_REG), F(STR_IMM), F(LDR_IMM), F(STRB_IMM), + F(LDRB_IMM), F(STRH_IMM), F(LDRH_IMM), F(STR_SPREL), F(LDR_SPREL), + F(PUSH), F(POP), F(LDMIA), F(STMIA), + F(BCOND), F(BX), F(BLX_REG), F(B), F(BL_LONG_1), F(BL_LONG_2), + F(UNK), F(SVC), + NULL // BL_LONG psudo opcode +}; +#undef F + +void CompileBlock(ARM* cpu) { bool thumb = cpu->CPSR & 0x20; @@ -153,17 +315,41 @@ CompiledBlock CompileBlock(ARM* cpu) if (Config::JIT_MaxBlockSize > 32) Config::JIT_MaxBlockSize = 32; + u32 blockAddr = cpu->R[15] - (thumb ? 2 : 4); + if (!(cpu->Num == 0 + ? IsMapped<0>(blockAddr) + : IsMapped<1>(blockAddr))) + { + printf("Trying to compile a block in unmapped memory: %x\n", blockAddr); + } + + u32 pseudoPhysicalAddr = cpu->Num == 0 + ? TranslateAddr<0>(blockAddr) + : TranslateAddr<1>(blockAddr); + FetchedInstr instrs[Config::JIT_MaxBlockSize]; int i = 0; - u32 blockAddr = cpu->R[15] - (thumb ? 2 : 4); u32 r15 = cpu->R[15]; + + u32 addresseRanges[32] = {}; + u32 numAddressRanges = 0; + cpu->FillPipeline(); u32 nextInstr[2] = {cpu->NextInstr[0], cpu->NextInstr[1]}; u32 nextInstrAddr[2] = {blockAddr, r15}; + + JIT_DEBUGPRINT("start block %x (%x) %p %p (region invalidates %dx)\n", + blockAddr, pseudoPhysicalAddr, FastBlockAccess[pseudoPhysicalAddr / 2], + cpu->Num == 0 ? LookUpBlock<0>(blockAddr) : LookUpBlock<1>(blockAddr), + CodeRanges[pseudoPhysicalAddr / 256].TimesInvalidated); + + u32 lastSegmentStart = blockAddr; + do { r15 += thumb ? 2 : 4; + instrs[i].BranchFlags = 0; instrs[i].SetFlags = 0; instrs[i].Instr = nextInstr[0]; instrs[i].NextInstr[0] = nextInstr[0] = nextInstr[1]; @@ -171,6 +357,25 @@ CompiledBlock CompileBlock(ARM* cpu) instrs[i].Addr = nextInstrAddr[0]; nextInstrAddr[0] = nextInstrAddr[1]; nextInstrAddr[1] = r15; + JIT_DEBUGPRINT("instr %08x %x\n", instrs[i].Instr & (thumb ? 0xFFFF : ~0), instrs[i].Addr); + + u32 translatedAddr = (cpu->Num == 0 + ? TranslateAddr<0>(instrs[i].Addr) + : TranslateAddr<1>(instrs[i].Addr)) & ~0xFF; + if (i == 0 || translatedAddr != addresseRanges[numAddressRanges - 1]) + { + bool returning = false; + for (int j = 0; j < numAddressRanges; j++) + { + if (addresseRanges[j] == translatedAddr) + { + returning = true; + break; + } + } + if (!returning) + addresseRanges[numAddressRanges++] = translatedAddr; + } if (cpu->Num == 0) { @@ -198,6 +403,34 @@ CompiledBlock CompileBlock(ARM* cpu) instrs[i].NextInstr[1] = nextInstr[1]; instrs[i].Info = ARMInstrInfo::Decode(thumb, cpu->Num, instrs[i].Instr); + cpu->R[15] = r15; + cpu->CurInstr = instrs[i].Instr; + cpu->CodeCycles = instrs[i].CodeCycles; + + if (thumb) + { + InterpretTHUMB[instrs[i].Info.Kind](cpu); + } + else + { + if (cpu->Num == 0 && instrs[i].Info.Kind == ARMInstrInfo::ak_BLX_IMM) + { + ARMInterpreter::A_BLX_IMM(cpu); + } + else + { + u32 icode = ((instrs[i].Instr >> 4) & 0xF) | ((instrs[i].Instr >> 16) & 0xFF0); + assert(InterpretARM[instrs[i].Info.Kind] == ARMInterpreter::ARMInstrTable[icode] || instrs[i].Info.Kind == ARMInstrInfo::ak_MOV_REG_LSL_IMM); + if (cpu->CheckCondition(instrs[i].Cond())) + InterpretARM[instrs[i].Info.Kind](cpu); + else + cpu->AddCycles_C(); + } + } + + instrs[i].DataCycles = cpu->DataCycles; + instrs[i].DataRegion = cpu->DataRegion; + if (thumb && instrs[i].Info.Kind == ARMInstrInfo::tk_BL_LONG_2 && i > 0 && instrs[i - 1].Info.Kind == ARMInstrInfo::tk_BL_LONG_1) { @@ -208,40 +441,340 @@ CompiledBlock CompileBlock(ARM* cpu) instrs[i - 1].Info.EndBlock = true; i--; } - i++; + if (instrs[i].Info.Branches() && Config::JIT_BrancheOptimisations) + { + bool hasBranched = cpu->R[15] != r15; + + u32 cond, target; + bool staticBranch = DecodeBranch(thumb, instrs[i], cond, target); + JIT_DEBUGPRINT("branch cond %x target %x (%d)\n", cond, target, hasBranched); + + if (staticBranch) + { + bool isBackJump = false; + if (hasBranched) + { + for (int j = 0; j < i; j++) + { + if (instrs[i].Addr == target) + { + isBackJump = true; + break; + } + } + } + + if (cond < 0xE && target < instrs[i].Addr && target >= lastSegmentStart) + { + // we might have an idle loop + u32 offset = (target - blockAddr) / (thumb ? 2 : 4); + if (IsIdleLoop(instrs + offset, i - offset + 1)) + { + instrs[i].BranchFlags |= branch_IdleBranch; + JIT_DEBUGPRINT("found %s idle loop %d in block %x\n", thumb ? "thumb" : "arm", cpu->Num, blockAddr); + } + } + else if (hasBranched && (!thumb || cond == 0xE) && !isBackJump && i + 1 < Config::JIT_MaxBlockSize) + { + u32 targetPseudoPhysical = cpu->Num == 0 + ? TranslateAddr<0>(target) + : TranslateAddr<1>(target); + + r15 = target + (thumb ? 2 : 4); + assert(r15 == cpu->R[15]); + + JIT_DEBUGPRINT("block lengthened by static branch (target %x)\n", target); + + nextInstr[0] = cpu->NextInstr[0]; + nextInstr[1] = cpu->NextInstr[1]; + + nextInstrAddr[0] = target; + nextInstrAddr[1] = r15; + + lastSegmentStart = target; + + instrs[i].Info.EndBlock = false; + + if (cond < 0xE) + instrs[i].BranchFlags |= branch_FollowCondTaken; + } + } + + if (!hasBranched && cond < 0xE && i + 1 < Config::JIT_MaxBlockSize) + { + instrs[i].Info.EndBlock = false; + instrs[i].BranchFlags |= branch_FollowCondNotTaken; + } + } + + i++; bool canCompile = compiler->CanCompile(thumb, instrs[i - 1].Info.Kind); - if (instrs[i - 1].Info.ReadFlags != 0 || !canCompile) - floodFillSetFlags(instrs, i - 2, canCompile ? instrs[i - 1].Info.ReadFlags : 0xF); - } while(!instrs[i - 1].Info.EndBlock && i < Config::JIT_MaxBlockSize); + bool secondaryFlagReadCond = !canCompile || (instrs[i - 1].BranchFlags & (branch_FollowCondTaken | branch_FollowCondNotTaken)); + if (instrs[i - 1].Info.ReadFlags != 0 || secondaryFlagReadCond) + FloodFillSetFlags(instrs, i - 2, !secondaryFlagReadCond ? instrs[i - 1].Info.ReadFlags : 0xF); + } while(!instrs[i - 1].Info.EndBlock && i < Config::JIT_MaxBlockSize && !cpu->Halted); - floodFillSetFlags(instrs, i - 1, 0xF); + u32 restoreSlot = HashRestoreCandidate(pseudoPhysicalAddr); + JitBlock* prevBlock = RestoreCandidates[restoreSlot]; + bool mayRestore = true; + if (prevBlock && prevBlock->PseudoPhysicalAddr == pseudoPhysicalAddr) + { + RestoreCandidates[restoreSlot] = NULL; + if (prevBlock->NumInstrs == i) + { + for (int j = 0; j < i; j++) + { + if (prevBlock->Instrs()[j] != instrs[j].Instr) + { + mayRestore = false; + break; + } + } + } + else + mayRestore = false; - CompiledBlock block = compiler->CompileBlock(cpu, instrs, i); + if (prevBlock->NumAddresses == numAddressRanges) + { + for (int j = 0; j < numAddressRanges; j++) + { + if (prevBlock->AddressRanges()[j] != addresseRanges[j]) + { + mayRestore = false; + break; + } + } + } + else + mayRestore = false; + } + else + { + mayRestore = false; + prevBlock = NULL; + } - if (cpu->Num == 0) - InsertBlock<0>(blockAddr, block); + JitBlock* block; + if (!mayRestore) + { + if (prevBlock) + delete prevBlock; + + block = new JitBlock(i, numAddressRanges); + for (int j = 0; j < i; j++) + block->Instrs()[j] = instrs[j].Instr; + for (int j = 0; j < numAddressRanges; j++) + block->AddressRanges()[j] = addresseRanges[j]; + + block->StartAddr = blockAddr; + block->PseudoPhysicalAddr = pseudoPhysicalAddr; + + FloodFillSetFlags(instrs, i - 1, 0xF); + + block->EntryPoint = compiler->CompileBlock(cpu, thumb, instrs, i); + } else - InsertBlock<1>(blockAddr, block); + { + JIT_DEBUGPRINT("restored! %p\n", prevBlock); + block = prevBlock; + } + + for (int j = 0; j < numAddressRanges; j++) + { + assert(addresseRanges[j] == block->AddressRanges()[j]); + CodeRanges[addresseRanges[j] / 256].Blocks.Add(block); + } + + FastBlockAccess[block->PseudoPhysicalAddr / 2] = block->EntryPoint; - return block; + JitBlocks.Add(block); } -void InvalidateBlockCache() +void InvalidateByAddr(u32 pseudoPhysical) { - printf("Resetting JIT block cache...\n"); + JIT_DEBUGPRINT("invalidating by addr %x\n", pseudoPhysical); + AddressRange* range = &CodeRanges[pseudoPhysical / 256]; + int startLength = range->Blocks.Length; + for (int i = 0; i < range->Blocks.Length; i++) + { + assert(range->Blocks.Length == startLength); + JitBlock* block = range->Blocks[i]; + for (int j = 0; j < block->NumAddresses; j++) + { + u32 addr = block->AddressRanges()[j]; + if ((addr / 256) != (pseudoPhysical / 256)) + { + AddressRange* otherRange = &CodeRanges[addr / 256]; + assert(otherRange != range); + assert(otherRange->Blocks.RemoveByValue(block)); + } + } + + assert(JitBlocks.RemoveByValue(block)); + + FastBlockAccess[block->PseudoPhysicalAddr / 2] = NULL; - memset(cache.MainRAM, 0, sizeof(cache.MainRAM)); - memset(cache.SWRAM, 0, sizeof(cache.SWRAM)); - memset(cache.ARM9_BIOS, 0, sizeof(cache.ARM9_BIOS)); - memset(cache.ARM9_ITCM, 0, sizeof(cache.ARM9_ITCM)); - memset(cache.ARM9_LCDC, 0, sizeof(cache.ARM9_LCDC)); - memset(cache.ARM7_BIOS, 0, sizeof(cache.ARM7_BIOS)); - memset(cache.ARM7_WRAM, 0, sizeof(cache.ARM7_WRAM)); - memset(cache.ARM7_WVRAM, 0, sizeof(cache.ARM7_WVRAM)); + u32 slot = HashRestoreCandidate(block->PseudoPhysicalAddr); + if (RestoreCandidates[slot] && RestoreCandidates[slot] != block) + delete RestoreCandidates[slot]; + + RestoreCandidates[slot] = block; + } + if ((range->TimesInvalidated + 1) > range->TimesInvalidated) + range->TimesInvalidated++; + + range->Blocks.Clear(); +} + +void InvalidateByAddr7(u32 addr) +{ + u32 pseudoPhysical = TranslateAddr<1>(addr); + if (__builtin_expect(CodeRanges[pseudoPhysical / 256].Blocks.Length > 0, false)) + InvalidateByAddr(pseudoPhysical); +} + +void InvalidateITCM(u32 addr) +{ + u32 pseudoPhysical = addr + ExeMemRegionOffsets[exeMem_ITCM]; + if (CodeRanges[pseudoPhysical / 256].Blocks.Length > 0) + InvalidateByAddr(pseudoPhysical); +} + +void InvalidateAll() +{ + JIT_DEBUGPRINT("invalidating all %x\n", JitBlocks.Length); + for (int i = 0; i < JitBlocks.Length; i++) + { + JitBlock* block = JitBlocks[i]; + + FastBlockAccess[block->PseudoPhysicalAddr / 2] = NULL; + + for (int j = 0; j < block->NumAddresses; j++) + { + u32 addr = block->AddressRanges()[j]; + AddressRange* range = &CodeRanges[addr / 256]; + range->Blocks.Clear(); + if (range->TimesInvalidated + 1 > range->TimesInvalidated) + range->TimesInvalidated++; + } + + u32 slot = HashRestoreCandidate(block->PseudoPhysicalAddr); + if (RestoreCandidates[slot] && RestoreCandidates[slot] != block) + delete RestoreCandidates[slot]; + + RestoreCandidates[slot] = block; + } + + JitBlocks.Clear(); +} + +void ResetBlockCache() +{ + printf("Resetting JIT block cache...\n"); + + memset(FastBlockAccess, 0, sizeof(FastBlockAccess)); + for (int i = 0; i < sizeof(RestoreCandidates)/sizeof(RestoreCandidates[0]); i++) + { + if (RestoreCandidates[i]) + { + delete RestoreCandidates[i]; + RestoreCandidates[i] = NULL; + } + } + for (int i = 0; i < JitBlocks.Length; i++) + { + JitBlock* block = JitBlocks[i]; + for (int j = 0; j < block->NumAddresses; j++) + { + u32 addr = block->AddressRanges()[j]; + CodeRanges[addr / 256].Blocks.Clear(); + CodeRanges[addr / 256].TimesInvalidated = 0; + } + delete block; + } + JitBlocks.Clear(); compiler->Reset(); } +void* GetFuncForAddr(ARM* cpu, u32 addr, bool store, int size) +{ + if (cpu->Num == 0) + { + if ((addr & 0xFF000000) == 0x04000000) + { + /* + unfortunately we can't map GPU2D this way + since it's hidden inside an object + + though GPU3D registers are accessed much more intensive + */ + if (addr >= 0x04000320 && addr < 0x040006A4) + { + switch (size | store) + { + case 8: return (void*)GPU3D::Read8; + case 9: return (void*)GPU3D::Write8; + case 16: return (void*)GPU3D::Read16; + case 17: return (void*)GPU3D::Write16; + case 32: return (void*)GPU3D::Read32; + case 33: return (void*)GPU3D::Write32; + } + } + + switch (size | store) + { + case 8: return (void*)NDS::ARM9IORead8; + case 9: return (void*)NDS::ARM9IOWrite8; + case 16: return (void*)NDS::ARM9IORead16; + case 17: return (void*)NDS::ARM9IOWrite16; + case 32: return (void*)NDS::ARM9IORead32; + case 33: return (void*)NDS::ARM9IOWrite32; + } + } + } + else + { + switch (addr & 0xFF800000) + { + case 0x04000000: + if (addr >= 0x04000400 && addr < 0x04000520) + { + switch (size | store) + { + case 8: return (void*)SPU::Read8; + case 9: return (void*)SPU::Write8; + case 16: return (void*)SPU::Read16; + case 17: return (void*)SPU::Write16; + case 32: return (void*)SPU::Read32; + case 33: return (void*)SPU::Write32; + } + } + + switch (size | store) + { + case 8: return (void*)NDS::ARM7IORead8; + case 9: return (void*)NDS::ARM7IOWrite8; + case 16: return (void*)NDS::ARM7IORead16; + case 17: return (void*)NDS::ARM7IOWrite16; + case 32: return (void*)NDS::ARM7IORead32; + case 33: return (void*)NDS::ARM7IOWrite32; + } + break; + case 0x04800000: + if (addr < 0x04810000 && size == 16) + { + if (store) + return (void*)Wifi::Write; + else + return (void*)Wifi::Read; + } + break; + } + } + return NULL; +} + } \ No newline at end of file diff --git a/src/ARMJIT.h b/src/ARMJIT.h index 7e448ef..1db4d66 100644 --- a/src/ARMJIT.h +++ b/src/ARMJIT.h @@ -9,142 +9,67 @@ namespace ARMJIT { -typedef u32 (*CompiledBlock)(); - -struct FetchedInstr +enum ExeMemKind { - u32 A_Reg(int pos) const - { - return (Instr >> pos) & 0xF; - } - - u32 T_Reg(int pos) const - { - return (Instr >> pos) & 0x7; - } - - u32 Cond() const - { - return Instr >> 28; - } - - u8 SetFlags; - u32 Instr; - u32 NextInstr[2]; - u32 Addr; - - u8 CodeCycles; - - ARMInstrInfo::Info Info; + exeMem_Unmapped = 0, + exeMem_ITCM, + exeMem_MainRAM, + exeMem_SWRAM, + exeMem_LCDC, + exeMem_ARM9_BIOS, + exeMem_ARM7_BIOS, + exeMem_ARM7_WRAM, + exeMem_ARM7_WVRAM, + exeMem_Count }; -/* - Copied from DeSmuME - Some names where changed to match the nomenclature of melonDS +extern const u32 ExeMemRegionOffsets[]; +extern const u32 ExeMemRegionSizes[]; - Since it's nowhere explained and atleast I needed some time to get behind it, - here's a summary on how it works: - more or less all memory locations from which code can be executed are - represented by an array of function pointers, which point to null or - a function which executes a block instructions starting from there. +typedef u32 (*JitBlockEntry)(); - The most significant 4 bits of each address is ignored. This 28 bit space is - divided into 0x2000 32 KB for ARM9 and 0x4000 16 KB for ARM7, each of which - a pointer to the relevant place inside the afore mentioned arrays. 32 and 16 KB - are the sizes of the smallest contigous memory region mapped to the respective CPU. - Because ARM addresses are always aligned to 4 bytes and Thumb to a 2 byte boundary, - we only need every second half word to be adressable. +extern u32 AddrTranslate9[0x2000]; +extern u32 AddrTranslate7[0x4000]; - In case a memory write hits mapped memory, the function block at this - address is set to null, so it's recompiled the next time it's executed. - - This method has disadvantages, namely that only writing to the - first instruction of a block marks it as invalid and that memory remapping - (SWRAM and VRAM) isn't taken into account. -*/ - -struct BlockCache -{ - CompiledBlock* AddrMapping9[0x2000] = {0}; - CompiledBlock* AddrMapping7[0x4000] = {0}; - - CompiledBlock MainRAM[4*1024*1024/2]; - CompiledBlock SWRAM[0x8000/2]; // Shared working RAM - CompiledBlock ARM9_ITCM[0x8000/2]; - CompiledBlock ARM9_LCDC[0xA4000/2]; - CompiledBlock ARM9_BIOS[0x8000/2]; - CompiledBlock ARM7_BIOS[0x4000/2]; - CompiledBlock ARM7_WRAM[0x10000/2]; // dedicated ARM7 WRAM - CompiledBlock ARM7_WVRAM[0x40000/2]; // VRAM allocated as Working RAM -}; - -extern BlockCache cache; +const u32 ExeMemSpaceSize = 0x518000; // I hate you C++, sometimes I really hate you... +extern JitBlockEntry FastBlockAccess[ExeMemSpaceSize / 2]; template inline bool IsMapped(u32 addr) { if (num == 0) - return cache.AddrMapping9[(addr & 0xFFFFFFF) >> 15]; + return AddrTranslate9[(addr & 0xFFFFFFF) >> 15] >= ExeMemRegionSizes[exeMem_Unmapped]; else - return cache.AddrMapping7[(addr & 0xFFFFFFF) >> 14]; + return AddrTranslate7[(addr & 0xFFFFFFF) >> 14] >= ExeMemRegionSizes[exeMem_Unmapped]; } template -inline CompiledBlock LookUpBlock(u32 addr) +inline u32 TranslateAddr(u32 addr) { if (num == 0) - return cache.AddrMapping9[(addr & 0xFFFFFFF) >> 15][(addr & 0x7FFF) >> 1]; + return AddrTranslate9[(addr & 0xFFFFFFF) >> 15] + (addr & 0x7FFF); else - return cache.AddrMapping7[(addr & 0xFFFFFFF) >> 14][(addr & 0x3FFF) >> 1]; + return AddrTranslate7[(addr & 0xFFFFFFF) >> 14] + (addr & 0x3FFF); } template -inline void Invalidate16(u32 addr) +inline JitBlockEntry LookUpBlock(u32 addr) { - if (IsMapped(addr)) - { - if (num == 0) - cache.AddrMapping9[(addr & 0xFFFFFFF) >> 15][(addr & 0x7FFF) >> 1] = NULL; - else - cache.AddrMapping7[(addr & 0xFFFFFFF) >> 14][(addr & 0x3FFF) >> 1] = NULL; - } -} - -template -inline void Invalidate32(u32 addr) -{ - if (IsMapped(addr)) - { - if (num == 0) - { - CompiledBlock* page = cache.AddrMapping9[(addr & 0xFFFFFFF) >> 15]; - page[(addr & 0x7FFF) >> 1] = NULL; - page[((addr + 2) & 0x7FFF) >> 1] = NULL; - } - else - { - CompiledBlock* page = cache.AddrMapping7[(addr & 0xFFFFFFF) >> 14]; - page[(addr & 0x3FFF) >> 1] = NULL; - page[((addr + 2) & 0x3FFF) >> 1] = NULL; - } - } -} - -template -inline void InsertBlock(u32 addr, CompiledBlock func) -{ - if (num == 0) - cache.AddrMapping9[(addr & 0xFFFFFFF) >> 15][(addr & 0x7FFF) >> 1] = func; - else - cache.AddrMapping7[(addr & 0xFFFFFFF) >> 14][(addr & 0x3FFF) >> 1] = func; + return FastBlockAccess[TranslateAddr(addr) / 2]; } void Init(); void DeInit(); -CompiledBlock CompileBlock(ARM* cpu); +void InvalidateByAddr(u32 pseudoPhysical); +void InvalidateAll(); + +void InvalidateITCM(u32 addr); +void InvalidateByAddr7(u32 addr); + +void CompileBlock(ARM* cpu); -void InvalidateBlockCache(); +void ResetBlockCache(); } diff --git a/src/ARMJIT_Internal.h b/src/ARMJIT_Internal.h new file mode 100644 index 0000000..4acb488 --- /dev/null +++ b/src/ARMJIT_Internal.h @@ -0,0 +1,198 @@ +#ifndef ARMJIT_INTERNAL_H +#define ARMJIT_INTERNAL_H + +#include "types.h" +#include + +#include "ARMJIT.h" + +// here lands everything which doesn't fit into ARMJIT.h +// where it would be included by pretty much everything +namespace ARMJIT +{ + +enum +{ + branch_IdleBranch = 1 << 0, + branch_FollowCondTaken = 1 << 1, + branch_FollowCondNotTaken = 1 << 2 +}; + +struct FetchedInstr +{ + u32 A_Reg(int pos) const + { + return (Instr >> pos) & 0xF; + } + + u32 T_Reg(int pos) const + { + return (Instr >> pos) & 0x7; + } + + u32 Cond() const + { + return Instr >> 28; + } + + u8 BranchFlags; + u8 SetFlags; + u32 Instr; + u32 NextInstr[2]; + u32 Addr; + + u8 CodeCycles; + u8 DataCycles; + u8 DataRegion; + + ARMInstrInfo::Info Info; +}; + +/* + TinyVector + - because reinventing the wheel is the best! + + - meant to be used very often, with not so many elements + max 1 << 16 elements + - doesn't allocate while no elements are inserted + - not stl confirmant of course + - probably only works with POD types + - remove operations don't preserve order, but O(1)! +*/ +template +struct __attribute__((packed)) TinyVector +{ + T* Data = NULL; + u16 Capacity = 0; + u32 Length = 0; // make it 32 bit so we don't need movzx + + ~TinyVector() + { + delete[] Data; + } + + void MakeCapacity(u32 capacity) + { + assert(capacity <= UINT16_MAX); + assert(capacity > Capacity); + T* newMem = new T[capacity]; + if (Data != NULL) + memcpy(newMem, Data, sizeof(Data) * Length); + + T* oldData = Data; + Data = newMem; + if (oldData != NULL) + delete[] oldData; + + Capacity = capacity; + } + + void Clear() + { + Length = 0; + } + + void Add(T element) + { + assert(Length + 1 <= UINT16_MAX); + if (Length + 1 > Capacity) + MakeCapacity(((Capacity + 4) * 3) / 2); + + Data[Length++] = element; + } + + void Remove(int index) + { + assert(index >= 0 && index < Length); + + Length--; + Data[index] = Data[Length]; + /*for (int i = index; i < Length; i++) + Data[i] = Data[i + 1];*/ + } + + int Find(T needle) + { + for (int i = 0; i < Length; i++) + { + if (Data[i] == needle) + return i; + } + return -1; + } + + bool RemoveByValue(T needle) + { + for (int i = 0; i < Length; i++) + { + if (Data[i] == needle) + { + Remove(i); + return true; + } + } + return false; + } + + T& operator[](int index) + { + assert(index >= 0 && index < Length); + return Data[index]; + } +}; + +class JitBlock +{ +public: + JitBlock(u32 numInstrs, u32 numAddresses) + { + NumInstrs = numInstrs; + NumAddresses = numAddresses; + Data = new u32[numInstrs + numAddresses]; + } + + ~JitBlock() + { + delete[] Data; + } + + u32 StartAddr; + u32 PseudoPhysicalAddr; + + u32 NumInstrs; + u32 NumAddresses; + + JitBlockEntry EntryPoint; + + u32* Instrs() + { return Data; } + u32* AddressRanges() + { return Data + NumInstrs; } + +private: + /* + 0.. Blocks; + u16 TimesInvalidated; +}; + +extern AddressRange CodeRanges[ExeMemSpaceSize / 256]; + +typedef void (*InterpreterFunc)(ARM* cpu); +extern InterpreterFunc InterpretARM[]; +extern InterpreterFunc InterpretTHUMB[]; + +void* GetFuncForAddr(ARM* cpu, u32 addr, bool store, int size); + +} + +#endif \ No newline at end of file diff --git a/src/ARMJIT_RegisterCache.h b/src/ARMJIT_RegisterCache.h index fe2f203..ed6a2b7 100644 --- a/src/ARMJIT_RegisterCache.h +++ b/src/ARMJIT_RegisterCache.h @@ -60,15 +60,46 @@ public: assert("Welp!"); } + void PutLiteral(int reg, u32 val) + { + LiteralsLoaded |= (1 << reg); + LiteralValues[reg] = val; + } + + void UnloadLiteral(int reg) + { + LiteralsLoaded &= ~(1 << reg); + } + + bool IsLiteral(int reg) + { + return LiteralsLoaded & (1 << reg); + } + + void PrepareExit() + { + BitSet16 dirtyRegs(DirtyRegs); + for (int reg : dirtyRegs) + Compiler->SaveReg(reg, Mapping[reg]); + } + void Flush() { BitSet16 loadedSet(LoadedRegs); for (int reg : loadedSet) UnloadRegister(reg); + LiteralsLoaded = 0; } void Prepare(bool thumb, int i) { + if (LoadedRegs & (1 << 15)) + UnloadRegister(15); + + BitSet16 invalidedLiterals(LiteralsLoaded & Instrs[i].Info.DstRegs); + for (int reg : invalidedLiterals) + UnloadLiteral(reg); + u16 futureNeeded = 0; int ranking[16]; for (int j = 0; j < 16; j++) @@ -86,7 +117,7 @@ public: for (int reg : neverNeededAgain) UnloadRegister(reg); - FetchedInstr Instr = Instrs[i]; + FetchedInstr Instr = Instrs[i]; u16 necessaryRegs = (Instr.Info.SrcRegs & ~(1 << 15)) | Instr.Info.DstRegs; BitSet16 needToBeLoaded(necessaryRegs & ~LoadedRegs); if (needToBeLoaded != BitSet16(0)) @@ -125,6 +156,9 @@ public: static const int NativeRegsAvailable; Reg Mapping[16]; + u32 LiteralValues[16]; + + u16 LiteralsLoaded = 0; u32 NativeRegsUsed = 0; u16 LoadedRegs = 0; u16 DirtyRegs = 0; diff --git a/src/ARMJIT_x64/ARMJIT_ALU.cpp b/src/ARMJIT_x64/ARMJIT_ALU.cpp index f868ddf..14c223b 100644 --- a/src/ARMJIT_x64/ARMJIT_ALU.cpp +++ b/src/ARMJIT_x64/ARMJIT_ALU.cpp @@ -213,7 +213,13 @@ void Compiler::A_Comp_MovOp() MOV(32, rd, op2); if (((CurInstr.Instr >> 21) & 0xF) == 0xF) + { NOT(32, rd); + if (op2.IsImm() && CurInstr.Cond() == 0xE) + RegCache.PutLiteral(CurInstr.A_Reg(12), ~op2.Imm32()); + } + else if (op2.IsImm() && CurInstr.Cond() == 0xE) + RegCache.PutLiteral(CurInstr.A_Reg(12), op2.Imm32()); if (S) { @@ -564,7 +570,13 @@ void Compiler::T_Comp_AddSub_() Comp_AddCycles_C(); - if (op & 1) + // special case for thumb mov being alias to add rd, rn, #0 + if (CurInstr.SetFlags == 0 && rn.IsImm() && rn.Imm32() == 0) + { + if (rd != rs) + MOV(32, rd, rs); + } + else if (op & 1) Comp_ArithTriOp(&Compiler::SUB, rd, rs, rn, false, opSetsFlags|opInvertCarry|opRetriveCV); else Comp_ArithTriOp(&Compiler::ADD, rd, rs, rn, false, opSetsFlags|opSymmetric|opRetriveCV); @@ -614,7 +626,7 @@ void Compiler::T_Comp_ALU() u32 op = (CurInstr.Instr >> 6) & 0xF; if ((op >= 0x2 && op < 0x4) || op == 0x7) - Comp_AddCycles_CI(1); + Comp_AddCycles_CI(1); // shift by reg else Comp_AddCycles_C(); diff --git a/src/ARMJIT_x64/ARMJIT_Branch.cpp b/src/ARMJIT_x64/ARMJIT_Branch.cpp index cc7a3c4..0dedb3f 100644 --- a/src/ARMJIT_x64/ARMJIT_Branch.cpp +++ b/src/ARMJIT_x64/ARMJIT_Branch.cpp @@ -16,9 +16,6 @@ int squeezePointer(T* ptr) void Compiler::Comp_JumpTo(u32 addr, bool forceNonConstantCycles) { // we can simplify constant branches by a lot - // it's not completely safe to assume stuff like, which instructions to preload - // we'll see how it works out - IrregularCycles = true; u32 newPC; @@ -39,18 +36,12 @@ void Compiler::Comp_JumpTo(u32 addr, bool forceNonConstantCycles) { ARMv5* cpu9 = (ARMv5*)CurCPU; - u32 oldregion = R15 >> 24; - u32 newregion = addr >> 24; - u32 regionCodeCycles = cpu9->MemTimings[addr >> 12][0]; u32 compileTimeCodeCycles = cpu9->RegionCodeCycles; cpu9->RegionCodeCycles = regionCodeCycles; - MOV(32, MDisp(RCPU, offsetof(ARMv5, RegionCodeCycles)), Imm32(regionCodeCycles)); - - bool setupRegion = newregion != oldregion; - if (setupRegion) - cpu9->SetupCodeMem(addr); + if (Exit) + MOV(32, MDisp(RCPU, offsetof(ARMv5, RegionCodeCycles)), Imm32(regionCodeCycles)); if (addr & 0x1) { @@ -83,12 +74,7 @@ void Compiler::Comp_JumpTo(u32 addr, bool forceNonConstantCycles) cycles += cpu9->CodeCycles; } - MOV(64, MDisp(RCPU, offsetof(ARM, CodeMem.Mem)), Imm32(squeezePointer(cpu9->CodeMem.Mem))); - MOV(32, MDisp(RCPU, offsetof(ARM, CodeMem.Mask)), Imm32(cpu9->CodeMem.Mask)); - cpu9->RegionCodeCycles = compileTimeCodeCycles; - if (setupRegion) - cpu9->SetupCodeMem(R15); } else { @@ -100,8 +86,11 @@ void Compiler::Comp_JumpTo(u32 addr, bool forceNonConstantCycles) cpu7->CodeRegion = codeRegion; cpu7->CodeCycles = codeCycles; - MOV(32, MDisp(RCPU, offsetof(ARM, CodeRegion)), Imm32(codeRegion)); - MOV(32, MDisp(RCPU, offsetof(ARM, CodeCycles)), Imm32(codeCycles)); + if (Exit) + { + MOV(32, MDisp(RCPU, offsetof(ARM, CodeRegion)), Imm32(codeRegion)); + MOV(32, MDisp(RCPU, offsetof(ARM, CodeCycles)), Imm32(codeCycles)); + } if (addr & 0x1) { @@ -133,7 +122,8 @@ void Compiler::Comp_JumpTo(u32 addr, bool forceNonConstantCycles) cpu7->CodeCycles = addr >> 15; } - MOV(32, MDisp(RCPU, offsetof(ARM, R[15])), Imm32(newPC)); + if (Exit) + MOV(32, MDisp(RCPU, offsetof(ARM, R[15])), Imm32(newPC)); if ((Thumb || CurInstr.Cond() >= 0xE) && !forceNonConstantCycles) ConstantCycles += cycles; else @@ -219,10 +209,23 @@ void Compiler::T_Comp_BCOND() s32 offset = (s32)(CurInstr.Instr << 24) >> 23; Comp_JumpTo(R15 + offset + 1, true); + Comp_SpecialBranchBehaviour(); + FixupBranch skipFailed = J(); SetJumpTarget(skipExecute); + + if (CurInstr.BranchFlags & branch_FollowCondTaken) + { + RegCache.PrepareExit(); + SaveCPSR(false); + + MOV(32, R(RAX), Imm32(ConstantCycles)); + ABI_PopRegistersAndAdjustStack(BitSet32(ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS & ~BitSet32({RSP})), 8); + RET(); + } + Comp_AddCycles_C(true); - SetJumpTarget(skipFailed); + SetJumpTarget(skipFailed); } void Compiler::T_Comp_B() diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp index d8ce1aa..25c55a3 100644 --- a/src/ARMJIT_x64/ARMJIT_Compiler.cpp +++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp @@ -72,12 +72,15 @@ Compiler::Compiler() for (int i = 0; i < 3; i++) { for (int j = 0; j < 2; j++) - { MemoryFuncs9[i][j] = Gen_MemoryRoutine9(j, 8 << i); - MemoryFuncs7[i][j][0] = Gen_MemoryRoutine7(j, false, 8 << i); - MemoryFuncs7[i][j][1] = Gen_MemoryRoutine7(j, true, 8 << i); - } } + MemoryFuncs7[0][0] = (void*)NDS::ARM7Read8; + MemoryFuncs7[0][1] = (void*)NDS::ARM7Write8; + MemoryFuncs7[1][0] = (void*)NDS::ARM7Read16; + MemoryFuncs7[1][1] = (void*)NDS::ARM7Write16; + MemoryFuncs7[2][0] = (void*)NDS::ARM7Read32; + MemoryFuncs7[2][1] = (void*)NDS::ARM7Write32; + for (int i = 0; i < 2; i++) for (int j = 0; j < 2; j++) { @@ -179,12 +182,13 @@ void Compiler::LoadCPSR() MOV(32, R(RCPSR), MDisp(RCPU, offsetof(ARM, CPSR))); } -void Compiler::SaveCPSR() +void Compiler::SaveCPSR(bool flagClean) { if (CPSRDirty) { MOV(32, MDisp(RCPU, offsetof(ARM, CPSR)), R(RCPSR)); - CPSRDirty = false; + if (flagClean) + CPSRDirty = false; } } @@ -204,6 +208,9 @@ void Compiler::SaveReg(int reg, X64Reg nativeReg) // invalidates RSCRATCH and RSCRATCH3 Gen::FixupBranch Compiler::CheckCondition(u32 cond) { + // hack, ldm/stm can get really big TODO: make this better + bool ldmStm = !Thumb && + (CurInstr.Info.Kind == ARMInstrInfo::ak_LDM || CurInstr.Info.Kind == ARMInstrInfo::ak_STM); if (cond >= 0x8) { static_assert(RSCRATCH3 == ECX, "RSCRATCH has to be equal to ECX!"); @@ -213,14 +220,14 @@ Gen::FixupBranch Compiler::CheckCondition(u32 cond) SHL(32, R(RSCRATCH), R(RSCRATCH3)); TEST(32, R(RSCRATCH), Imm32(ARM::ConditionTable[cond])); - return J_CC(CC_Z); + return J_CC(CC_Z, ldmStm); } else { // could have used a LUT, but then where would be the fun? TEST(32, R(RCPSR), Imm32(1 << (28 + ((~(cond >> 1) & 1) << 1 | (cond >> 2 & 1) ^ (cond >> 1 & 1))))); - return J_CC(cond & 1 ? CC_NZ : CC_Z); + return J_CC(cond & 1 ? CC_NZ : CC_Z, ldmStm); } } @@ -354,25 +361,34 @@ void Compiler::Reset() SetCodePtr(ResetStart); } -CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrsCount) +void Compiler::Comp_SpecialBranchBehaviour() +{ + if (CurInstr.BranchFlags & branch_IdleBranch) + OR(32, MDisp(RCPU, offsetof(ARM, Halted)), Imm8(0x20)); + + if (CurInstr.BranchFlags & branch_FollowCondNotTaken) + { + RegCache.PrepareExit(); + SaveCPSR(false); + + MOV(32, R(RAX), Imm32(ConstantCycles)); + ABI_PopRegistersAndAdjustStack(BitSet32(ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS & ~BitSet32({RSP})), 8); + RET(); + } +} + +JitBlockEntry Compiler::CompileBlock(ARM* cpu, bool thumb, FetchedInstr instrs[], int instrsCount) { if (CodeMemSize - (GetWritableCodePtr() - ResetStart) < 1024 * 32) // guess... - InvalidateBlockCache(); + ResetBlockCache(); ConstantCycles = 0; - Thumb = cpu->CPSR & 0x20; + Thumb = thumb; Num = cpu->Num; - CodeRegion = cpu->CodeRegion; + CodeRegion = instrs[0].Addr >> 24; CurCPU = cpu; - CompiledBlock res = (CompiledBlock)GetWritableCodePtr(); - - if (!(Num == 0 - ? IsMapped<0>(instrs[0].Addr - (Thumb ? 2 : 4)) - : IsMapped<1>(instrs[0].Addr - (Thumb ? 2 : 4)))) - { - printf("Trying to compile a block in unmapped memory\n"); - } + JitBlockEntry res = (JitBlockEntry)GetWritableCodePtr(); ABI_PushRegistersAndAdjustStack(BitSet32(ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS & ~BitSet32({RSP})), 8); @@ -380,7 +396,6 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs LoadCPSR(); - // TODO: this is ugly as a whole, do better RegCache = RegisterCache(this, instrs, instrsCount); for (int i = 0; i < instrsCount; i++) @@ -388,21 +403,25 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs CurInstr = instrs[i]; R15 = CurInstr.Addr + (Thumb ? 4 : 8); + Exit = i == instrsCount - 1 || (CurInstr.BranchFlags & branch_FollowCondNotTaken); + CompileFunc comp = Thumb ? T_Comp[CurInstr.Info.Kind] : A_Comp[CurInstr.Info.Kind]; bool isConditional = Thumb ? CurInstr.Info.Kind == ARMInstrInfo::tk_BCOND : CurInstr.Cond() < 0xE; - if (comp == NULL || (i == instrsCount - 1 && (!CurInstr.Info.Branches() || isConditional))) + if (comp == NULL || (CurInstr.BranchFlags & branch_FollowCondTaken) || (i == instrsCount - 1 && (!CurInstr.Info.Branches() || isConditional))) { MOV(32, MDisp(RCPU, offsetof(ARM, R[15])), Imm32(R15)); - MOV(32, MDisp(RCPU, offsetof(ARM, CodeCycles)), Imm32(CurInstr.CodeCycles)); - MOV(32, MDisp(RCPU, offsetof(ARM, CurInstr)), Imm32(CurInstr.Instr)); - if (comp == NULL) + { + MOV(32, MDisp(RCPU, offsetof(ARM, CodeCycles)), Imm32(CurInstr.CodeCycles)); + MOV(32, MDisp(RCPU, offsetof(ARM, CurInstr)), Imm32(CurInstr.Instr)); + SaveCPSR(); + } } - + if (comp != NULL) RegCache.Prepare(Thumb, i); else @@ -410,12 +429,11 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs if (Thumb) { - u32 icode = (CurInstr.Instr >> 6) & 0x3FF; if (comp == NULL) { MOV(64, R(ABI_PARAM1), R(RCPU)); - ABI_CallFunction(ARMInterpreter::THUMBInstrTable[icode]); + ABI_CallFunction(InterpretTHUMB[CurInstr.Info.Kind]); } else (this->*comp)(); @@ -434,7 +452,9 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs } } else if (cond == 0xF) + { Comp_AddCycles_C(); + } else { IrregularCycles = false; @@ -443,25 +463,36 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs if (cond < 0xE) skipExecute = CheckCondition(cond); - u32 icode = ((CurInstr.Instr >> 4) & 0xF) | ((CurInstr.Instr >> 16) & 0xFF0); if (comp == NULL) { MOV(64, R(ABI_PARAM1), R(RCPU)); - ABI_CallFunction(ARMInterpreter::ARMInstrTable[icode]); + ABI_CallFunction(InterpretARM[CurInstr.Info.Kind]); } else (this->*comp)(); + Comp_SpecialBranchBehaviour(); + if (CurInstr.Cond() < 0xE) { - if (IrregularCycles) + if (IrregularCycles || (CurInstr.BranchFlags & branch_FollowCondTaken)) { FixupBranch skipFailed = J(); SetJumpTarget(skipExecute); Comp_AddCycles_C(true); + if (CurInstr.BranchFlags & branch_FollowCondTaken) + { + RegCache.PrepareExit(); + SaveCPSR(false); + + MOV(32, R(RAX), Imm32(ConstantCycles)); + ABI_PopRegistersAndAdjustStack(BitSet32(ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS & ~BitSet32({RSP})), 8); + RET(); + } + SetJumpTarget(skipFailed); } else @@ -483,6 +514,12 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs ABI_PopRegistersAndAdjustStack(BitSet32(ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS & ~BitSet32({RSP})), 8); RET(); + /*FILE* codeout = fopen("codeout", "a"); + fprintf(codeout, "beginning block argargarg__ %x!!!", instrs[0].Addr); + fwrite((u8*)res, GetWritableCodePtr() - (u8*)res, 1, codeout); + + fclose(codeout);*/ + return res; } @@ -528,4 +565,89 @@ void Compiler::Comp_AddCycles_CI(Gen::X64Reg i, int add) } } +void Compiler::Comp_AddCycles_CDI() +{ + if (Num == 0) + Comp_AddCycles_CD(); + else + { + IrregularCycles = true; + + s32 cycles; + + s32 numC = NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2]; + s32 numD = CurInstr.DataCycles; + + if (CurInstr.DataRegion == 0x02) // mainRAM + { + if (CodeRegion == 0x02) + cycles = numC + numD; + else + { + numC++; + cycles = std::max(numC + numD - 3, std::max(numC, numD)); + } + } + else if (CodeRegion == 0x02) + { + numD++; + cycles = std::max(numC + numD - 3, std::max(numC, numD)); + } + else + { + cycles = numC + numD + 1; + } + + printf("%x: %d %d cycles cdi (%d)\n", CurInstr.Instr, Num, CurInstr.DataCycles, cycles); + + if (!Thumb && CurInstr.Cond() < 0xE) + ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm8(cycles)); + else + ConstantCycles += cycles; + } +} + +void Compiler::Comp_AddCycles_CD() +{ + u32 cycles = 0; + if (Num == 0) + { + s32 numC = (R15 & 0x2) ? 0 : CurInstr.CodeCycles; + s32 numD = CurInstr.DataCycles; + + //if (DataRegion != CodeRegion) + cycles = std::max(numC + numD - 6, std::max(numC, numD)); + + IrregularCycles = cycles != numC; + } + else + { + s32 numC = NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2]; + s32 numD = CurInstr.DataCycles; + + if (CurInstr.DataRegion == 0x02) + { + if (CodeRegion == 0x02) + cycles += numC + numD; + else + cycles += std::max(numC + numD - 3, std::max(numC, numD)); + } + else if (CodeRegion == 0x02) + { + cycles += std::max(numC + numD - 3, std::max(numC, numD)); + } + else + { + cycles += numC + numD; + } + + IrregularCycles = true; + } + + if (!Thumb && CurInstr.Cond() < 0xE) + ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm8(cycles)); + else + ConstantCycles += cycles; +} + } \ No newline at end of file diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.h b/src/ARMJIT_x64/ARMJIT_Compiler.h index fcb2380..792ff66 100644 --- a/src/ARMJIT_x64/ARMJIT_Compiler.h +++ b/src/ARMJIT_x64/ARMJIT_Compiler.h @@ -4,6 +4,7 @@ #include "../dolphin/x64Emitter.h" #include "../ARMJIT.h" +#include "../ARMJIT_Internal.h" #include "../ARMJIT_RegisterCache.h" namespace ARMJIT @@ -16,6 +17,32 @@ const Gen::X64Reg RSCRATCH = Gen::EAX; const Gen::X64Reg RSCRATCH2 = Gen::EDX; const Gen::X64Reg RSCRATCH3 = Gen::ECX; +struct ComplexOperand +{ + ComplexOperand() + {} + + ComplexOperand(u32 imm) + : IsImm(true), Imm(imm) + {} + ComplexOperand(int reg, int op, int amount) + : IsImm(false) + { + Reg.Reg = reg; + Reg.Op = op; + Reg.Amount = amount; + } + + bool IsImm; + union + { + struct + { + int Reg, Op, Amount; + } Reg; + u32 Imm; + }; +}; class Compiler : public Gen::XEmitter { @@ -24,7 +51,7 @@ public: void Reset(); - CompiledBlock CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrsCount); + JitBlockEntry CompileBlock(ARM* cpu, bool thumb, FetchedInstr instrs[], int instrsCount); void LoadReg(int reg, Gen::X64Reg nativeReg); void SaveReg(int reg, Gen::X64Reg nativeReg); @@ -39,6 +66,8 @@ public: void Comp_AddCycles_C(bool forceNonConstant = false); void Comp_AddCycles_CI(u32 i); void Comp_AddCycles_CI(Gen::X64Reg i, int add); + void Comp_AddCycles_CDI(); + void Comp_AddCycles_CD(); enum { @@ -92,8 +121,17 @@ public: void T_Comp_BL_LONG_2(); void T_Comp_BL_Merged(); - void Comp_MemAccess(Gen::OpArg rd, bool signExtend, bool store, int size); + enum + { + memop_Writeback = 1 << 0, + memop_Post = 1 << 1, + memop_SignExtend = 1 << 2, + memop_Store = 1 << 3, + memop_SubtractOffset = 1 << 4 + }; + void Comp_MemAccess(int rd, int rn, const ComplexOperand& op2, int size, int flags); s32 Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc, bool decrement, bool usermode); + void Comp_MemLoadLiteral(int size, int rd, u32 addr); void Comp_ArithTriOp(void (Compiler::*op)(int, const Gen::OpArg&, const Gen::OpArg&), Gen::OpArg rd, Gen::OpArg rn, Gen::OpArg op2, bool carryUsed, int opFlags); @@ -105,8 +143,9 @@ public: void Comp_RetriveFlags(bool sign, bool retriveCV, bool carryUsed); + void Comp_SpecialBranchBehaviour(); + void* Gen_MemoryRoutine9(bool store, int size); - void* Gen_MemoryRoutine7(bool store, bool codeMainRAM, int size); void* Gen_MemoryRoutineSeq9(bool store, bool preinc); void* Gen_MemoryRoutineSeq7(bool store, bool preinc, bool codeMainRAM); @@ -117,10 +156,9 @@ public: Gen::OpArg Comp_RegShiftReg(int op, Gen::OpArg rs, Gen::OpArg rm, bool S, bool& carryUsed); Gen::OpArg A_Comp_GetALUOp2(bool S, bool& carryUsed); - Gen::OpArg A_Comp_GetMemWBOffset(); void LoadCPSR(); - void SaveCPSR(); + void SaveCPSR(bool flagClean = true); bool FlagsNZRequired() { return CurInstr.SetFlags & 0xC; } @@ -139,10 +177,11 @@ public: u8* ResetStart; u32 CodeMemSize; + bool Exit; bool IrregularCycles; void* MemoryFuncs9[3][2]; - void* MemoryFuncs7[3][2][2]; + void* MemoryFuncs7[3][2]; void* MemoryFuncsSeq9[2][2]; void* MemoryFuncsSeq7[2][2][2]; diff --git a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp index bf8280d..13ca415 100644 --- a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp +++ b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp @@ -27,51 +27,7 @@ int squeezePointer(T* ptr) /* address - ABI_PARAM1 (a.k.a. ECX = RSCRATCH3 on Windows) store value - ABI_PARAM2 (a.k.a. RDX = RSCRATCH2 on Windows) - code cycles - ABI_PARAM3 */ - -#define CALC_CYCLES_9(numC, numD, scratch) \ - LEA(32, scratch, MComplex(numD, numC, SCALE_1, -6)); \ - CMP(32, R(numC), R(numD)); \ - CMOVcc(32, numD, R(numC), CC_G); \ - CMP(32, R(numD), R(scratch)); \ - CMOVcc(32, scratch, R(numD), CC_G); \ - ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(scratch)); -#define CALC_CYCLES_7_DATA_MAIN_RAM(numC, numD, scratch) \ - if (codeMainRAM) \ - { \ - LEA(32, scratch, MRegSum(numD, numC)); \ - ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(scratch)); \ - } \ - else \ - { \ - if (!store) \ - ADD(32, R(numC), Imm8(1)); \ - LEA(32, scratch, MComplex(numD, numC, SCALE_1, -3)); \ - CMP(32, R(numD), R(numC)); \ - CMOVcc(32, numC, R(numD), CC_G); \ - CMP(32, R(numC), R(scratch)); \ - CMOVcc(32, scratch, R(numC), CC_G); \ - ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(scratch)); \ - } -#define CALC_CYCLES_7_DATA_NON_MAIN_RAM(numC, numD, scratch) \ - if (codeMainRAM) \ - { \ - if (!store) \ - ADD(32, R(numD), Imm8(1)); \ - LEA(32, scratch, MComplex(numD, numC, SCALE_1, -3)); \ - CMP(32, R(numD), R(numC)); \ - CMOVcc(32, numC, R(numD), CC_G); \ - CMP(32, R(numC), R(scratch)); \ - CMOVcc(32, scratch, R(numC), CC_G); \ - ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(scratch)); \ - } \ - else \ - { \ - LEA(32, scratch, MComplex(numD, numC, SCALE_1, store ? 0 : 1)); \ - ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(scratch)); \ - } - void* Compiler::Gen_MemoryRoutine9(bool store, int size) { u32 addressMask = ~(size == 32 ? 3 : (size == 16 ? 1 : 0)); @@ -86,12 +42,6 @@ void* Compiler::Gen_MemoryRoutine9(bool store, int size) CMP(32, R(ABI_PARAM1), MDisp(RCPU, offsetof(ARMv5, ITCMSize))); FixupBranch insideITCM = J_CC(CC_B); - // cycle counting! - MOV(32, R(ABI_PARAM4), R(ABI_PARAM1)); - SHR(32, R(ABI_PARAM4), Imm8(12)); - MOVZX(32, 8, ABI_PARAM4, MComplex(RCPU, ABI_PARAM4, SCALE_4, offsetof(ARMv5, MemTimings) + (size == 32 ? 2 : 1))); - CALC_CYCLES_9(ABI_PARAM3, ABI_PARAM4, RSCRATCH) - if (store) { if (size > 8) @@ -127,7 +77,6 @@ void* Compiler::Gen_MemoryRoutine9(bool store, int size) } SetJumpTarget(insideDTCM); - ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(ABI_PARAM3)); AND(32, R(RSCRATCH), Imm32(0x3FFF & addressMask)); if (store) MOV(size, MComplex(RCPU, RSCRATCH, SCALE_1, offsetof(ARMv5, DTCM)), R(ABI_PARAM2)); @@ -146,16 +95,22 @@ void* Compiler::Gen_MemoryRoutine9(bool store, int size) RET(); SetJumpTarget(insideITCM); - ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(ABI_PARAM3)); MOV(32, R(ABI_PARAM3), R(ABI_PARAM1)); // free up ECX AND(32, R(ABI_PARAM3), Imm32(0x7FFF & addressMask)); if (store) { MOV(size, MComplex(RCPU, ABI_PARAM3, SCALE_1, offsetof(ARMv5, ITCM)), R(ABI_PARAM2)); - XOR(32, R(RSCRATCH), R(RSCRATCH)); - MOV(64, MScaled(ABI_PARAM3, SCALE_4, squeezePointer(cache.ARM9_ITCM)), R(RSCRATCH)); - if (size == 32) - MOV(64, MScaled(ABI_PARAM3, SCALE_4, squeezePointer(cache.ARM9_ITCM) + 8), R(RSCRATCH)); + + // if CodeRanges[pseudoPhysical/256].Blocks.Length > 0 we're writing into code! + static_assert(sizeof(AddressRange) == 16); + LEA(32, ABI_PARAM1, MDisp(ABI_PARAM3, ExeMemRegionOffsets[exeMem_ITCM])); + MOV(32, R(RSCRATCH), R(ABI_PARAM1)); + SHR(32, R(RSCRATCH), Imm8(8)); + SHL(32, R(RSCRATCH), Imm8(4)); + CMP(32, MDisp(RSCRATCH, squeezePointer(CodeRanges) + offsetof(AddressRange, Blocks.Length)), Imm8(0)); + FixupBranch noCode = J_CC(CC_Z); + JMP((u8*)InvalidateByAddr, true); + SetJumpTarget(noCode); } else { @@ -176,83 +131,6 @@ void* Compiler::Gen_MemoryRoutine9(bool store, int size) return res; } -void* Compiler::Gen_MemoryRoutine7(bool store, bool codeMainRAM, int size) -{ - u32 addressMask = ~(size == 32 ? 3 : (size == 16 ? 1 : 0)); - AlignCode4(); - void* res = GetWritableCodePtr(); - - MOV(32, R(RSCRATCH), R(ABI_PARAM1)); - SHR(32, R(RSCRATCH), Imm8(15)); - MOVZX(32, 8, ABI_PARAM4, MScaled(RSCRATCH, SCALE_4, (size == 32 ? 2 : 0) + squeezePointer(NDS::ARM7MemTimings))); - - MOV(32, R(RSCRATCH), R(ABI_PARAM1)); - AND(32, R(RSCRATCH), Imm32(0xFF000000)); - CMP(32, R(RSCRATCH), Imm32(0x02000000)); - FixupBranch outsideMainRAM = J_CC(CC_NE); - CALC_CYCLES_7_DATA_MAIN_RAM(ABI_PARAM3, ABI_PARAM4, RSCRATCH) - MOV(32, R(ABI_PARAM3), R(ABI_PARAM1)); - AND(32, R(ABI_PARAM3), Imm32((MAIN_RAM_SIZE - 1) & addressMask)); - if (store) - { - MOV(size, MDisp(ABI_PARAM3, squeezePointer(NDS::MainRAM)), R(ABI_PARAM2)); - XOR(32, R(RSCRATCH), R(RSCRATCH)); - MOV(64, MScaled(ABI_PARAM3, SCALE_4, squeezePointer(cache.MainRAM)), R(RSCRATCH)); - if (size == 32) - MOV(64, MScaled(ABI_PARAM3, SCALE_4, squeezePointer(cache.MainRAM) + 8), R(RSCRATCH)); - } - else - { - MOVZX(32, size, RSCRATCH, MDisp(ABI_PARAM3, squeezePointer(NDS::MainRAM))); - if (size == 32) - { - if (ABI_PARAM1 != ECX) - MOV(32, R(ECX), R(ABI_PARAM1)); - AND(32, R(ECX), Imm8(3)); - SHL(32, R(ECX), Imm8(3)); - ROR_(32, R(RSCRATCH), R(ECX)); - } - } - RET(); - - SetJumpTarget(outsideMainRAM); - CALC_CYCLES_7_DATA_NON_MAIN_RAM(ABI_PARAM3, ABI_PARAM4, RSCRATCH) - if (store) - { - if (size > 8) - AND(32, R(ABI_PARAM1), Imm32(addressMask)); - switch (size) - { - case 32: JMP((u8*)NDS::ARM7Write32, true); break; - case 16: JMP((u8*)NDS::ARM7Write16, true); break; - case 8: JMP((u8*)NDS::ARM7Write8, true); break; - } - } - else - { - if (size == 32) - { - ABI_PushRegistersAndAdjustStack({ABI_PARAM1}, 8); - AND(32, R(ABI_PARAM1), Imm32(addressMask)); - ABI_CallFunction(NDS::ARM7Read32); - ABI_PopRegistersAndAdjustStack({ECX}, 8); - AND(32, R(ECX), Imm8(3)); - SHL(32, R(ECX), Imm8(3)); - ROR_(32, R(RSCRATCH), R(ECX)); - RET(); - } - else if (size == 16) - { - AND(32, R(ABI_PARAM1), Imm32(addressMask)); - JMP((u8*)NDS::ARM7Read16, true); - } - else - JMP((u8*)NDS::ARM7Read8, true); - } - - return res; -} - #define MEMORY_SEQ_WHILE_COND \ if (!store) \ MOV(32, currentElement, R(EAX));\ @@ -266,24 +144,13 @@ void* Compiler::Gen_MemoryRoutine7(bool store, bool codeMainRAM, int size) ABI_PARAM1 address ABI_PARAM2 address where registers are stored ABI_PARAM3 how many values to read/write - ABI_PARAM4 code cycles Dolphin x64CodeEmitter is my favourite assembler */ void* Compiler::Gen_MemoryRoutineSeq9(bool store, bool preinc) { - const u8* zero = GetCodePtr(); - ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(ABI_PARAM4)); - RET(); - void* res = (void*)GetWritableCodePtr(); - TEST(32, R(ABI_PARAM3), R(ABI_PARAM3)); - J_CC(CC_Z, zero); - - PUSH(ABI_PARAM3); - PUSH(ABI_PARAM4); // we need you later - const u8* repeat = GetCodePtr(); if (preinc) @@ -311,12 +178,7 @@ void* Compiler::Gen_MemoryRoutineSeq9(bool store, bool preinc) ABI_PopRegistersAndAdjustStack({ABI_PARAM1, ABI_PARAM2, ABI_PARAM3}, 8); MEMORY_SEQ_WHILE_COND - MOV(32, R(RSCRATCH), R(ABI_PARAM1)); - SHR(32, R(RSCRATCH), Imm8(12)); - MOVZX(32, 8, ABI_PARAM2, MComplex(RCPU, RSCRATCH, SCALE_4, 2 + offsetof(ARMv5, MemTimings))); - MOVZX(32, 8, RSCRATCH, MComplex(RCPU, RSCRATCH, SCALE_4, 3 + offsetof(ARMv5, MemTimings))); - - FixupBranch finishIt1 = J(); + RET(); SetJumpTarget(insideDTCM); AND(32, R(RSCRATCH), Imm32(0x3FFF & ~3)); @@ -329,9 +191,7 @@ void* Compiler::Gen_MemoryRoutineSeq9(bool store, bool preinc) MOV(32, R(RSCRATCH), MComplex(RCPU, RSCRATCH, SCALE_1, offsetof(ARMv5, DTCM))); MEMORY_SEQ_WHILE_COND - MOV(32, R(RSCRATCH), Imm32(1)); // sequential access time - MOV(32, R(ABI_PARAM2), Imm32(1)); // non sequential - FixupBranch finishIt2 = J(); + RET(); SetJumpTarget(insideITCM); MOV(32, R(RSCRATCH), R(ABI_PARAM1)); @@ -340,31 +200,23 @@ void* Compiler::Gen_MemoryRoutineSeq9(bool store, bool preinc) { MOV(32, R(ABI_PARAM4), currentElement); MOV(32, MComplex(RCPU, RSCRATCH, SCALE_1, offsetof(ARMv5, ITCM)), R(ABI_PARAM4)); - XOR(32, R(ABI_PARAM4), R(ABI_PARAM4)); - MOV(64, MScaled(RSCRATCH, SCALE_4, squeezePointer(cache.ARM9_ITCM)), R(ABI_PARAM4)); - MOV(64, MScaled(RSCRATCH, SCALE_4, squeezePointer(cache.ARM9_ITCM) + 8), R(ABI_PARAM4)); + + ADD(32, R(RSCRATCH), Imm32(ExeMemRegionOffsets[exeMem_ITCM])); + MOV(32, R(ABI_PARAM4), R(RSCRATCH)); + SHR(32, R(RSCRATCH), Imm8(8)); + SHL(32, R(RSCRATCH), Imm8(4)); + CMP(32, MDisp(RSCRATCH, squeezePointer(CodeRanges) + offsetof(AddressRange, Blocks.Length)), Imm8(0)); + FixupBranch noCode = J_CC(CC_Z); + ABI_PushRegistersAndAdjustStack({ABI_PARAM1, ABI_PARAM2, ABI_PARAM3}, 8); + MOV(32, R(ABI_PARAM1), R(ABI_PARAM4)); + CALL((u8*)InvalidateByAddr); + ABI_PopRegistersAndAdjustStack({ABI_PARAM1, ABI_PARAM2, ABI_PARAM3}, 8); + SetJumpTarget(noCode); } else MOV(32, R(RSCRATCH), MComplex(RCPU, RSCRATCH, SCALE_1, offsetof(ARMv5, ITCM))); MEMORY_SEQ_WHILE_COND - MOV(32, R(RSCRATCH), Imm32(1)); - MOV(32, R(ABI_PARAM2), Imm32(1)); - - SetJumpTarget(finishIt1); - SetJumpTarget(finishIt2); - - POP(ABI_PARAM4); - POP(ABI_PARAM3); - - CMP(32, R(ABI_PARAM3), Imm8(1)); - FixupBranch skipSequential = J_CC(CC_E); - SUB(32, R(ABI_PARAM3), Imm8(1)); - IMUL(32, RSCRATCH, R(ABI_PARAM3)); - ADD(32, R(ABI_PARAM2), R(RSCRATCH)); - SetJumpTarget(skipSequential); - - CALC_CYCLES_9(ABI_PARAM4, ABI_PARAM2, RSCRATCH) RET(); return res; @@ -372,18 +224,8 @@ void* Compiler::Gen_MemoryRoutineSeq9(bool store, bool preinc) void* Compiler::Gen_MemoryRoutineSeq7(bool store, bool preinc, bool codeMainRAM) { - const u8* zero = GetCodePtr(); - ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(ABI_PARAM4)); - RET(); - void* res = (void*)GetWritableCodePtr(); - TEST(32, R(ABI_PARAM3), R(ABI_PARAM3)); - J_CC(CC_Z, zero); - - PUSH(ABI_PARAM3); - PUSH(ABI_PARAM4); // we need you later - const u8* repeat = GetCodePtr(); if (preinc) @@ -403,59 +245,227 @@ void* Compiler::Gen_MemoryRoutineSeq7(bool store, bool preinc, bool codeMainRAM) ABI_PopRegistersAndAdjustStack({ABI_PARAM1, ABI_PARAM2, ABI_PARAM3}, 8); MEMORY_SEQ_WHILE_COND - MOV(32, R(RSCRATCH), R(ABI_PARAM1)); - SHR(32, R(RSCRATCH), Imm8(15)); - MOVZX(32, 8, ABI_PARAM2, MScaled(RSCRATCH, SCALE_4, 2 + squeezePointer(NDS::ARM7MemTimings))); - MOVZX(32, 8, RSCRATCH, MScaled(RSCRATCH, SCALE_4, 3 + squeezePointer(NDS::ARM7MemTimings))); + RET(); - POP(ABI_PARAM4); - POP(ABI_PARAM3); + return res; +} - // TODO: optimise this - CMP(32, R(ABI_PARAM3), Imm8(1)); - FixupBranch skipSequential = J_CC(CC_E); - SUB(32, R(ABI_PARAM3), Imm8(1)); - IMUL(32, RSCRATCH, R(ABI_PARAM3)); - ADD(32, R(ABI_PARAM2), R(RSCRATCH)); - SetJumpTarget(skipSequential); +#undef MEMORY_SEQ_WHILE_COND - MOV(32, R(RSCRATCH), R(ABI_PARAM1)); - AND(32, R(RSCRATCH), Imm32(0xFF000000)); - CMP(32, R(RSCRATCH), Imm32(0x02000000)); - FixupBranch outsideMainRAM = J_CC(CC_NE); - CALC_CYCLES_7_DATA_MAIN_RAM(ABI_PARAM4, ABI_PARAM2, RSCRATCH) - RET(); +void Compiler::Comp_MemLoadLiteral(int size, int rd, u32 addr) +{ + u32 val; + // make sure arm7 bios is accessible + u32 tmpR15 = CurCPU->R[15]; + CurCPU->R[15] = R15; + if (size == 32) + { + CurCPU->DataRead32(addr & ~0x3, &val); + val = ROR(val, (addr & 0x3) << 3); + } + else if (size == 16) + CurCPU->DataRead16(addr & ~0x1, &val); + else + CurCPU->DataRead8(addr, &val); + CurCPU->R[15] = tmpR15; - SetJumpTarget(outsideMainRAM); - CALC_CYCLES_7_DATA_NON_MAIN_RAM(ABI_PARAM4, ABI_PARAM2, RSCRATCH) - RET(); + MOV(32, MapReg(rd), Imm32(val)); - return res; + if (Thumb || CurInstr.Cond() == 0xE) + RegCache.PutLiteral(rd, val); + + Comp_AddCycles_CDI(); } -#undef CALC_CYCLES_9 -#undef MEMORY_SEQ_WHILE_COND +void fault(u32 a, u32 b) +{ + printf("actually not static! %x %x\n", a, b); +} -void Compiler::Comp_MemAccess(OpArg rd, bool signExtend, bool store, int size) +void Compiler::Comp_MemAccess(int rd, int rn, const ComplexOperand& op2, int size, int flags) { - IrregularCycles = true; + if (flags & memop_Store) + { + Comp_AddCycles_CD(); + } + else + { + Comp_AddCycles_CDI(); + } - if (store) - MOV(32, R(ABI_PARAM2), rd); - u32 cycles = Num - ? NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2] - : (R15 & 0x2 ? 0 : CurInstr.CodeCycles); - MOV(32, R(ABI_PARAM3), Imm32(cycles)); - CALL(Num == 0 - ? MemoryFuncs9[size >> 4][store] - : MemoryFuncs7[size >> 4][store][CodeRegion == 0x02]); + u32 addressMask = ~0; + if (size == 32) + addressMask = ~3; + if (size == 16) + addressMask = ~1; - if (!store) + if (rn == 15 && rd != 15 && op2.IsImm && !(flags & (memop_Post|memop_Store|memop_Writeback))) { - if (signExtend) - MOVSX(32, size, rd.GetSimpleReg(), R(RSCRATCH)); + Comp_MemLoadLiteral(size, rd, + R15 + op2.Imm * ((flags & memop_SubtractOffset) ? -1 : 1)); + } + else + { + OpArg rdMapped = MapReg(rd); + OpArg rnMapped = MapReg(rn); + + bool inlinePreparation = Num == 1; + u32 constLocalROR32 = 4; + + void* memoryFunc = Num == 0 + ? MemoryFuncs9[size >> 4][!!(flags & memop_Store)] + : MemoryFuncs7[size >> 4][!!((flags & memop_Store))]; + + if ((rd != 15 || (flags & memop_Store)) && op2.IsImm && RegCache.IsLiteral(rn)) + { + u32 addr = RegCache.LiteralValues[rn] + op2.Imm * ((flags & memop_SubtractOffset) ? -1 : 1); + + /*MOV(32, R(ABI_PARAM1), Imm32(CurInstr.Instr)); + MOV(32, R(ABI_PARAM1), Imm32(R15)); + MOV_sum(32, RSCRATCH, rnMapped, Imm32(op2.Imm * ((flags & memop_SubtractOffset) ? -1 : 1))); + CMP(32, R(RSCRATCH), Imm32(addr)); + FixupBranch eq = J_CC(CC_E); + CALL((void*)fault); + SetJumpTarget(eq);*/ + + NDS::MemRegion region; + region.Mem = NULL; + if (Num == 0) + { + ARMv5* cpu5 = (ARMv5*)CurCPU; + + // stupid dtcm... + if (addr >= cpu5->DTCMBase && addr < (cpu5->DTCMBase + cpu5->DTCMSize)) + { + region.Mem = cpu5->DTCM; + region.Mask = 0x3FFF; + } + else + { + NDS::ARM9GetMemRegion(addr, flags & memop_Store, ®ion); + } + } + else + NDS::ARM7GetMemRegion(addr, flags & memop_Store, ®ion); + + if (region.Mem != NULL) + { + void* ptr = ®ion.Mem[addr & addressMask & region.Mask]; + + if (flags & memop_Store) + { + MOV(size, M(ptr), MapReg(rd)); + } + else + { + if (flags & memop_SignExtend) + MOVSX(32, size, rdMapped.GetSimpleReg(), M(ptr)); + else + MOVZX(32, size, rdMapped.GetSimpleReg(), M(ptr)); + + if (size == 32 && addr & ~0x3) + { + ROR_(32, rdMapped, Imm8((addr & 0x3) << 3)); + } + } + + return; + } + + void* specialFunc = GetFuncForAddr(CurCPU, addr, flags & memop_Store, size); + if (specialFunc) + { + memoryFunc = specialFunc; + inlinePreparation = true; + constLocalROR32 = addr & 0x3; + } + } + + X64Reg finalAddr = ABI_PARAM1; + if (flags & memop_Post) + { + MOV(32, R(ABI_PARAM1), rnMapped); + + finalAddr = rnMapped.GetSimpleReg(); + } + + if (op2.IsImm) + { + MOV_sum(32, finalAddr, rnMapped, Imm32(op2.Imm * ((flags & memop_SubtractOffset) ? -1 : 1))); + } else - MOVZX(32, size, rd.GetSimpleReg(), R(RSCRATCH)); + { + OpArg rm = MapReg(op2.Reg.Reg); + + if (!(flags & memop_SubtractOffset) && rm.IsSimpleReg() && rnMapped.IsSimpleReg() + && op2.Reg.Op == 0 && op2.Reg.Amount > 0 && op2.Reg.Amount <= 3) + { + LEA(32, finalAddr, + MComplex(rnMapped.GetSimpleReg(), rm.GetSimpleReg(), 1 << op2.Reg.Amount, 0)); + } + else + { + bool throwAway; + OpArg offset = + Comp_RegShiftImm(op2.Reg.Op, op2.Reg.Amount, rm, false, throwAway); + + if (flags & memop_SubtractOffset) + { + MOV(32, R(finalAddr), rnMapped); + if (!offset.IsZero()) + SUB(32, R(finalAddr), offset); + } + else + MOV_sum(32, finalAddr, rnMapped, offset); + } + } + + if ((flags & memop_Writeback) && !(flags & memop_Post)) + MOV(32, rnMapped, R(finalAddr)); + + if (flags & memop_Store) + MOV(32, R(ABI_PARAM2), rdMapped); + + if (!(flags & memop_Store) && inlinePreparation && constLocalROR32 == 4 && size == 32) + MOV(32, rdMapped, R(ABI_PARAM1)); + + if (inlinePreparation && size > 8) + AND(32, R(ABI_PARAM1), Imm8(addressMask)); + + CALL(memoryFunc); + + if (!(flags & memop_Store)) + { + if (inlinePreparation && size == 32) + { + if (constLocalROR32 == 4) + { + static_assert(RSCRATCH3 == ECX); + MOV(32, R(ECX), rdMapped); + AND(32, R(ECX), Imm8(3)); + SHL(32, R(ECX), Imm8(3)); + ROR_(32, R(RSCRATCH), R(ECX)); + } + else if (constLocalROR32 != 0) + ROR_(32, R(RSCRATCH), Imm8(constLocalROR32 << 3)); + } + + if (flags & memop_SignExtend) + MOVSX(32, size, rdMapped.GetSimpleReg(), R(RSCRATCH)); + else + MOVZX(32, size, rdMapped.GetSimpleReg(), R(RSCRATCH)); + } + + if (!(flags & memop_Store) && rd == 15) + { + if (size < 32) + printf("!!! LDR <32 bit PC %08X %x\n", R15, CurInstr.Instr); + { + if (Num == 1) + AND(32, rdMapped, Imm8(0xFE)); // immediate is sign extended + Comp_JumpTo(rdMapped.GetSimpleReg()); + } + } } } @@ -475,16 +485,13 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc s32 offset = (regsCount * 4) * (decrement ? -1 : 1); - u32 cycles = Num - ? NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2] - : (R15 & 0x2 ? 0 : CurInstr.CodeCycles); - // we need to make sure that the stack stays aligned to 16 bytes u32 stackAlloc = ((regsCount + 1) & ~1) * 8; - MOV(32, R(ABI_PARAM4), Imm32(cycles)); if (!store) { + Comp_AddCycles_CDI(); + MOV(32, R(ABI_PARAM3), Imm32(regsCount)); SUB(64, R(RSP), stackAlloc <= INT8_MAX ? Imm8(stackAlloc) : Imm32(stackAlloc)); MOV(64, R(ABI_PARAM2), R(RSP)); @@ -548,6 +555,8 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc } else { + Comp_AddCycles_CD(); + if (regsCount & 1) PUSH(RSCRATCH); @@ -594,81 +603,45 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc return offset; } -OpArg Compiler::A_Comp_GetMemWBOffset() -{ - if (!(CurInstr.Instr & (1 << 25))) - { - u32 imm = CurInstr.Instr & 0xFFF; - return Imm32(imm); - } - else - { - int op = (CurInstr.Instr >> 5) & 0x3; - int amount = (CurInstr.Instr >> 7) & 0x1F; - OpArg rm = MapReg(CurInstr.A_Reg(0)); - bool carryUsed; - - return Comp_RegShiftImm(op, amount, rm, false, carryUsed); - } -} void Compiler::A_Comp_MemWB() { - OpArg rn = MapReg(CurInstr.A_Reg(16)); - OpArg rd = MapReg(CurInstr.A_Reg(12)); bool load = CurInstr.Instr & (1 << 20); bool byte = CurInstr.Instr & (1 << 22); int size = byte ? 8 : 32; + + int flags = 0; + if (!load) + flags |= memop_Store; + if (!(CurInstr.Instr & (1 << 24))) + flags |= memop_Post; + if (CurInstr.Instr & (1 << 21)) + flags |= memop_Writeback; + if (!(CurInstr.Instr & (1 << 23))) + flags |= memop_SubtractOffset; - if (CurInstr.Instr & (1 << 24)) + ComplexOperand offset; + if (!(CurInstr.Instr & (1 << 25))) { - OpArg offset = A_Comp_GetMemWBOffset(); - if (CurInstr.Instr & (1 << 23)) - MOV_sum(32, ABI_PARAM1, rn, offset); - else - { - MOV(32, R(ABI_PARAM1), rn); - SUB(32, R(ABI_PARAM1), offset); - } - - if (CurInstr.Instr & (1 << 21)) - MOV(32, rn, R(ABI_PARAM1)); + offset = ComplexOperand(CurInstr.Instr & 0xFFF); } else - MOV(32, R(ABI_PARAM1), rn); - - if (!(CurInstr.Instr & (1 << 24))) { - OpArg offset = A_Comp_GetMemWBOffset(); + int op = (CurInstr.Instr >> 5) & 0x3; + int amount = (CurInstr.Instr >> 7) & 0x1F; + int rm = CurInstr.A_Reg(0); - if (CurInstr.Instr & (1 << 23)) - ADD(32, rn, offset); - else - SUB(32, rn, offset); + offset = ComplexOperand(rm, op, amount); } - Comp_MemAccess(rd, false, !load, byte ? 8 : 32); - if (load && CurInstr.A_Reg(12) == 15) - { - if (byte) - printf("!!! LDRB PC %08X\n", R15); - else - { - if (Num == 1) - AND(32, rd, Imm8(0xFE)); // immediate is sign extended - Comp_JumpTo(rd.GetSimpleReg()); - } - } + Comp_MemAccess(CurInstr.A_Reg(12), CurInstr.A_Reg(16), offset, size, flags); } void Compiler::A_Comp_MemHalf() { - OpArg rn = MapReg(CurInstr.A_Reg(16)); - OpArg rd = MapReg(CurInstr.A_Reg(12)); - - OpArg offset = CurInstr.Instr & (1 << 22) - ? Imm32(CurInstr.Instr & 0xF | ((CurInstr.Instr >> 4) & 0xF0)) - : MapReg(CurInstr.A_Reg(0)); + ComplexOperand offset = CurInstr.Instr & (1 << 22) + ? ComplexOperand(CurInstr.Instr & 0xF | ((CurInstr.Instr >> 4) & 0xF0)) + : ComplexOperand(CurInstr.A_Reg(0), 0, 0); int op = (CurInstr.Instr >> 5) & 0x3; bool load = CurInstr.Instr & (1 << 20); @@ -689,49 +662,29 @@ void Compiler::A_Comp_MemHalf() if (size == 32 && Num == 1) return; // NOP - if (CurInstr.Instr & (1 << 24)) - { - if (CurInstr.Instr & (1 << 23)) - MOV_sum(32, ABI_PARAM1, rn, offset); - else - { - MOV(32, R(ABI_PARAM1), rn); - SUB(32, R(ABI_PARAM1), offset); - } - - if (CurInstr.Instr & (1 << 21)) - MOV(32, rn, R(ABI_PARAM1)); - } - else - MOV(32, R(ABI_PARAM1), rn); - + int flags = 0; + if (signExtend) + flags |= memop_SignExtend; + if (!load) + flags |= memop_Store; if (!(CurInstr.Instr & (1 << 24))) - { - if (CurInstr.Instr & (1 << 23)) - ADD(32, rn, offset); - else - SUB(32, rn, offset); - } + flags |= memop_Post; + if (!(CurInstr.Instr & (1 << 23))) + flags |= memop_SubtractOffset; + if (CurInstr.Instr & (1 << 21)) + flags |= memop_Writeback; - Comp_MemAccess(rd, signExtend, !load, size); - - if (load && CurInstr.A_Reg(12) == 15) - printf("!!! MemHalf op PC %08X\n", R15);; + Comp_MemAccess(CurInstr.A_Reg(12), CurInstr.A_Reg(16), offset, size, flags); } void Compiler::T_Comp_MemReg() { - OpArg rd = MapReg(CurInstr.T_Reg(0)); - OpArg rb = MapReg(CurInstr.T_Reg(3)); - OpArg ro = MapReg(CurInstr.T_Reg(6)); - int op = (CurInstr.Instr >> 10) & 0x3; bool load = op & 0x2; bool byte = op & 0x1; - MOV_sum(32, ABI_PARAM1, rb, ro); - - Comp_MemAccess(rd, false, !load, byte ? 8 : 32); + Comp_MemAccess(CurInstr.T_Reg(0), CurInstr.T_Reg(3), ComplexOperand(CurInstr.T_Reg(6), 0, 0), + byte ? 8 : 32, load ? 0 : memop_Store); } void Compiler::A_Comp_LDM_STM() @@ -758,67 +711,55 @@ void Compiler::A_Comp_LDM_STM() void Compiler::T_Comp_MemImm() { - OpArg rd = MapReg(CurInstr.T_Reg(0)); - OpArg rb = MapReg(CurInstr.T_Reg(3)); - int op = (CurInstr.Instr >> 11) & 0x3; bool load = op & 0x1; bool byte = op & 0x2; u32 offset = ((CurInstr.Instr >> 6) & 0x1F) * (byte ? 1 : 4); - LEA(32, ABI_PARAM1, MDisp(rb.GetSimpleReg(), offset)); - - Comp_MemAccess(rd, false, !load, byte ? 8 : 32); + Comp_MemAccess(CurInstr.T_Reg(0), CurInstr.T_Reg(3), ComplexOperand(offset), + byte ? 8 : 32, load ? 0 : memop_Store); } void Compiler::T_Comp_MemRegHalf() { - OpArg rd = MapReg(CurInstr.T_Reg(0)); - OpArg rb = MapReg(CurInstr.T_Reg(3)); - OpArg ro = MapReg(CurInstr.T_Reg(6)); - int op = (CurInstr.Instr >> 10) & 0x3; bool load = op != 0; int size = op != 1 ? 16 : 8; bool signExtend = op & 1; - MOV_sum(32, ABI_PARAM1, rb, ro); + int flags = 0; + if (signExtend) + flags |= memop_SignExtend; + if (!load) + flags |= memop_Store; - Comp_MemAccess(rd, signExtend, !load, size); + Comp_MemAccess(CurInstr.T_Reg(0), CurInstr.T_Reg(3), ComplexOperand(CurInstr.T_Reg(6), 0, 0), + size, flags); } void Compiler::T_Comp_MemImmHalf() { - OpArg rd = MapReg(CurInstr.T_Reg(0)); - OpArg rb = MapReg(CurInstr.T_Reg(3)); - u32 offset = (CurInstr.Instr >> 5) & 0x3E; bool load = CurInstr.Instr & (1 << 11); - LEA(32, ABI_PARAM1, MDisp(rb.GetSimpleReg(), offset)); - - Comp_MemAccess(rd, false, !load, 16); + Comp_MemAccess(CurInstr.T_Reg(0), CurInstr.T_Reg(3), ComplexOperand(offset), 16, + load ? 0 : memop_Store); } void Compiler::T_Comp_LoadPCRel() { - OpArg rd = MapReg(CurInstr.T_Reg(8)); u32 addr = (R15 & ~0x2) + ((CurInstr.Instr & 0xFF) << 2); - // hopefully this doesn't break - u32 val; CurCPU->DataRead32(addr, &val); - MOV(32, rd, Imm32(val)); + Comp_MemLoadLiteral(32, CurInstr.T_Reg(8), addr); } void Compiler::T_Comp_MemSPRel() { u32 offset = (CurInstr.Instr & 0xFF) * 4; - OpArg rd = MapReg(CurInstr.T_Reg(8)); bool load = CurInstr.Instr & (1 << 11); - LEA(32, ABI_PARAM1, MDisp(MapReg(13).GetSimpleReg(), offset)); - - Comp_MemAccess(rd, false, !load, 32); + Comp_MemAccess(CurInstr.T_Reg(8), 13, ComplexOperand(offset), 32, + load ? 0 : memop_Store); } void Compiler::T_Comp_PUSH_POP() diff --git a/src/ARM_InstrInfo.cpp b/src/ARM_InstrInfo.cpp index 9239e29..0fbde26 100644 --- a/src/ARM_InstrInfo.cpp +++ b/src/ARM_InstrInfo.cpp @@ -36,7 +36,7 @@ enum { A_StaticShiftSetC = 1 << 18, A_SetC = 1 << 19, - A_WriteMemory = 1 << 20, + A_WriteMem = 1 << 20 }; #define A_BIOP A_Read16 @@ -109,7 +109,7 @@ const u32 A_UMULL = A_MulFlags | A_Write16 | A_Write12 | A_Read0 | A_Read8 | ak( const u32 A_UMLAL = A_MulFlags | A_Write16 | A_Write12 | A_Read16 | A_Read12 | A_Read0 | A_Read8 | ak(ak_UMLAL); const u32 A_SMULL = A_MulFlags | A_Write16 | A_Write12 | A_Read0 | A_Read8 | ak(ak_SMULL); const u32 A_SMLAL = A_MulFlags | A_Write16 | A_Write12 | A_Read16 | A_Read12 | A_Read0 | A_Read8 | ak(ak_SMLAL); -const u32 A_SMLAxy = A_Write16 | A_Read0 | A_Read8 | A_Read12 | ak(ak_SMLALxy); +const u32 A_SMLAxy = A_Write16 | A_Read0 | A_Read8 | A_Read12 | ak(ak_SMLAxy); const u32 A_SMLAWy = A_Write16 | A_Read0 | A_Read8 | A_Read12 | ak(ak_SMLAWy); const u32 A_SMULWy = A_Write16 | A_Read0 | A_Read8 | ak(ak_SMULWy); const u32 A_SMLALxy = A_Write16 | A_Write12 | A_Read16 | A_Read12 | A_Read0 | A_Read8 | ak(ak_SMLALxy); @@ -123,7 +123,7 @@ const u32 A_QDADD = A_Write12 | A_Read0 | A_Read16 | A_UnkOnARM7 | ak(ak_QDADD); const u32 A_QDSUB = A_Write12 | A_Read0 | A_Read16 | A_UnkOnARM7 | ak(ak_QDSUB); #define A_LDR A_Write12 -#define A_STR A_Read12 | A_WriteMemory +#define A_STR A_Read12 | A_WriteMem #define A_IMPLEMENT_WB_LDRSTR(x,k) \ const u32 A_##x##_IMM = A_##k | A_Read16 | A_MemWriteback | ak(ak_##x##_IMM); \ @@ -144,7 +144,7 @@ A_IMPLEMENT_WB_LDRSTR(LDR,LDR) A_IMPLEMENT_WB_LDRSTR(LDRB,LDR) #define A_LDRD A_Write12Double -#define A_STRD A_Read12Double | A_WriteMemory +#define A_STRD A_Read12Double | A_WriteMem #define A_IMPLEMENT_HD_LDRSTR(x,k) \ const u32 A_##x##_IMM = A_##k | A_Read16 | A_MemWriteback | ak(ak_##x##_IMM); \ @@ -159,11 +159,11 @@ A_IMPLEMENT_HD_LDRSTR(LDRH,LDR) A_IMPLEMENT_HD_LDRSTR(LDRSB,LDR) A_IMPLEMENT_HD_LDRSTR(LDRSH,LDR) -const u32 A_SWP = A_Write12 | A_Read16 | A_Read0 | A_WriteMemory | ak(ak_SWP); -const u32 A_SWPB = A_Write12 | A_Read16 | A_Read0 | A_WriteMemory | ak(ak_SWPB); +const u32 A_SWP = A_Write12 | A_Read16 | A_Read0 | A_WriteMem | ak(ak_SWP); +const u32 A_SWPB = A_Write12 | A_Read16 | A_Read0 | A_WriteMem | ak(ak_SWPB); const u32 A_LDM = A_Read16 | A_MemWriteback | ak(ak_LDM); -const u32 A_STM = A_Read16 | A_MemWriteback | A_WriteMemory | ak(ak_STM); +const u32 A_STM = A_Read16 | A_MemWriteback | A_WriteMem | ak(ak_STM); const u32 A_B = A_BranchAlways | ak(ak_B); const u32 A_BL = A_BranchAlways | A_Link | ak(ak_BL); @@ -181,7 +181,7 @@ const u32 A_SVC = A_BranchAlways | A_Link | ak(ak_SVC); // THUMB -#define tk(x) ((x) << 21) +#define tk(x) ((x) << 22) enum { T_Read0 = 1 << 0, @@ -210,6 +210,8 @@ enum { T_SetMaybeC = 1 << 18, T_ReadC = 1 << 19, T_SetC = 1 << 20, + + T_WriteMem = 1 << 21, }; const u32 T_LSL_IMM = T_SetNZ | T_SetMaybeC | T_Write0 | T_Read3 | tk(tk_LSL_IMM); @@ -253,30 +255,30 @@ const u32 T_ADD_SP = T_WriteR13 | T_ReadR13 | tk(tk_ADD_SP); const u32 T_LDR_PCREL = T_Write8 | tk(tk_LDR_PCREL); -const u32 T_STR_REG = T_Read0 | T_Read3 | T_Read6 | tk(tk_STR_REG); -const u32 T_STRB_REG = T_Read0 | T_Read3 | T_Read6 | tk(tk_STRB_REG); +const u32 T_STR_REG = T_Read0 | T_Read3 | T_Read6 | T_WriteMem | tk(tk_STR_REG); +const u32 T_STRB_REG = T_Read0 | T_Read3 | T_Read6 | T_WriteMem | tk(tk_STRB_REG); const u32 T_LDR_REG = T_Write0 | T_Read3 | T_Read6 | tk(tk_LDR_REG); const u32 T_LDRB_REG = T_Write0 | T_Read3 | T_Read6 | tk(tk_LDRB_REG); -const u32 T_STRH_REG = T_Read0 | T_Read3 | T_Read6 | tk(tk_STRH_REG); +const u32 T_STRH_REG = T_Read0 | T_Read3 | T_Read6 | T_WriteMem | tk(tk_STRH_REG); const u32 T_LDRSB_REG = T_Write0 | T_Read3 | T_Read6 | tk(tk_LDRSB_REG); const u32 T_LDRH_REG = T_Write0 | T_Read3 | T_Read6 | tk(tk_LDRH_REG); const u32 T_LDRSH_REG = T_Write0 | T_Read3 | T_Read6 | tk(tk_LDRSH_REG); -const u32 T_STR_IMM = T_Read0 | T_Read3 | tk(tk_STR_IMM); +const u32 T_STR_IMM = T_Read0 | T_Read3 | T_WriteMem | tk(tk_STR_IMM); const u32 T_LDR_IMM = T_Write0 | T_Read3 | tk(tk_LDR_IMM); -const u32 T_STRB_IMM = T_Read0 | T_Read3 | tk(tk_STRB_IMM); +const u32 T_STRB_IMM = T_Read0 | T_Read3 | T_WriteMem | tk(tk_STRB_IMM); const u32 T_LDRB_IMM = T_Write0 | T_Read3 | tk(tk_LDRB_IMM); -const u32 T_STRH_IMM = T_Read0 | T_Read3 | tk(tk_STRH_IMM); +const u32 T_STRH_IMM = T_Read0 | T_Read3 | T_WriteMem | tk(tk_STRH_IMM); const u32 T_LDRH_IMM = T_Write0 | T_Read3 | tk(tk_LDRH_IMM); -const u32 T_STR_SPREL = T_Read8 | T_ReadR13 | tk(tk_STR_SPREL); +const u32 T_STR_SPREL = T_Read8 | T_ReadR13 | T_WriteMem | tk(tk_STR_SPREL); const u32 T_LDR_SPREL = T_Write8 | T_ReadR13 | tk(tk_LDR_SPREL); -const u32 T_PUSH = T_ReadR13 | T_WriteR13 | tk(tk_PUSH); +const u32 T_PUSH = T_ReadR13 | T_WriteR13 | T_WriteMem | tk(tk_PUSH); const u32 T_POP = T_PopPC | T_ReadR13 | T_WriteR13 | tk(tk_POP); const u32 T_LDMIA = T_Read8 | T_Write8 | tk(tk_LDMIA); -const u32 T_STMIA = T_Read8 | T_Write8 | tk(tk_STMIA); +const u32 T_STMIA = T_Read8 | T_Write8 | T_WriteMem | tk(tk_STMIA); const u32 T_BCOND = T_BranchAlways | tk(tk_BCOND); const u32 T_BX = T_BranchAlways | T_ReadHi3 | tk(tk_BX); @@ -307,7 +309,7 @@ Info Decode(bool thumb, u32 num, u32 instr) if (thumb) { u32 data = THUMBInstrTable[(instr >> 6) & 0x3FF]; - res.Kind = (data >> 21) & 0x3F; + res.Kind = (data >> 22) & 0x3F; if (data & T_Read0) res.SrcRegs |= 1 << (instr & 0x7); @@ -356,6 +358,9 @@ Info Decode(bool thumb, u32 num, u32 instr) if (data & T_SetC) res.WriteFlags |= flag_C; + if (data & T_WriteMem) + res.SpecialKind = special_WriteMem; + res.EndBlock |= res.Branches(); if (res.Kind == tk_BCOND) @@ -382,6 +387,9 @@ Info Decode(bool thumb, u32 num, u32 instr) u32 id = (cn<<8)|(cm<<4)|cpinfo; if (id == 0x704 || id == 0x782 || id == 0x750 || id == 0x751 || id == 0x752) res.EndBlock |= true; + + if (id == 0x704 || id == 0x782) + res.SpecialKind = special_WaitForInterrupt; } if (res.Kind == ak_MCR || res.Kind == ak_MRC) { @@ -449,6 +457,9 @@ Info Decode(bool thumb, u32 num, u32 instr) if ((data & A_SetC) || (data & A_StaticShiftSetC) && ((instr >> 7) & 0x1F)) res.WriteFlags |= flag_C; + if (data & A_WriteMem) + res.SpecialKind = special_WriteMem; + if ((instr >> 28) < 0xE) { // make non conditional flag sets conditional diff --git a/src/ARM_InstrInfo.h b/src/ARM_InstrInfo.h index d01c600..d02f168 100644 --- a/src/ARM_InstrInfo.h +++ b/src/ARM_InstrInfo.h @@ -226,18 +226,27 @@ enum flag_V = 1 << 0, }; +enum +{ + special_NotSpecialAtAll = 0, + special_WriteMem, + special_WaitForInterrupt +}; + struct Info { u16 DstRegs, SrcRegs; u16 Kind; + u8 SpecialKind; + u8 ReadFlags; // lower 4 bits - set always // upper 4 bits - might set flag u8 WriteFlags; bool EndBlock; - bool Branches() + bool Branches() const { return DstRegs & (1 << 15); } diff --git a/src/CP15.cpp b/src/CP15.cpp index e6e91c3..10c3b1b 100644 --- a/src/CP15.cpp +++ b/src/CP15.cpp @@ -561,9 +561,11 @@ void ARMv5::CP15Write(u32 id, u32 val) case 0x750: + ARMJIT::InvalidateAll(); ICacheInvalidateAll(); return; case 0x751: + ARMJIT::InvalidateByAddr(ARMJIT::TranslateAddr<0>(val)); ICacheInvalidateByAddr(val); return; case 0x752: @@ -813,7 +815,7 @@ void ARMv5::DataWrite8(u32 addr, u8 val) DataCycles = 1; *(u8*)&ITCM[addr & 0x7FFF] = val; #ifdef JIT_ENABLED - ARMJIT::cache.ARM9_ITCM[(addr & 0x7FFF) >> 1] = NULL; + ARMJIT::InvalidateITCM(addr & 0x7FFF); #endif return; } @@ -837,7 +839,7 @@ void ARMv5::DataWrite16(u32 addr, u16 val) DataCycles = 1; *(u16*)&ITCM[addr & 0x7FFF] = val; #ifdef JIT_ENABLED - ARMJIT::cache.ARM9_ITCM[(addr & 0x7FFF) >> 1] = NULL; + ARMJIT::InvalidateITCM(addr & 0x7FFF); #endif return; } @@ -861,8 +863,7 @@ void ARMv5::DataWrite32(u32 addr, u32 val) DataCycles = 1; *(u32*)&ITCM[addr & 0x7FFF] = val; #ifdef JIT_ENABLED - ARMJIT::cache.ARM9_ITCM[(addr & 0x7FFF) >> 1] = NULL; - ARMJIT::cache.ARM9_ITCM[((addr + 2) & 0x7FFF) >> 1] = NULL; + ARMJIT::InvalidateITCM(addr & 0x7FFF); #endif return; } @@ -886,8 +887,7 @@ void ARMv5::DataWrite32S(u32 addr, u32 val) DataCycles += 1; *(u32*)&ITCM[addr & 0x7FFF] = val; #ifdef JIT_ENABLED - ARMJIT::cache.ARM9_ITCM[(addr & 0x7FFF) >> 1] = NULL; - ARMJIT::cache.ARM9_ITCM[((addr & 0x7FFF) >> 1) + 1] = NULL; + ARMJIT::InvalidateITCM(addr & 0x7FFF); #endif return; } diff --git a/src/Config.cpp b/src/Config.cpp index 3cff0ed..63d61a3 100644 --- a/src/Config.cpp +++ b/src/Config.cpp @@ -37,6 +37,7 @@ int GL_Antialias; #ifdef JIT_ENABLED bool JIT_Enable = false; int JIT_MaxBlockSize = 12; +bool JIT_BrancheOptimisations = true; #endif ConfigEntry ConfigFile[] = @@ -50,6 +51,7 @@ ConfigEntry ConfigFile[] = #ifdef JIT_ENABLED {"JIT_Enable", 0, &JIT_Enable, 0, NULL, 0}, {"JIT_MaxBlockSize", 0, &JIT_MaxBlockSize, 10, NULL, 0}, + {"JIT_BrancheOptimisations", 0, &JIT_BrancheOptimisations, 1, NULL, 0}, #endif {"", -1, NULL, 0, NULL, 0} diff --git a/src/Config.h b/src/Config.h index c13eae3..0fcefc3 100644 --- a/src/Config.h +++ b/src/Config.h @@ -49,6 +49,7 @@ extern int GL_Antialias; #ifdef JIT_ENABLED extern bool JIT_Enable; extern int JIT_MaxBlockSize; +extern bool JIT_BrancheOptimisations; #endif } diff --git a/src/NDS.cpp b/src/NDS.cpp index 1baa308..e9e6795 100644 --- a/src/NDS.cpp +++ b/src/NDS.cpp @@ -536,7 +536,7 @@ void Reset() RCnt = 0; #ifdef JIT_ENABLED - ARMJIT::InvalidateBlockCache(); + ARMJIT::ResetBlockCache(); #endif NDSCart::Reset(); @@ -757,7 +757,7 @@ bool DoSavestate(Savestate* file) #ifdef JIT_ENABLED if (!file->Saving) { - ARMJIT::InvalidateBlockCache(); + ARMJIT::ResetBlockCache(); } #endif @@ -1870,10 +1870,6 @@ u32 ARM9Read32(u32 addr) void ARM9Write8(u32 addr, u8 val) { -#ifdef JIT_ENABLED - ARMJIT::Invalidate16<0>(addr); -#endif - switch (addr & 0xFF000000) { case 0x02000000: @@ -1924,10 +1920,6 @@ void ARM9Write8(u32 addr, u8 val) void ARM9Write16(u32 addr, u16 val) { -#ifdef JIT_ENABLED - ARMJIT::Invalidate16<0>(addr); -#endif - switch (addr & 0xFF000000) { case 0x02000000: @@ -1994,10 +1986,6 @@ void ARM9Write16(u32 addr, u16 val) void ARM9Write32(u32 addr, u32 val) { -#ifdef JIT_ENABLED - ARMJIT::Invalidate32<0>(addr); -#endif - switch (addr & 0xFF000000) { case 0x02000000: @@ -2292,7 +2280,7 @@ u32 ARM7Read32(u32 addr) void ARM7Write8(u32 addr, u8 val) { #ifdef JIT_ENABLED - ARMJIT::Invalidate16<1>(addr); + ARMJIT::InvalidateByAddr7(addr); #endif switch (addr & 0xFF800000) @@ -2355,7 +2343,7 @@ void ARM7Write8(u32 addr, u8 val) void ARM7Write16(u32 addr, u16 val) { #ifdef JIT_ENABLED - ARMJIT::Invalidate16<1>(addr); + ARMJIT::InvalidateByAddr7(addr); #endif switch (addr & 0xFF800000) @@ -2428,7 +2416,7 @@ void ARM7Write16(u32 addr, u16 val) void ARM7Write32(u32 addr, u32 val) { #ifdef JIT_ENABLED - ARMJIT::Invalidate32<1>(addr); + ARMJIT::InvalidateByAddr7(addr); #endif switch (addr & 0xFF800000) diff --git a/src/libui_sdl/DlgEmuSettings.cpp b/src/libui_sdl/DlgEmuSettings.cpp index 09ea8eb..45e8e0c 100644 --- a/src/libui_sdl/DlgEmuSettings.cpp +++ b/src/libui_sdl/DlgEmuSettings.cpp @@ -42,6 +42,7 @@ uiCheckbox* cbDirectBoot; #ifdef JIT_ENABLED uiCheckbox* cbJITEnabled; uiEntry* enJITMaxBlockSize; +uiCheckbox* cbJITBranchOptimisations; #endif int OnCloseWindow(uiWindow* window, void* blarg) @@ -64,13 +65,15 @@ void OnOk(uiButton* btn, void* blarg) bool enableJit = uiCheckboxChecked(cbJITEnabled); char* maxBlockSizeStr = uiEntryText(enJITMaxBlockSize); long blockSize = strtol(maxBlockSizeStr, NULL, 10); + bool branchOptimisations = uiCheckboxChecked(cbJITBranchOptimisations); uiFreeText(maxBlockSizeStr); if (blockSize < 1) blockSize = 1; if (blockSize > 32) blockSize = 32; - if (enableJit != Config::JIT_Enable || blockSize != Config::JIT_MaxBlockSize) + if (enableJit != Config::JIT_Enable || blockSize != Config::JIT_MaxBlockSize || + branchOptimisations != Config::JIT_BrancheOptimisations) { if (RunningSomething && !uiMsgBoxConfirm(win, "Reset emulator", @@ -79,6 +82,7 @@ void OnOk(uiButton* btn, void* blarg) Config::JIT_Enable = enableJit; Config::JIT_MaxBlockSize = blockSize; + Config::JIT_BrancheOptimisations = uiCheckboxChecked(cbJITBranchOptimisations); restart = true; } @@ -101,9 +105,15 @@ void OnOk(uiButton* btn, void* blarg) void OnJITStateChanged(uiCheckbox* cb, void* blarg) { if (uiCheckboxChecked(cb)) + { uiControlEnable(uiControl(enJITMaxBlockSize)); + uiControlEnable(uiControl(cbJITBranchOptimisations)); + } else + { uiControlDisable(uiControl(enJITMaxBlockSize)); + uiControlDisable(uiControl(cbJITBranchOptimisations)); + } } #endif @@ -159,6 +169,14 @@ void Open() enJITMaxBlockSize = uiNewEntry(); uiBoxAppend(row, uiControl(enJITMaxBlockSize), 0); } + + { + uiBox* row = uiNewHorizontalBox(); + uiBoxAppend(in_ctrl, uiControl(row), 0); + + cbJITBranchOptimisations = uiNewCheckbox("Branch optimisations (breaks in rare cases games!)"); + uiBoxAppend(row, uiControl(cbJITBranchOptimisations), 0); + } } #endif @@ -194,6 +212,8 @@ void Open() uiEntrySetText(enJITMaxBlockSize, maxBlockSizeStr); } OnJITStateChanged(cbJITEnabled, NULL); + + uiCheckboxSetChecked(cbJITBranchOptimisations, Config::JIT_BrancheOptimisations); #endif uiControlShow(uiControl(win)); -- cgit v1.2.3 From 81f38c14be0d9ba5a3da8f67d9719ed2c47279c5 Mon Sep 17 00:00:00 2001 From: RSDuck Date: Fri, 18 Oct 2019 13:29:17 +0200 Subject: integrate changes from ARM64 backend and more - better handle LDM/STM in reg alloc - unify Halted and IRQ in anticipation for branch inlining - literal optimisations can be disabled in gui - jit blocks follow simple returns - fix idle loop detection - break jit blocks on IRQ (fixes saving in Pokemon White) --- src/ARM.cpp | 40 ++++++++++++++++++----------- src/ARM.h | 13 +++++++--- src/ARMJIT.cpp | 50 +++++++++++++++++++++++++++++++------ src/ARMJIT_RegisterCache.h | 33 +++++++++++++++++++----- src/ARMJIT_x64/ARMJIT_Compiler.cpp | 7 +++--- src/ARMJIT_x64/ARMJIT_LoadStore.cpp | 16 ++++++++---- src/ARM_InstrInfo.cpp | 28 +++++++++++++++++++++ src/ARM_InstrInfo.h | 2 +- src/Config.cpp | 2 ++ src/Config.h | 1 + src/NDS.cpp | 2 +- src/libui_sdl/DlgEmuSettings.cpp | 31 ++++++++++++++++++++--- src/libui_sdl/main.cpp | 2 -- 13 files changed, 179 insertions(+), 48 deletions(-) (limited to 'src/ARMJIT_RegisterCache.h') diff --git a/src/ARM.cpp b/src/ARM.cpp index 423c940..4fab60e 100644 --- a/src/ARM.cpp +++ b/src/ARM.cpp @@ -113,7 +113,7 @@ void ARM::DoSavestate(Savestate* file) file->Var32((u32*)&Cycles); //file->Var32((u32*)&CyclesToRun); - file->Var32(&Halted); + file->Var32(&StopExecution); file->VarArray(R, 16*sizeof(u32)); file->Var32(&CPSR); @@ -589,16 +589,21 @@ void ARMv5::ExecuteJIT() NDS::ARM9Timestamp += Cycles; Cycles = 0; - if (IRQ) TriggerIRQ(); - if (Halted) + if (StopExecution) { - bool idleLoop = Halted & 0x20; - Halted &= ~0x20; - if ((Halted == 1 || idleLoop) && NDS::ARM9Timestamp < NDS::ARM9Target) + if (IRQ) + TriggerIRQ(); + + if (Halted || IdleLoop) { - NDS::ARM9Timestamp = NDS::ARM9Target; + bool idleLoop = IdleLoop; + IdleLoop = 0; + if ((Halted == 1 || idleLoop) && NDS::ARM9Timestamp < NDS::ARM9Target) + { + NDS::ARM9Timestamp = NDS::ARM9Target; + } + break; } - break; } } @@ -726,16 +731,21 @@ void ARMv4::ExecuteJIT() Cycles = 0; // TODO optimize this shit!!! - if (IRQ) TriggerIRQ(); - if (Halted) + if (StopExecution) { - bool idleLoop = Halted & 0x20; - Halted &= ~0x20; - if ((Halted == 1 || idleLoop) && NDS::ARM7Timestamp < NDS::ARM7Target) + if (IRQ) + TriggerIRQ(); + + if (Halted || IdleLoop) { - NDS::ARM7Timestamp = NDS::ARM7Target; + bool idleLoop = IdleLoop; + IdleLoop = 0; + if ((Halted == 1 || idleLoop) && NDS::ARM7Timestamp < NDS::ARM7Target) + { + NDS::ARM7Timestamp = NDS::ARM7Target; + } + break; } - break; } } diff --git a/src/ARM.h b/src/ARM.h index 8a01068..e252d23 100644 --- a/src/ARM.h +++ b/src/ARM.h @@ -112,9 +112,16 @@ public: u32 Num; s32 Cycles; - u32 Halted; - - u32 IRQ; // nonzero to trigger IRQ + union + { + struct + { + u8 Halted; + u8 IRQ; // nonzero to trigger IRQ + u8 IdleLoop; + }; + u32 StopExecution; + }; u32 CodeRegion; s32 CodeCycles; diff --git a/src/ARMJIT.cpp b/src/ARMJIT.cpp index 19a5e70..0695b85 100644 --- a/src/ARMJIT.cpp +++ b/src/ARMJIT.cpp @@ -16,11 +16,13 @@ #include "GPU3D.h" #include "SPU.h" #include "Wifi.h" +#include "NDSCart.h" namespace ARMJIT { #define JIT_DEBUGPRINT(msg, ...) +//#define JIT_DEBUGPRINT(msg, ...) printf(msg, ## __VA_ARGS__) Compiler* compiler; @@ -159,13 +161,17 @@ void FloodFillSetFlags(FetchedInstr instrs[], int start, u8 flags) } } -bool DecodeBranch(bool thumb, const FetchedInstr& instr, u32& cond, u32& targetAddr) +bool DecodeBranch(bool thumb, const FetchedInstr& instr, u32& cond, bool hasLink, u32 lr, bool& link, + u32& linkAddr, u32& targetAddr) { if (thumb) { u32 r15 = instr.Addr + 4; cond = 0xE; + link = instr.Info.Kind == ARMInstrInfo::tk_BL_LONG; + linkAddr = instr.Addr + 4; + if (instr.Info.Kind == ARMInstrInfo::tk_BL_LONG && !(instr.Instr & (1 << 12))) { targetAddr = r15 + ((s32)((instr.Instr & 0x7FF) << 21) >> 9); @@ -185,9 +191,18 @@ bool DecodeBranch(bool thumb, const FetchedInstr& instr, u32& cond, u32& targetA targetAddr = r15 + offset; return true; } + else if (hasLink && instr.Info.Kind == ARMInstrInfo::tk_BX && instr.A_Reg(3) == 14) + { + JIT_DEBUGPRINT("returning!\n"); + targetAddr = lr; + return true; + } } else { + link = instr.Info.Kind == ARMInstrInfo::ak_BL; + linkAddr = instr.Addr + 4; + cond = instr.Cond(); if (instr.Info.Kind == ARMInstrInfo::ak_BL || instr.Info.Kind == ARMInstrInfo::ak_B) @@ -197,6 +212,12 @@ bool DecodeBranch(bool thumb, const FetchedInstr& instr, u32& cond, u32& targetA targetAddr = r15 + offset; return true; } + else if (hasLink && instr.Info.Kind == ARMInstrInfo::ak_BX && instr.A_Reg(0) == 14) + { + JIT_DEBUGPRINT("returning!\n"); + targetAddr = lr; + return true; + } } return false; } @@ -351,6 +372,8 @@ void CompileBlock(ARM* cpu) CodeRanges[pseudoPhysicalAddr / 512].TimesInvalidated); u32 lastSegmentStart = blockAddr; + u32 lr; + bool hasLink = false; do { @@ -413,6 +436,9 @@ void CompileBlock(ARM* cpu) cpu->CurInstr = instrs[i].Instr; cpu->CodeCycles = instrs[i].CodeCycles; + if (instrs[i].Info.DstRegs & (1 << 14)) + hasLink = false; + if (thumb) { InterpretTHUMB[instrs[i].Info.Kind](cpu); @@ -452,8 +478,9 @@ void CompileBlock(ARM* cpu) { bool hasBranched = cpu->R[15] != r15; - u32 cond, target; - bool staticBranch = DecodeBranch(thumb, instrs[i], cond, target); + bool link; + u32 cond, target, linkAddr; + bool staticBranch = DecodeBranch(thumb, instrs[i], cond, hasLink, lr, link, linkAddr, target); JIT_DEBUGPRINT("branch cond %x target %x (%d)\n", cond, target, hasBranched); if (staticBranch) @@ -474,18 +501,24 @@ void CompileBlock(ARM* cpu) if (cond < 0xE && target < instrs[i].Addr && target >= lastSegmentStart) { // we might have an idle loop - u32 offset = (target - blockAddr) / (thumb ? 2 : 4); - if (IsIdleLoop(instrs + offset, i - offset + 1)) + u32 backwardsOffset = (instrs[i].Addr - target) / (thumb ? 2 : 4); + if (IsIdleLoop(&instrs[i - backwardsOffset], backwardsOffset + 1)) { instrs[i].BranchFlags |= branch_IdleBranch; JIT_DEBUGPRINT("found %s idle loop %d in block %x\n", thumb ? "thumb" : "arm", cpu->Num, blockAddr); } } - else if (hasBranched && (!thumb || cond == 0xE) && !isBackJump && i + 1 < Config::JIT_MaxBlockSize) + else if (hasBranched && !isBackJump && i + 1 < Config::JIT_MaxBlockSize) { u32 targetPseudoPhysical = cpu->Num == 0 ? TranslateAddr<0>(target) : TranslateAddr<1>(target); + + if (link) + { + lr = linkAddr; + hasLink = true; + } r15 = target + (thumb ? 2 : 4); assert(r15 == cpu->R[15]); @@ -520,7 +553,7 @@ void CompileBlock(ARM* cpu) bool secondaryFlagReadCond = !canCompile || (instrs[i - 1].BranchFlags & (branch_FollowCondTaken | branch_FollowCondNotTaken)); if (instrs[i - 1].Info.ReadFlags != 0 || secondaryFlagReadCond) FloodFillSetFlags(instrs, i - 2, !secondaryFlagReadCond ? instrs[i - 1].Info.ReadFlags : 0xF); - } while(!instrs[i - 1].Info.EndBlock && i < Config::JIT_MaxBlockSize && !cpu->Halted); + } while(!instrs[i - 1].Info.EndBlock && i < Config::JIT_MaxBlockSize && !cpu->Halted && (!cpu->IRQ || (cpu->CPSR & 0x80))); u32 restoreSlot = HashRestoreCandidate(pseudoPhysicalAddr); JitBlock* prevBlock = RestoreCandidates[restoreSlot]; @@ -713,6 +746,9 @@ void* GetFuncForAddr(ARM* cpu, u32 addr, bool store, int size) { if ((addr & 0xFF000000) == 0x04000000) { + if (!store && size == 32 && addr == 0x04100010 && NDS::ExMemCnt[0] & (1<<11)) + return (void*)NDSCart::ReadROMData; + /* unfortunately we can't map GPU2D this way since it's hidden inside an object diff --git a/src/ARMJIT_RegisterCache.h b/src/ARMJIT_RegisterCache.h index ed6a2b7..2222bc2 100644 --- a/src/ARMJIT_RegisterCache.h +++ b/src/ARMJIT_RegisterCache.h @@ -93,10 +93,12 @@ public: void Prepare(bool thumb, int i) { + FetchedInstr instr = Instrs[i]; + if (LoadedRegs & (1 << 15)) UnloadRegister(15); - BitSet16 invalidedLiterals(LiteralsLoaded & Instrs[i].Info.DstRegs); + BitSet16 invalidedLiterals(LiteralsLoaded & instr.Info.DstRegs); for (int reg : invalidedLiterals) UnloadLiteral(reg); @@ -108,6 +110,7 @@ public: { BitSet16 regsNeeded((Instrs[j].Info.SrcRegs & ~(1 << 15)) | Instrs[j].Info.DstRegs); futureNeeded |= regsNeeded.m_val; + regsNeeded &= BitSet16(~Instrs[j].Info.NotStrictlyNeeded); for (int reg : regsNeeded) ranking[reg]++; } @@ -117,8 +120,8 @@ public: for (int reg : neverNeededAgain) UnloadRegister(reg); - FetchedInstr Instr = Instrs[i]; - u16 necessaryRegs = (Instr.Info.SrcRegs & ~(1 << 15)) | Instr.Info.DstRegs; + u16 necessaryRegs = ((instr.Info.SrcRegs & ~(1 << 15)) | instr.Info.DstRegs) & ~instr.Info.NotStrictlyNeeded; + u16 writeRegs = instr.Info.DstRegs & ~instr.Info.NotStrictlyNeeded; BitSet16 needToBeLoaded(necessaryRegs & ~LoadedRegs); if (needToBeLoaded != BitSet16(0)) { @@ -143,13 +146,31 @@ public: loadedSet.m_val = LoadedRegs; } + // we don't need to load a value which is always going to be overwritten BitSet16 needValueLoaded(needToBeLoaded); - if (thumb || Instr.Cond() >= 0xE) - needValueLoaded = BitSet16(Instr.Info.SrcRegs); + if (thumb || instr.Cond() >= 0xE) + needValueLoaded = BitSet16(instr.Info.SrcRegs); for (int reg : needToBeLoaded) LoadRegister(reg, needValueLoaded[reg]); + } + { + BitSet16 loadedSet(LoadedRegs); + BitSet16 loadRegs(instr.Info.NotStrictlyNeeded & futureNeeded & ~LoadedRegs); + if (loadRegs && loadedSet.Count() < NativeRegsAvailable) + { + int left = NativeRegsAvailable - loadedSet.Count(); + for (int reg : loadRegs) + { + if (left-- == 0) + break; + + writeRegs |= (1 << reg) & instr.Info.DstRegs; + LoadRegister(reg, !(thumb || instr.Cond() >= 0xE) || (1 << reg) & instr.Info.SrcRegs); + } + } } - DirtyRegs |= Instr.Info.DstRegs & ~(1 << 15); + + DirtyRegs |= writeRegs & ~(1 << 15); } static const Reg NativeRegAllocOrder[]; diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp index a994d34..fd38724 100644 --- a/src/ARMJIT_x64/ARMJIT_Compiler.cpp +++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp @@ -364,7 +364,7 @@ void Compiler::Reset() void Compiler::Comp_SpecialBranchBehaviour() { if (CurInstr.BranchFlags & branch_IdleBranch) - OR(32, MDisp(RCPU, offsetof(ARM, Halted)), Imm8(0x20)); + OR(32, MDisp(RCPU, offsetof(ARM, IdleLoop)), Imm8(0x1)); if (CurInstr.BranchFlags & branch_FollowCondNotTaken) { @@ -402,6 +402,7 @@ JitBlockEntry Compiler::CompileBlock(ARM* cpu, bool thumb, FetchedInstr instrs[] { CurInstr = instrs[i]; R15 = CurInstr.Addr + (Thumb ? 4 : 8); + CodeRegion = R15 >> 24; Exit = i == instrsCount - 1 || (CurInstr.BranchFlags & branch_FollowCondNotTaken); @@ -571,8 +572,6 @@ void Compiler::Comp_AddCycles_CDI() Comp_AddCycles_CD(); else { - IrregularCycles = true; - s32 cycles; s32 numC = NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2]; @@ -642,7 +641,7 @@ void Compiler::Comp_AddCycles_CD() IrregularCycles = true; } - if (!Thumb && CurInstr.Cond() < 0xE) + if (IrregularCycles && !Thumb && CurInstr.Cond() < 0xE) ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm8(cycles)); else ConstantCycles += cycles; diff --git a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp index eb01c87..3799774 100644 --- a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp +++ b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp @@ -1,5 +1,6 @@ #include "ARMJIT_Compiler.h" +#include "../Config.h" using namespace Gen; @@ -290,7 +291,7 @@ void Compiler::Comp_MemAccess(int rd, int rn, const ComplexOperand& op2, int siz if (size == 16) addressMask = ~1; - if (rn == 15 && rd != 15 && op2.IsImm && !(flags & (memop_SignExtend|memop_Post|memop_Store|memop_Writeback))) + if (Config::JIT_LiteralOptimisations && rn == 15 && rd != 15 && op2.IsImm && !(flags & (memop_SignExtend|memop_Post|memop_Store|memop_Writeback))) { u32 addr = R15 + op2.Imm * ((flags & memop_SubtractOffset) ? -1 : 1); Comp_MemLoadLiteral(size, rd, addr); @@ -309,6 +310,8 @@ void Compiler::Comp_MemAccess(int rd, int rn, const ComplexOperand& op2, int siz OpArg rdMapped = MapReg(rd); OpArg rnMapped = MapReg(rn); + if (Thumb && rn == 15) + rnMapped = Imm32(R15 & ~0x2); bool inlinePreparation = Num == 1; u32 constLocalROR32 = 4; @@ -317,7 +320,7 @@ void Compiler::Comp_MemAccess(int rd, int rn, const ComplexOperand& op2, int siz ? MemoryFuncs9[size >> 4][!!(flags & memop_Store)] : MemoryFuncs7[size >> 4][!!((flags & memop_Store))]; - if ((rd != 15 || (flags & memop_Store)) && op2.IsImm && RegCache.IsLiteral(rn)) + if (Config::JIT_LiteralOptimisations && (rd != 15 || (flags & memop_Store)) && op2.IsImm && RegCache.IsLiteral(rn)) { u32 addr = RegCache.LiteralValues[rn] + op2.Imm * ((flags & memop_SubtractOffset) ? -1 : 1); @@ -749,9 +752,12 @@ void Compiler::T_Comp_MemImmHalf() void Compiler::T_Comp_LoadPCRel() { - u32 addr = (R15 & ~0x2) + ((CurInstr.Instr & 0xFF) << 2); - - Comp_MemLoadLiteral(32, CurInstr.T_Reg(8), addr); + u32 offset = (CurInstr.Instr & 0xFF) << 2; + u32 addr = (R15 & ~0x2) + offset; + if (Config::JIT_LiteralOptimisations) + Comp_MemLoadLiteral(32, CurInstr.T_Reg(8), addr); + else + Comp_MemAccess(CurInstr.T_Reg(8), 15, ComplexOperand(offset), 32, 0); } void Compiler::T_Comp_MemSPRel() diff --git a/src/ARM_InstrInfo.cpp b/src/ARM_InstrInfo.cpp index 1261bbe..8f8bd35 100644 --- a/src/ARM_InstrInfo.cpp +++ b/src/ARM_InstrInfo.cpp @@ -365,6 +365,21 @@ Info Decode(bool thumb, u32 num, u32 instr) if (res.Kind == ARMInstrInfo::tk_LDR_PCREL) res.SpecialKind = special_LoadLiteral; + if (res.Kind == tk_LDMIA || res.Kind == tk_POP) + { + u32 set = (instr & 0xFF) & ~(res.DstRegs|res.SrcRegs); + res.NotStrictlyNeeded |= set; + res.DstRegs |= set; + } + if (res.Kind == tk_STMIA || res.Kind == tk_PUSH) + { + u32 set = (instr & 0xFF) & ~(res.DstRegs|res.SrcRegs); + if (res.Kind == tk_PUSH && instr & (1 << 8)) + set |= (1 << 14); + res.NotStrictlyNeeded |= set; + res.SrcRegs |= set; + } + res.EndBlock |= res.Branches(); if (res.Kind == tk_BCOND) @@ -466,6 +481,19 @@ Info Decode(bool thumb, u32 num, u32 instr) if ((data & A_LoadMem) && res.SrcRegs == (1 << 15)) res.SpecialKind = special_LoadLiteral; + + if (res.Kind == ak_LDM) + { + u16 set = (instr & 0xFFFF) & ~(res.SrcRegs|res.DstRegs|(1<<15)); + res.DstRegs |= set; + res.NotStrictlyNeeded |= set; + } + if (res.Kind == ak_STM) + { + u16 set = (instr & 0xFFFF) & ~(res.SrcRegs|res.DstRegs|(1<<15)); + res.SrcRegs |= set; + res.NotStrictlyNeeded |= set; + } if ((instr >> 28) < 0xE) { diff --git a/src/ARM_InstrInfo.h b/src/ARM_InstrInfo.h index c032a4f..2732181 100644 --- a/src/ARM_InstrInfo.h +++ b/src/ARM_InstrInfo.h @@ -236,7 +236,7 @@ enum struct Info { - u16 DstRegs, SrcRegs; + u16 DstRegs, SrcRegs, NotStrictlyNeeded; u16 Kind; u8 SpecialKind; diff --git a/src/Config.cpp b/src/Config.cpp index 63d61a3..eb5bfcc 100644 --- a/src/Config.cpp +++ b/src/Config.cpp @@ -38,6 +38,7 @@ int GL_Antialias; bool JIT_Enable = false; int JIT_MaxBlockSize = 12; bool JIT_BrancheOptimisations = true; +bool JIT_LiteralOptimisations = true; #endif ConfigEntry ConfigFile[] = @@ -52,6 +53,7 @@ ConfigEntry ConfigFile[] = {"JIT_Enable", 0, &JIT_Enable, 0, NULL, 0}, {"JIT_MaxBlockSize", 0, &JIT_MaxBlockSize, 10, NULL, 0}, {"JIT_BrancheOptimisations", 0, &JIT_BrancheOptimisations, 1, NULL, 0}, + {"JIT_BrancheOptimisations", 0, &JIT_LiteralOptimisations, 1, NULL, 0}, #endif {"", -1, NULL, 0, NULL, 0} diff --git a/src/Config.h b/src/Config.h index 0fcefc3..723ab13 100644 --- a/src/Config.h +++ b/src/Config.h @@ -50,6 +50,7 @@ extern int GL_Antialias; extern bool JIT_Enable; extern int JIT_MaxBlockSize; extern bool JIT_BrancheOptimisations; +extern bool JIT_LiteralOptimisations; #endif } diff --git a/src/NDS.cpp b/src/NDS.cpp index e9e6795..141c565 100644 --- a/src/NDS.cpp +++ b/src/NDS.cpp @@ -1142,7 +1142,7 @@ void UpdateIRQ(u32 cpu) if (IME[cpu] & 0x1) { - arm->IRQ = IE[cpu] & IF[cpu]; + arm->IRQ = !!(IE[cpu] & IF[cpu]); } else { diff --git a/src/libui_sdl/DlgEmuSettings.cpp b/src/libui_sdl/DlgEmuSettings.cpp index 45e8e0c..0df9c6c 100644 --- a/src/libui_sdl/DlgEmuSettings.cpp +++ b/src/libui_sdl/DlgEmuSettings.cpp @@ -43,6 +43,7 @@ uiCheckbox* cbDirectBoot; uiCheckbox* cbJITEnabled; uiEntry* enJITMaxBlockSize; uiCheckbox* cbJITBranchOptimisations; +uiCheckbox* cbJITLiteralOptimisations; #endif int OnCloseWindow(uiWindow* window, void* blarg) @@ -66,14 +67,16 @@ void OnOk(uiButton* btn, void* blarg) char* maxBlockSizeStr = uiEntryText(enJITMaxBlockSize); long blockSize = strtol(maxBlockSizeStr, NULL, 10); bool branchOptimisations = uiCheckboxChecked(cbJITBranchOptimisations); + bool literalOptimisations = uiCheckboxChecked(cbJITLiteralOptimisations); uiFreeText(maxBlockSizeStr); if (blockSize < 1) blockSize = 1; if (blockSize > 32) blockSize = 32; - if (enableJit != Config::JIT_Enable || blockSize != Config::JIT_MaxBlockSize || - branchOptimisations != Config::JIT_BrancheOptimisations) + if (enableJit != Config::JIT_Enable || blockSize != Config::JIT_MaxBlockSize + || branchOptimisations != Config::JIT_BrancheOptimisations + || literalOptimisations != Config::JIT_LiteralOptimisations) { if (RunningSomething && !uiMsgBoxConfirm(win, "Reset emulator", @@ -82,7 +85,8 @@ void OnOk(uiButton* btn, void* blarg) Config::JIT_Enable = enableJit; Config::JIT_MaxBlockSize = blockSize; - Config::JIT_BrancheOptimisations = uiCheckboxChecked(cbJITBranchOptimisations); + Config::JIT_BrancheOptimisations = branchOptimisations; + Config::JIT_LiteralOptimisations = literalOptimisations; restart = true; } @@ -108,11 +112,13 @@ void OnJITStateChanged(uiCheckbox* cb, void* blarg) { uiControlEnable(uiControl(enJITMaxBlockSize)); uiControlEnable(uiControl(cbJITBranchOptimisations)); + uiControlEnable(uiControl(cbJITLiteralOptimisations)); } else { uiControlDisable(uiControl(enJITMaxBlockSize)); uiControlDisable(uiControl(cbJITBranchOptimisations)); + uiControlDisable(uiControl(cbJITLiteralOptimisations)); } } #endif @@ -174,9 +180,25 @@ void Open() uiBox* row = uiNewHorizontalBox(); uiBoxAppend(in_ctrl, uiControl(row), 0); - cbJITBranchOptimisations = uiNewCheckbox("Branch optimisations (breaks in rare cases games!)"); + uiLabel* lbl = uiNewLabel("If you experience problems with a certain game, you can try disabling these options:"); + uiBoxAppend(row, uiControl(lbl), 0); + } + + { + uiBox* row = uiNewHorizontalBox(); + uiBoxAppend(in_ctrl, uiControl(row), 0); + + cbJITBranchOptimisations = uiNewCheckbox("Branch optimisations"); uiBoxAppend(row, uiControl(cbJITBranchOptimisations), 0); } + + { + uiBox* row = uiNewHorizontalBox(); + uiBoxAppend(in_ctrl, uiControl(row), 0); + + cbJITLiteralOptimisations = uiNewCheckbox("Literal optimisations"); + uiBoxAppend(row, uiControl(cbJITLiteralOptimisations), 0); + } } #endif @@ -214,6 +236,7 @@ void Open() OnJITStateChanged(cbJITEnabled, NULL); uiCheckboxSetChecked(cbJITBranchOptimisations, Config::JIT_BrancheOptimisations); + uiCheckboxSetChecked(cbJITLiteralOptimisations, Config::JIT_LiteralOptimisations); #endif uiControlShow(uiControl(win)); diff --git a/src/libui_sdl/main.cpp b/src/libui_sdl/main.cpp index c3db88d..0066668 100644 --- a/src/libui_sdl/main.cpp +++ b/src/libui_sdl/main.cpp @@ -2675,8 +2675,6 @@ void RecreateMainWindow(bool opengl) int main(int argc, char** argv) { - freopen("miauz.txt", "w", stdout); - srand(time(NULL)); printf("melonDS " MELONDS_VERSION "\n"); -- cgit v1.2.3 From 9b98b8816a1dc1373ce9a57aef845263456702c3 Mon Sep 17 00:00:00 2001 From: RSDuck Date: Tue, 4 Feb 2020 17:28:51 +0100 Subject: improve nop handling and proper behaviour for LDM^ fixes dslinux --- src/ARM.cpp | 2 ++ src/ARMJIT.cpp | 13 +++++++++---- src/ARMJIT_RegisterCache.h | 2 +- src/ARMJIT_x64/ARMJIT_Branch.cpp | 6 +++--- src/ARMJIT_x64/ARMJIT_Compiler.cpp | 1 + src/ARMJIT_x64/ARMJIT_Compiler.h | 2 ++ src/ARMJIT_x64/ARMJIT_LoadStore.cpp | 5 +++-- src/ARM_InstrInfo.cpp | 2 ++ src/ARM_InstrInfo.h | 2 ++ 9 files changed, 25 insertions(+), 10 deletions(-) (limited to 'src/ARMJIT_RegisterCache.h') diff --git a/src/ARM.cpp b/src/ARM.cpp index 9ab9546..07cc472 100644 --- a/src/ARM.cpp +++ b/src/ARM.cpp @@ -725,6 +725,8 @@ void ARMv4::ExecuteJIT() return; } + //printf("executing armv4 at %08x\n", instrAddr); + ARMJIT::JitBlockEntry block = ARMJIT::LookUpBlock<1>(instrAddr); if (block) Cycles += block(); diff --git a/src/ARMJIT.cpp b/src/ARMJIT.cpp index c7387c9..8fd7708 100644 --- a/src/ARMJIT.cpp +++ b/src/ARMJIT.cpp @@ -273,6 +273,8 @@ bool IsIdleLoop(FetchedInstr* instrs, int instrsCount) typedef void (*InterpreterFunc)(ARM* cpu); +void NOP(ARM* cpu) {} + #define F(x) &ARMInterpreter::A_##x #define F_ALU(name, s) \ F(name##_REG_LSL_IMM##s), F(name##_REG_LSR_IMM##s), F(name##_REG_ASR_IMM##s), F(name##_REG_ROR_IMM##s), \ @@ -320,7 +322,8 @@ InterpreterFunc InterpretARM[ARMInstrInfo::ak_Count] = F(LDM), F(STM), F(B), F(BL), F(BLX_IMM), F(BX), F(BLX_REG), - F(UNK), F(MSR_IMM), F(MSR_REG), F(MRS), F(MCR), F(MRC), F(SVC) + F(UNK), F(MSR_IMM), F(MSR_REG), F(MRS), F(MCR), F(MRC), F(SVC), + NOP }; #undef F_ALU #undef F_MEM_WB @@ -387,8 +390,8 @@ void CompileBlock(ARM* cpu) u32 nextInstr[2] = {cpu->NextInstr[0], cpu->NextInstr[1]}; u32 nextInstrAddr[2] = {blockAddr, r15}; - JIT_DEBUGPRINT("start block %x (%x) %p %p (region invalidates %dx)\n", - blockAddr, pseudoPhysicalAddr, FastBlockAccess[pseudoPhysicalAddr / 2], + JIT_DEBUGPRINT("start block %x %08x (%x) %p %p (region invalidates %dx)\n", + blockAddr, cpu->CPSR, pseudoPhysicalAddr, FastBlockAccess[pseudoPhysicalAddr / 2], cpu->Num == 0 ? LookUpBlock<0>(blockAddr) : LookUpBlock<1>(blockAddr), CodeRanges[pseudoPhysicalAddr / 512].TimesInvalidated); @@ -473,7 +476,9 @@ void CompileBlock(ARM* cpu) else { u32 icode = ((instrs[i].Instr >> 4) & 0xF) | ((instrs[i].Instr >> 16) & 0xFF0); - assert(InterpretARM[instrs[i].Info.Kind] == ARMInterpreter::ARMInstrTable[icode] || instrs[i].Info.Kind == ARMInstrInfo::ak_MOV_REG_LSL_IMM); + assert(InterpretARM[instrs[i].Info.Kind] == ARMInterpreter::ARMInstrTable[icode] + || instrs[i].Info.Kind == ARMInstrInfo::ak_MOV_REG_LSL_IMM + || instrs[i].Info.Kind == ARMInstrInfo::ak_Nop); if (cpu->CheckCondition(instrs[i].Cond())) InterpretARM[instrs[i].Info.Kind](cpu); else diff --git a/src/ARMJIT_RegisterCache.h b/src/ARMJIT_RegisterCache.h index 2222bc2..b894657 100644 --- a/src/ARMJIT_RegisterCache.h +++ b/src/ARMJIT_RegisterCache.h @@ -152,7 +152,7 @@ public: needValueLoaded = BitSet16(instr.Info.SrcRegs); for (int reg : needToBeLoaded) LoadRegister(reg, needValueLoaded[reg]); - } + } { BitSet16 loadedSet(LoadedRegs); BitSet16 loadRegs(instr.Info.NotStrictlyNeeded & futureNeeded & ~LoadedRegs); diff --git a/src/ARMJIT_x64/ARMJIT_Branch.cpp b/src/ARMJIT_x64/ARMJIT_Branch.cpp index 0dedb3f..e02865d 100644 --- a/src/ARMJIT_x64/ARMJIT_Branch.cpp +++ b/src/ARMJIT_x64/ARMJIT_Branch.cpp @@ -134,7 +134,7 @@ void Compiler::Comp_JumpTo(Gen::X64Reg addr, bool restoreCPSR) { IrregularCycles = true; - BitSet16 hiRegsLoaded(RegCache.DirtyRegs & 0xFF00); + BitSet16 hiRegsLoaded(RegCache.LoadedRegs & 0x7F00); bool previouslyDirty = CPSRDirty; SaveCPSR(); @@ -156,12 +156,12 @@ void Compiler::Comp_JumpTo(Gen::X64Reg addr, bool restoreCPSR) if (!restoreCPSR) XOR(32, R(ABI_PARAM3), R(ABI_PARAM3)); else - MOV(32, R(ABI_PARAM3), Imm32(restoreCPSR)); + MOV(32, R(ABI_PARAM3), Imm32(true)); // what a waste if (Num == 0) CALL((void*)&ARMv5::JumpTo); else CALL((void*)&ARMv4::JumpTo); - + if (!Thumb && restoreCPSR && CurInstr.Cond() < 0xE) { for (int reg : hiRegsLoaded) diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp index fd38724..5afe842 100644 --- a/src/ARMJIT_x64/ARMJIT_Compiler.cpp +++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp @@ -308,6 +308,7 @@ const Compiler::CompileFunc A_Comp[ARMInstrInfo::ak_Count] = F(A_Comp_BranchImm), F(A_Comp_BranchImm), F(A_Comp_BranchImm), F(A_Comp_BranchXchangeReg), F(A_Comp_BranchXchangeReg), // system stuff NULL, NULL, NULL, NULL, NULL, NULL, NULL, + F(Nop) }; const Compiler::CompileFunc T_Comp[ARMInstrInfo::tk_Count] = { diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.h b/src/ARMJIT_x64/ARMJIT_Compiler.h index 792ff66..2cb57dc 100644 --- a/src/ARMJIT_x64/ARMJIT_Compiler.h +++ b/src/ARMJIT_x64/ARMJIT_Compiler.h @@ -79,6 +79,8 @@ public: opInvertOp2 = 1 << 5, }; + void Nop() {} + void A_Comp_Arith(); void A_Comp_MovOp(); void A_Comp_CmpOp(); diff --git a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp index b66f304..4cafc1c 100644 --- a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp +++ b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp @@ -531,7 +531,7 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc { if (regs[reg]) { - if (usermode && reg >= 8 && reg < 15) + if (usermode && !regs[15] && reg >= 8 && reg < 15) { if (firstUserMode) { @@ -545,7 +545,8 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc FixupBranch sucessfulWritten = J_CC(CC_NC); if (RegCache.Mapping[reg] != INVALID_REG) MOV(32, R(RegCache.Mapping[reg]), R(ABI_PARAM3)); - SaveReg(reg, ABI_PARAM3); + else + SaveReg(reg, ABI_PARAM3); SetJumpTarget(sucessfulWritten); } else if (RegCache.Mapping[reg] == INVALID_REG) diff --git a/src/ARM_InstrInfo.cpp b/src/ARM_InstrInfo.cpp index 8f8bd35..08e2f0a 100644 --- a/src/ARM_InstrInfo.cpp +++ b/src/ARM_InstrInfo.cpp @@ -392,6 +392,8 @@ Info Decode(bool thumb, u32 num, u32 instr) u32 data = ARMInstrTable[((instr >> 4) & 0xF) | ((instr >> 16) & 0xFF0)]; if (num == 0 && (instr & 0xFE000000) == 0xFA000000) data = A_BLX_IMM; + else if ((instr >> 28) == 0xF) + data = ak(ak_Nop); if (data & A_UnkOnARM7 && num != 0) data = A_UNK; diff --git a/src/ARM_InstrInfo.h b/src/ARM_InstrInfo.h index 2732181..6ab4929 100644 --- a/src/ARM_InstrInfo.h +++ b/src/ARM_InstrInfo.h @@ -139,6 +139,8 @@ enum ak_MRC, ak_SVC, + ak_Nop, + ak_Count, tk_LSL_IMM = 0, -- cgit v1.2.3 From 266fd20ea536e1c2cd98fce49ef23dbd01f3a8cd Mon Sep 17 00:00:00 2001 From: RSDuck Date: Fri, 7 Feb 2020 00:08:29 +0100 Subject: fixup for aarch64 JIT --- src/ARMJIT_A64/ARMJIT_Compiler.cpp | 1 - src/ARMJIT_RegisterCache.h | 10 ++++++++-- 2 files changed, 8 insertions(+), 3 deletions(-) (limited to 'src/ARMJIT_RegisterCache.h') diff --git a/src/ARMJIT_A64/ARMJIT_Compiler.cpp b/src/ARMJIT_A64/ARMJIT_Compiler.cpp index d61cc9c..2033307 100644 --- a/src/ARMJIT_A64/ARMJIT_Compiler.cpp +++ b/src/ARMJIT_A64/ARMJIT_Compiler.cpp @@ -83,7 +83,6 @@ Compiler::Compiler() JitMemUseableSize = JitMemSize; Reset(); #else - #else u64 pageSize = sysconf(_SC_PAGE_SIZE); u8* pageAligned = (u8*)(((u64)JitMem & ~(pageSize - 1)) + pageSize); u64 alignedSize = (((u64)JitMem + sizeof(JitMem)) & ~(pageSize - 1)) - (u64)pageAligned; diff --git a/src/ARMJIT_RegisterCache.h b/src/ARMJIT_RegisterCache.h index b894657..8460825 100644 --- a/src/ARMJIT_RegisterCache.h +++ b/src/ARMJIT_RegisterCache.h @@ -18,11 +18,15 @@ public: RegisterCache() {} - RegisterCache(T* compiler, FetchedInstr instrs[], int instrsCount) + RegisterCache(T* compiler, FetchedInstr instrs[], int instrsCount, bool pcAllocatableAsSrc = false) : Compiler(compiler), Instrs(instrs), InstrsCount(instrsCount) { for (int i = 0; i < 16; i++) Mapping[i] = (Reg)-1; + + PCAllocatableAsSrc = ~(pcAllocatableAsSrc + ? 0 + : (1 << 15)); } void UnloadRegister(int reg) @@ -120,7 +124,7 @@ public: for (int reg : neverNeededAgain) UnloadRegister(reg); - u16 necessaryRegs = ((instr.Info.SrcRegs & ~(1 << 15)) | instr.Info.DstRegs) & ~instr.Info.NotStrictlyNeeded; + u16 necessaryRegs = ((instr.Info.SrcRegs & PCAllocatableAsSrc) | instr.Info.DstRegs) & ~instr.Info.NotStrictlyNeeded; u16 writeRegs = instr.Info.DstRegs & ~instr.Info.NotStrictlyNeeded; BitSet16 needToBeLoaded(necessaryRegs & ~LoadedRegs); if (needToBeLoaded != BitSet16(0)) @@ -184,6 +188,8 @@ public: u16 LoadedRegs = 0; u16 DirtyRegs = 0; + u16 PCAllocatableAsSrc = 0; + T* Compiler; FetchedInstr* Instrs; -- cgit v1.2.3 From 2e6e6aa75094bfb3efbae805006249c26c0c4726 Mon Sep 17 00:00:00 2001 From: RSDuck Date: Fri, 7 Feb 2020 00:12:09 +0100 Subject: this it should work --- src/ARMJIT_RegisterCache.h | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) (limited to 'src/ARMJIT_RegisterCache.h') diff --git a/src/ARMJIT_RegisterCache.h b/src/ARMJIT_RegisterCache.h index 8460825..d4e5539 100644 --- a/src/ARMJIT_RegisterCache.h +++ b/src/ARMJIT_RegisterCache.h @@ -95,6 +95,20 @@ public: LiteralsLoaded = 0; } + BitSet32 GetPushRegs() + { + BitSet16 used; + for (int i = 0; i < InstrsCount; i++) + used |= BitSet16(Instrs[i].Info.SrcRegs | Instrs[i].Info.DstRegs); + + BitSet32 res; + u32 registersMax = std::min((int)used.Count(), NativeRegsAvailable); + for (int i = 0; i < registersMax; i++) + res |= BitSet32(1 << (int)NativeRegAllocOrder[i]); + + return res; + } + void Prepare(bool thumb, int i) { FetchedInstr instr = Instrs[i]; @@ -111,7 +125,7 @@ public: for (int j = 0; j < 16; j++) ranking[j] = 0; for (int j = i; j < InstrsCount; j++) - { + {s BitSet16 regsNeeded((Instrs[j].Info.SrcRegs & ~(1 << 15)) | Instrs[j].Info.DstRegs); futureNeeded |= regsNeeded.m_val; regsNeeded &= BitSet16(~Instrs[j].Info.NotStrictlyNeeded); -- cgit v1.2.3 From e9760c941b1e08d4908bf8697e1fa427f6ed8b85 Mon Sep 17 00:00:00 2001 From: RSDuck Date: Fri, 7 Feb 2020 00:21:08 +0100 Subject: git played a prank on me haha very funny --- src/ARMJIT_RegisterCache.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'src/ARMJIT_RegisterCache.h') diff --git a/src/ARMJIT_RegisterCache.h b/src/ARMJIT_RegisterCache.h index d4e5539..5e18e84 100644 --- a/src/ARMJIT_RegisterCache.h +++ b/src/ARMJIT_RegisterCache.h @@ -125,7 +125,7 @@ public: for (int j = 0; j < 16; j++) ranking[j] = 0; for (int j = i; j < InstrsCount; j++) - {s + { BitSet16 regsNeeded((Instrs[j].Info.SrcRegs & ~(1 << 15)) | Instrs[j].Info.DstRegs); futureNeeded |= regsNeeded.m_val; regsNeeded &= BitSet16(~Instrs[j].Info.NotStrictlyNeeded); @@ -212,4 +212,4 @@ public: } -#endif \ No newline at end of file +#endif -- cgit v1.2.3 From 0f53a34551d60964345debb1766f81ca4686eb17 Mon Sep 17 00:00:00 2001 From: RSDuck Date: Sat, 9 May 2020 00:45:05 +0200 Subject: rewrite JIT memory emulation --- src/ARM.cpp | 10 +- src/ARM.h | 24 +- src/ARMJIT.cpp | 905 +++++++++++++++++++++++++--------- src/ARMJIT.h | 65 ++- src/ARMJIT_A64/ARMJIT_Compiler.cpp | 4 +- src/ARMJIT_Internal.h | 68 ++- src/ARMJIT_RegisterCache.h | 18 +- src/ARMJIT_x64/ARMJIT_Compiler.cpp | 43 +- src/ARMJIT_x64/ARMJIT_Compiler.h | 34 +- src/ARMJIT_x64/ARMJIT_LoadStore.cpp | 935 +++++++++++++++++++----------------- src/ARM_InstrInfo.cpp | 16 +- src/CP15.cpp | 44 +- src/NDS.cpp | 105 +++- src/NDS.h | 8 + 14 files changed, 1465 insertions(+), 814 deletions(-) (limited to 'src/ARMJIT_RegisterCache.h') diff --git a/src/ARM.cpp b/src/ARM.cpp index 95d2b8b..205332d 100644 --- a/src/ARM.cpp +++ b/src/ARM.cpp @@ -579,7 +579,8 @@ void ARMv5::ExecuteJIT() while (NDS::ARM9Timestamp < NDS::ARM9Target) { u32 instrAddr = R[15] - ((CPSR&0x20)?2:4); - if (!ARMJIT::IsMapped<0>(instrAddr)) + u32 translatedAddr = ARMJIT::TranslateAddr9(instrAddr); + if (!translatedAddr) { NDS::ARM9Timestamp = NDS::ARM9Target; printf("ARMv5 PC in non executable region %08X\n", R[15]); @@ -589,7 +590,7 @@ void ARMv5::ExecuteJIT() // hack so Cycles <= 0 becomes Cycles < 0 Cycles = NDS::ARM9Target - NDS::ARM9Timestamp - 1; - ARMJIT::JitBlockEntry block = ARMJIT::LookUpBlockEntry(ARMJIT::TranslateAddr<0>(instrAddr)); + ARMJIT::JitBlockEntry block = ARMJIT::LookUpBlockEntry<0>(translatedAddr); if (block) ARM_Dispatch(this, block); else @@ -722,7 +723,8 @@ void ARMv4::ExecuteJIT() while (NDS::ARM7Timestamp < NDS::ARM7Target) { u32 instrAddr = R[15] - ((CPSR&0x20)?2:4); - if (!ARMJIT::IsMapped<1>(instrAddr)) + u32 translatedAddr = ARMJIT::TranslateAddr7(instrAddr); + if (!translatedAddr) { NDS::ARM7Timestamp = NDS::ARM7Target; printf("ARMv4 PC in non executable region %08X\n", R[15]); @@ -731,7 +733,7 @@ void ARMv4::ExecuteJIT() Cycles = NDS::ARM7Target - NDS::ARM7Timestamp - 1; - ARMJIT::JitBlockEntry block = ARMJIT::LookUpBlockEntry(ARMJIT::TranslateAddr<1>(instrAddr)); + ARMJIT::JitBlockEntry block = ARMJIT::LookUpBlockEntry<1>(translatedAddr); if (block) ARM_Dispatch(this, block); else diff --git a/src/ARM.h b/src/ARM.h index 4877956..f64b7fe 100644 --- a/src/ARM.h +++ b/src/ARM.h @@ -308,7 +308,7 @@ public: void DataRead8(u32 addr, u32* val) { *val = NDS::ARM7Read8(addr); - DataRegion = addr >> 20; + DataRegion = addr; DataCycles = NDS::ARM7MemTimings[addr >> 15][0]; } @@ -317,7 +317,7 @@ public: addr &= ~1; *val = NDS::ARM7Read16(addr); - DataRegion = addr >> 20; + DataRegion = addr; DataCycles = NDS::ARM7MemTimings[addr >> 15][0]; } @@ -326,7 +326,7 @@ public: addr &= ~3; *val = NDS::ARM7Read32(addr); - DataRegion = addr >> 20; + DataRegion = addr; DataCycles = NDS::ARM7MemTimings[addr >> 15][2]; } @@ -341,7 +341,7 @@ public: void DataWrite8(u32 addr, u8 val) { NDS::ARM7Write8(addr, val); - DataRegion = addr >> 20; + DataRegion = addr; DataCycles = NDS::ARM7MemTimings[addr >> 15][0]; } @@ -350,7 +350,7 @@ public: addr &= ~1; NDS::ARM7Write16(addr, val); - DataRegion = addr >> 20; + DataRegion = addr; DataCycles = NDS::ARM7MemTimings[addr >> 15][0]; } @@ -359,7 +359,7 @@ public: addr &= ~3; NDS::ARM7Write32(addr, val); - DataRegion = addr >> 20; + DataRegion = addr; DataCycles = NDS::ARM7MemTimings[addr >> 15][2]; } @@ -390,7 +390,7 @@ public: s32 numC = NDS::ARM7MemTimings[CodeCycles][(CPSR&0x20)?0:2]; s32 numD = DataCycles; - if ((DataRegion >> 4) == 0x02) // mainRAM + if ((DataRegion >> 24) == 0x02) // mainRAM { if (CodeRegion == 0x02) Cycles -= numC + numD; @@ -417,7 +417,7 @@ public: s32 numC = NDS::ARM7MemTimings[CodeCycles][(CPSR&0x20)?0:2]; s32 numD = DataCycles; - if ((DataRegion >> 4) == 0x02) + if ((DataRegion >> 24) == 0x02) { if (CodeRegion == 0x02) Cycles -= numC + numD; @@ -443,4 +443,12 @@ void T_UNK(ARM* cpu); } +namespace NDS +{ + +extern ARMv5* ARM9; +extern ARMv4* ARM7; + +} + #endif // ARM_H diff --git a/src/ARMJIT.cpp b/src/ARMJIT.cpp index 46f71f1..9602aed 100644 --- a/src/ARMJIT.cpp +++ b/src/ARMJIT.cpp @@ -23,6 +23,7 @@ #include "ARMInterpreter_Branch.h" #include "ARMInterpreter.h" +#include "GPU.h" #include "GPU3D.h" #include "SPU.h" #include "Wifi.h" @@ -34,9 +35,10 @@ namespace ARMJIT #define JIT_DEBUGPRINT(msg, ...) //#define JIT_DEBUGPRINT(msg, ...) printf(msg, ## __VA_ARGS__) -Compiler* compiler; +Compiler* JITCompiler; -const u32 ExeMemRegionSizes[] = { +const u32 ExeMemRegionSizes[] = +{ 0x8000, // Unmapped Region (dummy) 0x8000, // ITCM 4*1024*1024, // Main RAM @@ -48,7 +50,8 @@ const u32 ExeMemRegionSizes[] = { 0x40000 // ARM7 WVRAM }; -const u32 ExeMemRegionOffsets[] = { +const u32 ExeMemRegionOffsets[] = +{ 0, 0x8000, 0x10000, @@ -61,65 +64,391 @@ const u32 ExeMemRegionOffsets[] = { 0x518000, }; -#define DUP2(x) x, x - -const static ExeMemKind JIT_MEM[2][32] = { - //arm9 - { - /* 0X*/ DUP2(exeMem_ITCM), - /* 1X*/ DUP2(exeMem_ITCM), // mirror - /* 2X*/ DUP2(exeMem_MainRAM), - /* 3X*/ DUP2(exeMem_SWRAM), - /* 4X*/ DUP2(exeMem_Unmapped), - /* 5X*/ DUP2(exeMem_Unmapped), - /* 6X*/ exeMem_Unmapped, - exeMem_LCDC, // Plain ARM9-CPU Access (LCDC mode) (max 656KB) - /* 7X*/ DUP2(exeMem_Unmapped), - /* 8X*/ DUP2(exeMem_Unmapped), - /* 9X*/ DUP2(exeMem_Unmapped), - /* AX*/ DUP2(exeMem_Unmapped), - /* BX*/ DUP2(exeMem_Unmapped), - /* CX*/ DUP2(exeMem_Unmapped), - /* DX*/ DUP2(exeMem_Unmapped), - /* EX*/ DUP2(exeMem_Unmapped), - /* FX*/ DUP2(exeMem_ARM9_BIOS) - }, - //arm7 - { - /* 0X*/ DUP2(exeMem_ARM7_BIOS), - /* 1X*/ DUP2(exeMem_Unmapped), - /* 2X*/ DUP2(exeMem_MainRAM), - /* 3X*/ exeMem_SWRAM, - exeMem_ARM7_WRAM, - /* 4X*/ DUP2(exeMem_Unmapped), - /* 5X*/ DUP2(exeMem_Unmapped), - /* 6X*/ DUP2(exeMem_ARM7_WVRAM), /* contrary to Gbatek, melonDS and itself, - DeSmuME doesn't mirror the 64 MB region at 0x6800000 */ - /* 7X*/ DUP2(exeMem_Unmapped), - /* 8X*/ DUP2(exeMem_Unmapped), - /* 9X*/ DUP2(exeMem_Unmapped), - /* AX*/ DUP2(exeMem_Unmapped), - /* BX*/ DUP2(exeMem_Unmapped), - /* CX*/ DUP2(exeMem_Unmapped), - /* DX*/ DUP2(exeMem_Unmapped), - /* EX*/ DUP2(exeMem_Unmapped), - /* FX*/ DUP2(exeMem_Unmapped) - } -}; - -#undef DUP2 - /* translates address to pseudo physical address - more compact, eliminates mirroring, everything comes in a row - we only need one translation table */ -u32 AddrTranslate9[0x2000]; -u32 AddrTranslate7[0x4000]; + +u32 TranslateAddr9(u32 addr) +{ + switch (ClassifyAddress9(addr)) + { + case memregion_MainRAM: return ExeMemRegionOffsets[exeMem_MainRAM] + (addr & (MAIN_RAM_SIZE - 1)); + case memregion_SWRAM9: + if (NDS::SWRAM_ARM9) + return ExeMemRegionOffsets[exeMem_SWRAM] + (NDS::SWRAM_ARM9 - NDS::SharedWRAM) + (addr & NDS::SWRAM_ARM9Mask); + else + return 0; + case memregion_ITCM: return ExeMemRegionOffsets[exeMem_ITCM] + (addr & 0x7FFF); + case memregion_VRAM: return (addr >= 0x6800000 && addr < 0x68A4000) ? ExeMemRegionOffsets[exeMem_LCDC] + (addr - 0x6800000) : 0; + case memregion_BIOS9: return ExeMemRegionOffsets[exeMem_ARM9_BIOS] + (addr & 0xFFF); + default: return 0; + } +} + +u32 TranslateAddr7(u32 addr) +{ + switch (ClassifyAddress7(addr)) + { + case memregion_MainRAM: return ExeMemRegionOffsets[exeMem_MainRAM] + (addr & (MAIN_RAM_SIZE - 1)); + case memregion_SWRAM7: + if (NDS::SWRAM_ARM7) + return ExeMemRegionOffsets[exeMem_SWRAM] + (NDS::SWRAM_ARM7 - NDS::SharedWRAM) + (addr & NDS::SWRAM_ARM7Mask); + else + return 0; + case memregion_BIOS7: return ExeMemRegionOffsets[exeMem_ARM7_BIOS] + addr; + case memregion_WRAM7: return ExeMemRegionOffsets[exeMem_ARM7_WRAM] + (addr & 0xFFFF); + case memregion_VWRAM: return ExeMemRegionOffsets[exeMem_ARM7_WVRAM] + (addr & 0x1FFFF); + default: return 0; + } +} AddressRange CodeRanges[ExeMemSpaceSize / 512]; -std::unordered_map JitBlocks; +TinyVector InvalidLiterals; + +std::unordered_map JitBlocks9; +std::unordered_map JitBlocks7; + +u8 MemoryStatus9[0x800000]; +u8 MemoryStatus7[0x800000]; + +int ClassifyAddress9(u32 addr) +{ + if (addr < NDS::ARM9->ITCMSize) + return memregion_ITCM; + else if (addr >= NDS::ARM9->DTCMBase && addr < (NDS::ARM9->DTCMBase + NDS::ARM9->DTCMSize)) + return memregion_DTCM; + else if ((addr & 0xFFFFF000) == 0xFFFF0000) + return memregion_BIOS9; + else + { + switch (addr & 0xFF000000) + { + case 0x02000000: + return memregion_MainRAM; + case 0x03000000: + return memregion_SWRAM9; + case 0x04000000: + return memregion_IO9; + case 0x06000000: + return memregion_VRAM; + } + } + return memregion_Other; +} + +int ClassifyAddress7(u32 addr) +{ + if (addr < 0x00004000) + return memregion_BIOS7; + else + { + switch (addr & 0xFF800000) + { + case 0x02000000: + case 0x02800000: + return memregion_MainRAM; + case 0x03000000: + if (NDS::SWRAM_ARM7) + return memregion_SWRAM7; + else + return memregion_WRAM7; + case 0x03800000: + return memregion_WRAM7; + case 0x04000000: + return memregion_IO7; + case 0x04800000: + return memregion_Wifi; + case 0x06000000: + case 0x06800000: + return memregion_VWRAM; + } + } + return memregion_Other; +} + +void UpdateMemoryStatus9(u32 start, u32 end) +{ + start >>= 12; + end >>= 12; + + if (end == 0xFFFFF) + end++; + + for (u32 i = start; i < end; i++) + { + u32 addr = i << 12; + + int region = ClassifyAddress9(addr); + u32 pseudoPhyisical = TranslateAddr9(addr); + + for (u32 j = 0; j < 8; j++) + { + u8 val = region; + if (CodeRanges[(pseudoPhyisical + (j << 12)) / 512].Blocks.Length) + val |= 0x80; + MemoryStatus9[i * 8 + j] = val; + } + } +} + +void UpdateMemoryStatus7(u32 start, u32 end) +{ + start >>= 12; + end >>= 12; + + if (end == 0xFFFFF) + end++; + + for (u32 i = start; i < end; i++) + { + u32 addr = i << 12; + + int region = ClassifyAddress7(addr); + u32 pseudoPhyisical = TranslateAddr7(addr); + + for (u32 j = 0; j < 8; j++) + { + u8 val = region; + if (CodeRanges[(pseudoPhyisical + (j << 12)) / 512].Blocks.Length) + val |= 0x80; + MemoryStatus7[i * 8 + j] = val; + } + } +} + +void UpdateRegionByPseudoPhyiscal(u32 addr, bool invalidate) +{ + for (u32 i = 1; i < exeMem_Count; i++) + { + if (addr >= ExeMemRegionOffsets[i] && addr < ExeMemRegionOffsets[i] + ExeMemRegionSizes[i]) + { + for (u32 num = 0; num < 2; num++) + { + u32 physSize = ExeMemRegionSizes[i]; + u32 mapSize = 0; + u32 mapStart = 0; + switch (i) + { + case exeMem_ITCM: + if (num == 0) + mapStart = 0; mapSize = NDS::ARM9->ITCMSize; + break; + case exeMem_MainRAM: mapStart = 0x2000000; mapSize = 0x1000000; break; + case exeMem_SWRAM: + if (num == 0) + { + if (NDS::SWRAM_ARM9) + mapStart = 0x3000000, mapSize = 0x1000000; + else + mapStart = mapSize = 0; + } + else + { + if (NDS::SWRAM_ARM7) + mapStart = 0x3000000, mapSize = 0x800000; + else + mapStart = mapSize = 0; + } + break; + case exeMem_LCDC: + if (num == 0) + mapStart = 0x6800000, mapSize = 0xA4000; + break; + case exeMem_ARM9_BIOS: + if (num == 0) + mapStart = 0xFFFF0000, mapSize = 0x10000; + break; + case exeMem_ARM7_BIOS: + if (num == 1) + mapStart = 0; mapSize = 0x4000; + break; + case exeMem_ARM7_WRAM: + if (num == 1) + { + if (NDS::SWRAM_ARM7) + mapStart = 0x3800000, mapSize = 0x800000; + else + mapStart = 0x3000000, mapSize = 0x1000000; + } + break; + case exeMem_ARM7_WVRAM: + if (num == 1) + mapStart = 0x6000000, mapSize = 0x1000000; + break; + } + + for (u32 j = 0; j < mapSize / physSize; j++) + { + u32 virtAddr = mapStart + physSize * j + (addr - ExeMemRegionOffsets[i]); + if (num == 0 + && virtAddr >= NDS::ARM9->DTCMBase && virtAddr < (NDS::ARM9->DTCMBase + NDS::ARM9->DTCMSize)) + continue; + if (invalidate) + { + if (num == 0) + MemoryStatus9[virtAddr / 512] |= 0x80; + else + MemoryStatus7[virtAddr / 512] |= 0x80; + } + else + { + if (num == 0) + MemoryStatus9[virtAddr / 512] &= ~0x80; + else + MemoryStatus7[virtAddr / 512] &= ~0x80; + } + } + + } + return; + } + } + + assert(false); +} + +template +T SlowRead9(ARMv5* cpu, u32 addr) +{ + u32 offset = addr & 0x3; + addr &= ~(sizeof(T) - 1); + + T val; + if (addr < cpu->ITCMSize) + val = *(T*)&cpu->ITCM[addr & 0x7FFF]; + else if (addr >= cpu->DTCMBase && addr < (cpu->DTCMBase + cpu->DTCMSize)) + val = *(T*)&cpu->DTCM[(addr - cpu->DTCMBase) & 0x3FFF]; + else if (std::is_same::value) + val = NDS::ARM9Read32(addr); + else if (std::is_same::value) + val = NDS::ARM9Read16(addr); + else + val = NDS::ARM9Read8(addr); + + if (std::is_same::value) + return ROR(val, offset << 3); + else + return val; +} + +template +void SlowWrite9(ARMv5* cpu, u32 addr, T val) +{ + addr &= ~(sizeof(T) - 1); + + if (addr < cpu->ITCMSize) + { + InvalidateITCMIfNecessary(addr); + *(T*)&cpu->ITCM[addr & 0x7FFF] = val; + } + else if (addr >= cpu->DTCMBase && addr < (cpu->DTCMBase + cpu->DTCMSize)) + { + *(T*)&cpu->DTCM[(addr - cpu->DTCMBase) & 0x3FFF] = val; + } + else if (std::is_same::value) + { + NDS::ARM9Write32(addr, val); + } + else if (std::is_same::value) + { + NDS::ARM9Write16(addr, val); + } + else + { + NDS::ARM9Write8(addr, val); + } +} + +template void SlowWrite9(ARMv5*, u32, u32); +template void SlowWrite9(ARMv5*, u32, u16); +template void SlowWrite9(ARMv5*, u32, u8); + +template u32 SlowRead9(ARMv5*, u32); +template u16 SlowRead9(ARMv5*, u32); +template u8 SlowRead9(ARMv5*, u32); + +template +T SlowRead7(u32 addr) +{ + u32 offset = addr & 0x3; + addr &= ~(sizeof(T) - 1); + + T val; + if (std::is_same::value) + val = NDS::ARM7Read32(addr); + else if (std::is_same::value) + val = NDS::ARM7Read16(addr); + else + val = NDS::ARM7Read8(addr); + + if (std::is_same::value) + return ROR(val, offset << 3); + else + return val; +} + +template +void SlowWrite7(u32 addr, T val) +{ + addr &= ~(sizeof(T) - 1); + + if (std::is_same::value) + NDS::ARM7Write32(addr, val); + else if (std::is_same::value) + NDS::ARM7Write16(addr, val); + else + NDS::ARM7Write8(addr, val); +} + +template +void SlowBlockTransfer9(u32 addr, u64* data, u32 num, ARMv5* cpu) +{ + addr &= ~0x3; + for (int i = 0; i < num; i++) + { + addr += PreInc * 4; + if (Write) + SlowWrite9(cpu, addr, data[i]); + else + data[i] = SlowRead9(cpu, addr); + addr += !PreInc * 4; + } +} + +template +void SlowBlockTransfer7(u32 addr, u64* data, u32 num) +{ + addr &= ~0x3; + for (int i = 0; i < num; i++) + { + addr += PreInc * 4; + if (Write) + SlowWrite7(addr, data[i]); + else + data[i] = SlowRead7(addr); + addr += !PreInc * 4; + } +} + +template void SlowWrite7(u32, u32); +template void SlowWrite7(u32, u16); +template void SlowWrite7(u32, u8); + +template u32 SlowRead7(u32); +template u16 SlowRead7(u32); +template u8 SlowRead7(u32); + +template void SlowBlockTransfer9(u32, u64*, u32, ARMv5*); +template void SlowBlockTransfer9(u32, u64*, u32, ARMv5*); +template void SlowBlockTransfer9(u32, u64*, u32, ARMv5*); +template void SlowBlockTransfer9(u32, u64*, u32, ARMv5*); +template void SlowBlockTransfer7(u32 addr, u64* data, u32 num); +template void SlowBlockTransfer7(u32 addr, u64* data, u32 num); +template void SlowBlockTransfer7(u32 addr, u64* data, u32 num); +template void SlowBlockTransfer7(u32 addr, u64* data, u32 num); template struct UnreliableHashTable @@ -211,31 +540,25 @@ struct UnreliableHashTable }; UnreliableHashTable RestoreCandidates; -UnreliableHashTable FastBlockLookUp; +UnreliableHashTable FastBlockLookUp9; +UnreliableHashTable FastBlockLookUp7; void Init() { - for (int i = 0; i < 0x2000; i++) - { - ExeMemKind kind = JIT_MEM[0][i >> 8]; - u32 size = ExeMemRegionSizes[kind]; - - AddrTranslate9[i] = ExeMemRegionOffsets[kind] + ((i << 15) & (size - 1)); - } - for (int i = 0; i < 0x4000; i++) - { - ExeMemKind kind = JIT_MEM[1][i >> 9]; - u32 size = ExeMemRegionSizes[kind]; - - AddrTranslate7[i] = ExeMemRegionOffsets[kind] + ((i << 14) & (size - 1)); - } - - compiler = new Compiler(); + JITCompiler = new Compiler(); } void DeInit() { - delete compiler; + delete JITCompiler; +} + +void Reset() +{ + ResetBlockCache(); + + UpdateMemoryStatus9(0, 0xFFFFFFFF); + UpdateMemoryStatus7(0, 0xFFFFFFFF); } void FloodFillSetFlags(FetchedInstr instrs[], int start, u8 flags) @@ -256,25 +579,31 @@ void FloodFillSetFlags(FetchedInstr instrs[], int start, u8 flags) } } -bool DecodeLiteral(const FetchedInstr& instr, u32& addr) +bool DecodeLiteral(bool thumb, const FetchedInstr& instr, u32& addr) { - switch (instr.Info.Kind) + if (!thumb) { - case ARMInstrInfo::ak_STR_IMM: - case ARMInstrInfo::ak_STRB_IMM: - addr = (instr.Addr + 8) + ((instr.Instr & 0xFFF) * (instr.Instr & (1 << 23) ? 1 : -1)); - return true; - case ARMInstrInfo::ak_STRD_IMM: - case ARMInstrInfo::ak_STRH_IMM: - addr = (instr.Addr + 8) + (((instr.Instr & 0xF00) >> 4 | (instr.Instr & 0xF)) * (instr.Instr & (1 << 23) ? 1 : -1)); - return true; - case ARMInstrInfo::ak_STM: // I honestly hope noone was ever crazy enough to do stm pc, {whatever} - addr = instr.Addr + 8; + switch (instr.Info.Kind) + { + case ARMInstrInfo::ak_LDR_IMM: + case ARMInstrInfo::ak_LDRB_IMM: + addr = (instr.Addr + 8) + ((instr.Instr & 0xFFF) * (instr.Instr & (1 << 23) ? 1 : -1)); + return true; + case ARMInstrInfo::ak_LDRH_IMM: + addr = (instr.Addr + 8) + (((instr.Instr & 0xF00) >> 4 | (instr.Instr & 0xF)) * (instr.Instr & (1 << 23) ? 1 : -1)); + return true; + default: + break; + } + } + else if (instr.Info.Kind == ARMInstrInfo::tk_LDR_PCREL) + { + addr = ((instr.Addr + 4) & ~0x2) + ((instr.Instr & 0xFF) << 2); return true; - default: - JIT_DEBUGPRINT("Literal %08x %x not recognised\n", instr.Instr, instr.Addr); - return false; } + + JIT_DEBUGPRINT("Literal %08x %x not recognised %d\n", instr.Instr, instr.Addr, instr.Info.Kind); + return false; } bool DecodeBranch(bool thumb, const FetchedInstr& instr, u32& cond, bool hasLink, u32 lr, bool& link, @@ -453,6 +782,8 @@ InterpreterFunc InterpretTHUMB[ARMInstrInfo::tk_Count] = }; #undef F + +extern u32 literalsPerBlock; void CompileBlock(ARM* cpu) { bool thumb = cpu->CPSR & 0x20; @@ -463,31 +794,33 @@ void CompileBlock(ARM* cpu) Config::JIT_MaxBlockSize = 32; u32 blockAddr = cpu->R[15] - (thumb ? 2 : 4); - if (!(cpu->Num == 0 - ? IsMapped<0>(blockAddr) - : IsMapped<1>(blockAddr))) + u32 pseudoPhysicalAddr = cpu->Num == 0 + ? TranslateAddr9(blockAddr) + : TranslateAddr7(blockAddr); + if (pseudoPhysicalAddr < ExeMemRegionSizes[exeMem_Unmapped]) { printf("Trying to compile a block in unmapped memory: %x\n", blockAddr); } - u32 pseudoPhysicalAddr = cpu->Num == 0 - ? TranslateAddr<0>(blockAddr) - : TranslateAddr<1>(blockAddr); - FetchedInstr instrs[Config::JIT_MaxBlockSize]; int i = 0; u32 r15 = cpu->R[15]; - u32 addresseRanges[32] = {}; + u32 addressRanges[Config::JIT_MaxBlockSize]; + u32 addressMasks[Config::JIT_MaxBlockSize] = {0}; u32 numAddressRanges = 0; + u32 numLiterals = 0; + u32 literalLoadAddrs[Config::JIT_MaxBlockSize]; + // they are going to be hashed + u32 literalValues[Config::JIT_MaxBlockSize]; + u32 instrValues[Config::JIT_MaxBlockSize]; + cpu->FillPipeline(); u32 nextInstr[2] = {cpu->NextInstr[0], cpu->NextInstr[1]}; u32 nextInstrAddr[2] = {blockAddr, r15}; - JIT_DEBUGPRINT("start block %x %08x (%x) (region invalidates %dx)\n", - blockAddr, cpu->CPSR, pseudoPhysicalAddr, - CodeRanges[pseudoPhysicalAddr / 512].TimesInvalidated); + JIT_DEBUGPRINT("start block %x %08x (%x)\n", blockAddr, cpu->CPSR, pseudoPhysicalAddr); u32 lastSegmentStart = blockAddr; u32 lr; @@ -507,23 +840,29 @@ void CompileBlock(ARM* cpu) nextInstrAddr[1] = r15; JIT_DEBUGPRINT("instr %08x %x\n", instrs[i].Instr & (thumb ? 0xFFFF : ~0), instrs[i].Addr); - u32 translatedAddr = (cpu->Num == 0 - ? TranslateAddr<0>(instrs[i].Addr) - : TranslateAddr<1>(instrs[i].Addr)) & ~0x1FF; - if (i == 0 || translatedAddr != addresseRanges[numAddressRanges - 1]) + instrValues[i] = instrs[i].Instr; + + u32 translatedAddr = cpu->Num == 0 + ? TranslateAddr9(instrs[i].Addr) + : TranslateAddr7(instrs[i].Addr); + u32 translatedAddrRounded = translatedAddr & ~0x1FF; + if (i == 0 || translatedAddrRounded != addressRanges[numAddressRanges - 1]) { bool returning = false; for (int j = 0; j < numAddressRanges; j++) { - if (addresseRanges[j] == translatedAddr) + if (addressRanges[j] == translatedAddrRounded) { + std::swap(addressRanges[j], addressRanges[numAddressRanges - 1]); + std::swap(addressMasks[j], addressMasks[numAddressRanges - 1]); returning = true; break; } } if (!returning) - addresseRanges[numAddressRanges++] = translatedAddr; + addressRanges[numAddressRanges++] = translatedAddrRounded; } + addressMasks[numAddressRanges - 1] |= 1 << ((translatedAddr & 0x1FF) / 16); if (cpu->Num == 0) { @@ -572,7 +911,8 @@ void CompileBlock(ARM* cpu) u32 icode = ((instrs[i].Instr >> 4) & 0xF) | ((instrs[i].Instr >> 16) & 0xFF0); assert(InterpretARM[instrs[i].Info.Kind] == ARMInterpreter::ARMInstrTable[icode] || instrs[i].Info.Kind == ARMInstrInfo::ak_MOV_REG_LSL_IMM - || instrs[i].Info.Kind == ARMInstrInfo::ak_Nop); + || instrs[i].Info.Kind == ARMInstrInfo::ak_Nop + || instrs[i].Info.Kind == ARMInstrInfo::ak_UNK); if (cpu->CheckCondition(instrs[i].Cond())) InterpretARM[instrs[i].Info.Kind](cpu); else @@ -583,21 +923,26 @@ void CompileBlock(ARM* cpu) instrs[i].DataCycles = cpu->DataCycles; instrs[i].DataRegion = cpu->DataRegion; - if (instrs[i].Info.SpecialKind == ARMInstrInfo::special_WriteMem - && instrs[i].Info.SrcRegs == (1 << 15) - && instrs[i].Info.DstRegs == 0) + u32 literalAddr; + if (Config::JIT_LiteralOptimisations + && instrs[i].Info.SpecialKind == ARMInstrInfo::special_LoadLiteral + && DecodeLiteral(thumb, instrs[i], literalAddr)) { - assert (!thumb); - - u32 addr; - if (DecodeLiteral(instrs[i], addr)) - { - JIT_DEBUGPRINT("pc relative write detected\n"); - u32 translatedAddr = cpu->Num == 0 ? TranslateAddr<0>(addr) : TranslateAddr<1>(addr); - - ARMJIT::InvalidateByAddr(translatedAddr, false); - CodeRanges[translatedAddr / 512].InvalidLiterals |= (1 << ((translatedAddr & 0x1FF) / 16)); - } + u32 translatedAddr = cpu->Num == 0 + ? TranslateAddr9(literalAddr) + : TranslateAddr7(literalAddr); + u32 translatedAddrRounded = translatedAddr & ~0x1FF; + + u32 j = 0; + for (; j < numAddressRanges; j++) + if (addressRanges[j] == translatedAddrRounded) + break; + if (j == numAddressRanges) + addressRanges[numAddressRanges++] = translatedAddrRounded; + addressMasks[j] |= 1 << ((translatedAddr & 0x1FF) / 16); + JIT_DEBUGPRINT("literal loading %08x %08x %08x %08x\n", literalAddr, translatedAddr, addressMasks[j], addressRanges[j]); + cpu->DataRead32(literalAddr, &literalValues[numLiterals]); + literalLoadAddrs[numLiterals++] = translatedAddr; } if (thumb && instrs[i].Info.Kind == ARMInstrInfo::tk_BL_LONG_2 && i > 0 @@ -650,8 +995,8 @@ void CompileBlock(ARM* cpu) else if (hasBranched && !isBackJump && i + 1 < Config::JIT_MaxBlockSize) { u32 targetPseudoPhysical = cpu->Num == 0 - ? TranslateAddr<0>(target) - : TranslateAddr<1>(target); + ? TranslateAddr9(target) + : TranslateAddr7(target); if (link) { @@ -688,36 +1033,29 @@ void CompileBlock(ARM* cpu) i++; - bool canCompile = compiler->CanCompile(thumb, instrs[i - 1].Info.Kind); + bool canCompile = JITCompiler->CanCompile(thumb, instrs[i - 1].Info.Kind); bool secondaryFlagReadCond = !canCompile || (instrs[i - 1].BranchFlags & (branch_FollowCondTaken | branch_FollowCondNotTaken)); if (instrs[i - 1].Info.ReadFlags != 0 || secondaryFlagReadCond) FloodFillSetFlags(instrs, i - 2, !secondaryFlagReadCond ? instrs[i - 1].Info.ReadFlags : 0xF); } while(!instrs[i - 1].Info.EndBlock && i < Config::JIT_MaxBlockSize && !cpu->Halted && (!cpu->IRQ || (cpu->CPSR & 0x80))); + u32 literalHash = (u32)XXH3_64bits(literalValues, numLiterals * 4); + u32 instrHash = (u32)XXH3_64bits(instrValues, i * 4); + JitBlock* prevBlock = RestoreCandidates.LookUp(pseudoPhysicalAddr); bool mayRestore = true; if (prevBlock) { RestoreCandidates.Remove(pseudoPhysicalAddr); - if (prevBlock->NumInstrs == i) - { - for (int j = 0; j < i; j++) - { - if (prevBlock->Instrs()[j] != instrs[j].Instr) - { - mayRestore = false; - break; - } - } - } - else - mayRestore = false; - if (prevBlock->NumAddresses == numAddressRanges) + mayRestore = prevBlock->LiteralHash == literalHash && prevBlock->InstrHash == instrHash; + + if (mayRestore && prevBlock->NumAddresses == numAddressRanges) { for (int j = 0; j < numAddressRanges; j++) { - if (prevBlock->AddressRanges()[j] != addresseRanges[j]) + if (prevBlock->AddressRanges()[j] != addressRanges[j] + || prevBlock->AddressMasks()[j] != addressMasks[j]) { mayRestore = false; break; @@ -739,18 +1077,21 @@ void CompileBlock(ARM* cpu) if (prevBlock) delete prevBlock; - block = new JitBlock(i, numAddressRanges); - for (int j = 0; j < i; j++) - block->Instrs()[j] = instrs[j].Instr; + block = new JitBlock(cpu->Num, i, numAddressRanges, numLiterals); + block->LiteralHash = literalHash; + block->InstrHash = instrHash; + for (int j = 0; j < numAddressRanges; j++) + block->AddressRanges()[j] = addressRanges[j]; for (int j = 0; j < numAddressRanges; j++) - block->AddressRanges()[j] = addresseRanges[j]; + block->AddressMasks()[j] = addressMasks[j]; + for (int j = 0; j < numLiterals; j++) + block->Literals()[j] = literalLoadAddrs[j]; - block->StartAddr = blockAddr; block->PseudoPhysicalAddr = pseudoPhysicalAddr; FloodFillSetFlags(instrs, i - 1, 0xF); - block->EntryPoint = compiler->CompileBlock(pseudoPhysicalAddr, cpu, thumb, instrs, i); + block->EntryPoint = JITCompiler->CompileBlock(pseudoPhysicalAddr, cpu, thumb, instrs, i); } else { @@ -760,23 +1101,73 @@ void CompileBlock(ARM* cpu) for (int j = 0; j < numAddressRanges; j++) { - assert(addresseRanges[j] == block->AddressRanges()[j]); - CodeRanges[addresseRanges[j] / 512].Blocks.Add(block); + assert(addressRanges[j] == block->AddressRanges()[j]); + assert(addressMasks[j] == block->AddressMasks()[j]); + assert(addressMasks[j] != 0); + CodeRanges[addressRanges[j] / 512].Code |= addressMasks[j]; + CodeRanges[addressRanges[j] / 512].Blocks.Add(block); + + UpdateRegionByPseudoPhyiscal(addressRanges[j], true); } - JitBlocks[pseudoPhysicalAddr] = block; - FastBlockLookUp.Insert(pseudoPhysicalAddr, compiler->SubEntryOffset(block->EntryPoint)); + if (cpu->Num == 0) + { + JitBlocks9[pseudoPhysicalAddr] = block; + FastBlockLookUp9.Insert(pseudoPhysicalAddr, JITCompiler->SubEntryOffset(block->EntryPoint)); + } + else + { + JitBlocks7[pseudoPhysicalAddr] = block; + FastBlockLookUp7.Insert(pseudoPhysicalAddr, JITCompiler->SubEntryOffset(block->EntryPoint)); + } } -void InvalidateByAddr(u32 pseudoPhysical, bool mayRestore) +void InvalidateByAddr(u32 pseudoPhysical) { JIT_DEBUGPRINT("invalidating by addr %x\n", pseudoPhysical); AddressRange* range = &CodeRanges[pseudoPhysical / 512]; - int startLength = range->Blocks.Length; - for (int i = 0; i < range->Blocks.Length; i++) + u32 mask = 1 << ((pseudoPhysical & 0x1FF) / 16); + + range->Code = 0; + for (int i = 0; i < range->Blocks.Length;) { - assert(range->Blocks.Length == startLength); JitBlock* block = range->Blocks[i]; + + bool invalidated = false; + u32 mask = 0; + for (int j = 0; j < block->NumAddresses; j++) + { + if (block->AddressRanges()[j] == (pseudoPhysical & ~0x1FF)) + { + mask = block->AddressMasks()[j]; + invalidated = block->AddressMasks()[j] & mask; + break; + } + } + assert(mask); + if (!invalidated) + { + range->Code |= mask; + i++; + continue; + } + range->Blocks.Remove(i); + + bool literalInvalidation = false; + for (int j = 0; j < block->NumLiterals; j++) + { + u32 addr = block->Literals()[j]; + if (addr == pseudoPhysical) + { + if (InvalidLiterals.Find(pseudoPhysical) != -1) + { + InvalidLiterals.Add(pseudoPhysical); + JIT_DEBUGPRINT("found invalid literal %d\n", InvalidLiterals.Length); + } + literalInvalidation = true; + break; + } + } for (int j = 0; j < block->NumAddresses; j++) { u32 addr = block->AddressRanges()[j]; @@ -786,76 +1177,59 @@ void InvalidateByAddr(u32 pseudoPhysical, bool mayRestore) assert(otherRange != range); bool removed = otherRange->Blocks.RemoveByValue(block); assert(removed); + + if (otherRange->Blocks.Length == 0) + { + otherRange->Code = 0; + UpdateRegionByPseudoPhyiscal(addr, false); + } } } for (int j = 0; j < block->NumLinks(); j++) - compiler->UnlinkBlock(block->Links()[j]); + JITCompiler->UnlinkBlock(block->Links()[j]); + block->ResetLinks(); - JitBlocks.erase(block->PseudoPhysicalAddr); - FastBlockLookUp.Remove(block->PseudoPhysicalAddr); + if (block->Num == 0) + { + JitBlocks9.erase(block->PseudoPhysicalAddr); + FastBlockLookUp9.Remove(block->PseudoPhysicalAddr); + } + else + { + JitBlocks7.erase(block->PseudoPhysicalAddr); + FastBlockLookUp7.Remove(block->PseudoPhysicalAddr); + } - if (mayRestore) + if (!literalInvalidation) { JitBlock* prevBlock = RestoreCandidates.Insert(block->PseudoPhysicalAddr, block); if (prevBlock) delete prevBlock; } + else + { + delete block; + } } - if ((range->TimesInvalidated + 1) > range->TimesInvalidated) - range->TimesInvalidated++; - - range->Blocks.Clear(); -} -void InvalidateByAddr7(u32 addr) -{ - u32 pseudoPhysical = TranslateAddr<1>(addr); - if (__builtin_expect(CodeRanges[pseudoPhysical / 512].Blocks.Length > 0, false)) - InvalidateByAddr(pseudoPhysical); + if (range->Blocks.Length == 0) + UpdateRegionByPseudoPhyiscal(pseudoPhysical, false); } -void InvalidateITCM(u32 addr) +void InvalidateRegionIfNecessary(u32 pseudoPhyisical) { - u32 pseudoPhysical = addr + ExeMemRegionOffsets[exeMem_ITCM]; - if (CodeRanges[pseudoPhysical / 512].Blocks.Length > 0) - InvalidateByAddr(pseudoPhysical); -} - -void InvalidateAll() -{ - JIT_DEBUGPRINT("invalidating all %x\n", JitBlocks.size()); - for (auto it : JitBlocks) - { - JitBlock* block = it.second; - - FastBlockLookUp.Remove(block->PseudoPhysicalAddr); - - for (int i = 0; i < block->NumAddresses; i++) - { - u32 addr = block->AddressRanges()[i]; - AddressRange* range = &CodeRanges[addr / 512]; - range->Blocks.Clear(); - if (range->TimesInvalidated + 1 > range->TimesInvalidated) - range->TimesInvalidated++; - } - for (int i = 0; i < block->NumLinks(); i++) - compiler->UnlinkBlock(block->Links()[i]); - block->ResetLinks(); - - JitBlock* prevBlock = RestoreCandidates.Insert(block->PseudoPhysicalAddr, block); - if (prevBlock) - delete prevBlock; - } - - JitBlocks.clear(); + if (CodeRanges[pseudoPhyisical / 512].Code & (1 << ((pseudoPhyisical & 0x1FF) / 16))) + InvalidateByAddr(pseudoPhyisical); } void ResetBlockCache() { printf("Resetting JIT block cache...\n"); - FastBlockLookUp.Reset(); + InvalidLiterals.Clear(); + FastBlockLookUp9.Reset(); + FastBlockLookUp7.Reset(); RestoreCandidates.Reset(); for (int i = 0; i < sizeof(RestoreCandidates.Table)/sizeof(RestoreCandidates.Table[0]); i++) { @@ -870,61 +1244,119 @@ void ResetBlockCache() RestoreCandidates.Table[i].ValB = NULL; } } - for (auto it : JitBlocks) + for (auto it : JitBlocks9) { JitBlock* block = it.second; for (int j = 0; j < block->NumAddresses; j++) { u32 addr = block->AddressRanges()[j]; CodeRanges[addr / 512].Blocks.Clear(); - CodeRanges[addr / 512].TimesInvalidated = 0; - CodeRanges[addr / 512].InvalidLiterals = 0; + CodeRanges[addr / 512].Code = 0; } delete block; } - JitBlocks.clear(); + for (auto it : JitBlocks7) + { + JitBlock* block = it.second; + for (int j = 0; j < block->NumAddresses; j++) + { + u32 addr = block->AddressRanges()[j]; + CodeRanges[addr / 512].Blocks.Clear(); + CodeRanges[addr / 512].Code = 0; + } + } + JitBlocks9.clear(); + JitBlocks7.clear(); - compiler->Reset(); + JITCompiler->Reset(); } +template JitBlockEntry LookUpBlockEntry(u32 addr) { - u32 entryOffset = FastBlockLookUp.LookUp(addr); + auto& fastMap = Num == 0 ? FastBlockLookUp9 : FastBlockLookUp7; + u32 entryOffset = fastMap.LookUp(addr); if (entryOffset != UINT32_MAX) - return compiler->AddEntryOffset(entryOffset); + return JITCompiler->AddEntryOffset(entryOffset); - auto block = JitBlocks.find(addr); - if (block != JitBlocks.end()) + auto& slowMap = Num == 0 ? JitBlocks9 : JitBlocks7; + auto block = slowMap.find(addr); + if (block != slowMap.end()) { - FastBlockLookUp.Insert(addr, compiler->SubEntryOffset(block->second->EntryPoint)); + fastMap.Insert(addr, JITCompiler->SubEntryOffset(block->second->EntryPoint)); return block->second->EntryPoint; } return NULL; } +template JitBlockEntry LookUpBlockEntry<0>(u32); +template JitBlockEntry LookUpBlockEntry<1>(u32); + template void LinkBlock(ARM* cpu, u32 codeOffset) { - u32 targetPseudoPhys = TranslateAddr(cpu->R[15] - ((cpu->CPSR&0x20)?2:4)); - auto block = JitBlocks.find(targetPseudoPhys); - if (block == JitBlocks.end()) + auto& blockMap = Num == 0 ? JitBlocks9 : JitBlocks7; + u32 instrAddr = cpu->R[15] - ((cpu->CPSR&0x20)?2:4); + u32 targetPseudoPhys = Num == 0 ? TranslateAddr9(instrAddr) : TranslateAddr7(instrAddr); + auto block = blockMap.find(targetPseudoPhys); + if (block == blockMap.end()) { CompileBlock(cpu); - block = JitBlocks.find(targetPseudoPhys); + block = blockMap.find(targetPseudoPhys); } JIT_DEBUGPRINT("linking to block %08x\n", targetPseudoPhys); block->second->AddLink(codeOffset); - compiler->LinkBlock(codeOffset, block->second->EntryPoint); + JITCompiler->LinkBlock(codeOffset, block->second->EntryPoint); +} + +template void LinkBlock<0>(ARM*, u32); +template void LinkBlock<1>(ARM*, u32); + +void WifiWrite32(u32 addr, u32 val) +{ + Wifi::Write(addr, val & 0xFFFF); + Wifi::Write(addr + 2, val >> 16); +} + +u32 WifiRead32(u32 addr) +{ + return Wifi::Read(addr) | (Wifi::Read(addr + 2) << 16); +} + +template +void VRAMWrite(u32 addr, T val) +{ + switch (addr & 0x00E00000) + { + case 0x00000000: GPU::WriteVRAM_ABG(addr, val); return; + case 0x00200000: GPU::WriteVRAM_BBG(addr, val); return; + case 0x00400000: GPU::WriteVRAM_AOBJ(addr, val); return; + case 0x00600000: GPU::WriteVRAM_BOBJ(addr, val); return; + default: GPU::WriteVRAM_LCDC(addr, val); return; + } +} +template +T VRAMRead(u32 addr) +{ + switch (addr & 0x00E00000) + { + case 0x00000000: return GPU::ReadVRAM_ABG(addr); + case 0x00200000: return GPU::ReadVRAM_BBG(addr); + case 0x00400000: return GPU::ReadVRAM_AOBJ(addr); + case 0x00600000: return GPU::ReadVRAM_BOBJ(addr); + default: return GPU::ReadVRAM_LCDC(addr); + } } void* GetFuncForAddr(ARM* cpu, u32 addr, bool store, int size) { if (cpu->Num == 0) { - if ((addr & 0xFF000000) == 0x04000000) + switch (addr & 0xFF000000) { + case 0x04000000: if (!store && size == 32 && addr == 0x04100010 && NDS::ExMemCnt[0] & (1<<11)) return (void*)NDSCart::ReadROMData; @@ -949,13 +1381,25 @@ void* GetFuncForAddr(ARM* cpu, u32 addr, bool store, int size) switch (size | store) { - case 8: return (void*)NDS::ARM9IORead8; - case 9: return (void*)NDS::ARM9IOWrite8; + case 8: return (void*)NDS::ARM9IORead8; + case 9: return (void*)NDS::ARM9IOWrite8; case 16: return (void*)NDS::ARM9IORead16; case 17: return (void*)NDS::ARM9IOWrite16; case 32: return (void*)NDS::ARM9IORead32; case 33: return (void*)NDS::ARM9IOWrite32; } + break; + case 0x06000000: + switch (size | store) + { + case 8: return (void*)VRAMRead; + case 9: return NULL; + case 16: return (void*)VRAMRead; + case 17: return (void*)VRAMWrite; + case 32: return (void*)VRAMRead; + case 33: return (void*)VRAMWrite; + } + break; } } else @@ -987,20 +1431,31 @@ void* GetFuncForAddr(ARM* cpu, u32 addr, bool store, int size) } break; case 0x04800000: - if (addr < 0x04810000 && size == 16) + if (addr < 0x04810000 && size >= 16) { - if (store) - return (void*)Wifi::Write; - else - return (void*)Wifi::Read; + switch (size | store) + { + case 16: return (void*)Wifi::Read; + case 17: return (void*)Wifi::Write; + case 32: return (void*)WifiRead32; + case 33: return (void*)WifiWrite32; + } } break; + case 0x06000000: + case 0x06800000: + switch (size | store) + { + case 8: return (void*)GPU::ReadVRAM_ARM7; + case 9: return (void*)GPU::WriteVRAM_ARM7; + case 16: return (void*)GPU::ReadVRAM_ARM7; + case 17: return (void*)GPU::WriteVRAM_ARM7; + case 32: return (void*)GPU::ReadVRAM_ARM7; + case 33: return (void*)GPU::WriteVRAM_ARM7; + } } } return NULL; } } - -template void ARMJIT::LinkBlock<0>(ARM*, u32); -template void ARMJIT::LinkBlock<1>(ARM*, u32); diff --git a/src/ARMJIT.h b/src/ARMJIT.h index cab385f..44a6140 100644 --- a/src/ARMJIT.h +++ b/src/ARMJIT.h @@ -28,45 +28,60 @@ extern const u32 ExeMemRegionSizes[]; typedef u32 (*JitBlockEntry)(); -extern u32 AddrTranslate9[0x2000]; -extern u32 AddrTranslate7[0x4000]; - const u32 ExeMemSpaceSize = 0x518000; // I hate you C++, sometimes I really hate you... -template -inline bool IsMapped(u32 addr) -{ - if (num == 0) - return AddrTranslate9[(addr & 0xFFFFFFF) >> 15] >= ExeMemRegionSizes[exeMem_Unmapped]; - else - return AddrTranslate7[(addr & 0xFFFFFFF) >> 14] >= ExeMemRegionSizes[exeMem_Unmapped]; -} - -template -inline u32 TranslateAddr(u32 addr) -{ - if (num == 0) - return AddrTranslate9[(addr & 0xFFFFFFF) >> 15] + (addr & 0x7FFF); - else - return AddrTranslate7[(addr & 0xFFFFFFF) >> 14] + (addr & 0x3FFF); -} +u32 TranslateAddr9(u32 addr); +u32 TranslateAddr7(u32 addr); +template JitBlockEntry LookUpBlockEntry(u32 addr); - void Init(); void DeInit(); -void InvalidateByAddr(u32 pseudoPhysical, bool mayRestore = true); -void InvalidateAll(); +void Reset(); + +void InvalidateByAddr(u32 pseudoPhysical); + +void InvalidateRegionIfNecessary(u32 addr); -void InvalidateITCM(u32 addr); -void InvalidateByAddr7(u32 addr); +inline void InvalidateMainRAMIfNecessary(u32 addr) +{ + InvalidateRegionIfNecessary(ExeMemRegionOffsets[exeMem_MainRAM] + (addr & (MAIN_RAM_SIZE - 1))); +} +inline void InvalidateITCMIfNecessary(u32 addr) +{ + InvalidateRegionIfNecessary(ExeMemRegionOffsets[exeMem_ITCM] + (addr & 0x7FFF)); +} +inline void InvalidateLCDCIfNecessary(u32 addr) +{ + if (addr < 0x68A3FFF) + InvalidateRegionIfNecessary(ExeMemRegionOffsets[exeMem_LCDC] + (addr - 0x6800000)); +} +inline void InvalidateSWRAM7IfNecessary(u32 addr) +{ + InvalidateRegionIfNecessary(ExeMemRegionOffsets[exeMem_SWRAM] + (NDS::SWRAM_ARM7 - NDS::SharedWRAM) + (addr & NDS::SWRAM_ARM7Mask)); +} +inline void InvalidateSWRAM9IfNecessary(u32 addr) +{ + InvalidateRegionIfNecessary(ExeMemRegionOffsets[exeMem_SWRAM] + (NDS::SWRAM_ARM9 - NDS::SharedWRAM) + (addr & NDS::SWRAM_ARM9Mask)); +} +inline void InvalidateARM7WRAMIfNecessary(u32 addr) +{ + InvalidateRegionIfNecessary(ExeMemRegionOffsets[exeMem_ARM7_WRAM] + (addr & 0xFFFF)); +} +inline void InvalidateARM7WVRAMIfNecessary(u32 addr) +{ + InvalidateRegionIfNecessary(ExeMemRegionOffsets[exeMem_ARM7_WVRAM] + (addr & 0x1FFFF)); +} void CompileBlock(ARM* cpu); void ResetBlockCache(); +void UpdateMemoryStatus9(u32 start, u32 end); +void UpdateMemoryStatus7(u32 start, u32 end); + } extern "C" void ARM_Dispatch(ARM* cpu, ARMJIT::JitBlockEntry entry); diff --git a/src/ARMJIT_A64/ARMJIT_Compiler.cpp b/src/ARMJIT_A64/ARMJIT_Compiler.cpp index 00fa436..a67f357 100644 --- a/src/ARMJIT_A64/ARMJIT_Compiler.cpp +++ b/src/ARMJIT_A64/ARMJIT_Compiler.cpp @@ -650,7 +650,7 @@ void Compiler::Comp_AddCycles_CDI() s32 numC = NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2]; s32 numD = CurInstr.DataCycles; - if ((CurInstr.DataRegion >> 4) == 0x02) // mainRAM + if ((CurInstr.DataRegion >> 24) == 0x02) // mainRAM { if (CodeRegion == 0x02) cycles = numC + numD; @@ -695,7 +695,7 @@ void Compiler::Comp_AddCycles_CD() s32 numC = NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2]; s32 numD = CurInstr.DataCycles; - if ((CurInstr.DataRegion >> 4) == 0x02) + if ((CurInstr.DataRegion >> 24) == 0x02) { if (CodeRegion == 0x02) cycles += numC + numD; diff --git a/src/ARMJIT_Internal.h b/src/ARMJIT_Internal.h index 66d1808..4e45760 100644 --- a/src/ARMJIT_Internal.h +++ b/src/ARMJIT_Internal.h @@ -152,30 +152,34 @@ struct __attribute__((packed)) TinyVector class JitBlock { public: - JitBlock(u32 numInstrs, u32 numAddresses) + JitBlock(u32 num, u32 literalHash, u32 numAddresses, u32 numLiterals) { - NumInstrs = numInstrs; + Num = num; NumAddresses = numAddresses; - Data.SetLength(numInstrs + numAddresses); + NumLiterals = numLiterals; + Data.SetLength(numAddresses * 2 + numLiterals); } - u32 StartAddr; u32 PseudoPhysicalAddr; - - u32 NumInstrs; - u32 NumAddresses; + + u32 InstrHash, LiteralHash; + u8 Num; + u16 NumAddresses; + u16 NumLiterals; JitBlockEntry EntryPoint; - u32* Instrs() - { return &Data[0]; } u32* AddressRanges() - { return &Data[NumInstrs]; } + { return &Data[0]; } + u32* AddressMasks() + { return &Data[NumAddresses]; } + u32* Literals() + { return &Data[NumAddresses * 2]; } u32* Links() - { return &Data[NumInstrs + NumAddresses]; } + { return &Data[NumAddresses * 2 + NumLiterals]; } u32 NumLinks() - { return Data.Length - NumInstrs - NumAddresses; } + { return Data.Length - NumAddresses * 2 - NumLiterals; } void AddLink(u32 link) { @@ -184,7 +188,7 @@ public: void ResetLinks() { - Data.SetLength(NumInstrs + NumAddresses); + Data.SetLength(NumAddresses * 2 + NumLiterals); } private: @@ -200,8 +204,7 @@ private: struct __attribute__((packed)) AddressRange { TinyVector Blocks; - u16 InvalidLiterals; - u16 TimesInvalidated; + u32 Code; }; extern AddressRange CodeRanges[ExeMemSpaceSize / 512]; @@ -210,14 +213,45 @@ typedef void (*InterpreterFunc)(ARM* cpu); extern InterpreterFunc InterpretARM[]; extern InterpreterFunc InterpretTHUMB[]; -extern u8 MemRegion9[0x80000]; -extern u8 MemRegion7[0x80000]; +extern u8 MemoryStatus9[0x800000]; +extern u8 MemoryStatus7[0x800000]; + +extern TinyVector InvalidLiterals; void* GetFuncForAddr(ARM* cpu, u32 addr, bool store, int size); template void LinkBlock(ARM* cpu, u32 codeOffset); +enum +{ + memregion_Other = 0, + memregion_ITCM, + memregion_DTCM, + memregion_BIOS9, + memregion_MainRAM, + memregion_SWRAM9, + memregion_SWRAM7, + memregion_IO9, + memregion_VRAM, + memregion_BIOS7, + memregion_WRAM7, + memregion_IO7, + memregion_Wifi, + memregion_VWRAM, +}; + +int ClassifyAddress9(u32 addr); +int ClassifyAddress7(u32 addr); + +template T SlowRead9(ARMv5* cpu, u32 addr); +template void SlowWrite9(ARMv5* cpu, u32 addr, T val); +template T SlowRead7(u32 addr); +template void SlowWrite7(u32 addr, T val); + +template void SlowBlockTransfer9(u32 addr, u64* data, u32 num, ARMv5* cpu); +template void SlowBlockTransfer7(u32 addr, u64* data, u32 num); + } #endif \ No newline at end of file diff --git a/src/ARMJIT_RegisterCache.h b/src/ARMJIT_RegisterCache.h index 5e18e84..0547c84 100644 --- a/src/ARMJIT_RegisterCache.h +++ b/src/ARMJIT_RegisterCache.h @@ -95,20 +95,6 @@ public: LiteralsLoaded = 0; } - BitSet32 GetPushRegs() - { - BitSet16 used; - for (int i = 0; i < InstrsCount; i++) - used |= BitSet16(Instrs[i].Info.SrcRegs | Instrs[i].Info.DstRegs); - - BitSet32 res; - u32 registersMax = std::min((int)used.Count(), NativeRegsAvailable); - for (int i = 0; i < registersMax; i++) - res |= BitSet32(1 << (int)NativeRegAllocOrder[i]); - - return res; - } - void Prepare(bool thumb, int i) { FetchedInstr instr = Instrs[i]; @@ -139,7 +125,6 @@ public: UnloadRegister(reg); u16 necessaryRegs = ((instr.Info.SrcRegs & PCAllocatableAsSrc) | instr.Info.DstRegs) & ~instr.Info.NotStrictlyNeeded; - u16 writeRegs = instr.Info.DstRegs & ~instr.Info.NotStrictlyNeeded; BitSet16 needToBeLoaded(necessaryRegs & ~LoadedRegs); if (needToBeLoaded != BitSet16(0)) { @@ -182,13 +167,12 @@ public: if (left-- == 0) break; - writeRegs |= (1 << reg) & instr.Info.DstRegs; LoadRegister(reg, !(thumb || instr.Cond() >= 0xE) || (1 << reg) & instr.Info.SrcRegs); } } } - DirtyRegs |= writeRegs & ~(1 << 15); + DirtyRegs |= (LoadedRegs & instr.Info.DstRegs) & ~(1 << 15); } static const Reg NativeRegAllocOrder[]; diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp index dd20e3c..eee2e0f 100644 --- a/src/ARMJIT_x64/ARMJIT_Compiler.cpp +++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp @@ -195,26 +195,6 @@ Compiler::Compiler() Reset(); - for (int i = 0; i < 3; i++) - { - for (int j = 0; j < 2; j++) - MemoryFuncs9[i][j] = Gen_MemoryRoutine9(j, 8 << i); - } - MemoryFuncs7[0][0] = (void*)NDS::ARM7Read8; - MemoryFuncs7[0][1] = (void*)NDS::ARM7Write8; - MemoryFuncs7[1][0] = (void*)NDS::ARM7Read16; - MemoryFuncs7[1][1] = (void*)NDS::ARM7Write16; - MemoryFuncs7[2][0] = (void*)NDS::ARM7Read32; - MemoryFuncs7[2][1] = (void*)NDS::ARM7Write32; - - for (int i = 0; i < 2; i++) - for (int j = 0; j < 2; j++) - { - MemoryFuncsSeq9[i][j] = Gen_MemoryRoutineSeq9(i, j); - MemoryFuncsSeq7[i][j][0] = Gen_MemoryRoutineSeq7(i, j, false); - MemoryFuncsSeq7[i][j][1] = Gen_MemoryRoutineSeq7(i, j, true); - } - { // RSCRATCH mode // RSCRATCH2 reg number @@ -317,6 +297,12 @@ Compiler::Compiler() // move the region forward to prevent overwriting the generated functions CodeMemSize -= GetWritableCodePtr() - ResetStart; ResetStart = GetWritableCodePtr(); + + NearStart = ResetStart; + FarStart = ResetStart + 1024*1024*24; + + NearSize = FarStart - ResetStart; + FarSize = (ResetStart + CodeMemSize) - FarStart; } void Compiler::LoadCPSR() @@ -504,6 +490,9 @@ void Compiler::Reset() { memset(ResetStart, 0xcc, CodeMemSize); SetCodePtr(ResetStart); + + NearCode = NearStart; + FarCode = FarStart; } void Compiler::Comp_SpecialBranchBehaviour(bool taken) @@ -544,8 +533,16 @@ void Compiler::Comp_SpecialBranchBehaviour(bool taken) JitBlockEntry Compiler::CompileBlock(u32 translatedAddr, ARM* cpu, bool thumb, FetchedInstr instrs[], int instrsCount) { - if (CodeMemSize - (GetWritableCodePtr() - ResetStart) < 1024 * 32) // guess... + if (NearSize - (NearCode - NearStart) < 1024 * 32) // guess... + { + printf("near reset\n"); + ResetBlockCache(); + } + if (FarSize - (FarCode - FarStart) < 1024 * 32) // guess... + { + printf("far reset\n"); ResetBlockCache(); + } ConstantCycles = 0; Thumb = thumb; @@ -762,12 +759,14 @@ void Compiler::Comp_AddCycles_CDI() Comp_AddCycles_CD(); else { + IrregularCycles = true; + s32 cycles; s32 numC = NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2]; s32 numD = CurInstr.DataCycles; - if ((CurInstr.DataRegion >> 4) == 0x02) // mainRAM + if ((CurInstr.DataRegion >> 24) == 0x02) // mainRAM { if (CodeRegion == 0x02) cycles = numC + numD; diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.h b/src/ARMJIT_x64/ARMJIT_Compiler.h index e0a4978..9df218b 100644 --- a/src/ARMJIT_x64/ARMJIT_Compiler.h +++ b/src/ARMJIT_x64/ARMJIT_Compiler.h @@ -140,7 +140,7 @@ public: }; void Comp_MemAccess(int rd, int rn, const ComplexOperand& op2, int size, int flags); s32 Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc, bool decrement, bool usermode); - void Comp_MemLoadLiteral(int size, int rd, u32 addr); + bool Comp_MemLoadLiteral(int size, int rd, u32 addr); void Comp_ArithTriOp(void (Compiler::*op)(int, const Gen::OpArg&, const Gen::OpArg&), Gen::OpArg rd, Gen::OpArg rn, Gen::OpArg op2, bool carryUsed, int opFlags); @@ -154,12 +154,6 @@ public: void Comp_SpecialBranchBehaviour(bool taken); - void* Gen_MemoryRoutine9(bool store, int size); - - void* Gen_MemoryRoutineSeq9(bool store, bool preinc); - void* Gen_MemoryRoutineSeq7(bool store, bool preinc, bool codeMainRAM); - - void* Gen_ChangeCPSRRoutine(); Gen::OpArg Comp_RegShiftImm(int op, int amount, Gen::OpArg rm, bool S, bool& carryUsed); Gen::OpArg Comp_RegShiftReg(int op, Gen::OpArg rs, Gen::OpArg rm, bool S, bool& carryUsed); @@ -193,6 +187,26 @@ public: return (u8*)entry - ResetStart; } + void SwitchToNearCode() + { + FarCode = GetWritableCodePtr(); + SetCodePtr(NearCode); + } + + void SwitchToFarCode() + { + NearCode = GetWritableCodePtr(); + SetCodePtr(FarCode); + } + + u8* FarCode; + u8* NearCode; + u32 FarSize; + u32 NearSize; + + u8* NearStart; + u8* FarStart; + u8* ResetStart; u32 CodeMemSize; @@ -201,12 +215,6 @@ public: void* BranchStub[2]; - void* MemoryFuncs9[3][2]; - void* MemoryFuncs7[3][2]; - - void* MemoryFuncsSeq9[2][2]; - void* MemoryFuncsSeq7[2][2][2]; - void* ReadBanked; void* WriteBanked; diff --git a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp index b595e32..c13b779 100644 --- a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp +++ b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp @@ -25,236 +25,17 @@ int squeezePointer(T* ptr) improvement. */ -/* - address - ABI_PARAM1 (a.k.a. ECX = RSCRATCH3 on Windows) - store value - ABI_PARAM2 (a.k.a. RDX = RSCRATCH2 on Windows) -*/ -void* Compiler::Gen_MemoryRoutine9(bool store, int size) +bool Compiler::Comp_MemLoadLiteral(int size, int rd, u32 addr) { - u32 addressMask = ~(size == 32 ? 3 : (size == 16 ? 1 : 0)); - AlignCode4(); - void* res = GetWritableCodePtr(); - - MOV(32, R(RSCRATCH), R(ABI_PARAM1)); - SUB(32, R(RSCRATCH), MDisp(RCPU, offsetof(ARMv5, DTCMBase))); - CMP(32, R(RSCRATCH), MDisp(RCPU, offsetof(ARMv5, DTCMSize))); - FixupBranch insideDTCM = J_CC(CC_B); - - CMP(32, R(ABI_PARAM1), MDisp(RCPU, offsetof(ARMv5, ITCMSize))); - FixupBranch insideITCM = J_CC(CC_B); - - if (store) - { - if (size > 8) - AND(32, R(ABI_PARAM1), Imm32(addressMask)); - switch (size) - { - case 32: JMP((u8*)NDS::ARM9Write32, true); break; - case 16: JMP((u8*)NDS::ARM9Write16, true); break; - case 8: JMP((u8*)NDS::ARM9Write8, true); break; - } - } - else - { - if (size == 32) - { - ABI_PushRegistersAndAdjustStack({ABI_PARAM1}, 8); - AND(32, R(ABI_PARAM1), Imm32(addressMask)); - // everything's already in the appropriate register - ABI_CallFunction(NDS::ARM9Read32); - ABI_PopRegistersAndAdjustStack({ECX}, 8); - AND(32, R(ECX), Imm8(3)); - SHL(32, R(ECX), Imm8(3)); - ROR_(32, R(RSCRATCH), R(ECX)); - RET(); - } - else if (size == 16) - { - AND(32, R(ABI_PARAM1), Imm32(addressMask)); - JMP((u8*)NDS::ARM9Read16, true); - } - else - JMP((u8*)NDS::ARM9Read8, true); - } - - SetJumpTarget(insideDTCM); - AND(32, R(RSCRATCH), Imm32(0x3FFF & addressMask)); - if (store) - MOV(size, MComplex(RCPU, RSCRATCH, SCALE_1, offsetof(ARMv5, DTCM)), R(ABI_PARAM2)); - else - { - MOVZX(32, size, RSCRATCH, MComplex(RCPU, RSCRATCH, SCALE_1, offsetof(ARMv5, DTCM))); - if (size == 32) - { - if (ABI_PARAM1 != ECX) - MOV(32, R(ECX), R(ABI_PARAM1)); - AND(32, R(ECX), Imm8(3)); - SHL(32, R(ECX), Imm8(3)); - ROR_(32, R(RSCRATCH), R(ECX)); - } - } - RET(); + u32 translatedAddr = Num == 0 ? TranslateAddr9(addr) : TranslateAddr7(addr); - SetJumpTarget(insideITCM); - MOV(32, R(ABI_PARAM3), R(ABI_PARAM1)); // free up ECX - AND(32, R(ABI_PARAM3), Imm32(0x7FFF & addressMask)); - if (store) - { - MOV(size, MComplex(RCPU, ABI_PARAM3, SCALE_1, offsetof(ARMv5, ITCM)), R(ABI_PARAM2)); - - // if CodeRanges[pseudoPhysical/256].Blocks.Length > 0 we're writing into code! - static_assert(sizeof(AddressRange) == 16); - LEA(32, ABI_PARAM1, MDisp(ABI_PARAM3, ExeMemRegionOffsets[exeMem_ITCM])); - MOV(32, R(RSCRATCH), R(ABI_PARAM1)); - SHR(32, R(RSCRATCH), Imm8(9)); - SHL(32, R(RSCRATCH), Imm8(4)); - CMP(16, MDisp(RSCRATCH, squeezePointer(CodeRanges) + offsetof(AddressRange, Blocks.Length)), Imm8(0)); - FixupBranch noCode = J_CC(CC_Z); - JMP((u8*)InvalidateByAddr, true); - SetJumpTarget(noCode); - } - else + int invalidLiteralIdx = InvalidLiterals.Find(translatedAddr); + if (invalidLiteralIdx != -1) { - MOVZX(32, size, RSCRATCH, MComplex(RCPU, ABI_PARAM3, SCALE_1, offsetof(ARMv5, ITCM))); - if (size == 32) - { - if (ABI_PARAM1 != ECX) - MOV(32, R(ECX), R(ABI_PARAM1)); - AND(32, R(ECX), Imm8(3)); - SHL(32, R(ECX), Imm8(3)); - ROR_(32, R(RSCRATCH), R(ECX)); - } + InvalidLiterals.Remove(invalidLiteralIdx); + return false; } - RET(); - - static_assert(RSCRATCH == EAX, "Someone changed RSCRATCH!"); - - return res; -} - -#define MEMORY_SEQ_WHILE_COND \ - if (!store) \ - MOV(32, currentElement, R(EAX));\ - if (!preinc) \ - ADD(32, R(ABI_PARAM1), Imm8(4)); \ - \ - SUB(32, R(ABI_PARAM3), Imm8(1)); \ - J_CC(CC_NZ, repeat); - -/* - ABI_PARAM1 address - ABI_PARAM2 address where registers are stored - ABI_PARAM3 how many values to read/write - - Dolphin x64CodeEmitter is my favourite assembler - */ -void* Compiler::Gen_MemoryRoutineSeq9(bool store, bool preinc) -{ - void* res = (void*)GetWritableCodePtr(); - - const u8* repeat = GetCodePtr(); - - if (preinc) - ADD(32, R(ABI_PARAM1), Imm8(4)); - MOV(32, R(RSCRATCH), R(ABI_PARAM1)); - SUB(32, R(RSCRATCH), MDisp(RCPU, offsetof(ARMv5, DTCMBase))); - CMP(32, R(RSCRATCH), MDisp(RCPU, offsetof(ARMv5, DTCMSize))); - FixupBranch insideDTCM = J_CC(CC_B); - - CMP(32, R(ABI_PARAM1), MDisp(RCPU, offsetof(ARMv5, ITCMSize))); - FixupBranch insideITCM = J_CC(CC_B); - - OpArg currentElement = MComplex(ABI_PARAM2, ABI_PARAM3, SCALE_8, -8); // wasting stack space like a gangster - - ABI_PushRegistersAndAdjustStack({ABI_PARAM1, ABI_PARAM2, ABI_PARAM3}, 8); - AND(32, R(ABI_PARAM1), Imm8(~3)); - if (store) - { - MOV(32, R(ABI_PARAM2), currentElement); - CALL((void*)NDS::ARM9Write32); - } - else - CALL((void*)NDS::ARM9Read32); - ABI_PopRegistersAndAdjustStack({ABI_PARAM1, ABI_PARAM2, ABI_PARAM3}, 8); - - MEMORY_SEQ_WHILE_COND - RET(); - - SetJumpTarget(insideDTCM); - AND(32, R(RSCRATCH), Imm32(0x3FFF & ~3)); - if (store) - { - MOV(32, R(ABI_PARAM4), currentElement); - MOV(32, MComplex(RCPU, RSCRATCH, SCALE_1, offsetof(ARMv5, DTCM)), R(ABI_PARAM4)); - } - else - MOV(32, R(RSCRATCH), MComplex(RCPU, RSCRATCH, SCALE_1, offsetof(ARMv5, DTCM))); - - MEMORY_SEQ_WHILE_COND - RET(); - - SetJumpTarget(insideITCM); - MOV(32, R(RSCRATCH), R(ABI_PARAM1)); - AND(32, R(RSCRATCH), Imm32(0x7FFF & ~3)); - if (store) - { - MOV(32, R(ABI_PARAM4), currentElement); - MOV(32, MComplex(RCPU, RSCRATCH, SCALE_1, offsetof(ARMv5, ITCM)), R(ABI_PARAM4)); - - ADD(32, R(RSCRATCH), Imm32(ExeMemRegionOffsets[exeMem_ITCM])); - MOV(32, R(ABI_PARAM4), R(RSCRATCH)); - SHR(32, R(RSCRATCH), Imm8(9)); - SHL(32, R(RSCRATCH), Imm8(4)); - CMP(16, MDisp(RSCRATCH, squeezePointer(CodeRanges) + offsetof(AddressRange, Blocks.Length)), Imm8(0)); - FixupBranch noCode = J_CC(CC_Z); - ABI_PushRegistersAndAdjustStack({ABI_PARAM1, ABI_PARAM2, ABI_PARAM3}, 8); - MOV(32, R(ABI_PARAM1), R(ABI_PARAM4)); - CALL((u8*)InvalidateByAddr); - ABI_PopRegistersAndAdjustStack({ABI_PARAM1, ABI_PARAM2, ABI_PARAM3}, 8); - SetJumpTarget(noCode); - } - else - MOV(32, R(RSCRATCH), MComplex(RCPU, RSCRATCH, SCALE_1, offsetof(ARMv5, ITCM))); - - MEMORY_SEQ_WHILE_COND - RET(); - - return res; -} - -void* Compiler::Gen_MemoryRoutineSeq7(bool store, bool preinc, bool codeMainRAM) -{ - void* res = (void*)GetWritableCodePtr(); - - const u8* repeat = GetCodePtr(); - - if (preinc) - ADD(32, R(ABI_PARAM1), Imm8(4)); - - OpArg currentElement = MComplex(ABI_PARAM2, ABI_PARAM3, SCALE_8, -8); - - ABI_PushRegistersAndAdjustStack({ABI_PARAM1, ABI_PARAM2, ABI_PARAM3}, 8); - AND(32, R(ABI_PARAM1), Imm8(~3)); - if (store) - { - MOV(32, R(ABI_PARAM2), currentElement); - CALL((void*)NDS::ARM7Write32); - } - else - CALL((void*)NDS::ARM7Read32); - ABI_PopRegistersAndAdjustStack({ABI_PARAM1, ABI_PARAM2, ABI_PARAM3}, 8); - - MEMORY_SEQ_WHILE_COND - RET(); - - return res; -} - -#undef MEMORY_SEQ_WHILE_COND - -void Compiler::Comp_MemLoadLiteral(int size, int rd, u32 addr) -{ u32 val; // make sure arm7 bios is accessible u32 tmpR15 = CurCPU->R[15]; @@ -276,12 +57,10 @@ void Compiler::Comp_MemLoadLiteral(int size, int rd, u32 addr) RegCache.PutLiteral(rd, val); Comp_AddCycles_CDI(); + + return true; } -/*void fault(u32 a, u32 b, u32 c, u32 d) -{ - printf("actually not static! %x %x %x %x\n", a, b, c, d); -}*/ void Compiler::Comp_MemAccess(int rd, int rn, const ComplexOperand& op2, int size, int flags) { @@ -291,17 +70,12 @@ void Compiler::Comp_MemAccess(int rd, int rn, const ComplexOperand& op2, int siz if (size == 16) addressMask = ~1; - //bool check = false; if (Config::JIT_LiteralOptimisations && rn == 15 && rd != 15 && op2.IsImm && !(flags & (memop_SignExtend|memop_Post|memop_Store|memop_Writeback))) { u32 addr = R15 + op2.Imm * ((flags & memop_SubtractOffset) ? -1 : 1); - u32 translatedAddr = Num == 0 ? TranslateAddr<0>(addr) : TranslateAddr<1>(addr); - - if (!(CodeRanges[translatedAddr / 512].InvalidLiterals & (1 << ((translatedAddr & 0x1FF) / 16)))) - { - Comp_MemLoadLiteral(size, rd, addr); + + if (Comp_MemLoadLiteral(size, rd, addr)) return; - } } { @@ -314,173 +88,334 @@ void Compiler::Comp_MemAccess(int rd, int rn, const ComplexOperand& op2, int siz Comp_AddCycles_CDI(); } + bool addrIsStatic = Config::JIT_LiteralOptimisations + && RegCache.IsLiteral(rn) && op2.IsImm && !(flags & (memop_Writeback|memop_Post)); + u32 staticAddress; + if (addrIsStatic) + staticAddress = RegCache.LiteralValues[rn] + op2.Imm * ((flags & memop_SubtractOffset) ? -1 : 1); OpArg rdMapped = MapReg(rd); - OpArg rnMapped = MapReg(rn); - if (Thumb && rn == 15) - rnMapped = Imm32(R15 & ~0x2); - - bool inlinePreparation = Num == 1; - u32 constLocalROR32 = 4; - - void* memoryFunc = Num == 0 - ? MemoryFuncs9[size >> 4][!!(flags & memop_Store)] - : MemoryFuncs7[size >> 4][!!((flags & memop_Store))]; - if (Config::JIT_LiteralOptimisations && (rd != 15 || (flags & memop_Store)) && op2.IsImm && RegCache.IsLiteral(rn)) + if (!addrIsStatic) { - u32 addr = RegCache.LiteralValues[rn] + op2.Imm * ((flags & memop_SubtractOffset) ? -1 : 1); - - /*MOV(32, R(ABI_PARAM1), Imm32(CurInstr.Instr)); - MOV(32, R(ABI_PARAM1), Imm32(R15)); - MOV_sum(32, RSCRATCH, rnMapped, Imm32(op2.Imm * ((flags & memop_SubtractOffset) ? -1 : 1))); - CMP(32, R(RSCRATCH), Imm32(addr)); - FixupBranch eq = J_CC(CC_E); - CALL((void*)fault); - SetJumpTarget(eq);*/ - - NDS::MemRegion region; - region.Mem = NULL; - if (Num == 0) + OpArg rnMapped = MapReg(rn); + if (Thumb && rn == 15) + rnMapped = Imm32(R15 & ~0x2); + + X64Reg finalAddr = RSCRATCH3; + if (flags & memop_Post) { - ARMv5* cpu5 = (ARMv5*)CurCPU; + MOV(32, R(RSCRATCH3), rnMapped); - // stupid dtcm... - if (addr >= cpu5->DTCMBase && addr < (cpu5->DTCMBase + cpu5->DTCMSize)) - { - // disable this for now as DTCM is located in heap - // which might excced the RIP-addressable range - //region.Mem = cpu5->DTCM; - //region.Mask = 0x3FFF; - } - else - { - NDS::ARM9GetMemRegion(addr, flags & memop_Store, ®ion); - } + finalAddr = rnMapped.GetSimpleReg(); } - else - NDS::ARM7GetMemRegion(addr, flags & memop_Store, ®ion); - if (region.Mem != NULL) + if (op2.IsImm) + { + MOV_sum(32, finalAddr, rnMapped, Imm32(op2.Imm * ((flags & memop_SubtractOffset) ? -1 : 1))); + } + else { - void* ptr = ®ion.Mem[addr & addressMask & region.Mask]; + OpArg rm = MapReg(op2.Reg.Reg); - if (flags & memop_Store) + if (!(flags & memop_SubtractOffset) && rm.IsSimpleReg() && rnMapped.IsSimpleReg() + && op2.Reg.Op == 0 && op2.Reg.Amount > 0 && op2.Reg.Amount <= 3) { - MOV(size, M(ptr), MapReg(rd)); + LEA(32, finalAddr, + MComplex(rnMapped.GetSimpleReg(), rm.GetSimpleReg(), 1 << op2.Reg.Amount, 0)); } else { - if (flags & memop_SignExtend) - MOVSX(32, size, rdMapped.GetSimpleReg(), M(ptr)); - else - MOVZX(32, size, rdMapped.GetSimpleReg(), M(ptr)); + bool throwAway; + OpArg offset = + Comp_RegShiftImm(op2.Reg.Op, op2.Reg.Amount, rm, false, throwAway); - if (size == 32 && addr & ~0x3) + if (flags & memop_SubtractOffset) { - ROR_(32, rdMapped, Imm8((addr & 0x3) << 3)); + if (R(finalAddr) != rnMapped) + MOV(32, R(finalAddr), rnMapped); + if (!offset.IsZero()) + SUB(32, R(finalAddr), offset); } + else + MOV_sum(32, finalAddr, rnMapped, offset); } - - return; } - void* specialFunc = GetFuncForAddr(CurCPU, addr, flags & memop_Store, size); - if (specialFunc) - { - memoryFunc = specialFunc; - inlinePreparation = true; - constLocalROR32 = addr & 0x3; - } + if ((flags & memop_Writeback) && !(flags & memop_Post)) + MOV(32, rnMapped, R(finalAddr)); } - X64Reg finalAddr = ABI_PARAM1; - if (flags & memop_Post) - { - MOV(32, R(ABI_PARAM1), rnMapped); + int expectedTarget = Num == 0 + ? ClassifyAddress9(addrIsStatic ? staticAddress : CurInstr.DataRegion) + : ClassifyAddress7(addrIsStatic ? staticAddress : CurInstr.DataRegion); + if (CurInstr.Cond() < 0xE) + expectedTarget = memregion_Other; + + bool compileFastPath = false, compileSlowPath = !addrIsStatic || (flags & memop_Store); - finalAddr = rnMapped.GetSimpleReg(); + switch (expectedTarget) + { + case memregion_MainRAM: + case memregion_DTCM: + case memregion_WRAM7: + case memregion_SWRAM9: + case memregion_SWRAM7: + case memregion_IO9: + case memregion_IO7: + case memregion_VWRAM: + compileFastPath = true; + break; + case memregion_Wifi: + compileFastPath = size >= 16; + break; + case memregion_VRAM: + compileFastPath = !(flags & memop_Store) || size >= 16; + case memregion_BIOS9: + compileFastPath = !(flags & memop_Store); + break; + default: break; } - if (op2.IsImm) + if (addrIsStatic && !compileFastPath) { - MOV_sum(32, finalAddr, rnMapped, Imm32(op2.Imm * ((flags & memop_SubtractOffset) ? -1 : 1))); + compileFastPath = false; + compileSlowPath = true; } - else + + if (addrIsStatic && compileSlowPath) + MOV(32, R(RSCRATCH3), Imm32(staticAddress)); + + if (compileFastPath) { - OpArg rm = MapReg(op2.Reg.Reg); + FixupBranch slowPath; + if (compileSlowPath) + { + MOV(32, R(RSCRATCH), R(RSCRATCH3)); + SHR(32, R(RSCRATCH), Imm8(9)); + if (flags & memop_Store) + { + CMP(8, MDisp(RSCRATCH, squeezePointer(Num == 0 ? MemoryStatus9 : MemoryStatus7)), Imm8(expectedTarget)); + } + else + { + MOVZX(32, 8, RSCRATCH, MDisp(RSCRATCH, squeezePointer(Num == 0 ? MemoryStatus9 : MemoryStatus7))); + AND(32, R(RSCRATCH), Imm8(~0x80)); + CMP(32, R(RSCRATCH), Imm8(expectedTarget)); + } + + slowPath = J_CC(CC_NE, true); + } - if (!(flags & memop_SubtractOffset) && rm.IsSimpleReg() && rnMapped.IsSimpleReg() - && op2.Reg.Op == 0 && op2.Reg.Amount > 0 && op2.Reg.Amount <= 3) + if (expectedTarget == memregion_MainRAM || expectedTarget == memregion_WRAM7 + || expectedTarget == memregion_BIOS9) { - LEA(32, finalAddr, - MComplex(rnMapped.GetSimpleReg(), rm.GetSimpleReg(), 1 << op2.Reg.Amount, 0)); + u8* data; + u32 mask; + if (expectedTarget == memregion_MainRAM) + { + data = NDS::MainRAM; + mask = MAIN_RAM_SIZE - 1; + } + else if (expectedTarget == memregion_BIOS9) + { + data = NDS::ARM9BIOS; + mask = 0xFFF; + } + else + { + data = NDS::ARM7WRAM; + mask = 0xFFFF; + } + OpArg memLoc; + if (addrIsStatic) + { + memLoc = M(data + ((staticAddress & mask & addressMask))); + } + else + { + MOV(32, R(RSCRATCH), R(RSCRATCH3)); + AND(32, R(RSCRATCH), Imm32(mask & addressMask)); + memLoc = MDisp(RSCRATCH, squeezePointer(data)); + } + if (flags & memop_Store) + MOV(size, memLoc, rdMapped); + else if (flags & memop_SignExtend) + MOVSX(32, size, rdMapped.GetSimpleReg(), memLoc); + else + MOVZX(32, size, rdMapped.GetSimpleReg(), memLoc); } - else + else if (expectedTarget == memregion_DTCM) + { + if (addrIsStatic) + MOV(32, R(RSCRATCH), Imm32(staticAddress)); + else + MOV(32, R(RSCRATCH), R(RSCRATCH3)); + SUB(32, R(RSCRATCH), MDisp(RCPU, offsetof(ARMv5, DTCMBase))); + AND(32, R(RSCRATCH), Imm32(0x3FFF & addressMask)); + OpArg memLoc = MComplex(RCPU, RSCRATCH, SCALE_1, offsetof(ARMv5, DTCM)); + if (flags & memop_Store) + MOV(size, memLoc, rdMapped); + else if (flags & memop_SignExtend) + MOVSX(32, size, rdMapped.GetSimpleReg(), memLoc); + else + MOVZX(32, size, rdMapped.GetSimpleReg(), memLoc); + } + else if (expectedTarget == memregion_SWRAM9 || expectedTarget == memregion_SWRAM7) { - bool throwAway; - OpArg offset = - Comp_RegShiftImm(op2.Reg.Op, op2.Reg.Amount, rm, false, throwAway); - - if (flags & memop_SubtractOffset) + MOV(64, R(RSCRATCH2), M(expectedTarget == memregion_SWRAM9 ? &NDS::SWRAM_ARM9 : &NDS::SWRAM_ARM7)); + if (addrIsStatic) { - if (R(finalAddr) != rnMapped) - MOV(32, R(finalAddr), rnMapped); - if (!offset.IsZero()) - SUB(32, R(finalAddr), offset); + MOV(32, R(RSCRATCH), Imm32(staticAddress & addressMask)); } else - MOV_sum(32, finalAddr, rnMapped, offset); + { + MOV(32, R(RSCRATCH), R(RSCRATCH3)); + AND(32, R(RSCRATCH), Imm8(addressMask)); + } + AND(32, R(RSCRATCH), M(expectedTarget == memregion_SWRAM9 ? &NDS::SWRAM_ARM9Mask : &NDS::SWRAM_ARM7Mask)); + OpArg memLoc = MRegSum(RSCRATCH, RSCRATCH2); + if (flags & memop_Store) + MOV(size, memLoc, rdMapped); + else if (flags & memop_SignExtend) + MOVSX(32, size, rdMapped.GetSimpleReg(), memLoc); + else + MOVZX(32, size, rdMapped.GetSimpleReg(), memLoc); } - } + else + { + u32 maskedDataRegion; - if ((flags & memop_Writeback) && !(flags & memop_Post)) - MOV(32, rnMapped, R(finalAddr)); + if (addrIsStatic) + { + maskedDataRegion = staticAddress; + MOV(32, R(ABI_PARAM1), Imm32(staticAddress)); + } + else + { + if (ABI_PARAM1 != RSCRATCH3) + MOV(32, R(ABI_PARAM1), R(RSCRATCH3)); + AND(32, R(ABI_PARAM1), Imm8(addressMask)); - if (flags & memop_Store) - MOV(32, R(ABI_PARAM2), rdMapped); + maskedDataRegion = CurInstr.DataRegion; + if (Num == 0) + maskedDataRegion &= ~0xFFFFFF; + else + maskedDataRegion &= ~0x7FFFFF; + } - if (!(flags & memop_Store) && inlinePreparation && constLocalROR32 == 4 && size == 32) - MOV(32, rdMapped, R(ABI_PARAM1)); + void* func = GetFuncForAddr(CurCPU, maskedDataRegion, flags & memop_Store, size); - if (inlinePreparation && size > 8) - AND(32, R(ABI_PARAM1), Imm8(addressMask)); + if (flags & memop_Store) + { + MOV(32, R(ABI_PARAM2), rdMapped); - CALL(memoryFunc); + ABI_CallFunction((void(*)())func); + } + else + { + if (!addrIsStatic) + MOV(32, rdMapped, R(RSCRATCH3)); - /*if (Num == 0 && check) - { - CMP(32, R(EAX), rdMapped); - FixupBranch notEqual = J_CC(CC_E); - ABI_PushRegistersAndAdjustStack({RSCRATCH}, 0); - MOV(32, R(ABI_PARAM1), Imm32(R15 - (Thumb ? 4 : 8))); - MOV(32, R(ABI_PARAM2), R(EAX)); - MOV(32, R(ABI_PARAM3), rdMapped); - MOV(32, R(ABI_PARAM4), Imm32(CurInstr.Instr)); - CALL((u8*)fault); - ABI_PopRegistersAndAdjustStack({RSCRATCH}, 0); - SetJumpTarget(notEqual); - }*/ - - if (!(flags & memop_Store)) - { - if (inlinePreparation && size == 32) + ABI_CallFunction((void(*)())func); + + if (!addrIsStatic) + MOV(32, R(RSCRATCH3), rdMapped); + + if (flags & memop_SignExtend) + MOVSX(32, size, rdMapped.GetSimpleReg(), R(RSCRATCH)); + else + MOVZX(32, size, rdMapped.GetSimpleReg(), R(RSCRATCH)); + } + } + + if ((size == 32 && !(flags & memop_Store))) { - if (constLocalROR32 == 4) + if (addrIsStatic) + { + if (staticAddress & 0x3) + ROR_(32, rdMapped, Imm8((staticAddress & 0x3) * 8)); + } + else { - static_assert(RSCRATCH3 == ECX); - MOV(32, R(ECX), rdMapped); - AND(32, R(ECX), Imm8(3)); - SHL(32, R(ECX), Imm8(3)); - ROR_(32, R(RSCRATCH), R(ECX)); + AND(32, R(RSCRATCH3), Imm8(0x3)); + SHL(32, R(RSCRATCH3), Imm8(3)); + ROR_(32, rdMapped, R(RSCRATCH3)); } - else if (constLocalROR32 != 0) - ROR_(32, R(RSCRATCH), Imm8(constLocalROR32 << 3)); } - if (flags & memop_SignExtend) - MOVSX(32, size, rdMapped.GetSimpleReg(), R(RSCRATCH)); + if (compileSlowPath) + { + SwitchToFarCode(); + SetJumpTarget(slowPath); + } + } + + if (compileSlowPath) + { + if (Num == 0) + { + MOV(32, R(ABI_PARAM2), R(RSCRATCH3)); + MOV(64, R(ABI_PARAM1), R(RCPU)); + if (flags & memop_Store) + { + MOV(32, R(ABI_PARAM3), rdMapped); + + switch (size) + { + case 32: CALL((void*)&SlowWrite9); break; + case 16: CALL((void*)&SlowWrite9); break; + case 8: CALL((void*)&SlowWrite9); break; + } + } + else + { + switch (size) + { + case 32: CALL((void*)&SlowRead9); break; + case 16: CALL((void*)&SlowRead9); break; + case 8: CALL((void*)&SlowRead9); break; + } + } + } else - MOVZX(32, size, rdMapped.GetSimpleReg(), R(RSCRATCH)); + { + if (ABI_PARAM1 != RSCRATCH3) + MOV(32, R(ABI_PARAM1), R(RSCRATCH3)); + if (flags & memop_Store) + { + MOV(32, R(ABI_PARAM2), rdMapped); + + switch (size) + { + case 32: CALL((void*)&SlowWrite7); break; + case 16: CALL((void*)&SlowWrite7); break; + case 8: CALL((void*)&SlowWrite7); break; + } + } + else + { + switch (size) + { + case 32: CALL((void*)&SlowRead7); break; + case 16: CALL((void*)&SlowRead7); break; + case 8: CALL((void*)&SlowRead7); break; + } + } + } + if (!(flags & memop_Store)) + { + if (flags & memop_SignExtend) + MOVSX(32, size, rdMapped.GetSimpleReg(), R(RSCRATCH)); + else + MOVZX(32, size, rdMapped.GetSimpleReg(), R(RSCRATCH)); + } + } + + if (compileFastPath && compileSlowPath) + { + FixupBranch ret = J(true); + SwitchToNearCode(); + SetJumpTarget(ret); } if (!(flags & memop_Store) && rd == 15) @@ -498,100 +433,160 @@ void Compiler::Comp_MemAccess(int rd, int rn, const ComplexOperand& op2, int siz s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc, bool decrement, bool usermode) { - IrregularCycles = true; - int regsCount = regs.Count(); s32 offset = (regsCount * 4) * (decrement ? -1 : 1); // we need to make sure that the stack stays aligned to 16 bytes +#ifdef _WIN32 + // include shadow + u32 stackAlloc = ((regsCount + 4 + 1) & ~1) * 8; +#else u32 stackAlloc = ((regsCount + 1) & ~1) * 8; +#endif + u32 allocOffset = stackAlloc - regsCount * 8; - if (!store) + int expectedTarget = Num == 0 + ? ClassifyAddress9(CurInstr.DataRegion) + : ClassifyAddress7(CurInstr.DataRegion); + if (usermode || CurInstr.Cond() < 0xE) + expectedTarget = memregion_Other; + + bool compileFastPath = false; + + switch (expectedTarget) { + case memregion_DTCM: + case memregion_MainRAM: + case memregion_SWRAM9: + case memregion_SWRAM7: + case memregion_WRAM7: + compileFastPath = true; + break; + default: + break; + } + + if (!store) Comp_AddCycles_CDI(); + else + Comp_AddCycles_CD(); - if (decrement) + if (decrement) + { + MOV_sum(32, RSCRATCH4, MapReg(rn), Imm32(-regsCount * 4)); + preinc ^= true; + } + else + MOV(32, R(RSCRATCH4), MapReg(rn)); + + if (compileFastPath) + { + assert(!usermode); + + MOV(32, R(RSCRATCH), R(RSCRATCH4)); + SHR(32, R(RSCRATCH), Imm8(9)); + + if (store) { - MOV_sum(32, ABI_PARAM1, MapReg(rn), Imm32(-regsCount * 4)); - preinc ^= true; + CMP(8, MDisp(RSCRATCH, squeezePointer(Num == 0 ? MemoryStatus9 : MemoryStatus7)), Imm8(expectedTarget)); } else - MOV(32, R(ABI_PARAM1), MapReg(rn)); - - MOV(32, R(ABI_PARAM3), Imm32(regsCount)); - SUB(64, R(RSP), stackAlloc <= INT8_MAX ? Imm8(stackAlloc) : Imm32(stackAlloc)); - MOV(64, R(ABI_PARAM2), R(RSP)); - - CALL(Num == 0 - ? MemoryFuncsSeq9[0][preinc] - : MemoryFuncsSeq7[0][preinc][CodeRegion == 0x02]); + { + MOVZX(32, 8, RSCRATCH, MDisp(RSCRATCH, squeezePointer(Num == 0 ? MemoryStatus9 : MemoryStatus7))); + AND(32, R(RSCRATCH), Imm8(~0x80)); + CMP(32, R(RSCRATCH), Imm8(expectedTarget)); + } + FixupBranch slowPath = J_CC(CC_NE, true); - bool firstUserMode = true; - for (int reg = 15; reg >= 0; reg--) + if (expectedTarget == memregion_DTCM) { - if (regs[reg]) + SUB(32, R(RSCRATCH4), MDisp(RCPU, offsetof(ARMv5, DTCMBase))); + AND(32, R(RSCRATCH4), Imm32(0x3FFF & ~3)); + LEA(64, RSCRATCH4, MComplex(RCPU, RSCRATCH4, 1, offsetof(ARMv5, DTCM))); + } + else if (expectedTarget == memregion_MainRAM) + { + AND(32, R(RSCRATCH4), Imm32((MAIN_RAM_SIZE - 1) & ~3)); + ADD(64, R(RSCRATCH4), Imm32(squeezePointer(NDS::MainRAM))); + } + else if (expectedTarget == memregion_WRAM7) + { + AND(32, R(RSCRATCH4), Imm32(0xFFFF & ~3)); + ADD(64, R(RSCRATCH4), Imm32(squeezePointer(NDS::ARM7WRAM))); + } + else // SWRAM + { + AND(32, R(RSCRATCH4), Imm8(~3)); + AND(32, R(RSCRATCH4), M(expectedTarget == memregion_SWRAM9 ? &NDS::SWRAM_ARM9Mask : &NDS::SWRAM_ARM7Mask)); + ADD(64, R(RSCRATCH4), M(expectedTarget == memregion_SWRAM9 ? &NDS::SWRAM_ARM9 : &NDS::SWRAM_ARM7)); + } + u32 offset = 0; + for (int reg : regs) + { + if (preinc) + offset += 4; + OpArg mem = MDisp(RSCRATCH4, offset); + if (store) { - if (usermode && !regs[15] && reg >= 8 && reg < 15) + if (RegCache.LoadedRegs & (1 << reg)) { - if (firstUserMode) - { - MOV(32, R(RSCRATCH), R(RCPSR)); - AND(32, R(RSCRATCH), Imm8(0x1F)); - firstUserMode = false; - } - MOV(32, R(RSCRATCH2), Imm32(reg - 8)); - POP(RSCRATCH3); - CALL(WriteBanked); - FixupBranch sucessfulWritten = J_CC(CC_NC); - if (RegCache.Mapping[reg] != INVALID_REG) - MOV(32, R(RegCache.Mapping[reg]), R(RSCRATCH3)); - else - SaveReg(reg, RSCRATCH3); - SetJumpTarget(sucessfulWritten); + MOV(32, mem, MapReg(reg)); } - else if (RegCache.Mapping[reg] == INVALID_REG) + else { - assert(reg != 15); - - POP(RSCRATCH); - SaveReg(reg, RSCRATCH); + LoadReg(reg, RSCRATCH); + MOV(32, mem, R(RSCRATCH)); + } + } + else + { + if (RegCache.LoadedRegs & (1 << reg)) + { + MOV(32, MapReg(reg), mem); } else { - if (reg != 15) - RegCache.DirtyRegs |= (1 << reg); - POP(MapReg(reg).GetSimpleReg()); + MOV(32, R(RSCRATCH), mem); + SaveReg(reg, RSCRATCH); } } + if (!preinc) + offset += 4; } - if (regsCount & 1) - POP(RSCRATCH); + SwitchToFarCode(); + SetJumpTarget(slowPath); + } + + if (!store) + { + MOV(32, R(ABI_PARAM1), R(RSCRATCH4)); + MOV(32, R(ABI_PARAM3), Imm32(regsCount)); + SUB(64, R(RSP), stackAlloc <= INT8_MAX ? Imm8(stackAlloc) : Imm32(stackAlloc)); + if (allocOffset == 0) + MOV(64, R(ABI_PARAM2), R(RSP)); + else + LEA(64, ABI_PARAM2, MDisp(RSP, allocOffset)); + + if (Num == 0) + MOV(64, R(ABI_PARAM4), R(RCPU)); - if (regs[15]) + switch (Num * 2 | preinc) { - if (Num == 1) - { - if (Thumb) - OR(32, MapReg(15), Imm8(1)); - else - AND(32, MapReg(15), Imm8(0xFE)); - } - Comp_JumpTo(MapReg(15).GetSimpleReg(), usermode); + case 0: CALL((void*)&SlowBlockTransfer9); break; + case 1: CALL((void*)&SlowBlockTransfer9); break; + case 2: CALL((void*)&SlowBlockTransfer7); break; + case 3: CALL((void*)&SlowBlockTransfer7); break; } - } - else - { - Comp_AddCycles_CD(); - if (regsCount & 1) - PUSH(RSCRATCH); + if (allocOffset) + ADD(64, R(RSP), Imm8(allocOffset)); bool firstUserMode = true; for (int reg : regs) { - if (usermode && reg >= 8 && reg < 15) + if (usermode && !regs[15] && reg >= 8 && reg < 15) { if (firstUserMode) { @@ -599,43 +594,107 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc AND(32, R(RSCRATCH), Imm8(0x1F)); firstUserMode = false; } - if (RegCache.Mapping[reg] == INVALID_REG) - LoadReg(reg, RSCRATCH3); - else - MOV(32, R(RSCRATCH3), R(RegCache.Mapping[reg])); MOV(32, R(RSCRATCH2), Imm32(reg - 8)); - CALL(ReadBanked); - PUSH(RSCRATCH3); + POP(RSCRATCH3); + CALL(WriteBanked); + FixupBranch sucessfulWritten = J_CC(CC_NC); + if (RegCache.LoadedRegs & (1 << reg)) + MOV(32, R(RegCache.Mapping[reg]), R(RSCRATCH3)); + else + SaveReg(reg, RSCRATCH3); + SetJumpTarget(sucessfulWritten); } - else if (RegCache.Mapping[reg] == INVALID_REG) + else if (!(RegCache.LoadedRegs & (1 << reg))) { - LoadReg(reg, RSCRATCH); - PUSH(RSCRATCH); + assert(reg != 15); + + POP(RSCRATCH); + SaveReg(reg, RSCRATCH); } else { - PUSH(MapReg(reg).GetSimpleReg()); + POP(MapReg(reg).GetSimpleReg()); } } - - if (decrement) + } + else + { + bool firstUserMode = true; + for (int reg = 15; reg >= 0; reg--) { - MOV_sum(32, ABI_PARAM1, MapReg(rn), Imm32(-regsCount * 4)); - preinc ^= true; + if (regs[reg]) + { + if (usermode && reg >= 8 && reg < 15) + { + if (firstUserMode) + { + MOV(32, R(RSCRATCH), R(RCPSR)); + AND(32, R(RSCRATCH), Imm8(0x1F)); + firstUserMode = false; + } + if (RegCache.Mapping[reg] == INVALID_REG) + LoadReg(reg, RSCRATCH3); + else + MOV(32, R(RSCRATCH3), R(RegCache.Mapping[reg])); + MOV(32, R(RSCRATCH2), Imm32(reg - 8)); + CALL(ReadBanked); + PUSH(RSCRATCH3); + } + else if (!(RegCache.LoadedRegs & (1 << reg))) + { + LoadReg(reg, RSCRATCH); + PUSH(RSCRATCH); + } + else + { + PUSH(MapReg(reg).GetSimpleReg()); + } + } } - else - MOV(32, R(ABI_PARAM1), MapReg(rn)); - MOV(64, R(ABI_PARAM2), R(RSP)); + if (allocOffset) + SUB(64, R(RSP), Imm8(allocOffset)); + + MOV(32, R(ABI_PARAM1), R(RSCRATCH4)); + if (allocOffset) + LEA(64, ABI_PARAM2, MDisp(RSP, allocOffset)); + else + MOV(64, R(ABI_PARAM2), R(RSP)); + MOV(32, R(ABI_PARAM3), Imm32(regsCount)); + if (Num == 0) + MOV(64, R(ABI_PARAM4), R(RCPU)); - CALL(Num == 0 - ? MemoryFuncsSeq9[1][preinc] - : MemoryFuncsSeq7[1][preinc][CodeRegion == 0x02]); + switch (Num * 2 | preinc) + { + case 0: CALL((void*)&SlowBlockTransfer9); break; + case 1: CALL((void*)&SlowBlockTransfer9); break; + case 2: CALL((void*)&SlowBlockTransfer7); break; + case 3: CALL((void*)&SlowBlockTransfer7); break; + } ADD(64, R(RSP), stackAlloc <= INT8_MAX ? Imm8(stackAlloc) : Imm32(stackAlloc)); } + if (compileFastPath) + { + FixupBranch ret = J(true); + SwitchToNearCode(); + SetJumpTarget(ret); + } + + if (!store && regs[15]) + { + if (Num == 1) + { + if (Thumb) + OR(32, MapReg(15), Imm8(1)); + else + AND(32, MapReg(15), Imm8(0xFE)); + } + Comp_JumpTo(MapReg(15).GetSimpleReg(), usermode); + } + return offset; } @@ -786,9 +845,7 @@ void Compiler::T_Comp_LoadPCRel() { u32 offset = (CurInstr.Instr & 0xFF) << 2; u32 addr = (R15 & ~0x2) + offset; - if (Config::JIT_LiteralOptimisations) - Comp_MemLoadLiteral(32, CurInstr.T_Reg(8), addr); - else + if (!Config::JIT_LiteralOptimisations || !Comp_MemLoadLiteral(32, CurInstr.T_Reg(8), addr)) Comp_MemAccess(CurInstr.T_Reg(8), 15, ComplexOperand(offset), 32, 0); } diff --git a/src/ARM_InstrInfo.cpp b/src/ARM_InstrInfo.cpp index 28362d9..b50e821 100644 --- a/src/ARM_InstrInfo.cpp +++ b/src/ARM_InstrInfo.cpp @@ -373,16 +373,16 @@ Info Decode(bool thumb, u32 num, u32 instr) if (res.Kind == tk_LDMIA || res.Kind == tk_POP) { - u32 set = (instr & 0xFF) & ~(res.DstRegs|res.SrcRegs); - res.NotStrictlyNeeded |= set; + u32 set = (instr & 0xFF); + res.NotStrictlyNeeded |= set & ~(res.DstRegs|res.SrcRegs); res.DstRegs |= set; } if (res.Kind == tk_STMIA || res.Kind == tk_PUSH) { - u32 set = (instr & 0xFF) & ~(res.DstRegs|res.SrcRegs); + u32 set = (instr & 0xFF); if (res.Kind == tk_PUSH && instr & (1 << 8)) set |= (1 << 14); - res.NotStrictlyNeeded |= set; + res.NotStrictlyNeeded |= set & ~(res.DstRegs|res.SrcRegs); res.SrcRegs |= set; } @@ -495,15 +495,15 @@ Info Decode(bool thumb, u32 num, u32 instr) if (res.Kind == ak_LDM) { - u16 set = (instr & 0xFFFF) & ~(res.SrcRegs|res.DstRegs|(1<<15)); + u16 set = (instr & 0xFFFF); + res.NotStrictlyNeeded |= set & ~(res.SrcRegs|res.DstRegs|(1<<15)); res.DstRegs |= set; - res.NotStrictlyNeeded |= set; } if (res.Kind == ak_STM) { - u16 set = (instr & 0xFFFF) & ~(res.SrcRegs|res.DstRegs|(1<<15)); + u16 set = (instr & 0xFFFF); + res.NotStrictlyNeeded |= set & ~(res.SrcRegs|res.DstRegs|(1<<15)); res.SrcRegs |= set; - res.NotStrictlyNeeded |= set; } if ((instr >> 28) < 0xE) diff --git a/src/CP15.cpp b/src/CP15.cpp index 62258e9..e665dbd 100644 --- a/src/CP15.cpp +++ b/src/CP15.cpp @@ -97,6 +97,10 @@ void ARMv5::CP15DoSavestate(Savestate* file) void ARMv5::UpdateDTCMSetting() { +#ifdef JIT_ENABLED + u32 oldDTCMBase = DTCMBase; + u32 oldDTCMSize = DTCMSize; +#endif if (CP15Control & (1<<16)) { DTCMBase = DTCMSetting & 0xFFFFF000; @@ -109,10 +113,20 @@ void ARMv5::UpdateDTCMSetting() DTCMSize = 0; //printf("DTCM disabled\n"); } +#ifdef JIT_ENABLED + if (oldDTCMBase != DTCMBase || oldDTCMSize != DTCMSize) + { + ARMJIT::UpdateMemoryStatus9(oldDTCMBase, oldDTCMBase + oldDTCMSize); + ARMJIT::UpdateMemoryStatus9(DTCMBase, DTCMBase + DTCMSize); + } +#endif } void ARMv5::UpdateITCMSetting() { +#ifdef JIT_ENABLED + u32 oldITCMSize = ITCMSize; +#endif if (CP15Control & (1<<18)) { ITCMSize = 0x200 << ((ITCMSetting >> 1) & 0x1F); @@ -123,6 +137,10 @@ void ARMv5::UpdateITCMSetting() ITCMSize = 0; //printf("ITCM disabled\n"); } +#ifdef JIT_ENABLED + if (oldITCMSize != ITCMSize) + ARMJIT::UpdateMemoryStatus9(0, std::max(oldITCMSize, ITCMSize)); +#endif } @@ -561,15 +579,9 @@ void ARMv5::CP15Write(u32 id, u32 val) case 0x750: -#ifdef JIT_ENABLED - ARMJIT::InvalidateAll(); -#endif ICacheInvalidateAll(); return; case 0x751: -#ifdef JIT_ENABLED - ARMJIT::InvalidateByAddr(ARMJIT::TranslateAddr<0>(val)); -#endif ICacheInvalidateByAddr(val); return; case 0x752: @@ -732,7 +744,7 @@ u32 ARMv5::CodeRead32(u32 addr, bool branch) void ARMv5::DataRead8(u32 addr, u32* val) { - DataRegion = addr >> 12; + DataRegion = addr; if (addr < ITCMSize) { @@ -753,7 +765,7 @@ void ARMv5::DataRead8(u32 addr, u32* val) void ARMv5::DataRead16(u32 addr, u32* val) { - DataRegion = addr >> 12; + DataRegion = addr; addr &= ~1; @@ -776,7 +788,7 @@ void ARMv5::DataRead16(u32 addr, u32* val) void ARMv5::DataRead32(u32 addr, u32* val) { - DataRegion = addr >> 12; + DataRegion = addr; addr &= ~3; @@ -820,14 +832,14 @@ void ARMv5::DataRead32S(u32 addr, u32* val) void ARMv5::DataWrite8(u32 addr, u8 val) { - DataRegion = addr >> 12; + DataRegion = addr; if (addr < ITCMSize) { DataCycles = 1; *(u8*)&ITCM[addr & 0x7FFF] = val; #ifdef JIT_ENABLED - ARMJIT::InvalidateITCM(addr & 0x7FFF); + ARMJIT::InvalidateITCMIfNecessary(addr); #endif return; } @@ -844,7 +856,7 @@ void ARMv5::DataWrite8(u32 addr, u8 val) void ARMv5::DataWrite16(u32 addr, u16 val) { - DataRegion = addr >> 12; + DataRegion = addr; addr &= ~1; @@ -853,7 +865,7 @@ void ARMv5::DataWrite16(u32 addr, u16 val) DataCycles = 1; *(u16*)&ITCM[addr & 0x7FFF] = val; #ifdef JIT_ENABLED - ARMJIT::InvalidateITCM(addr & 0x7FFF); + ARMJIT::InvalidateITCMIfNecessary(addr); #endif return; } @@ -870,7 +882,7 @@ void ARMv5::DataWrite16(u32 addr, u16 val) void ARMv5::DataWrite32(u32 addr, u32 val) { - DataRegion = addr >> 12; + DataRegion = addr; addr &= ~3; @@ -879,7 +891,7 @@ void ARMv5::DataWrite32(u32 addr, u32 val) DataCycles = 1; *(u32*)&ITCM[addr & 0x7FFF] = val; #ifdef JIT_ENABLED - ARMJIT::InvalidateITCM(addr & 0x7FFF); + ARMJIT::InvalidateITCMIfNecessary(addr); #endif return; } @@ -903,7 +915,7 @@ void ARMv5::DataWrite32S(u32 addr, u32 val) DataCycles += 1; *(u32*)&ITCM[addr & 0x7FFF] = val; #ifdef JIT_ENABLED - ARMJIT::InvalidateITCM(addr & 0x7FFF); + ARMJIT::InvalidateITCMIfNecessary(addr); #endif return; } diff --git a/src/NDS.cpp b/src/NDS.cpp index 141c565..6e989a8 100644 --- a/src/NDS.cpp +++ b/src/NDS.cpp @@ -535,10 +535,6 @@ void Reset() KeyCnt = 0; RCnt = 0; -#ifdef JIT_ENABLED - ARMJIT::ResetBlockCache(); -#endif - NDSCart::Reset(); GBACart::Reset(); GPU::Reset(); @@ -548,6 +544,10 @@ void Reset() Wifi::Reset(); AREngine::Reset(); + +#ifdef JIT_ENABLED + ARMJIT::Reset(); +#endif } void Stop() @@ -1058,6 +1058,9 @@ void Halt() void MapSharedWRAM(u8 val) { + if (val == WRAMCnt) + return; + WRAMCnt = val; switch (WRAMCnt & 0x3) @@ -1090,6 +1093,11 @@ void MapSharedWRAM(u8 val) SWRAM_ARM7Mask = 0x7FFF; break; } + +#ifdef JIT_ENABLED + ARMJIT::UpdateMemoryStatus9(0x3000000, 0x3000000 + 0x1000000); + ARMJIT::UpdateMemoryStatus7(0x3000000, 0x3000000 + 0x1000000); +#endif } @@ -1873,12 +1881,18 @@ void ARM9Write8(u32 addr, u8 val) switch (addr & 0xFF000000) { case 0x02000000: +#ifdef JIT_ENABLED + ARMJIT::InvalidateMainRAMIfNecessary(addr); +#endif *(u8*)&MainRAM[addr & (MAIN_RAM_SIZE - 1)] = val; return; case 0x03000000: if (SWRAM_ARM9) { +#ifdef JIT_ENABLED + ARMJIT::InvalidateSWRAM9IfNecessary(addr); +#endif *(u8*)&SWRAM_ARM9[addr & SWRAM_ARM9Mask] = val; } return; @@ -1923,12 +1937,18 @@ void ARM9Write16(u32 addr, u16 val) switch (addr & 0xFF000000) { case 0x02000000: +#ifdef JIT_ENABLED + ARMJIT::InvalidateMainRAMIfNecessary(addr); +#endif *(u16*)&MainRAM[addr & (MAIN_RAM_SIZE - 1)] = val; return; case 0x03000000: if (SWRAM_ARM9) { +#ifdef JIT_ENABLED + ARMJIT::InvalidateSWRAM9IfNecessary(addr); +#endif *(u16*)&SWRAM_ARM9[addr & SWRAM_ARM9Mask] = val; } return; @@ -1949,7 +1969,12 @@ void ARM9Write16(u32 addr, u16 val) case 0x00200000: GPU::WriteVRAM_BBG(addr, val); return; case 0x00400000: GPU::WriteVRAM_AOBJ(addr, val); return; case 0x00600000: GPU::WriteVRAM_BOBJ(addr, val); return; - default: GPU::WriteVRAM_LCDC(addr, val); return; + default: +#ifdef JIT_ENABLED + ARMJIT::InvalidateLCDCIfNecessary(addr); +#endif + GPU::WriteVRAM_LCDC(addr, val); + return; } case 0x07000000: @@ -1989,12 +2014,18 @@ void ARM9Write32(u32 addr, u32 val) switch (addr & 0xFF000000) { case 0x02000000: +#ifdef JIT_ENABLED + ARMJIT::InvalidateMainRAMIfNecessary(addr); +#endif *(u32*)&MainRAM[addr & (MAIN_RAM_SIZE - 1)] = val; return ; case 0x03000000: if (SWRAM_ARM9) { +#ifdef JIT_ENABLED + ARMJIT::InvalidateSWRAM9IfNecessary(addr); +#endif *(u32*)&SWRAM_ARM9[addr & SWRAM_ARM9Mask] = val; } return; @@ -2015,7 +2046,12 @@ void ARM9Write32(u32 addr, u32 val) case 0x00200000: GPU::WriteVRAM_BBG(addr, val); return; case 0x00400000: GPU::WriteVRAM_AOBJ(addr, val); return; case 0x00600000: GPU::WriteVRAM_BOBJ(addr, val); return; - default: GPU::WriteVRAM_LCDC(addr, val); return; + default: +#ifdef JIT_ENABLED + ARMJIT::InvalidateLCDCIfNecessary(addr); +#endif + GPU::WriteVRAM_LCDC(addr, val); + return; } case 0x07000000: @@ -2279,30 +2315,38 @@ u32 ARM7Read32(u32 addr) void ARM7Write8(u32 addr, u8 val) { -#ifdef JIT_ENABLED - ARMJIT::InvalidateByAddr7(addr); -#endif - switch (addr & 0xFF800000) { case 0x02000000: case 0x02800000: +#ifdef JIT_ENABLED + ARMJIT::InvalidateMainRAMIfNecessary(addr); +#endif *(u8*)&MainRAM[addr & (MAIN_RAM_SIZE - 1)] = val; return; case 0x03000000: if (SWRAM_ARM7) { +#ifdef JIT_ENABLED + ARMJIT::InvalidateSWRAM7IfNecessary(addr); +#endif *(u8*)&SWRAM_ARM7[addr & SWRAM_ARM7Mask] = val; return; } else { +#ifdef JIT_ENABLED + ARMJIT::InvalidateARM7WRAMIfNecessary(addr); +#endif *(u8*)&ARM7WRAM[addr & 0xFFFF] = val; return; } case 0x03800000: +#ifdef JIT_ENABLED + ARMJIT::InvalidateARM7WRAMIfNecessary(addr); +#endif *(u8*)&ARM7WRAM[addr & 0xFFFF] = val; return; @@ -2312,6 +2356,9 @@ void ARM7Write8(u32 addr, u8 val) case 0x06000000: case 0x06800000: +#ifdef JIT_ENABLED + ARMJIT::InvalidateARM7WVRAMIfNecessary(addr); +#endif GPU::WriteVRAM_ARM7(addr, val); return; @@ -2342,30 +2389,38 @@ void ARM7Write8(u32 addr, u8 val) void ARM7Write16(u32 addr, u16 val) { -#ifdef JIT_ENABLED - ARMJIT::InvalidateByAddr7(addr); -#endif - switch (addr & 0xFF800000) { case 0x02000000: case 0x02800000: +#ifdef JIT_ENABLED + ARMJIT::InvalidateMainRAMIfNecessary(addr); +#endif *(u16*)&MainRAM[addr & (MAIN_RAM_SIZE - 1)] = val; return; case 0x03000000: if (SWRAM_ARM7) { +#ifdef JIT_ENABLED + ARMJIT::InvalidateSWRAM7IfNecessary(addr); +#endif *(u16*)&SWRAM_ARM7[addr & SWRAM_ARM7Mask] = val; return; } else { +#ifdef JIT_ENABLED + ARMJIT::InvalidateARM7WRAMIfNecessary(addr); +#endif *(u16*)&ARM7WRAM[addr & 0xFFFF] = val; return; } case 0x03800000: +#ifdef JIT_ENABLED + ARMJIT::InvalidateARM7WRAMIfNecessary(addr); +#endif *(u16*)&ARM7WRAM[addr & 0xFFFF] = val; return; @@ -2383,6 +2438,9 @@ void ARM7Write16(u32 addr, u16 val) case 0x06000000: case 0x06800000: +#ifdef JIT_ENABLED + ARMJIT::InvalidateARM7WVRAMIfNecessary(addr); +#endif GPU::WriteVRAM_ARM7(addr, val); return; @@ -2415,30 +2473,38 @@ void ARM7Write16(u32 addr, u16 val) void ARM7Write32(u32 addr, u32 val) { -#ifdef JIT_ENABLED - ARMJIT::InvalidateByAddr7(addr); -#endif - switch (addr & 0xFF800000) { case 0x02000000: case 0x02800000: +#ifdef JIT_ENABLED + ARMJIT::InvalidateMainRAMIfNecessary(addr); +#endif *(u32*)&MainRAM[addr & (MAIN_RAM_SIZE - 1)] = val; return; case 0x03000000: if (SWRAM_ARM7) { +#ifdef JIT_ENABLED + ARMJIT::InvalidateSWRAM7IfNecessary(addr); +#endif *(u32*)&SWRAM_ARM7[addr & SWRAM_ARM7Mask] = val; return; } else { +#ifdef JIT_ENABLED + ARMJIT::InvalidateARM7WRAMIfNecessary(addr); +#endif *(u32*)&ARM7WRAM[addr & 0xFFFF] = val; return; } case 0x03800000: +#ifdef JIT_ENABLED + ARMJIT::InvalidateARM7WRAMIfNecessary(addr); +#endif *(u32*)&ARM7WRAM[addr & 0xFFFF] = val; return; @@ -2457,6 +2523,9 @@ void ARM7Write32(u32 addr, u32 val) case 0x06000000: case 0x06800000: +#ifdef JIT_ENABLED + ARMJIT::InvalidateARM7WVRAMIfNecessary(addr); +#endif GPU::WriteVRAM_ARM7(addr, val); return; diff --git a/src/NDS.h b/src/NDS.h index c7b455e..163260b 100644 --- a/src/NDS.h +++ b/src/NDS.h @@ -120,6 +120,14 @@ extern u8 ROMSeed1[2*8]; extern u8 ARM9BIOS[0x1000]; extern u8 ARM7BIOS[0x4000]; +extern u8 SharedWRAM[0x8000]; +extern u8* SWRAM_ARM9; +extern u8* SWRAM_ARM7; +extern u32 SWRAM_ARM9Mask; +extern u32 SWRAM_ARM7Mask; + +extern u8 ARM7WRAM[0x10000]; + #define MAIN_RAM_SIZE 0x400000 extern u8 MainRAM[MAIN_RAM_SIZE]; -- cgit v1.2.3