diff options
-rw-r--r-- | src/ARM.cpp | 10 | ||||
-rw-r--r-- | src/ARM.h | 24 | ||||
-rw-r--r-- | src/ARMJIT.cpp | 905 | ||||
-rw-r--r-- | src/ARMJIT.h | 65 | ||||
-rw-r--r-- | src/ARMJIT_A64/ARMJIT_Compiler.cpp | 4 | ||||
-rw-r--r-- | src/ARMJIT_Internal.h | 68 | ||||
-rw-r--r-- | src/ARMJIT_RegisterCache.h | 18 | ||||
-rw-r--r-- | src/ARMJIT_x64/ARMJIT_Compiler.cpp | 43 | ||||
-rw-r--r-- | src/ARMJIT_x64/ARMJIT_Compiler.h | 34 | ||||
-rw-r--r-- | src/ARMJIT_x64/ARMJIT_LoadStore.cpp | 935 | ||||
-rw-r--r-- | src/ARM_InstrInfo.cpp | 16 | ||||
-rw-r--r-- | src/CP15.cpp | 44 | ||||
-rw-r--r-- | src/NDS.cpp | 105 | ||||
-rw-r--r-- | src/NDS.h | 8 |
14 files changed, 1465 insertions, 814 deletions
diff --git a/src/ARM.cpp b/src/ARM.cpp index 95d2b8b..205332d 100644 --- a/src/ARM.cpp +++ b/src/ARM.cpp @@ -579,7 +579,8 @@ void ARMv5::ExecuteJIT() while (NDS::ARM9Timestamp < NDS::ARM9Target) { u32 instrAddr = R[15] - ((CPSR&0x20)?2:4); - if (!ARMJIT::IsMapped<0>(instrAddr)) + u32 translatedAddr = ARMJIT::TranslateAddr9(instrAddr); + if (!translatedAddr) { NDS::ARM9Timestamp = NDS::ARM9Target; printf("ARMv5 PC in non executable region %08X\n", R[15]); @@ -589,7 +590,7 @@ void ARMv5::ExecuteJIT() // hack so Cycles <= 0 becomes Cycles < 0 Cycles = NDS::ARM9Target - NDS::ARM9Timestamp - 1; - ARMJIT::JitBlockEntry block = ARMJIT::LookUpBlockEntry(ARMJIT::TranslateAddr<0>(instrAddr)); + ARMJIT::JitBlockEntry block = ARMJIT::LookUpBlockEntry<0>(translatedAddr); if (block) ARM_Dispatch(this, block); else @@ -722,7 +723,8 @@ void ARMv4::ExecuteJIT() while (NDS::ARM7Timestamp < NDS::ARM7Target) { u32 instrAddr = R[15] - ((CPSR&0x20)?2:4); - if (!ARMJIT::IsMapped<1>(instrAddr)) + u32 translatedAddr = ARMJIT::TranslateAddr7(instrAddr); + if (!translatedAddr) { NDS::ARM7Timestamp = NDS::ARM7Target; printf("ARMv4 PC in non executable region %08X\n", R[15]); @@ -731,7 +733,7 @@ void ARMv4::ExecuteJIT() Cycles = NDS::ARM7Target - NDS::ARM7Timestamp - 1; - ARMJIT::JitBlockEntry block = ARMJIT::LookUpBlockEntry(ARMJIT::TranslateAddr<1>(instrAddr)); + ARMJIT::JitBlockEntry block = ARMJIT::LookUpBlockEntry<1>(translatedAddr); if (block) ARM_Dispatch(this, block); else @@ -308,7 +308,7 @@ public: void DataRead8(u32 addr, u32* val) { *val = NDS::ARM7Read8(addr); - DataRegion = addr >> 20; + DataRegion = addr; DataCycles = NDS::ARM7MemTimings[addr >> 15][0]; } @@ -317,7 +317,7 @@ public: addr &= ~1; *val = NDS::ARM7Read16(addr); - DataRegion = addr >> 20; + DataRegion = addr; DataCycles = NDS::ARM7MemTimings[addr >> 15][0]; } @@ -326,7 +326,7 @@ public: addr &= ~3; *val = NDS::ARM7Read32(addr); - DataRegion = addr >> 20; + DataRegion = addr; DataCycles = NDS::ARM7MemTimings[addr >> 15][2]; } @@ -341,7 +341,7 @@ public: void DataWrite8(u32 addr, u8 val) { NDS::ARM7Write8(addr, val); - DataRegion = addr >> 20; + DataRegion = addr; DataCycles = NDS::ARM7MemTimings[addr >> 15][0]; } @@ -350,7 +350,7 @@ public: addr &= ~1; NDS::ARM7Write16(addr, val); - DataRegion = addr >> 20; + DataRegion = addr; DataCycles = NDS::ARM7MemTimings[addr >> 15][0]; } @@ -359,7 +359,7 @@ public: addr &= ~3; NDS::ARM7Write32(addr, val); - DataRegion = addr >> 20; + DataRegion = addr; DataCycles = NDS::ARM7MemTimings[addr >> 15][2]; } @@ -390,7 +390,7 @@ public: s32 numC = NDS::ARM7MemTimings[CodeCycles][(CPSR&0x20)?0:2]; s32 numD = DataCycles; - if ((DataRegion >> 4) == 0x02) // mainRAM + if ((DataRegion >> 24) == 0x02) // mainRAM { if (CodeRegion == 0x02) Cycles -= numC + numD; @@ -417,7 +417,7 @@ public: s32 numC = NDS::ARM7MemTimings[CodeCycles][(CPSR&0x20)?0:2]; s32 numD = DataCycles; - if ((DataRegion >> 4) == 0x02) + if ((DataRegion >> 24) == 0x02) { if (CodeRegion == 0x02) Cycles -= numC + numD; @@ -443,4 +443,12 @@ void T_UNK(ARM* cpu); } +namespace NDS +{ + +extern ARMv5* ARM9; +extern ARMv4* ARM7; + +} + #endif // ARM_H diff --git a/src/ARMJIT.cpp b/src/ARMJIT.cpp index 46f71f1..9602aed 100644 --- a/src/ARMJIT.cpp +++ b/src/ARMJIT.cpp @@ -23,6 +23,7 @@ #include "ARMInterpreter_Branch.h" #include "ARMInterpreter.h" +#include "GPU.h" #include "GPU3D.h" #include "SPU.h" #include "Wifi.h" @@ -34,9 +35,10 @@ namespace ARMJIT #define JIT_DEBUGPRINT(msg, ...) //#define JIT_DEBUGPRINT(msg, ...) printf(msg, ## __VA_ARGS__) -Compiler* compiler; +Compiler* JITCompiler; -const u32 ExeMemRegionSizes[] = { +const u32 ExeMemRegionSizes[] = +{ 0x8000, // Unmapped Region (dummy) 0x8000, // ITCM 4*1024*1024, // Main RAM @@ -48,7 +50,8 @@ const u32 ExeMemRegionSizes[] = { 0x40000 // ARM7 WVRAM }; -const u32 ExeMemRegionOffsets[] = { +const u32 ExeMemRegionOffsets[] = +{ 0, 0x8000, 0x10000, @@ -61,65 +64,391 @@ const u32 ExeMemRegionOffsets[] = { 0x518000, }; -#define DUP2(x) x, x - -const static ExeMemKind JIT_MEM[2][32] = { - //arm9 - { - /* 0X*/ DUP2(exeMem_ITCM), - /* 1X*/ DUP2(exeMem_ITCM), // mirror - /* 2X*/ DUP2(exeMem_MainRAM), - /* 3X*/ DUP2(exeMem_SWRAM), - /* 4X*/ DUP2(exeMem_Unmapped), - /* 5X*/ DUP2(exeMem_Unmapped), - /* 6X*/ exeMem_Unmapped, - exeMem_LCDC, // Plain ARM9-CPU Access (LCDC mode) (max 656KB) - /* 7X*/ DUP2(exeMem_Unmapped), - /* 8X*/ DUP2(exeMem_Unmapped), - /* 9X*/ DUP2(exeMem_Unmapped), - /* AX*/ DUP2(exeMem_Unmapped), - /* BX*/ DUP2(exeMem_Unmapped), - /* CX*/ DUP2(exeMem_Unmapped), - /* DX*/ DUP2(exeMem_Unmapped), - /* EX*/ DUP2(exeMem_Unmapped), - /* FX*/ DUP2(exeMem_ARM9_BIOS) - }, - //arm7 - { - /* 0X*/ DUP2(exeMem_ARM7_BIOS), - /* 1X*/ DUP2(exeMem_Unmapped), - /* 2X*/ DUP2(exeMem_MainRAM), - /* 3X*/ exeMem_SWRAM, - exeMem_ARM7_WRAM, - /* 4X*/ DUP2(exeMem_Unmapped), - /* 5X*/ DUP2(exeMem_Unmapped), - /* 6X*/ DUP2(exeMem_ARM7_WVRAM), /* contrary to Gbatek, melonDS and itself, - DeSmuME doesn't mirror the 64 MB region at 0x6800000 */ - /* 7X*/ DUP2(exeMem_Unmapped), - /* 8X*/ DUP2(exeMem_Unmapped), - /* 9X*/ DUP2(exeMem_Unmapped), - /* AX*/ DUP2(exeMem_Unmapped), - /* BX*/ DUP2(exeMem_Unmapped), - /* CX*/ DUP2(exeMem_Unmapped), - /* DX*/ DUP2(exeMem_Unmapped), - /* EX*/ DUP2(exeMem_Unmapped), - /* FX*/ DUP2(exeMem_Unmapped) - } -}; - -#undef DUP2 - /* translates address to pseudo physical address - more compact, eliminates mirroring, everything comes in a row - we only need one translation table */ -u32 AddrTranslate9[0x2000]; -u32 AddrTranslate7[0x4000]; + +u32 TranslateAddr9(u32 addr) +{ + switch (ClassifyAddress9(addr)) + { + case memregion_MainRAM: return ExeMemRegionOffsets[exeMem_MainRAM] + (addr & (MAIN_RAM_SIZE - 1)); + case memregion_SWRAM9: + if (NDS::SWRAM_ARM9) + return ExeMemRegionOffsets[exeMem_SWRAM] + (NDS::SWRAM_ARM9 - NDS::SharedWRAM) + (addr & NDS::SWRAM_ARM9Mask); + else + return 0; + case memregion_ITCM: return ExeMemRegionOffsets[exeMem_ITCM] + (addr & 0x7FFF); + case memregion_VRAM: return (addr >= 0x6800000 && addr < 0x68A4000) ? ExeMemRegionOffsets[exeMem_LCDC] + (addr - 0x6800000) : 0; + case memregion_BIOS9: return ExeMemRegionOffsets[exeMem_ARM9_BIOS] + (addr & 0xFFF); + default: return 0; + } +} + +u32 TranslateAddr7(u32 addr) +{ + switch (ClassifyAddress7(addr)) + { + case memregion_MainRAM: return ExeMemRegionOffsets[exeMem_MainRAM] + (addr & (MAIN_RAM_SIZE - 1)); + case memregion_SWRAM7: + if (NDS::SWRAM_ARM7) + return ExeMemRegionOffsets[exeMem_SWRAM] + (NDS::SWRAM_ARM7 - NDS::SharedWRAM) + (addr & NDS::SWRAM_ARM7Mask); + else + return 0; + case memregion_BIOS7: return ExeMemRegionOffsets[exeMem_ARM7_BIOS] + addr; + case memregion_WRAM7: return ExeMemRegionOffsets[exeMem_ARM7_WRAM] + (addr & 0xFFFF); + case memregion_VWRAM: return ExeMemRegionOffsets[exeMem_ARM7_WVRAM] + (addr & 0x1FFFF); + default: return 0; + } +} AddressRange CodeRanges[ExeMemSpaceSize / 512]; -std::unordered_map<u32, JitBlock*> JitBlocks; +TinyVector<u32> InvalidLiterals; + +std::unordered_map<u32, JitBlock*> JitBlocks9; +std::unordered_map<u32, JitBlock*> JitBlocks7; + +u8 MemoryStatus9[0x800000]; +u8 MemoryStatus7[0x800000]; + +int ClassifyAddress9(u32 addr) +{ + if (addr < NDS::ARM9->ITCMSize) + return memregion_ITCM; + else if (addr >= NDS::ARM9->DTCMBase && addr < (NDS::ARM9->DTCMBase + NDS::ARM9->DTCMSize)) + return memregion_DTCM; + else if ((addr & 0xFFFFF000) == 0xFFFF0000) + return memregion_BIOS9; + else + { + switch (addr & 0xFF000000) + { + case 0x02000000: + return memregion_MainRAM; + case 0x03000000: + return memregion_SWRAM9; + case 0x04000000: + return memregion_IO9; + case 0x06000000: + return memregion_VRAM; + } + } + return memregion_Other; +} + +int ClassifyAddress7(u32 addr) +{ + if (addr < 0x00004000) + return memregion_BIOS7; + else + { + switch (addr & 0xFF800000) + { + case 0x02000000: + case 0x02800000: + return memregion_MainRAM; + case 0x03000000: + if (NDS::SWRAM_ARM7) + return memregion_SWRAM7; + else + return memregion_WRAM7; + case 0x03800000: + return memregion_WRAM7; + case 0x04000000: + return memregion_IO7; + case 0x04800000: + return memregion_Wifi; + case 0x06000000: + case 0x06800000: + return memregion_VWRAM; + } + } + return memregion_Other; +} + +void UpdateMemoryStatus9(u32 start, u32 end) +{ + start >>= 12; + end >>= 12; + + if (end == 0xFFFFF) + end++; + + for (u32 i = start; i < end; i++) + { + u32 addr = i << 12; + + int region = ClassifyAddress9(addr); + u32 pseudoPhyisical = TranslateAddr9(addr); + + for (u32 j = 0; j < 8; j++) + { + u8 val = region; + if (CodeRanges[(pseudoPhyisical + (j << 12)) / 512].Blocks.Length) + val |= 0x80; + MemoryStatus9[i * 8 + j] = val; + } + } +} + +void UpdateMemoryStatus7(u32 start, u32 end) +{ + start >>= 12; + end >>= 12; + + if (end == 0xFFFFF) + end++; + + for (u32 i = start; i < end; i++) + { + u32 addr = i << 12; + + int region = ClassifyAddress7(addr); + u32 pseudoPhyisical = TranslateAddr7(addr); + + for (u32 j = 0; j < 8; j++) + { + u8 val = region; + if (CodeRanges[(pseudoPhyisical + (j << 12)) / 512].Blocks.Length) + val |= 0x80; + MemoryStatus7[i * 8 + j] = val; + } + } +} + +void UpdateRegionByPseudoPhyiscal(u32 addr, bool invalidate) +{ + for (u32 i = 1; i < exeMem_Count; i++) + { + if (addr >= ExeMemRegionOffsets[i] && addr < ExeMemRegionOffsets[i] + ExeMemRegionSizes[i]) + { + for (u32 num = 0; num < 2; num++) + { + u32 physSize = ExeMemRegionSizes[i]; + u32 mapSize = 0; + u32 mapStart = 0; + switch (i) + { + case exeMem_ITCM: + if (num == 0) + mapStart = 0; mapSize = NDS::ARM9->ITCMSize; + break; + case exeMem_MainRAM: mapStart = 0x2000000; mapSize = 0x1000000; break; + case exeMem_SWRAM: + if (num == 0) + { + if (NDS::SWRAM_ARM9) + mapStart = 0x3000000, mapSize = 0x1000000; + else + mapStart = mapSize = 0; + } + else + { + if (NDS::SWRAM_ARM7) + mapStart = 0x3000000, mapSize = 0x800000; + else + mapStart = mapSize = 0; + } + break; + case exeMem_LCDC: + if (num == 0) + mapStart = 0x6800000, mapSize = 0xA4000; + break; + case exeMem_ARM9_BIOS: + if (num == 0) + mapStart = 0xFFFF0000, mapSize = 0x10000; + break; + case exeMem_ARM7_BIOS: + if (num == 1) + mapStart = 0; mapSize = 0x4000; + break; + case exeMem_ARM7_WRAM: + if (num == 1) + { + if (NDS::SWRAM_ARM7) + mapStart = 0x3800000, mapSize = 0x800000; + else + mapStart = 0x3000000, mapSize = 0x1000000; + } + break; + case exeMem_ARM7_WVRAM: + if (num == 1) + mapStart = 0x6000000, mapSize = 0x1000000; + break; + } + + for (u32 j = 0; j < mapSize / physSize; j++) + { + u32 virtAddr = mapStart + physSize * j + (addr - ExeMemRegionOffsets[i]); + if (num == 0 + && virtAddr >= NDS::ARM9->DTCMBase && virtAddr < (NDS::ARM9->DTCMBase + NDS::ARM9->DTCMSize)) + continue; + if (invalidate) + { + if (num == 0) + MemoryStatus9[virtAddr / 512] |= 0x80; + else + MemoryStatus7[virtAddr / 512] |= 0x80; + } + else + { + if (num == 0) + MemoryStatus9[virtAddr / 512] &= ~0x80; + else + MemoryStatus7[virtAddr / 512] &= ~0x80; + } + } + + } + return; + } + } + + assert(false); +} + +template <typename T> +T SlowRead9(ARMv5* cpu, u32 addr) +{ + u32 offset = addr & 0x3; + addr &= ~(sizeof(T) - 1); + + T val; + if (addr < cpu->ITCMSize) + val = *(T*)&cpu->ITCM[addr & 0x7FFF]; + else if (addr >= cpu->DTCMBase && addr < (cpu->DTCMBase + cpu->DTCMSize)) + val = *(T*)&cpu->DTCM[(addr - cpu->DTCMBase) & 0x3FFF]; + else if (std::is_same<T, u32>::value) + val = NDS::ARM9Read32(addr); + else if (std::is_same<T, u16>::value) + val = NDS::ARM9Read16(addr); + else + val = NDS::ARM9Read8(addr); + + if (std::is_same<T, u32>::value) + return ROR(val, offset << 3); + else + return val; +} + +template <typename T> +void SlowWrite9(ARMv5* cpu, u32 addr, T val) +{ + addr &= ~(sizeof(T) - 1); + + if (addr < cpu->ITCMSize) + { + InvalidateITCMIfNecessary(addr); + *(T*)&cpu->ITCM[addr & 0x7FFF] = val; + } + else if (addr >= cpu->DTCMBase && addr < (cpu->DTCMBase + cpu->DTCMSize)) + { + *(T*)&cpu->DTCM[(addr - cpu->DTCMBase) & 0x3FFF] = val; + } + else if (std::is_same<T, u32>::value) + { + NDS::ARM9Write32(addr, val); + } + else if (std::is_same<T, u16>::value) + { + NDS::ARM9Write16(addr, val); + } + else + { + NDS::ARM9Write8(addr, val); + } +} + +template void SlowWrite9<u32>(ARMv5*, u32, u32); +template void SlowWrite9<u16>(ARMv5*, u32, u16); +template void SlowWrite9<u8>(ARMv5*, u32, u8); + +template u32 SlowRead9<u32>(ARMv5*, u32); +template u16 SlowRead9<u16>(ARMv5*, u32); +template u8 SlowRead9<u8>(ARMv5*, u32); + +template <typename T> +T SlowRead7(u32 addr) +{ + u32 offset = addr & 0x3; + addr &= ~(sizeof(T) - 1); + + T val; + if (std::is_same<T, u32>::value) + val = NDS::ARM7Read32(addr); + else if (std::is_same<T, u16>::value) + val = NDS::ARM7Read16(addr); + else + val = NDS::ARM7Read8(addr); + + if (std::is_same<T, u32>::value) + return ROR(val, offset << 3); + else + return val; +} + +template <typename T> +void SlowWrite7(u32 addr, T val) +{ + addr &= ~(sizeof(T) - 1); + + if (std::is_same<T, u32>::value) + NDS::ARM7Write32(addr, val); + else if (std::is_same<T, u16>::value) + NDS::ARM7Write16(addr, val); + else + NDS::ARM7Write8(addr, val); +} + +template <bool PreInc, bool Write> +void SlowBlockTransfer9(u32 addr, u64* data, u32 num, ARMv5* cpu) +{ + addr &= ~0x3; + for (int i = 0; i < num; i++) + { + addr += PreInc * 4; + if (Write) + SlowWrite9<u32>(cpu, addr, data[i]); + else + data[i] = SlowRead9<u32>(cpu, addr); + addr += !PreInc * 4; + } +} + +template <bool PreInc, bool Write> +void SlowBlockTransfer7(u32 addr, u64* data, u32 num) +{ + addr &= ~0x3; + for (int i = 0; i < num; i++) + { + addr += PreInc * 4; + if (Write) + SlowWrite7<u32>(addr, data[i]); + else + data[i] = SlowRead7<u32>(addr); + addr += !PreInc * 4; + } +} + +template void SlowWrite7<u32>(u32, u32); +template void SlowWrite7<u16>(u32, u16); +template void SlowWrite7<u8>(u32, u8); + +template u32 SlowRead7<u32>(u32); +template u16 SlowRead7<u16>(u32); +template u8 SlowRead7<u8>(u32); + +template void SlowBlockTransfer9<false, false>(u32, u64*, u32, ARMv5*); +template void SlowBlockTransfer9<false, true>(u32, u64*, u32, ARMv5*); +template void SlowBlockTransfer9<true, false>(u32, u64*, u32, ARMv5*); +template void SlowBlockTransfer9<true, true>(u32, u64*, u32, ARMv5*); +template void SlowBlockTransfer7<false, false>(u32 addr, u64* data, u32 num); +template void SlowBlockTransfer7<false, true>(u32 addr, u64* data, u32 num); +template void SlowBlockTransfer7<true, false>(u32 addr, u64* data, u32 num); +template void SlowBlockTransfer7<true, true>(u32 addr, u64* data, u32 num); template <typename K, typename V, int Size, V InvalidValue> struct UnreliableHashTable @@ -211,31 +540,25 @@ struct UnreliableHashTable }; UnreliableHashTable<u32, JitBlock*, 0x800, nullptr> RestoreCandidates; -UnreliableHashTable<u32, u32, 0x1000, UINT32_MAX> FastBlockLookUp; +UnreliableHashTable<u32, u32, 0x800, UINT32_MAX> FastBlockLookUp9; +UnreliableHashTable<u32, u32, 0x800, UINT32_MAX> FastBlockLookUp7; void Init() { - for (int i = 0; i < 0x2000; i++) - { - ExeMemKind kind = JIT_MEM[0][i >> 8]; - u32 size = ExeMemRegionSizes[kind]; - - AddrTranslate9[i] = ExeMemRegionOffsets[kind] + ((i << 15) & (size - 1)); - } - for (int i = 0; i < 0x4000; i++) - { - ExeMemKind kind = JIT_MEM[1][i >> 9]; - u32 size = ExeMemRegionSizes[kind]; - - AddrTranslate7[i] = ExeMemRegionOffsets[kind] + ((i << 14) & (size - 1)); - } - - compiler = new Compiler(); + JITCompiler = new Compiler(); } void DeInit() { - delete compiler; + delete JITCompiler; +} + +void Reset() +{ + ResetBlockCache(); + + UpdateMemoryStatus9(0, 0xFFFFFFFF); + UpdateMemoryStatus7(0, 0xFFFFFFFF); } void FloodFillSetFlags(FetchedInstr instrs[], int start, u8 flags) @@ -256,25 +579,31 @@ void FloodFillSetFlags(FetchedInstr instrs[], int start, u8 flags) } } -bool DecodeLiteral(const FetchedInstr& instr, u32& addr) +bool DecodeLiteral(bool thumb, const FetchedInstr& instr, u32& addr) { - switch (instr.Info.Kind) + if (!thumb) { - case ARMInstrInfo::ak_STR_IMM: - case ARMInstrInfo::ak_STRB_IMM: - addr = (instr.Addr + 8) + ((instr.Instr & 0xFFF) * (instr.Instr & (1 << 23) ? 1 : -1)); - return true; - case ARMInstrInfo::ak_STRD_IMM: - case ARMInstrInfo::ak_STRH_IMM: - addr = (instr.Addr + 8) + (((instr.Instr & 0xF00) >> 4 | (instr.Instr & 0xF)) * (instr.Instr & (1 << 23) ? 1 : -1)); - return true; - case ARMInstrInfo::ak_STM: // I honestly hope noone was ever crazy enough to do stm pc, {whatever} - addr = instr.Addr + 8; + switch (instr.Info.Kind) + { + case ARMInstrInfo::ak_LDR_IMM: + case ARMInstrInfo::ak_LDRB_IMM: + addr = (instr.Addr + 8) + ((instr.Instr & 0xFFF) * (instr.Instr & (1 << 23) ? 1 : -1)); + return true; + case ARMInstrInfo::ak_LDRH_IMM: + addr = (instr.Addr + 8) + (((instr.Instr & 0xF00) >> 4 | (instr.Instr & 0xF)) * (instr.Instr & (1 << 23) ? 1 : -1)); + return true; + default: + break; + } + } + else if (instr.Info.Kind == ARMInstrInfo::tk_LDR_PCREL) + { + addr = ((instr.Addr + 4) & ~0x2) + ((instr.Instr & 0xFF) << 2); return true; - default: - JIT_DEBUGPRINT("Literal %08x %x not recognised\n", instr.Instr, instr.Addr); - return false; } + + JIT_DEBUGPRINT("Literal %08x %x not recognised %d\n", instr.Instr, instr.Addr, instr.Info.Kind); + return false; } bool DecodeBranch(bool thumb, const FetchedInstr& instr, u32& cond, bool hasLink, u32 lr, bool& link, @@ -453,6 +782,8 @@ InterpreterFunc InterpretTHUMB[ARMInstrInfo::tk_Count] = }; #undef F + +extern u32 literalsPerBlock; void CompileBlock(ARM* cpu) { bool thumb = cpu->CPSR & 0x20; @@ -463,31 +794,33 @@ void CompileBlock(ARM* cpu) Config::JIT_MaxBlockSize = 32; u32 blockAddr = cpu->R[15] - (thumb ? 2 : 4); - if (!(cpu->Num == 0 - ? IsMapped<0>(blockAddr) - : IsMapped<1>(blockAddr))) + u32 pseudoPhysicalAddr = cpu->Num == 0 + ? TranslateAddr9(blockAddr) + : TranslateAddr7(blockAddr); + if (pseudoPhysicalAddr < ExeMemRegionSizes[exeMem_Unmapped]) { printf("Trying to compile a block in unmapped memory: %x\n", blockAddr); } - u32 pseudoPhysicalAddr = cpu->Num == 0 - ? TranslateAddr<0>(blockAddr) - : TranslateAddr<1>(blockAddr); - FetchedInstr instrs[Config::JIT_MaxBlockSize]; int i = 0; u32 r15 = cpu->R[15]; - u32 addresseRanges[32] = {}; + u32 addressRanges[Config::JIT_MaxBlockSize]; + u32 addressMasks[Config::JIT_MaxBlockSize] = {0}; u32 numAddressRanges = 0; + u32 numLiterals = 0; + u32 literalLoadAddrs[Config::JIT_MaxBlockSize]; + // they are going to be hashed + u32 literalValues[Config::JIT_MaxBlockSize]; + u32 instrValues[Config::JIT_MaxBlockSize]; + cpu->FillPipeline(); u32 nextInstr[2] = {cpu->NextInstr[0], cpu->NextInstr[1]}; u32 nextInstrAddr[2] = {blockAddr, r15}; - JIT_DEBUGPRINT("start block %x %08x (%x) (region invalidates %dx)\n", - blockAddr, cpu->CPSR, pseudoPhysicalAddr, - CodeRanges[pseudoPhysicalAddr / 512].TimesInvalidated); + JIT_DEBUGPRINT("start block %x %08x (%x)\n", blockAddr, cpu->CPSR, pseudoPhysicalAddr); u32 lastSegmentStart = blockAddr; u32 lr; @@ -507,23 +840,29 @@ void CompileBlock(ARM* cpu) nextInstrAddr[1] = r15; JIT_DEBUGPRINT("instr %08x %x\n", instrs[i].Instr & (thumb ? 0xFFFF : ~0), instrs[i].Addr); - u32 translatedAddr = (cpu->Num == 0 - ? TranslateAddr<0>(instrs[i].Addr) - : TranslateAddr<1>(instrs[i].Addr)) & ~0x1FF; - if (i == 0 || translatedAddr != addresseRanges[numAddressRanges - 1]) + instrValues[i] = instrs[i].Instr; + + u32 translatedAddr = cpu->Num == 0 + ? TranslateAddr9(instrs[i].Addr) + : TranslateAddr7(instrs[i].Addr); + u32 translatedAddrRounded = translatedAddr & ~0x1FF; + if (i == 0 || translatedAddrRounded != addressRanges[numAddressRanges - 1]) { bool returning = false; for (int j = 0; j < numAddressRanges; j++) { - if (addresseRanges[j] == translatedAddr) + if (addressRanges[j] == translatedAddrRounded) { + std::swap(addressRanges[j], addressRanges[numAddressRanges - 1]); + std::swap(addressMasks[j], addressMasks[numAddressRanges - 1]); returning = true; break; } } if (!returning) - addresseRanges[numAddressRanges++] = translatedAddr; + addressRanges[numAddressRanges++] = translatedAddrRounded; } + addressMasks[numAddressRanges - 1] |= 1 << ((translatedAddr & 0x1FF) / 16); if (cpu->Num == 0) { @@ -572,7 +911,8 @@ void CompileBlock(ARM* cpu) u32 icode = ((instrs[i].Instr >> 4) & 0xF) | ((instrs[i].Instr >> 16) & 0xFF0); assert(InterpretARM[instrs[i].Info.Kind] == ARMInterpreter::ARMInstrTable[icode] || instrs[i].Info.Kind == ARMInstrInfo::ak_MOV_REG_LSL_IMM - || instrs[i].Info.Kind == ARMInstrInfo::ak_Nop); + || instrs[i].Info.Kind == ARMInstrInfo::ak_Nop + || instrs[i].Info.Kind == ARMInstrInfo::ak_UNK); if (cpu->CheckCondition(instrs[i].Cond())) InterpretARM[instrs[i].Info.Kind](cpu); else @@ -583,21 +923,26 @@ void CompileBlock(ARM* cpu) instrs[i].DataCycles = cpu->DataCycles; instrs[i].DataRegion = cpu->DataRegion; - if (instrs[i].Info.SpecialKind == ARMInstrInfo::special_WriteMem - && instrs[i].Info.SrcRegs == (1 << 15) - && instrs[i].Info.DstRegs == 0) + u32 literalAddr; + if (Config::JIT_LiteralOptimisations + && instrs[i].Info.SpecialKind == ARMInstrInfo::special_LoadLiteral + && DecodeLiteral(thumb, instrs[i], literalAddr)) { - assert (!thumb); - - u32 addr; - if (DecodeLiteral(instrs[i], addr)) - { - JIT_DEBUGPRINT("pc relative write detected\n"); - u32 translatedAddr = cpu->Num == 0 ? TranslateAddr<0>(addr) : TranslateAddr<1>(addr); - - ARMJIT::InvalidateByAddr(translatedAddr, false); - CodeRanges[translatedAddr / 512].InvalidLiterals |= (1 << ((translatedAddr & 0x1FF) / 16)); - } + u32 translatedAddr = cpu->Num == 0 + ? TranslateAddr9(literalAddr) + : TranslateAddr7(literalAddr); + u32 translatedAddrRounded = translatedAddr & ~0x1FF; + + u32 j = 0; + for (; j < numAddressRanges; j++) + if (addressRanges[j] == translatedAddrRounded) + break; + if (j == numAddressRanges) + addressRanges[numAddressRanges++] = translatedAddrRounded; + addressMasks[j] |= 1 << ((translatedAddr & 0x1FF) / 16); + JIT_DEBUGPRINT("literal loading %08x %08x %08x %08x\n", literalAddr, translatedAddr, addressMasks[j], addressRanges[j]); + cpu->DataRead32(literalAddr, &literalValues[numLiterals]); + literalLoadAddrs[numLiterals++] = translatedAddr; } if (thumb && instrs[i].Info.Kind == ARMInstrInfo::tk_BL_LONG_2 && i > 0 @@ -650,8 +995,8 @@ void CompileBlock(ARM* cpu) else if (hasBranched && !isBackJump && i + 1 < Config::JIT_MaxBlockSize) { u32 targetPseudoPhysical = cpu->Num == 0 - ? TranslateAddr<0>(target) - : TranslateAddr<1>(target); + ? TranslateAddr9(target) + : TranslateAddr7(target); if (link) { @@ -688,36 +1033,29 @@ void CompileBlock(ARM* cpu) i++; - bool canCompile = compiler->CanCompile(thumb, instrs[i - 1].Info.Kind); + bool canCompile = JITCompiler->CanCompile(thumb, instrs[i - 1].Info.Kind); bool secondaryFlagReadCond = !canCompile || (instrs[i - 1].BranchFlags & (branch_FollowCondTaken | branch_FollowCondNotTaken)); if (instrs[i - 1].Info.ReadFlags != 0 || secondaryFlagReadCond) FloodFillSetFlags(instrs, i - 2, !secondaryFlagReadCond ? instrs[i - 1].Info.ReadFlags : 0xF); } while(!instrs[i - 1].Info.EndBlock && i < Config::JIT_MaxBlockSize && !cpu->Halted && (!cpu->IRQ || (cpu->CPSR & 0x80))); + u32 literalHash = (u32)XXH3_64bits(literalValues, numLiterals * 4); + u32 instrHash = (u32)XXH3_64bits(instrValues, i * 4); + JitBlock* prevBlock = RestoreCandidates.LookUp(pseudoPhysicalAddr); bool mayRestore = true; if (prevBlock) { RestoreCandidates.Remove(pseudoPhysicalAddr); - if (prevBlock->NumInstrs == i) - { - for (int j = 0; j < i; j++) - { - if (prevBlock->Instrs()[j] != instrs[j].Instr) - { - mayRestore = false; - break; - } - } - } - else - mayRestore = false; - if (prevBlock->NumAddresses == numAddressRanges) + mayRestore = prevBlock->LiteralHash == literalHash && prevBlock->InstrHash == instrHash; + + if (mayRestore && prevBlock->NumAddresses == numAddressRanges) { for (int j = 0; j < numAddressRanges; j++) { - if (prevBlock->AddressRanges()[j] != addresseRanges[j]) + if (prevBlock->AddressRanges()[j] != addressRanges[j] + || prevBlock->AddressMasks()[j] != addressMasks[j]) { mayRestore = false; break; @@ -739,18 +1077,21 @@ void CompileBlock(ARM* cpu) if (prevBlock) delete prevBlock; - block = new JitBlock(i, numAddressRanges); - for (int j = 0; j < i; j++) - block->Instrs()[j] = instrs[j].Instr; + block = new JitBlock(cpu->Num, i, numAddressRanges, numLiterals); + block->LiteralHash = literalHash; + block->InstrHash = instrHash; + for (int j = 0; j < numAddressRanges; j++) + block->AddressRanges()[j] = addressRanges[j]; for (int j = 0; j < numAddressRanges; j++) - block->AddressRanges()[j] = addresseRanges[j]; + block->AddressMasks()[j] = addressMasks[j]; + for (int j = 0; j < numLiterals; j++) + block->Literals()[j] = literalLoadAddrs[j]; - block->StartAddr = blockAddr; block->PseudoPhysicalAddr = pseudoPhysicalAddr; FloodFillSetFlags(instrs, i - 1, 0xF); - block->EntryPoint = compiler->CompileBlock(pseudoPhysicalAddr, cpu, thumb, instrs, i); + block->EntryPoint = JITCompiler->CompileBlock(pseudoPhysicalAddr, cpu, thumb, instrs, i); } else { @@ -760,23 +1101,73 @@ void CompileBlock(ARM* cpu) for (int j = 0; j < numAddressRanges; j++) { - assert(addresseRanges[j] == block->AddressRanges()[j]); - CodeRanges[addresseRanges[j] / 512].Blocks.Add(block); + assert(addressRanges[j] == block->AddressRanges()[j]); + assert(addressMasks[j] == block->AddressMasks()[j]); + assert(addressMasks[j] != 0); + CodeRanges[addressRanges[j] / 512].Code |= addressMasks[j]; + CodeRanges[addressRanges[j] / 512].Blocks.Add(block); + + UpdateRegionByPseudoPhyiscal(addressRanges[j], true); } - JitBlocks[pseudoPhysicalAddr] = block; - FastBlockLookUp.Insert(pseudoPhysicalAddr, compiler->SubEntryOffset(block->EntryPoint)); + if (cpu->Num == 0) + { + JitBlocks9[pseudoPhysicalAddr] = block; + FastBlockLookUp9.Insert(pseudoPhysicalAddr, JITCompiler->SubEntryOffset(block->EntryPoint)); + } + else + { + JitBlocks7[pseudoPhysicalAddr] = block; + FastBlockLookUp7.Insert(pseudoPhysicalAddr, JITCompiler->SubEntryOffset(block->EntryPoint)); + } } -void InvalidateByAddr(u32 pseudoPhysical, bool mayRestore) +void InvalidateByAddr(u32 pseudoPhysical) { JIT_DEBUGPRINT("invalidating by addr %x\n", pseudoPhysical); AddressRange* range = &CodeRanges[pseudoPhysical / 512]; - int startLength = range->Blocks.Length; - for (int i = 0; i < range->Blocks.Length; i++) + u32 mask = 1 << ((pseudoPhysical & 0x1FF) / 16); + + range->Code = 0; + for (int i = 0; i < range->Blocks.Length;) { - assert(range->Blocks.Length == startLength); JitBlock* block = range->Blocks[i]; + + bool invalidated = false; + u32 mask = 0; + for (int j = 0; j < block->NumAddresses; j++) + { + if (block->AddressRanges()[j] == (pseudoPhysical & ~0x1FF)) + { + mask = block->AddressMasks()[j]; + invalidated = block->AddressMasks()[j] & mask; + break; + } + } + assert(mask); + if (!invalidated) + { + range->Code |= mask; + i++; + continue; + } + range->Blocks.Remove(i); + + bool literalInvalidation = false; + for (int j = 0; j < block->NumLiterals; j++) + { + u32 addr = block->Literals()[j]; + if (addr == pseudoPhysical) + { + if (InvalidLiterals.Find(pseudoPhysical) != -1) + { + InvalidLiterals.Add(pseudoPhysical); + JIT_DEBUGPRINT("found invalid literal %d\n", InvalidLiterals.Length); + } + literalInvalidation = true; + break; + } + } for (int j = 0; j < block->NumAddresses; j++) { u32 addr = block->AddressRanges()[j]; @@ -786,76 +1177,59 @@ void InvalidateByAddr(u32 pseudoPhysical, bool mayRestore) assert(otherRange != range); bool removed = otherRange->Blocks.RemoveByValue(block); assert(removed); + + if (otherRange->Blocks.Length == 0) + { + otherRange->Code = 0; + UpdateRegionByPseudoPhyiscal(addr, false); + } } } for (int j = 0; j < block->NumLinks(); j++) - compiler->UnlinkBlock(block->Links()[j]); + JITCompiler->UnlinkBlock(block->Links()[j]); + block->ResetLinks(); - JitBlocks.erase(block->PseudoPhysicalAddr); - FastBlockLookUp.Remove(block->PseudoPhysicalAddr); + if (block->Num == 0) + { + JitBlocks9.erase(block->PseudoPhysicalAddr); + FastBlockLookUp9.Remove(block->PseudoPhysicalAddr); + } + else + { + JitBlocks7.erase(block->PseudoPhysicalAddr); + FastBlockLookUp7.Remove(block->PseudoPhysicalAddr); + } - if (mayRestore) + if (!literalInvalidation) { JitBlock* prevBlock = RestoreCandidates.Insert(block->PseudoPhysicalAddr, block); if (prevBlock) delete prevBlock; } + else + { + delete block; + } } - if ((range->TimesInvalidated + 1) > range->TimesInvalidated) - range->TimesInvalidated++; - - range->Blocks.Clear(); -} -void InvalidateByAddr7(u32 addr) -{ - u32 pseudoPhysical = TranslateAddr<1>(addr); - if (__builtin_expect(CodeRanges[pseudoPhysical / 512].Blocks.Length > 0, false)) - InvalidateByAddr(pseudoPhysical); + if (range->Blocks.Length == 0) + UpdateRegionByPseudoPhyiscal(pseudoPhysical, false); } -void InvalidateITCM(u32 addr) +void InvalidateRegionIfNecessary(u32 pseudoPhyisical) { - u32 pseudoPhysical = addr + ExeMemRegionOffsets[exeMem_ITCM]; - if (CodeRanges[pseudoPhysical / 512].Blocks.Length > 0) - InvalidateByAddr(pseudoPhysical); -} - -void InvalidateAll() -{ - JIT_DEBUGPRINT("invalidating all %x\n", JitBlocks.size()); - for (auto it : JitBlocks) - { - JitBlock* block = it.second; - - FastBlockLookUp.Remove(block->PseudoPhysicalAddr); - - for (int i = 0; i < block->NumAddresses; i++) - { - u32 addr = block->AddressRanges()[i]; - AddressRange* range = &CodeRanges[addr / 512]; - range->Blocks.Clear(); - if (range->TimesInvalidated + 1 > range->TimesInvalidated) - range->TimesInvalidated++; - } - for (int i = 0; i < block->NumLinks(); i++) - compiler->UnlinkBlock(block->Links()[i]); - block->ResetLinks(); - - JitBlock* prevBlock = RestoreCandidates.Insert(block->PseudoPhysicalAddr, block); - if (prevBlock) - delete prevBlock; - } - - JitBlocks.clear(); + if (CodeRanges[pseudoPhyisical / 512].Code & (1 << ((pseudoPhyisical & 0x1FF) / 16))) + InvalidateByAddr(pseudoPhyisical); } void ResetBlockCache() { printf("Resetting JIT block cache...\n"); - FastBlockLookUp.Reset(); + InvalidLiterals.Clear(); + FastBlockLookUp9.Reset(); + FastBlockLookUp7.Reset(); RestoreCandidates.Reset(); for (int i = 0; i < sizeof(RestoreCandidates.Table)/sizeof(RestoreCandidates.Table[0]); i++) { @@ -870,61 +1244,119 @@ void ResetBlockCache() RestoreCandidates.Table[i].ValB = NULL; } } - for (auto it : JitBlocks) + for (auto it : JitBlocks9) { JitBlock* block = it.second; for (int j = 0; j < block->NumAddresses; j++) { u32 addr = block->AddressRanges()[j]; CodeRanges[addr / 512].Blocks.Clear(); - CodeRanges[addr / 512].TimesInvalidated = 0; - CodeRanges[addr / 512].InvalidLiterals = 0; + CodeRanges[addr / 512].Code = 0; } delete block; } - JitBlocks.clear(); + for (auto it : JitBlocks7) + { + JitBlock* block = it.second; + for (int j = 0; j < block->NumAddresses; j++) + { + u32 addr = block->AddressRanges()[j]; + CodeRanges[addr / 512].Blocks.Clear(); + CodeRanges[addr / 512].Code = 0; + } + } + JitBlocks9.clear(); + JitBlocks7.clear(); - compiler->Reset(); + JITCompiler->Reset(); } +template <u32 Num> JitBlockEntry LookUpBlockEntry(u32 addr) { - u32 entryOffset = FastBlockLookUp.LookUp(addr); + auto& fastMap = Num == 0 ? FastBlockLookUp9 : FastBlockLookUp7; + u32 entryOffset = fastMap.LookUp(addr); if (entryOffset != UINT32_MAX) - return compiler->AddEntryOffset(entryOffset); + return JITCompiler->AddEntryOffset(entryOffset); - auto block = JitBlocks.find(addr); - if (block != JitBlocks.end()) + auto& slowMap = Num == 0 ? JitBlocks9 : JitBlocks7; + auto block = slowMap.find(addr); + if (block != slowMap.end()) { - FastBlockLookUp.Insert(addr, compiler->SubEntryOffset(block->second->EntryPoint)); + fastMap.Insert(addr, JITCompiler->SubEntryOffset(block->second->EntryPoint)); return block->second->EntryPoint; } return NULL; } +template JitBlockEntry LookUpBlockEntry<0>(u32); +template JitBlockEntry LookUpBlockEntry<1>(u32); + template <u32 Num> void LinkBlock(ARM* cpu, u32 codeOffset) { - u32 targetPseudoPhys = TranslateAddr<Num>(cpu->R[15] - ((cpu->CPSR&0x20)?2:4)); - auto block = JitBlocks.find(targetPseudoPhys); - if (block == JitBlocks.end()) + auto& blockMap = Num == 0 ? JitBlocks9 : JitBlocks7; + u32 instrAddr = cpu->R[15] - ((cpu->CPSR&0x20)?2:4); + u32 targetPseudoPhys = Num == 0 ? TranslateAddr9(instrAddr) : TranslateAddr7(instrAddr); + auto block = blockMap.find(targetPseudoPhys); + if (block == blockMap.end()) { CompileBlock(cpu); - block = JitBlocks.find(targetPseudoPhys); + block = blockMap.find(targetPseudoPhys); } JIT_DEBUGPRINT("linking to block %08x\n", targetPseudoPhys); block->second->AddLink(codeOffset); - compiler->LinkBlock(codeOffset, block->second->EntryPoint); + JITCompiler->LinkBlock(codeOffset, block->second->EntryPoint); +} + +template void LinkBlock<0>(ARM*, u32); +template void LinkBlock<1>(ARM*, u32); + +void WifiWrite32(u32 addr, u32 val) +{ + Wifi::Write(addr, val & 0xFFFF); + Wifi::Write(addr + 2, val >> 16); +} + +u32 WifiRead32(u32 addr) +{ + return Wifi::Read(addr) | (Wifi::Read(addr + 2) << 16); +} + +template <typename T> +void VRAMWrite(u32 addr, T val) +{ + switch (addr & 0x00E00000) + { + case 0x00000000: GPU::WriteVRAM_ABG<T>(addr, val); return; + case 0x00200000: GPU::WriteVRAM_BBG<T>(addr, val); return; + case 0x00400000: GPU::WriteVRAM_AOBJ<T>(addr, val); return; + case 0x00600000: GPU::WriteVRAM_BOBJ<T>(addr, val); return; + default: GPU::WriteVRAM_LCDC<T>(addr, val); return; + } +} +template <typename T> +T VRAMRead(u32 addr) +{ + switch (addr & 0x00E00000) + { + case 0x00000000: return GPU::ReadVRAM_ABG<T>(addr); + case 0x00200000: return GPU::ReadVRAM_BBG<T>(addr); + case 0x00400000: return GPU::ReadVRAM_AOBJ<T>(addr); + case 0x00600000: return GPU::ReadVRAM_BOBJ<T>(addr); + default: return GPU::ReadVRAM_LCDC<T>(addr); + } } void* GetFuncForAddr(ARM* cpu, u32 addr, bool store, int size) { if (cpu->Num == 0) { - if ((addr & 0xFF000000) == 0x04000000) + switch (addr & 0xFF000000) { + case 0x04000000: if (!store && size == 32 && addr == 0x04100010 && NDS::ExMemCnt[0] & (1<<11)) return (void*)NDSCart::ReadROMData; @@ -949,13 +1381,25 @@ void* GetFuncForAddr(ARM* cpu, u32 addr, bool store, int size) switch (size | store) { - case 8: return (void*)NDS::ARM9IORead8; - case 9: return (void*)NDS::ARM9IOWrite8; + case 8: return (void*)NDS::ARM9IORead8; + case 9: return (void*)NDS::ARM9IOWrite8; case 16: return (void*)NDS::ARM9IORead16; case 17: return (void*)NDS::ARM9IOWrite16; case 32: return (void*)NDS::ARM9IORead32; case 33: return (void*)NDS::ARM9IOWrite32; } + break; + case 0x06000000: + switch (size | store) + { + case 8: return (void*)VRAMRead<u8>; + case 9: return NULL; + case 16: return (void*)VRAMRead<u16>; + case 17: return (void*)VRAMWrite<u16>; + case 32: return (void*)VRAMRead<u32>; + case 33: return (void*)VRAMWrite<u32>; + } + break; } } else @@ -987,20 +1431,31 @@ void* GetFuncForAddr(ARM* cpu, u32 addr, bool store, int size) } break; case 0x04800000: - if (addr < 0x04810000 && size == 16) + if (addr < 0x04810000 && size >= 16) { - if (store) - return (void*)Wifi::Write; - else - return (void*)Wifi::Read; + switch (size | store) + { + case 16: return (void*)Wifi::Read; + case 17: return (void*)Wifi::Write; + case 32: return (void*)WifiRead32; + case 33: return (void*)WifiWrite32; + } } break; + case 0x06000000: + case 0x06800000: + switch (size | store) + { + case 8: return (void*)GPU::ReadVRAM_ARM7<u8>; + case 9: return (void*)GPU::WriteVRAM_ARM7<u8>; + case 16: return (void*)GPU::ReadVRAM_ARM7<u16>; + case 17: return (void*)GPU::WriteVRAM_ARM7<u16>; + case 32: return (void*)GPU::ReadVRAM_ARM7<u32>; + case 33: return (void*)GPU::WriteVRAM_ARM7<u32>; + } } } return NULL; } } - -template void ARMJIT::LinkBlock<0>(ARM*, u32); -template void ARMJIT::LinkBlock<1>(ARM*, u32); diff --git a/src/ARMJIT.h b/src/ARMJIT.h index cab385f..44a6140 100644 --- a/src/ARMJIT.h +++ b/src/ARMJIT.h @@ -28,45 +28,60 @@ extern const u32 ExeMemRegionSizes[]; typedef u32 (*JitBlockEntry)(); -extern u32 AddrTranslate9[0x2000]; -extern u32 AddrTranslate7[0x4000]; - const u32 ExeMemSpaceSize = 0x518000; // I hate you C++, sometimes I really hate you... -template <u32 num> -inline bool IsMapped(u32 addr) -{ - if (num == 0) - return AddrTranslate9[(addr & 0xFFFFFFF) >> 15] >= ExeMemRegionSizes[exeMem_Unmapped]; - else - return AddrTranslate7[(addr & 0xFFFFFFF) >> 14] >= ExeMemRegionSizes[exeMem_Unmapped]; -} - -template <u32 num> -inline u32 TranslateAddr(u32 addr) -{ - if (num == 0) - return AddrTranslate9[(addr & 0xFFFFFFF) >> 15] + (addr & 0x7FFF); - else - return AddrTranslate7[(addr & 0xFFFFFFF) >> 14] + (addr & 0x3FFF); -} +u32 TranslateAddr9(u32 addr); +u32 TranslateAddr7(u32 addr); +template <u32 Num> JitBlockEntry LookUpBlockEntry(u32 addr); - void Init(); void DeInit(); -void InvalidateByAddr(u32 pseudoPhysical, bool mayRestore = true); -void InvalidateAll(); +void Reset(); + +void InvalidateByAddr(u32 pseudoPhysical); + +void InvalidateRegionIfNecessary(u32 addr); -void InvalidateITCM(u32 addr); -void InvalidateByAddr7(u32 addr); +inline void InvalidateMainRAMIfNecessary(u32 addr) +{ + InvalidateRegionIfNecessary(ExeMemRegionOffsets[exeMem_MainRAM] + (addr & (MAIN_RAM_SIZE - 1))); +} +inline void InvalidateITCMIfNecessary(u32 addr) +{ + InvalidateRegionIfNecessary(ExeMemRegionOffsets[exeMem_ITCM] + (addr & 0x7FFF)); +} +inline void InvalidateLCDCIfNecessary(u32 addr) +{ + if (addr < 0x68A3FFF) + InvalidateRegionIfNecessary(ExeMemRegionOffsets[exeMem_LCDC] + (addr - 0x6800000)); +} +inline void InvalidateSWRAM7IfNecessary(u32 addr) +{ + InvalidateRegionIfNecessary(ExeMemRegionOffsets[exeMem_SWRAM] + (NDS::SWRAM_ARM7 - NDS::SharedWRAM) + (addr & NDS::SWRAM_ARM7Mask)); +} +inline void InvalidateSWRAM9IfNecessary(u32 addr) +{ + InvalidateRegionIfNecessary(ExeMemRegionOffsets[exeMem_SWRAM] + (NDS::SWRAM_ARM9 - NDS::SharedWRAM) + (addr & NDS::SWRAM_ARM9Mask)); +} +inline void InvalidateARM7WRAMIfNecessary(u32 addr) +{ + InvalidateRegionIfNecessary(ExeMemRegionOffsets[exeMem_ARM7_WRAM] + (addr & 0xFFFF)); +} +inline void InvalidateARM7WVRAMIfNecessary(u32 addr) +{ + InvalidateRegionIfNecessary(ExeMemRegionOffsets[exeMem_ARM7_WVRAM] + (addr & 0x1FFFF)); +} void CompileBlock(ARM* cpu); void ResetBlockCache(); +void UpdateMemoryStatus9(u32 start, u32 end); +void UpdateMemoryStatus7(u32 start, u32 end); + } extern "C" void ARM_Dispatch(ARM* cpu, ARMJIT::JitBlockEntry entry); diff --git a/src/ARMJIT_A64/ARMJIT_Compiler.cpp b/src/ARMJIT_A64/ARMJIT_Compiler.cpp index 00fa436..a67f357 100644 --- a/src/ARMJIT_A64/ARMJIT_Compiler.cpp +++ b/src/ARMJIT_A64/ARMJIT_Compiler.cpp @@ -650,7 +650,7 @@ void Compiler::Comp_AddCycles_CDI() s32 numC = NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2]; s32 numD = CurInstr.DataCycles; - if ((CurInstr.DataRegion >> 4) == 0x02) // mainRAM + if ((CurInstr.DataRegion >> 24) == 0x02) // mainRAM { if (CodeRegion == 0x02) cycles = numC + numD; @@ -695,7 +695,7 @@ void Compiler::Comp_AddCycles_CD() s32 numC = NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2]; s32 numD = CurInstr.DataCycles; - if ((CurInstr.DataRegion >> 4) == 0x02) + if ((CurInstr.DataRegion >> 24) == 0x02) { if (CodeRegion == 0x02) cycles += numC + numD; diff --git a/src/ARMJIT_Internal.h b/src/ARMJIT_Internal.h index 66d1808..4e45760 100644 --- a/src/ARMJIT_Internal.h +++ b/src/ARMJIT_Internal.h @@ -152,30 +152,34 @@ struct __attribute__((packed)) TinyVector class JitBlock { public: - JitBlock(u32 numInstrs, u32 numAddresses) + JitBlock(u32 num, u32 literalHash, u32 numAddresses, u32 numLiterals) { - NumInstrs = numInstrs; + Num = num; NumAddresses = numAddresses; - Data.SetLength(numInstrs + numAddresses); + NumLiterals = numLiterals; + Data.SetLength(numAddresses * 2 + numLiterals); } - u32 StartAddr; u32 PseudoPhysicalAddr; - - u32 NumInstrs; - u32 NumAddresses; + + u32 InstrHash, LiteralHash; + u8 Num; + u16 NumAddresses; + u16 NumLiterals; JitBlockEntry EntryPoint; - u32* Instrs() - { return &Data[0]; } u32* AddressRanges() - { return &Data[NumInstrs]; } + { return &Data[0]; } + u32* AddressMasks() + { return &Data[NumAddresses]; } + u32* Literals() + { return &Data[NumAddresses * 2]; } u32* Links() - { return &Data[NumInstrs + NumAddresses]; } + { return &Data[NumAddresses * 2 + NumLiterals]; } u32 NumLinks() - { return Data.Length - NumInstrs - NumAddresses; } + { return Data.Length - NumAddresses * 2 - NumLiterals; } void AddLink(u32 link) { @@ -184,7 +188,7 @@ public: void ResetLinks() { - Data.SetLength(NumInstrs + NumAddresses); + Data.SetLength(NumAddresses * 2 + NumLiterals); } private: @@ -200,8 +204,7 @@ private: struct __attribute__((packed)) AddressRange { TinyVector<JitBlock*> Blocks; - u16 InvalidLiterals; - u16 TimesInvalidated; + u32 Code; }; extern AddressRange CodeRanges[ExeMemSpaceSize / 512]; @@ -210,14 +213,45 @@ typedef void (*InterpreterFunc)(ARM* cpu); extern InterpreterFunc InterpretARM[]; extern InterpreterFunc InterpretTHUMB[]; -extern u8 MemRegion9[0x80000]; -extern u8 MemRegion7[0x80000]; +extern u8 MemoryStatus9[0x800000]; +extern u8 MemoryStatus7[0x800000]; + +extern TinyVector<u32> InvalidLiterals; void* GetFuncForAddr(ARM* cpu, u32 addr, bool store, int size); template <u32 Num> void LinkBlock(ARM* cpu, u32 codeOffset); +enum +{ + memregion_Other = 0, + memregion_ITCM, + memregion_DTCM, + memregion_BIOS9, + memregion_MainRAM, + memregion_SWRAM9, + memregion_SWRAM7, + memregion_IO9, + memregion_VRAM, + memregion_BIOS7, + memregion_WRAM7, + memregion_IO7, + memregion_Wifi, + memregion_VWRAM, +}; + +int ClassifyAddress9(u32 addr); +int ClassifyAddress7(u32 addr); + +template <typename T> T SlowRead9(ARMv5* cpu, u32 addr); +template <typename T> void SlowWrite9(ARMv5* cpu, u32 addr, T val); +template <typename T> T SlowRead7(u32 addr); +template <typename T> void SlowWrite7(u32 addr, T val); + +template <bool PreInc, bool Write> void SlowBlockTransfer9(u32 addr, u64* data, u32 num, ARMv5* cpu); +template <bool PreInc, bool Write> void SlowBlockTransfer7(u32 addr, u64* data, u32 num); + } #endif
\ No newline at end of file diff --git a/src/ARMJIT_RegisterCache.h b/src/ARMJIT_RegisterCache.h index 5e18e84..0547c84 100644 --- a/src/ARMJIT_RegisterCache.h +++ b/src/ARMJIT_RegisterCache.h @@ -95,20 +95,6 @@ public: LiteralsLoaded = 0; } - BitSet32 GetPushRegs() - { - BitSet16 used; - for (int i = 0; i < InstrsCount; i++) - used |= BitSet16(Instrs[i].Info.SrcRegs | Instrs[i].Info.DstRegs); - - BitSet32 res; - u32 registersMax = std::min((int)used.Count(), NativeRegsAvailable); - for (int i = 0; i < registersMax; i++) - res |= BitSet32(1 << (int)NativeRegAllocOrder[i]); - - return res; - } - void Prepare(bool thumb, int i) { FetchedInstr instr = Instrs[i]; @@ -139,7 +125,6 @@ public: UnloadRegister(reg); u16 necessaryRegs = ((instr.Info.SrcRegs & PCAllocatableAsSrc) | instr.Info.DstRegs) & ~instr.Info.NotStrictlyNeeded; - u16 writeRegs = instr.Info.DstRegs & ~instr.Info.NotStrictlyNeeded; BitSet16 needToBeLoaded(necessaryRegs & ~LoadedRegs); if (needToBeLoaded != BitSet16(0)) { @@ -182,13 +167,12 @@ public: if (left-- == 0) break; - writeRegs |= (1 << reg) & instr.Info.DstRegs; LoadRegister(reg, !(thumb || instr.Cond() >= 0xE) || (1 << reg) & instr.Info.SrcRegs); } } } - DirtyRegs |= writeRegs & ~(1 << 15); + DirtyRegs |= (LoadedRegs & instr.Info.DstRegs) & ~(1 << 15); } static const Reg NativeRegAllocOrder[]; diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp index dd20e3c..eee2e0f 100644 --- a/src/ARMJIT_x64/ARMJIT_Compiler.cpp +++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp @@ -195,26 +195,6 @@ Compiler::Compiler() Reset(); - for (int i = 0; i < 3; i++) - { - for (int j = 0; j < 2; j++) - MemoryFuncs9[i][j] = Gen_MemoryRoutine9(j, 8 << i); - } - MemoryFuncs7[0][0] = (void*)NDS::ARM7Read8; - MemoryFuncs7[0][1] = (void*)NDS::ARM7Write8; - MemoryFuncs7[1][0] = (void*)NDS::ARM7Read16; - MemoryFuncs7[1][1] = (void*)NDS::ARM7Write16; - MemoryFuncs7[2][0] = (void*)NDS::ARM7Read32; - MemoryFuncs7[2][1] = (void*)NDS::ARM7Write32; - - for (int i = 0; i < 2; i++) - for (int j = 0; j < 2; j++) - { - MemoryFuncsSeq9[i][j] = Gen_MemoryRoutineSeq9(i, j); - MemoryFuncsSeq7[i][j][0] = Gen_MemoryRoutineSeq7(i, j, false); - MemoryFuncsSeq7[i][j][1] = Gen_MemoryRoutineSeq7(i, j, true); - } - { // RSCRATCH mode // RSCRATCH2 reg number @@ -317,6 +297,12 @@ Compiler::Compiler() // move the region forward to prevent overwriting the generated functions CodeMemSize -= GetWritableCodePtr() - ResetStart; ResetStart = GetWritableCodePtr(); + + NearStart = ResetStart; + FarStart = ResetStart + 1024*1024*24; + + NearSize = FarStart - ResetStart; + FarSize = (ResetStart + CodeMemSize) - FarStart; } void Compiler::LoadCPSR() @@ -504,6 +490,9 @@ void Compiler::Reset() { memset(ResetStart, 0xcc, CodeMemSize); SetCodePtr(ResetStart); + + NearCode = NearStart; + FarCode = FarStart; } void Compiler::Comp_SpecialBranchBehaviour(bool taken) @@ -544,8 +533,16 @@ void Compiler::Comp_SpecialBranchBehaviour(bool taken) JitBlockEntry Compiler::CompileBlock(u32 translatedAddr, ARM* cpu, bool thumb, FetchedInstr instrs[], int instrsCount) { - if (CodeMemSize - (GetWritableCodePtr() - ResetStart) < 1024 * 32) // guess... + if (NearSize - (NearCode - NearStart) < 1024 * 32) // guess... + { + printf("near reset\n"); + ResetBlockCache(); + } + if (FarSize - (FarCode - FarStart) < 1024 * 32) // guess... + { + printf("far reset\n"); ResetBlockCache(); + } ConstantCycles = 0; Thumb = thumb; @@ -762,12 +759,14 @@ void Compiler::Comp_AddCycles_CDI() Comp_AddCycles_CD(); else { + IrregularCycles = true; + s32 cycles; s32 numC = NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2]; s32 numD = CurInstr.DataCycles; - if ((CurInstr.DataRegion >> 4) == 0x02) // mainRAM + if ((CurInstr.DataRegion >> 24) == 0x02) // mainRAM { if (CodeRegion == 0x02) cycles = numC + numD; diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.h b/src/ARMJIT_x64/ARMJIT_Compiler.h index e0a4978..9df218b 100644 --- a/src/ARMJIT_x64/ARMJIT_Compiler.h +++ b/src/ARMJIT_x64/ARMJIT_Compiler.h @@ -140,7 +140,7 @@ public: }; void Comp_MemAccess(int rd, int rn, const ComplexOperand& op2, int size, int flags); s32 Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc, bool decrement, bool usermode); - void Comp_MemLoadLiteral(int size, int rd, u32 addr); + bool Comp_MemLoadLiteral(int size, int rd, u32 addr); void Comp_ArithTriOp(void (Compiler::*op)(int, const Gen::OpArg&, const Gen::OpArg&), Gen::OpArg rd, Gen::OpArg rn, Gen::OpArg op2, bool carryUsed, int opFlags); @@ -154,12 +154,6 @@ public: void Comp_SpecialBranchBehaviour(bool taken); - void* Gen_MemoryRoutine9(bool store, int size); - - void* Gen_MemoryRoutineSeq9(bool store, bool preinc); - void* Gen_MemoryRoutineSeq7(bool store, bool preinc, bool codeMainRAM); - - void* Gen_ChangeCPSRRoutine(); Gen::OpArg Comp_RegShiftImm(int op, int amount, Gen::OpArg rm, bool S, bool& carryUsed); Gen::OpArg Comp_RegShiftReg(int op, Gen::OpArg rs, Gen::OpArg rm, bool S, bool& carryUsed); @@ -193,6 +187,26 @@ public: return (u8*)entry - ResetStart; } + void SwitchToNearCode() + { + FarCode = GetWritableCodePtr(); + SetCodePtr(NearCode); + } + + void SwitchToFarCode() + { + NearCode = GetWritableCodePtr(); + SetCodePtr(FarCode); + } + + u8* FarCode; + u8* NearCode; + u32 FarSize; + u32 NearSize; + + u8* NearStart; + u8* FarStart; + u8* ResetStart; u32 CodeMemSize; @@ -201,12 +215,6 @@ public: void* BranchStub[2]; - void* MemoryFuncs9[3][2]; - void* MemoryFuncs7[3][2]; - - void* MemoryFuncsSeq9[2][2]; - void* MemoryFuncsSeq7[2][2][2]; - void* ReadBanked; void* WriteBanked; diff --git a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp index b595e32..c13b779 100644 --- a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp +++ b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp @@ -25,236 +25,17 @@ int squeezePointer(T* ptr) improvement. */ -/* - address - ABI_PARAM1 (a.k.a. ECX = RSCRATCH3 on Windows) - store value - ABI_PARAM2 (a.k.a. RDX = RSCRATCH2 on Windows) -*/ -void* Compiler::Gen_MemoryRoutine9(bool store, int size) +bool Compiler::Comp_MemLoadLiteral(int size, int rd, u32 addr) { - u32 addressMask = ~(size == 32 ? 3 : (size == 16 ? 1 : 0)); - AlignCode4(); - void* res = GetWritableCodePtr(); - - MOV(32, R(RSCRATCH), R(ABI_PARAM1)); - SUB(32, R(RSCRATCH), MDisp(RCPU, offsetof(ARMv5, DTCMBase))); - CMP(32, R(RSCRATCH), MDisp(RCPU, offsetof(ARMv5, DTCMSize))); - FixupBranch insideDTCM = J_CC(CC_B); - - CMP(32, R(ABI_PARAM1), MDisp(RCPU, offsetof(ARMv5, ITCMSize))); - FixupBranch insideITCM = J_CC(CC_B); - - if (store) - { - if (size > 8) - AND(32, R(ABI_PARAM1), Imm32(addressMask)); - switch (size) - { - case 32: JMP((u8*)NDS::ARM9Write32, true); break; - case 16: JMP((u8*)NDS::ARM9Write16, true); break; - case 8: JMP((u8*)NDS::ARM9Write8, true); break; - } - } - else - { - if (size == 32) - { - ABI_PushRegistersAndAdjustStack({ABI_PARAM1}, 8); - AND(32, R(ABI_PARAM1), Imm32(addressMask)); - // everything's already in the appropriate register - ABI_CallFunction(NDS::ARM9Read32); - ABI_PopRegistersAndAdjustStack({ECX}, 8); - AND(32, R(ECX), Imm8(3)); - SHL(32, R(ECX), Imm8(3)); - ROR_(32, R(RSCRATCH), R(ECX)); - RET(); - } - else if (size == 16) - { - AND(32, R(ABI_PARAM1), Imm32(addressMask)); - JMP((u8*)NDS::ARM9Read16, true); - } - else - JMP((u8*)NDS::ARM9Read8, true); - } - - SetJumpTarget(insideDTCM); - AND(32, R(RSCRATCH), Imm32(0x3FFF & addressMask)); - if (store) - MOV(size, MComplex(RCPU, RSCRATCH, SCALE_1, offsetof(ARMv5, DTCM)), R(ABI_PARAM2)); - else - { - MOVZX(32, size, RSCRATCH, MComplex(RCPU, RSCRATCH, SCALE_1, offsetof(ARMv5, DTCM))); - if (size == 32) - { - if (ABI_PARAM1 != ECX) - MOV(32, R(ECX), R(ABI_PARAM1)); - AND(32, R(ECX), Imm8(3)); - SHL(32, R(ECX), Imm8(3)); - ROR_(32, R(RSCRATCH), R(ECX)); - } - } - RET(); + u32 translatedAddr = Num == 0 ? TranslateAddr9(addr) : TranslateAddr7(addr); - SetJumpTarget(insideITCM); - MOV(32, R(ABI_PARAM3), R(ABI_PARAM1)); // free up ECX - AND(32, R(ABI_PARAM3), Imm32(0x7FFF & addressMask)); - if (store) - { - MOV(size, MComplex(RCPU, ABI_PARAM3, SCALE_1, offsetof(ARMv5, ITCM)), R(ABI_PARAM2)); - - // if CodeRanges[pseudoPhysical/256].Blocks.Length > 0 we're writing into code! - static_assert(sizeof(AddressRange) == 16); - LEA(32, ABI_PARAM1, MDisp(ABI_PARAM3, ExeMemRegionOffsets[exeMem_ITCM])); - MOV(32, R(RSCRATCH), R(ABI_PARAM1)); - SHR(32, R(RSCRATCH), Imm8(9)); - SHL(32, R(RSCRATCH), Imm8(4)); - CMP(16, MDisp(RSCRATCH, squeezePointer(CodeRanges) + offsetof(AddressRange, Blocks.Length)), Imm8(0)); - FixupBranch noCode = J_CC(CC_Z); - JMP((u8*)InvalidateByAddr, true); - SetJumpTarget(noCode); - } - else + int invalidLiteralIdx = InvalidLiterals.Find(translatedAddr); + if (invalidLiteralIdx != -1) { - MOVZX(32, size, RSCRATCH, MComplex(RCPU, ABI_PARAM3, SCALE_1, offsetof(ARMv5, ITCM))); - if (size == 32) - { - if (ABI_PARAM1 != ECX) - MOV(32, R(ECX), R(ABI_PARAM1)); - AND(32, R(ECX), Imm8(3)); - SHL(32, R(ECX), Imm8(3)); - ROR_(32, R(RSCRATCH), R(ECX)); - } + InvalidLiterals.Remove(invalidLiteralIdx); + return false; } - RET(); - - static_assert(RSCRATCH == EAX, "Someone changed RSCRATCH!"); - - return res; -} - -#define MEMORY_SEQ_WHILE_COND \ - if (!store) \ - MOV(32, currentElement, R(EAX));\ - if (!preinc) \ - ADD(32, R(ABI_PARAM1), Imm8(4)); \ - \ - SUB(32, R(ABI_PARAM3), Imm8(1)); \ - J_CC(CC_NZ, repeat); - -/* - ABI_PARAM1 address - ABI_PARAM2 address where registers are stored - ABI_PARAM3 how many values to read/write - - Dolphin x64CodeEmitter is my favourite assembler - */ -void* Compiler::Gen_MemoryRoutineSeq9(bool store, bool preinc) -{ - void* res = (void*)GetWritableCodePtr(); - - const u8* repeat = GetCodePtr(); - - if (preinc) - ADD(32, R(ABI_PARAM1), Imm8(4)); - MOV(32, R(RSCRATCH), R(ABI_PARAM1)); - SUB(32, R(RSCRATCH), MDisp(RCPU, offsetof(ARMv5, DTCMBase))); - CMP(32, R(RSCRATCH), MDisp(RCPU, offsetof(ARMv5, DTCMSize))); - FixupBranch insideDTCM = J_CC(CC_B); - - CMP(32, R(ABI_PARAM1), MDisp(RCPU, offsetof(ARMv5, ITCMSize))); - FixupBranch insideITCM = J_CC(CC_B); - - OpArg currentElement = MComplex(ABI_PARAM2, ABI_PARAM3, SCALE_8, -8); // wasting stack space like a gangster - - ABI_PushRegistersAndAdjustStack({ABI_PARAM1, ABI_PARAM2, ABI_PARAM3}, 8); - AND(32, R(ABI_PARAM1), Imm8(~3)); - if (store) - { - MOV(32, R(ABI_PARAM2), currentElement); - CALL((void*)NDS::ARM9Write32); - } - else - CALL((void*)NDS::ARM9Read32); - ABI_PopRegistersAndAdjustStack({ABI_PARAM1, ABI_PARAM2, ABI_PARAM3}, 8); - - MEMORY_SEQ_WHILE_COND - RET(); - - SetJumpTarget(insideDTCM); - AND(32, R(RSCRATCH), Imm32(0x3FFF & ~3)); - if (store) - { - MOV(32, R(ABI_PARAM4), currentElement); - MOV(32, MComplex(RCPU, RSCRATCH, SCALE_1, offsetof(ARMv5, DTCM)), R(ABI_PARAM4)); - } - else - MOV(32, R(RSCRATCH), MComplex(RCPU, RSCRATCH, SCALE_1, offsetof(ARMv5, DTCM))); - - MEMORY_SEQ_WHILE_COND - RET(); - - SetJumpTarget(insideITCM); - MOV(32, R(RSCRATCH), R(ABI_PARAM1)); - AND(32, R(RSCRATCH), Imm32(0x7FFF & ~3)); - if (store) - { - MOV(32, R(ABI_PARAM4), currentElement); - MOV(32, MComplex(RCPU, RSCRATCH, SCALE_1, offsetof(ARMv5, ITCM)), R(ABI_PARAM4)); - - ADD(32, R(RSCRATCH), Imm32(ExeMemRegionOffsets[exeMem_ITCM])); - MOV(32, R(ABI_PARAM4), R(RSCRATCH)); - SHR(32, R(RSCRATCH), Imm8(9)); - SHL(32, R(RSCRATCH), Imm8(4)); - CMP(16, MDisp(RSCRATCH, squeezePointer(CodeRanges) + offsetof(AddressRange, Blocks.Length)), Imm8(0)); - FixupBranch noCode = J_CC(CC_Z); - ABI_PushRegistersAndAdjustStack({ABI_PARAM1, ABI_PARAM2, ABI_PARAM3}, 8); - MOV(32, R(ABI_PARAM1), R(ABI_PARAM4)); - CALL((u8*)InvalidateByAddr); - ABI_PopRegistersAndAdjustStack({ABI_PARAM1, ABI_PARAM2, ABI_PARAM3}, 8); - SetJumpTarget(noCode); - } - else - MOV(32, R(RSCRATCH), MComplex(RCPU, RSCRATCH, SCALE_1, offsetof(ARMv5, ITCM))); - - MEMORY_SEQ_WHILE_COND - RET(); - - return res; -} - -void* Compiler::Gen_MemoryRoutineSeq7(bool store, bool preinc, bool codeMainRAM) -{ - void* res = (void*)GetWritableCodePtr(); - - const u8* repeat = GetCodePtr(); - - if (preinc) - ADD(32, R(ABI_PARAM1), Imm8(4)); - - OpArg currentElement = MComplex(ABI_PARAM2, ABI_PARAM3, SCALE_8, -8); - - ABI_PushRegistersAndAdjustStack({ABI_PARAM1, ABI_PARAM2, ABI_PARAM3}, 8); - AND(32, R(ABI_PARAM1), Imm8(~3)); - if (store) - { - MOV(32, R(ABI_PARAM2), currentElement); - CALL((void*)NDS::ARM7Write32); - } - else - CALL((void*)NDS::ARM7Read32); - ABI_PopRegistersAndAdjustStack({ABI_PARAM1, ABI_PARAM2, ABI_PARAM3}, 8); - - MEMORY_SEQ_WHILE_COND - RET(); - - return res; -} - -#undef MEMORY_SEQ_WHILE_COND - -void Compiler::Comp_MemLoadLiteral(int size, int rd, u32 addr) -{ u32 val; // make sure arm7 bios is accessible u32 tmpR15 = CurCPU->R[15]; @@ -276,12 +57,10 @@ void Compiler::Comp_MemLoadLiteral(int size, int rd, u32 addr) RegCache.PutLiteral(rd, val); Comp_AddCycles_CDI(); + + return true; } -/*void fault(u32 a, u32 b, u32 c, u32 d) -{ - printf("actually not static! %x %x %x %x\n", a, b, c, d); -}*/ void Compiler::Comp_MemAccess(int rd, int rn, const ComplexOperand& op2, int size, int flags) { @@ -291,17 +70,12 @@ void Compiler::Comp_MemAccess(int rd, int rn, const ComplexOperand& op2, int siz if (size == 16) addressMask = ~1; - //bool check = false; if (Config::JIT_LiteralOptimisations && rn == 15 && rd != 15 && op2.IsImm && !(flags & (memop_SignExtend|memop_Post|memop_Store|memop_Writeback))) { u32 addr = R15 + op2.Imm * ((flags & memop_SubtractOffset) ? -1 : 1); - u32 translatedAddr = Num == 0 ? TranslateAddr<0>(addr) : TranslateAddr<1>(addr); - - if (!(CodeRanges[translatedAddr / 512].InvalidLiterals & (1 << ((translatedAddr & 0x1FF) / 16)))) - { - Comp_MemLoadLiteral(size, rd, addr); + + if (Comp_MemLoadLiteral(size, rd, addr)) return; - } } { @@ -314,173 +88,334 @@ void Compiler::Comp_MemAccess(int rd, int rn, const ComplexOperand& op2, int siz Comp_AddCycles_CDI(); } + bool addrIsStatic = Config::JIT_LiteralOptimisations + && RegCache.IsLiteral(rn) && op2.IsImm && !(flags & (memop_Writeback|memop_Post)); + u32 staticAddress; + if (addrIsStatic) + staticAddress = RegCache.LiteralValues[rn] + op2.Imm * ((flags & memop_SubtractOffset) ? -1 : 1); OpArg rdMapped = MapReg(rd); - OpArg rnMapped = MapReg(rn); - if (Thumb && rn == 15) - rnMapped = Imm32(R15 & ~0x2); - - bool inlinePreparation = Num == 1; - u32 constLocalROR32 = 4; - - void* memoryFunc = Num == 0 - ? MemoryFuncs9[size >> 4][!!(flags & memop_Store)] - : MemoryFuncs7[size >> 4][!!((flags & memop_Store))]; - if (Config::JIT_LiteralOptimisations && (rd != 15 || (flags & memop_Store)) && op2.IsImm && RegCache.IsLiteral(rn)) + if (!addrIsStatic) { - u32 addr = RegCache.LiteralValues[rn] + op2.Imm * ((flags & memop_SubtractOffset) ? -1 : 1); - - /*MOV(32, R(ABI_PARAM1), Imm32(CurInstr.Instr)); - MOV(32, R(ABI_PARAM1), Imm32(R15)); - MOV_sum(32, RSCRATCH, rnMapped, Imm32(op2.Imm * ((flags & memop_SubtractOffset) ? -1 : 1))); - CMP(32, R(RSCRATCH), Imm32(addr)); - FixupBranch eq = J_CC(CC_E); - CALL((void*)fault); - SetJumpTarget(eq);*/ - - NDS::MemRegion region; - region.Mem = NULL; - if (Num == 0) + OpArg rnMapped = MapReg(rn); + if (Thumb && rn == 15) + rnMapped = Imm32(R15 & ~0x2); + + X64Reg finalAddr = RSCRATCH3; + if (flags & memop_Post) { - ARMv5* cpu5 = (ARMv5*)CurCPU; + MOV(32, R(RSCRATCH3), rnMapped); - // stupid dtcm... - if (addr >= cpu5->DTCMBase && addr < (cpu5->DTCMBase + cpu5->DTCMSize)) - { - // disable this for now as DTCM is located in heap - // which might excced the RIP-addressable range - //region.Mem = cpu5->DTCM; - //region.Mask = 0x3FFF; - } - else - { - NDS::ARM9GetMemRegion(addr, flags & memop_Store, ®ion); - } + finalAddr = rnMapped.GetSimpleReg(); } - else - NDS::ARM7GetMemRegion(addr, flags & memop_Store, ®ion); - if (region.Mem != NULL) + if (op2.IsImm) + { + MOV_sum(32, finalAddr, rnMapped, Imm32(op2.Imm * ((flags & memop_SubtractOffset) ? -1 : 1))); + } + else { - void* ptr = ®ion.Mem[addr & addressMask & region.Mask]; + OpArg rm = MapReg(op2.Reg.Reg); - if (flags & memop_Store) + if (!(flags & memop_SubtractOffset) && rm.IsSimpleReg() && rnMapped.IsSimpleReg() + && op2.Reg.Op == 0 && op2.Reg.Amount > 0 && op2.Reg.Amount <= 3) { - MOV(size, M(ptr), MapReg(rd)); + LEA(32, finalAddr, + MComplex(rnMapped.GetSimpleReg(), rm.GetSimpleReg(), 1 << op2.Reg.Amount, 0)); } else { - if (flags & memop_SignExtend) - MOVSX(32, size, rdMapped.GetSimpleReg(), M(ptr)); - else - MOVZX(32, size, rdMapped.GetSimpleReg(), M(ptr)); + bool throwAway; + OpArg offset = + Comp_RegShiftImm(op2.Reg.Op, op2.Reg.Amount, rm, false, throwAway); - if (size == 32 && addr & ~0x3) + if (flags & memop_SubtractOffset) { - ROR_(32, rdMapped, Imm8((addr & 0x3) << 3)); + if (R(finalAddr) != rnMapped) + MOV(32, R(finalAddr), rnMapped); + if (!offset.IsZero()) + SUB(32, R(finalAddr), offset); } + else + MOV_sum(32, finalAddr, rnMapped, offset); } - - return; } - void* specialFunc = GetFuncForAddr(CurCPU, addr, flags & memop_Store, size); - if (specialFunc) - { - memoryFunc = specialFunc; - inlinePreparation = true; - constLocalROR32 = addr & 0x3; - } + if ((flags & memop_Writeback) && !(flags & memop_Post)) + MOV(32, rnMapped, R(finalAddr)); } - X64Reg finalAddr = ABI_PARAM1; - if (flags & memop_Post) - { - MOV(32, R(ABI_PARAM1), rnMapped); + int expectedTarget = Num == 0 + ? ClassifyAddress9(addrIsStatic ? staticAddress : CurInstr.DataRegion) + : ClassifyAddress7(addrIsStatic ? staticAddress : CurInstr.DataRegion); + if (CurInstr.Cond() < 0xE) + expectedTarget = memregion_Other; + + bool compileFastPath = false, compileSlowPath = !addrIsStatic || (flags & memop_Store); - finalAddr = rnMapped.GetSimpleReg(); + switch (expectedTarget) + { + case memregion_MainRAM: + case memregion_DTCM: + case memregion_WRAM7: + case memregion_SWRAM9: + case memregion_SWRAM7: + case memregion_IO9: + case memregion_IO7: + case memregion_VWRAM: + compileFastPath = true; + break; + case memregion_Wifi: + compileFastPath = size >= 16; + break; + case memregion_VRAM: + compileFastPath = !(flags & memop_Store) || size >= 16; + case memregion_BIOS9: + compileFastPath = !(flags & memop_Store); + break; + default: break; } - if (op2.IsImm) + if (addrIsStatic && !compileFastPath) { - MOV_sum(32, finalAddr, rnMapped, Imm32(op2.Imm * ((flags & memop_SubtractOffset) ? -1 : 1))); + compileFastPath = false; + compileSlowPath = true; } - else + + if (addrIsStatic && compileSlowPath) + MOV(32, R(RSCRATCH3), Imm32(staticAddress)); + + if (compileFastPath) { - OpArg rm = MapReg(op2.Reg.Reg); + FixupBranch slowPath; + if (compileSlowPath) + { + MOV(32, R(RSCRATCH), R(RSCRATCH3)); + SHR(32, R(RSCRATCH), Imm8(9)); + if (flags & memop_Store) + { + CMP(8, MDisp(RSCRATCH, squeezePointer(Num == 0 ? MemoryStatus9 : MemoryStatus7)), Imm8(expectedTarget)); + } + else + { + MOVZX(32, 8, RSCRATCH, MDisp(RSCRATCH, squeezePointer(Num == 0 ? MemoryStatus9 : MemoryStatus7))); + AND(32, R(RSCRATCH), Imm8(~0x80)); + CMP(32, R(RSCRATCH), Imm8(expectedTarget)); + } + + slowPath = J_CC(CC_NE, true); + } - if (!(flags & memop_SubtractOffset) && rm.IsSimpleReg() && rnMapped.IsSimpleReg() - && op2.Reg.Op == 0 && op2.Reg.Amount > 0 && op2.Reg.Amount <= 3) + if (expectedTarget == memregion_MainRAM || expectedTarget == memregion_WRAM7 + || expectedTarget == memregion_BIOS9) { - LEA(32, finalAddr, - MComplex(rnMapped.GetSimpleReg(), rm.GetSimpleReg(), 1 << op2.Reg.Amount, 0)); + u8* data; + u32 mask; + if (expectedTarget == memregion_MainRAM) + { + data = NDS::MainRAM; + mask = MAIN_RAM_SIZE - 1; + } + else if (expectedTarget == memregion_BIOS9) + { + data = NDS::ARM9BIOS; + mask = 0xFFF; + } + else + { + data = NDS::ARM7WRAM; + mask = 0xFFFF; + } + OpArg memLoc; + if (addrIsStatic) + { + memLoc = M(data + ((staticAddress & mask & addressMask))); + } + else + { + MOV(32, R(RSCRATCH), R(RSCRATCH3)); + AND(32, R(RSCRATCH), Imm32(mask & addressMask)); + memLoc = MDisp(RSCRATCH, squeezePointer(data)); + } + if (flags & memop_Store) + MOV(size, memLoc, rdMapped); + else if (flags & memop_SignExtend) + MOVSX(32, size, rdMapped.GetSimpleReg(), memLoc); + else + MOVZX(32, size, rdMapped.GetSimpleReg(), memLoc); } - else + else if (expectedTarget == memregion_DTCM) + { + if (addrIsStatic) + MOV(32, R(RSCRATCH), Imm32(staticAddress)); + else + MOV(32, R(RSCRATCH), R(RSCRATCH3)); + SUB(32, R(RSCRATCH), MDisp(RCPU, offsetof(ARMv5, DTCMBase))); + AND(32, R(RSCRATCH), Imm32(0x3FFF & addressMask)); + OpArg memLoc = MComplex(RCPU, RSCRATCH, SCALE_1, offsetof(ARMv5, DTCM)); + if (flags & memop_Store) + MOV(size, memLoc, rdMapped); + else if (flags & memop_SignExtend) + MOVSX(32, size, rdMapped.GetSimpleReg(), memLoc); + else + MOVZX(32, size, rdMapped.GetSimpleReg(), memLoc); + } + else if (expectedTarget == memregion_SWRAM9 || expectedTarget == memregion_SWRAM7) { - bool throwAway; - OpArg offset = - Comp_RegShiftImm(op2.Reg.Op, op2.Reg.Amount, rm, false, throwAway); - - if (flags & memop_SubtractOffset) + MOV(64, R(RSCRATCH2), M(expectedTarget == memregion_SWRAM9 ? &NDS::SWRAM_ARM9 : &NDS::SWRAM_ARM7)); + if (addrIsStatic) { - if (R(finalAddr) != rnMapped) - MOV(32, R(finalAddr), rnMapped); - if (!offset.IsZero()) - SUB(32, R(finalAddr), offset); + MOV(32, R(RSCRATCH), Imm32(staticAddress & addressMask)); } else - MOV_sum(32, finalAddr, rnMapped, offset); + { + MOV(32, R(RSCRATCH), R(RSCRATCH3)); + AND(32, R(RSCRATCH), Imm8(addressMask)); + } + AND(32, R(RSCRATCH), M(expectedTarget == memregion_SWRAM9 ? &NDS::SWRAM_ARM9Mask : &NDS::SWRAM_ARM7Mask)); + OpArg memLoc = MRegSum(RSCRATCH, RSCRATCH2); + if (flags & memop_Store) + MOV(size, memLoc, rdMapped); + else if (flags & memop_SignExtend) + MOVSX(32, size, rdMapped.GetSimpleReg(), memLoc); + else + MOVZX(32, size, rdMapped.GetSimpleReg(), memLoc); } - } + else + { + u32 maskedDataRegion; - if ((flags & memop_Writeback) && !(flags & memop_Post)) - MOV(32, rnMapped, R(finalAddr)); + if (addrIsStatic) + { + maskedDataRegion = staticAddress; + MOV(32, R(ABI_PARAM1), Imm32(staticAddress)); + } + else + { + if (ABI_PARAM1 != RSCRATCH3) + MOV(32, R(ABI_PARAM1), R(RSCRATCH3)); + AND(32, R(ABI_PARAM1), Imm8(addressMask)); - if (flags & memop_Store) - MOV(32, R(ABI_PARAM2), rdMapped); + maskedDataRegion = CurInstr.DataRegion; + if (Num == 0) + maskedDataRegion &= ~0xFFFFFF; + else + maskedDataRegion &= ~0x7FFFFF; + } - if (!(flags & memop_Store) && inlinePreparation && constLocalROR32 == 4 && size == 32) - MOV(32, rdMapped, R(ABI_PARAM1)); + void* func = GetFuncForAddr(CurCPU, maskedDataRegion, flags & memop_Store, size); - if (inlinePreparation && size > 8) - AND(32, R(ABI_PARAM1), Imm8(addressMask)); + if (flags & memop_Store) + { + MOV(32, R(ABI_PARAM2), rdMapped); - CALL(memoryFunc); + ABI_CallFunction((void(*)())func); + } + else + { + if (!addrIsStatic) + MOV(32, rdMapped, R(RSCRATCH3)); - /*if (Num == 0 && check) - { - CMP(32, R(EAX), rdMapped); - FixupBranch notEqual = J_CC(CC_E); - ABI_PushRegistersAndAdjustStack({RSCRATCH}, 0); - MOV(32, R(ABI_PARAM1), Imm32(R15 - (Thumb ? 4 : 8))); - MOV(32, R(ABI_PARAM2), R(EAX)); - MOV(32, R(ABI_PARAM3), rdMapped); - MOV(32, R(ABI_PARAM4), Imm32(CurInstr.Instr)); - CALL((u8*)fault); - ABI_PopRegistersAndAdjustStack({RSCRATCH}, 0); - SetJumpTarget(notEqual); - }*/ - - if (!(flags & memop_Store)) - { - if (inlinePreparation && size == 32) + ABI_CallFunction((void(*)())func); + + if (!addrIsStatic) + MOV(32, R(RSCRATCH3), rdMapped); + + if (flags & memop_SignExtend) + MOVSX(32, size, rdMapped.GetSimpleReg(), R(RSCRATCH)); + else + MOVZX(32, size, rdMapped.GetSimpleReg(), R(RSCRATCH)); + } + } + + if ((size == 32 && !(flags & memop_Store))) { - if (constLocalROR32 == 4) + if (addrIsStatic) + { + if (staticAddress & 0x3) + ROR_(32, rdMapped, Imm8((staticAddress & 0x3) * 8)); + } + else { - static_assert(RSCRATCH3 == ECX); - MOV(32, R(ECX), rdMapped); - AND(32, R(ECX), Imm8(3)); - SHL(32, R(ECX), Imm8(3)); - ROR_(32, R(RSCRATCH), R(ECX)); + AND(32, R(RSCRATCH3), Imm8(0x3)); + SHL(32, R(RSCRATCH3), Imm8(3)); + ROR_(32, rdMapped, R(RSCRATCH3)); } - else if (constLocalROR32 != 0) - ROR_(32, R(RSCRATCH), Imm8(constLocalROR32 << 3)); } - if (flags & memop_SignExtend) - MOVSX(32, size, rdMapped.GetSimpleReg(), R(RSCRATCH)); + if (compileSlowPath) + { + SwitchToFarCode(); + SetJumpTarget(slowPath); + } + } + + if (compileSlowPath) + { + if (Num == 0) + { + MOV(32, R(ABI_PARAM2), R(RSCRATCH3)); + MOV(64, R(ABI_PARAM1), R(RCPU)); + if (flags & memop_Store) + { + MOV(32, R(ABI_PARAM3), rdMapped); + + switch (size) + { + case 32: CALL((void*)&SlowWrite9<u32>); break; + case 16: CALL((void*)&SlowWrite9<u16>); break; + case 8: CALL((void*)&SlowWrite9<u8>); break; + } + } + else + { + switch (size) + { + case 32: CALL((void*)&SlowRead9<u32>); break; + case 16: CALL((void*)&SlowRead9<u16>); break; + case 8: CALL((void*)&SlowRead9<u8>); break; + } + } + } else - MOVZX(32, size, rdMapped.GetSimpleReg(), R(RSCRATCH)); + { + if (ABI_PARAM1 != RSCRATCH3) + MOV(32, R(ABI_PARAM1), R(RSCRATCH3)); + if (flags & memop_Store) + { + MOV(32, R(ABI_PARAM2), rdMapped); + + switch (size) + { + case 32: CALL((void*)&SlowWrite7<u32>); break; + case 16: CALL((void*)&SlowWrite7<u16>); break; + case 8: CALL((void*)&SlowWrite7<u8>); break; + } + } + else + { + switch (size) + { + case 32: CALL((void*)&SlowRead7<u32>); break; + case 16: CALL((void*)&SlowRead7<u16>); break; + case 8: CALL((void*)&SlowRead7<u8>); break; + } + } + } + if (!(flags & memop_Store)) + { + if (flags & memop_SignExtend) + MOVSX(32, size, rdMapped.GetSimpleReg(), R(RSCRATCH)); + else + MOVZX(32, size, rdMapped.GetSimpleReg(), R(RSCRATCH)); + } + } + + if (compileFastPath && compileSlowPath) + { + FixupBranch ret = J(true); + SwitchToNearCode(); + SetJumpTarget(ret); } if (!(flags & memop_Store) && rd == 15) @@ -498,100 +433,160 @@ void Compiler::Comp_MemAccess(int rd, int rn, const ComplexOperand& op2, int siz s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc, bool decrement, bool usermode) { - IrregularCycles = true; - int regsCount = regs.Count(); s32 offset = (regsCount * 4) * (decrement ? -1 : 1); // we need to make sure that the stack stays aligned to 16 bytes +#ifdef _WIN32 + // include shadow + u32 stackAlloc = ((regsCount + 4 + 1) & ~1) * 8; +#else u32 stackAlloc = ((regsCount + 1) & ~1) * 8; +#endif + u32 allocOffset = stackAlloc - regsCount * 8; - if (!store) + int expectedTarget = Num == 0 + ? ClassifyAddress9(CurInstr.DataRegion) + : ClassifyAddress7(CurInstr.DataRegion); + if (usermode || CurInstr.Cond() < 0xE) + expectedTarget = memregion_Other; + + bool compileFastPath = false; + + switch (expectedTarget) { + case memregion_DTCM: + case memregion_MainRAM: + case memregion_SWRAM9: + case memregion_SWRAM7: + case memregion_WRAM7: + compileFastPath = true; + break; + default: + break; + } + + if (!store) Comp_AddCycles_CDI(); + else + Comp_AddCycles_CD(); - if (decrement) + if (decrement) + { + MOV_sum(32, RSCRATCH4, MapReg(rn), Imm32(-regsCount * 4)); + preinc ^= true; + } + else + MOV(32, R(RSCRATCH4), MapReg(rn)); + + if (compileFastPath) + { + assert(!usermode); + + MOV(32, R(RSCRATCH), R(RSCRATCH4)); + SHR(32, R(RSCRATCH), Imm8(9)); + + if (store) { - MOV_sum(32, ABI_PARAM1, MapReg(rn), Imm32(-regsCount * 4)); - preinc ^= true; + CMP(8, MDisp(RSCRATCH, squeezePointer(Num == 0 ? MemoryStatus9 : MemoryStatus7)), Imm8(expectedTarget)); } else - MOV(32, R(ABI_PARAM1), MapReg(rn)); - - MOV(32, R(ABI_PARAM3), Imm32(regsCount)); - SUB(64, R(RSP), stackAlloc <= INT8_MAX ? Imm8(stackAlloc) : Imm32(stackAlloc)); - MOV(64, R(ABI_PARAM2), R(RSP)); - - CALL(Num == 0 - ? MemoryFuncsSeq9[0][preinc] - : MemoryFuncsSeq7[0][preinc][CodeRegion == 0x02]); + { + MOVZX(32, 8, RSCRATCH, MDisp(RSCRATCH, squeezePointer(Num == 0 ? MemoryStatus9 : MemoryStatus7))); + AND(32, R(RSCRATCH), Imm8(~0x80)); + CMP(32, R(RSCRATCH), Imm8(expectedTarget)); + } + FixupBranch slowPath = J_CC(CC_NE, true); - bool firstUserMode = true; - for (int reg = 15; reg >= 0; reg--) + if (expectedTarget == memregion_DTCM) { - if (regs[reg]) + SUB(32, R(RSCRATCH4), MDisp(RCPU, offsetof(ARMv5, DTCMBase))); + AND(32, R(RSCRATCH4), Imm32(0x3FFF & ~3)); + LEA(64, RSCRATCH4, MComplex(RCPU, RSCRATCH4, 1, offsetof(ARMv5, DTCM))); + } + else if (expectedTarget == memregion_MainRAM) + { + AND(32, R(RSCRATCH4), Imm32((MAIN_RAM_SIZE - 1) & ~3)); + ADD(64, R(RSCRATCH4), Imm32(squeezePointer(NDS::MainRAM))); + } + else if (expectedTarget == memregion_WRAM7) + { + AND(32, R(RSCRATCH4), Imm32(0xFFFF & ~3)); + ADD(64, R(RSCRATCH4), Imm32(squeezePointer(NDS::ARM7WRAM))); + } + else // SWRAM + { + AND(32, R(RSCRATCH4), Imm8(~3)); + AND(32, R(RSCRATCH4), M(expectedTarget == memregion_SWRAM9 ? &NDS::SWRAM_ARM9Mask : &NDS::SWRAM_ARM7Mask)); + ADD(64, R(RSCRATCH4), M(expectedTarget == memregion_SWRAM9 ? &NDS::SWRAM_ARM9 : &NDS::SWRAM_ARM7)); + } + u32 offset = 0; + for (int reg : regs) + { + if (preinc) + offset += 4; + OpArg mem = MDisp(RSCRATCH4, offset); + if (store) { - if (usermode && !regs[15] && reg >= 8 && reg < 15) + if (RegCache.LoadedRegs & (1 << reg)) { - if (firstUserMode) - { - MOV(32, R(RSCRATCH), R(RCPSR)); - AND(32, R(RSCRATCH), Imm8(0x1F)); - firstUserMode = false; - } - MOV(32, R(RSCRATCH2), Imm32(reg - 8)); - POP(RSCRATCH3); - CALL(WriteBanked); - FixupBranch sucessfulWritten = J_CC(CC_NC); - if (RegCache.Mapping[reg] != INVALID_REG) - MOV(32, R(RegCache.Mapping[reg]), R(RSCRATCH3)); - else - SaveReg(reg, RSCRATCH3); - SetJumpTarget(sucessfulWritten); + MOV(32, mem, MapReg(reg)); } - else if (RegCache.Mapping[reg] == INVALID_REG) + else { - assert(reg != 15); - - POP(RSCRATCH); - SaveReg(reg, RSCRATCH); + LoadReg(reg, RSCRATCH); + MOV(32, mem, R(RSCRATCH)); + } + } + else + { + if (RegCache.LoadedRegs & (1 << reg)) + { + MOV(32, MapReg(reg), mem); } else { - if (reg != 15) - RegCache.DirtyRegs |= (1 << reg); - POP(MapReg(reg).GetSimpleReg()); + MOV(32, R(RSCRATCH), mem); + SaveReg(reg, RSCRATCH); } } + if (!preinc) + offset += 4; } - if (regsCount & 1) - POP(RSCRATCH); + SwitchToFarCode(); + SetJumpTarget(slowPath); + } + + if (!store) + { + MOV(32, R(ABI_PARAM1), R(RSCRATCH4)); + MOV(32, R(ABI_PARAM3), Imm32(regsCount)); + SUB(64, R(RSP), stackAlloc <= INT8_MAX ? Imm8(stackAlloc) : Imm32(stackAlloc)); + if (allocOffset == 0) + MOV(64, R(ABI_PARAM2), R(RSP)); + else + LEA(64, ABI_PARAM2, MDisp(RSP, allocOffset)); + + if (Num == 0) + MOV(64, R(ABI_PARAM4), R(RCPU)); - if (regs[15]) + switch (Num * 2 | preinc) { - if (Num == 1) - { - if (Thumb) - OR(32, MapReg(15), Imm8(1)); - else - AND(32, MapReg(15), Imm8(0xFE)); - } - Comp_JumpTo(MapReg(15).GetSimpleReg(), usermode); + case 0: CALL((void*)&SlowBlockTransfer9<false, false>); break; + case 1: CALL((void*)&SlowBlockTransfer9<true, false>); break; + case 2: CALL((void*)&SlowBlockTransfer7<false, false>); break; + case 3: CALL((void*)&SlowBlockTransfer7<true, false>); break; } - } - else - { - Comp_AddCycles_CD(); - if (regsCount & 1) - PUSH(RSCRATCH); + if (allocOffset) + ADD(64, R(RSP), Imm8(allocOffset)); bool firstUserMode = true; for (int reg : regs) { - if (usermode && reg >= 8 && reg < 15) + if (usermode && !regs[15] && reg >= 8 && reg < 15) { if (firstUserMode) { @@ -599,43 +594,107 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc AND(32, R(RSCRATCH), Imm8(0x1F)); firstUserMode = false; } - if (RegCache.Mapping[reg] == INVALID_REG) - LoadReg(reg, RSCRATCH3); - else - MOV(32, R(RSCRATCH3), R(RegCache.Mapping[reg])); MOV(32, R(RSCRATCH2), Imm32(reg - 8)); - CALL(ReadBanked); - PUSH(RSCRATCH3); + POP(RSCRATCH3); + CALL(WriteBanked); + FixupBranch sucessfulWritten = J_CC(CC_NC); + if (RegCache.LoadedRegs & (1 << reg)) + MOV(32, R(RegCache.Mapping[reg]), R(RSCRATCH3)); + else + SaveReg(reg, RSCRATCH3); + SetJumpTarget(sucessfulWritten); } - else if (RegCache.Mapping[reg] == INVALID_REG) + else if (!(RegCache.LoadedRegs & (1 << reg))) { - LoadReg(reg, RSCRATCH); - PUSH(RSCRATCH); + assert(reg != 15); + + POP(RSCRATCH); + SaveReg(reg, RSCRATCH); } else { - PUSH(MapReg(reg).GetSimpleReg()); + POP(MapReg(reg).GetSimpleReg()); } } - - if (decrement) + } + else + { + bool firstUserMode = true; + for (int reg = 15; reg >= 0; reg--) { - MOV_sum(32, ABI_PARAM1, MapReg(rn), Imm32(-regsCount * 4)); - preinc ^= true; + if (regs[reg]) + { + if (usermode && reg >= 8 && reg < 15) + { + if (firstUserMode) + { + MOV(32, R(RSCRATCH), R(RCPSR)); + AND(32, R(RSCRATCH), Imm8(0x1F)); + firstUserMode = false; + } + if (RegCache.Mapping[reg] == INVALID_REG) + LoadReg(reg, RSCRATCH3); + else + MOV(32, R(RSCRATCH3), R(RegCache.Mapping[reg])); + MOV(32, R(RSCRATCH2), Imm32(reg - 8)); + CALL(ReadBanked); + PUSH(RSCRATCH3); + } + else if (!(RegCache.LoadedRegs & (1 << reg))) + { + LoadReg(reg, RSCRATCH); + PUSH(RSCRATCH); + } + else + { + PUSH(MapReg(reg).GetSimpleReg()); + } + } } - else - MOV(32, R(ABI_PARAM1), MapReg(rn)); - MOV(64, R(ABI_PARAM2), R(RSP)); + if (allocOffset) + SUB(64, R(RSP), Imm8(allocOffset)); + + MOV(32, R(ABI_PARAM1), R(RSCRATCH4)); + if (allocOffset) + LEA(64, ABI_PARAM2, MDisp(RSP, allocOffset)); + else + MOV(64, R(ABI_PARAM2), R(RSP)); + MOV(32, R(ABI_PARAM3), Imm32(regsCount)); + if (Num == 0) + MOV(64, R(ABI_PARAM4), R(RCPU)); - CALL(Num == 0 - ? MemoryFuncsSeq9[1][preinc] - : MemoryFuncsSeq7[1][preinc][CodeRegion == 0x02]); + switch (Num * 2 | preinc) + { + case 0: CALL((void*)&SlowBlockTransfer9<false, true>); break; + case 1: CALL((void*)&SlowBlockTransfer9<true, true>); break; + case 2: CALL((void*)&SlowBlockTransfer7<false, true>); break; + case 3: CALL((void*)&SlowBlockTransfer7<true, true>); break; + } ADD(64, R(RSP), stackAlloc <= INT8_MAX ? Imm8(stackAlloc) : Imm32(stackAlloc)); } + if (compileFastPath) + { + FixupBranch ret = J(true); + SwitchToNearCode(); + SetJumpTarget(ret); + } + + if (!store && regs[15]) + { + if (Num == 1) + { + if (Thumb) + OR(32, MapReg(15), Imm8(1)); + else + AND(32, MapReg(15), Imm8(0xFE)); + } + Comp_JumpTo(MapReg(15).GetSimpleReg(), usermode); + } + return offset; } @@ -786,9 +845,7 @@ void Compiler::T_Comp_LoadPCRel() { u32 offset = (CurInstr.Instr & 0xFF) << 2; u32 addr = (R15 & ~0x2) + offset; - if (Config::JIT_LiteralOptimisations) - Comp_MemLoadLiteral(32, CurInstr.T_Reg(8), addr); - else + if (!Config::JIT_LiteralOptimisations || !Comp_MemLoadLiteral(32, CurInstr.T_Reg(8), addr)) Comp_MemAccess(CurInstr.T_Reg(8), 15, ComplexOperand(offset), 32, 0); } diff --git a/src/ARM_InstrInfo.cpp b/src/ARM_InstrInfo.cpp index 28362d9..b50e821 100644 --- a/src/ARM_InstrInfo.cpp +++ b/src/ARM_InstrInfo.cpp @@ -373,16 +373,16 @@ Info Decode(bool thumb, u32 num, u32 instr) if (res.Kind == tk_LDMIA || res.Kind == tk_POP) { - u32 set = (instr & 0xFF) & ~(res.DstRegs|res.SrcRegs); - res.NotStrictlyNeeded |= set; + u32 set = (instr & 0xFF); + res.NotStrictlyNeeded |= set & ~(res.DstRegs|res.SrcRegs); res.DstRegs |= set; } if (res.Kind == tk_STMIA || res.Kind == tk_PUSH) { - u32 set = (instr & 0xFF) & ~(res.DstRegs|res.SrcRegs); + u32 set = (instr & 0xFF); if (res.Kind == tk_PUSH && instr & (1 << 8)) set |= (1 << 14); - res.NotStrictlyNeeded |= set; + res.NotStrictlyNeeded |= set & ~(res.DstRegs|res.SrcRegs); res.SrcRegs |= set; } @@ -495,15 +495,15 @@ Info Decode(bool thumb, u32 num, u32 instr) if (res.Kind == ak_LDM) { - u16 set = (instr & 0xFFFF) & ~(res.SrcRegs|res.DstRegs|(1<<15)); + u16 set = (instr & 0xFFFF); + res.NotStrictlyNeeded |= set & ~(res.SrcRegs|res.DstRegs|(1<<15)); res.DstRegs |= set; - res.NotStrictlyNeeded |= set; } if (res.Kind == ak_STM) { - u16 set = (instr & 0xFFFF) & ~(res.SrcRegs|res.DstRegs|(1<<15)); + u16 set = (instr & 0xFFFF); + res.NotStrictlyNeeded |= set & ~(res.SrcRegs|res.DstRegs|(1<<15)); res.SrcRegs |= set; - res.NotStrictlyNeeded |= set; } if ((instr >> 28) < 0xE) diff --git a/src/CP15.cpp b/src/CP15.cpp index 62258e9..e665dbd 100644 --- a/src/CP15.cpp +++ b/src/CP15.cpp @@ -97,6 +97,10 @@ void ARMv5::CP15DoSavestate(Savestate* file) void ARMv5::UpdateDTCMSetting() { +#ifdef JIT_ENABLED + u32 oldDTCMBase = DTCMBase; + u32 oldDTCMSize = DTCMSize; +#endif if (CP15Control & (1<<16)) { DTCMBase = DTCMSetting & 0xFFFFF000; @@ -109,10 +113,20 @@ void ARMv5::UpdateDTCMSetting() DTCMSize = 0; //printf("DTCM disabled\n"); } +#ifdef JIT_ENABLED + if (oldDTCMBase != DTCMBase || oldDTCMSize != DTCMSize) + { + ARMJIT::UpdateMemoryStatus9(oldDTCMBase, oldDTCMBase + oldDTCMSize); + ARMJIT::UpdateMemoryStatus9(DTCMBase, DTCMBase + DTCMSize); + } +#endif } void ARMv5::UpdateITCMSetting() { +#ifdef JIT_ENABLED + u32 oldITCMSize = ITCMSize; +#endif if (CP15Control & (1<<18)) { ITCMSize = 0x200 << ((ITCMSetting >> 1) & 0x1F); @@ -123,6 +137,10 @@ void ARMv5::UpdateITCMSetting() ITCMSize = 0; //printf("ITCM disabled\n"); } +#ifdef JIT_ENABLED + if (oldITCMSize != ITCMSize) + ARMJIT::UpdateMemoryStatus9(0, std::max(oldITCMSize, ITCMSize)); +#endif } @@ -561,15 +579,9 @@ void ARMv5::CP15Write(u32 id, u32 val) case 0x750: -#ifdef JIT_ENABLED - ARMJIT::InvalidateAll(); -#endif ICacheInvalidateAll(); return; case 0x751: -#ifdef JIT_ENABLED - ARMJIT::InvalidateByAddr(ARMJIT::TranslateAddr<0>(val)); -#endif ICacheInvalidateByAddr(val); return; case 0x752: @@ -732,7 +744,7 @@ u32 ARMv5::CodeRead32(u32 addr, bool branch) void ARMv5::DataRead8(u32 addr, u32* val) { - DataRegion = addr >> 12; + DataRegion = addr; if (addr < ITCMSize) { @@ -753,7 +765,7 @@ void ARMv5::DataRead8(u32 addr, u32* val) void ARMv5::DataRead16(u32 addr, u32* val) { - DataRegion = addr >> 12; + DataRegion = addr; addr &= ~1; @@ -776,7 +788,7 @@ void ARMv5::DataRead16(u32 addr, u32* val) void ARMv5::DataRead32(u32 addr, u32* val) { - DataRegion = addr >> 12; + DataRegion = addr; addr &= ~3; @@ -820,14 +832,14 @@ void ARMv5::DataRead32S(u32 addr, u32* val) void ARMv5::DataWrite8(u32 addr, u8 val) { - DataRegion = addr >> 12; + DataRegion = addr; if (addr < ITCMSize) { DataCycles = 1; *(u8*)&ITCM[addr & 0x7FFF] = val; #ifdef JIT_ENABLED - ARMJIT::InvalidateITCM(addr & 0x7FFF); + ARMJIT::InvalidateITCMIfNecessary(addr); #endif return; } @@ -844,7 +856,7 @@ void ARMv5::DataWrite8(u32 addr, u8 val) void ARMv5::DataWrite16(u32 addr, u16 val) { - DataRegion = addr >> 12; + DataRegion = addr; addr &= ~1; @@ -853,7 +865,7 @@ void ARMv5::DataWrite16(u32 addr, u16 val) DataCycles = 1; *(u16*)&ITCM[addr & 0x7FFF] = val; #ifdef JIT_ENABLED - ARMJIT::InvalidateITCM(addr & 0x7FFF); + ARMJIT::InvalidateITCMIfNecessary(addr); #endif return; } @@ -870,7 +882,7 @@ void ARMv5::DataWrite16(u32 addr, u16 val) void ARMv5::DataWrite32(u32 addr, u32 val) { - DataRegion = addr >> 12; + DataRegion = addr; addr &= ~3; @@ -879,7 +891,7 @@ void ARMv5::DataWrite32(u32 addr, u32 val) DataCycles = 1; *(u32*)&ITCM[addr & 0x7FFF] = val; #ifdef JIT_ENABLED - ARMJIT::InvalidateITCM(addr & 0x7FFF); + ARMJIT::InvalidateITCMIfNecessary(addr); #endif return; } @@ -903,7 +915,7 @@ void ARMv5::DataWrite32S(u32 addr, u32 val) DataCycles += 1; *(u32*)&ITCM[addr & 0x7FFF] = val; #ifdef JIT_ENABLED - ARMJIT::InvalidateITCM(addr & 0x7FFF); + ARMJIT::InvalidateITCMIfNecessary(addr); #endif return; } diff --git a/src/NDS.cpp b/src/NDS.cpp index 141c565..6e989a8 100644 --- a/src/NDS.cpp +++ b/src/NDS.cpp @@ -535,10 +535,6 @@ void Reset() KeyCnt = 0; RCnt = 0; -#ifdef JIT_ENABLED - ARMJIT::ResetBlockCache(); -#endif - NDSCart::Reset(); GBACart::Reset(); GPU::Reset(); @@ -548,6 +544,10 @@ void Reset() Wifi::Reset(); AREngine::Reset(); + +#ifdef JIT_ENABLED + ARMJIT::Reset(); +#endif } void Stop() @@ -1058,6 +1058,9 @@ void Halt() void MapSharedWRAM(u8 val) { + if (val == WRAMCnt) + return; + WRAMCnt = val; switch (WRAMCnt & 0x3) @@ -1090,6 +1093,11 @@ void MapSharedWRAM(u8 val) SWRAM_ARM7Mask = 0x7FFF; break; } + +#ifdef JIT_ENABLED + ARMJIT::UpdateMemoryStatus9(0x3000000, 0x3000000 + 0x1000000); + ARMJIT::UpdateMemoryStatus7(0x3000000, 0x3000000 + 0x1000000); +#endif } @@ -1873,12 +1881,18 @@ void ARM9Write8(u32 addr, u8 val) switch (addr & 0xFF000000) { case 0x02000000: +#ifdef JIT_ENABLED + ARMJIT::InvalidateMainRAMIfNecessary(addr); +#endif *(u8*)&MainRAM[addr & (MAIN_RAM_SIZE - 1)] = val; return; case 0x03000000: if (SWRAM_ARM9) { +#ifdef JIT_ENABLED + ARMJIT::InvalidateSWRAM9IfNecessary(addr); +#endif *(u8*)&SWRAM_ARM9[addr & SWRAM_ARM9Mask] = val; } return; @@ -1923,12 +1937,18 @@ void ARM9Write16(u32 addr, u16 val) switch (addr & 0xFF000000) { case 0x02000000: +#ifdef JIT_ENABLED + ARMJIT::InvalidateMainRAMIfNecessary(addr); +#endif *(u16*)&MainRAM[addr & (MAIN_RAM_SIZE - 1)] = val; return; case 0x03000000: if (SWRAM_ARM9) { +#ifdef JIT_ENABLED + ARMJIT::InvalidateSWRAM9IfNecessary(addr); +#endif *(u16*)&SWRAM_ARM9[addr & SWRAM_ARM9Mask] = val; } return; @@ -1949,7 +1969,12 @@ void ARM9Write16(u32 addr, u16 val) case 0x00200000: GPU::WriteVRAM_BBG<u16>(addr, val); return; case 0x00400000: GPU::WriteVRAM_AOBJ<u16>(addr, val); return; case 0x00600000: GPU::WriteVRAM_BOBJ<u16>(addr, val); return; - default: GPU::WriteVRAM_LCDC<u16>(addr, val); return; + default: +#ifdef JIT_ENABLED + ARMJIT::InvalidateLCDCIfNecessary(addr); +#endif + GPU::WriteVRAM_LCDC<u16>(addr, val); + return; } case 0x07000000: @@ -1989,12 +2014,18 @@ void ARM9Write32(u32 addr, u32 val) switch (addr & 0xFF000000) { case 0x02000000: +#ifdef JIT_ENABLED + ARMJIT::InvalidateMainRAMIfNecessary(addr); +#endif *(u32*)&MainRAM[addr & (MAIN_RAM_SIZE - 1)] = val; return ; case 0x03000000: if (SWRAM_ARM9) { +#ifdef JIT_ENABLED + ARMJIT::InvalidateSWRAM9IfNecessary(addr); +#endif *(u32*)&SWRAM_ARM9[addr & SWRAM_ARM9Mask] = val; } return; @@ -2015,7 +2046,12 @@ void ARM9Write32(u32 addr, u32 val) case 0x00200000: GPU::WriteVRAM_BBG<u32>(addr, val); return; case 0x00400000: GPU::WriteVRAM_AOBJ<u32>(addr, val); return; case 0x00600000: GPU::WriteVRAM_BOBJ<u32>(addr, val); return; - default: GPU::WriteVRAM_LCDC<u32>(addr, val); return; + default: +#ifdef JIT_ENABLED + ARMJIT::InvalidateLCDCIfNecessary(addr); +#endif + GPU::WriteVRAM_LCDC<u32>(addr, val); + return; } case 0x07000000: @@ -2279,30 +2315,38 @@ u32 ARM7Read32(u32 addr) void ARM7Write8(u32 addr, u8 val) { -#ifdef JIT_ENABLED - ARMJIT::InvalidateByAddr7(addr); -#endif - switch (addr & 0xFF800000) { case 0x02000000: case 0x02800000: +#ifdef JIT_ENABLED + ARMJIT::InvalidateMainRAMIfNecessary(addr); +#endif *(u8*)&MainRAM[addr & (MAIN_RAM_SIZE - 1)] = val; return; case 0x03000000: if (SWRAM_ARM7) { +#ifdef JIT_ENABLED + ARMJIT::InvalidateSWRAM7IfNecessary(addr); +#endif *(u8*)&SWRAM_ARM7[addr & SWRAM_ARM7Mask] = val; return; } else { +#ifdef JIT_ENABLED + ARMJIT::InvalidateARM7WRAMIfNecessary(addr); +#endif *(u8*)&ARM7WRAM[addr & 0xFFFF] = val; return; } case 0x03800000: +#ifdef JIT_ENABLED + ARMJIT::InvalidateARM7WRAMIfNecessary(addr); +#endif *(u8*)&ARM7WRAM[addr & 0xFFFF] = val; return; @@ -2312,6 +2356,9 @@ void ARM7Write8(u32 addr, u8 val) case 0x06000000: case 0x06800000: +#ifdef JIT_ENABLED + ARMJIT::InvalidateARM7WVRAMIfNecessary(addr); +#endif GPU::WriteVRAM_ARM7<u8>(addr, val); return; @@ -2342,30 +2389,38 @@ void ARM7Write8(u32 addr, u8 val) void ARM7Write16(u32 addr, u16 val) { -#ifdef JIT_ENABLED - ARMJIT::InvalidateByAddr7(addr); -#endif - switch (addr & 0xFF800000) { case 0x02000000: case 0x02800000: +#ifdef JIT_ENABLED + ARMJIT::InvalidateMainRAMIfNecessary(addr); +#endif *(u16*)&MainRAM[addr & (MAIN_RAM_SIZE - 1)] = val; return; case 0x03000000: if (SWRAM_ARM7) { +#ifdef JIT_ENABLED + ARMJIT::InvalidateSWRAM7IfNecessary(addr); +#endif *(u16*)&SWRAM_ARM7[addr & SWRAM_ARM7Mask] = val; return; } else { +#ifdef JIT_ENABLED + ARMJIT::InvalidateARM7WRAMIfNecessary(addr); +#endif *(u16*)&ARM7WRAM[addr & 0xFFFF] = val; return; } case 0x03800000: +#ifdef JIT_ENABLED + ARMJIT::InvalidateARM7WRAMIfNecessary(addr); +#endif *(u16*)&ARM7WRAM[addr & 0xFFFF] = val; return; @@ -2383,6 +2438,9 @@ void ARM7Write16(u32 addr, u16 val) case 0x06000000: case 0x06800000: +#ifdef JIT_ENABLED + ARMJIT::InvalidateARM7WVRAMIfNecessary(addr); +#endif GPU::WriteVRAM_ARM7<u16>(addr, val); return; @@ -2415,30 +2473,38 @@ void ARM7Write16(u32 addr, u16 val) void ARM7Write32(u32 addr, u32 val) { -#ifdef JIT_ENABLED - ARMJIT::InvalidateByAddr7(addr); -#endif - switch (addr & 0xFF800000) { case 0x02000000: case 0x02800000: +#ifdef JIT_ENABLED + ARMJIT::InvalidateMainRAMIfNecessary(addr); +#endif *(u32*)&MainRAM[addr & (MAIN_RAM_SIZE - 1)] = val; return; case 0x03000000: if (SWRAM_ARM7) { +#ifdef JIT_ENABLED + ARMJIT::InvalidateSWRAM7IfNecessary(addr); +#endif *(u32*)&SWRAM_ARM7[addr & SWRAM_ARM7Mask] = val; return; } else { +#ifdef JIT_ENABLED + ARMJIT::InvalidateARM7WRAMIfNecessary(addr); +#endif *(u32*)&ARM7WRAM[addr & 0xFFFF] = val; return; } case 0x03800000: +#ifdef JIT_ENABLED + ARMJIT::InvalidateARM7WRAMIfNecessary(addr); +#endif *(u32*)&ARM7WRAM[addr & 0xFFFF] = val; return; @@ -2457,6 +2523,9 @@ void ARM7Write32(u32 addr, u32 val) case 0x06000000: case 0x06800000: +#ifdef JIT_ENABLED + ARMJIT::InvalidateARM7WVRAMIfNecessary(addr); +#endif GPU::WriteVRAM_ARM7<u32>(addr, val); return; @@ -120,6 +120,14 @@ extern u8 ROMSeed1[2*8]; extern u8 ARM9BIOS[0x1000]; extern u8 ARM7BIOS[0x4000]; +extern u8 SharedWRAM[0x8000]; +extern u8* SWRAM_ARM9; +extern u8* SWRAM_ARM7; +extern u32 SWRAM_ARM9Mask; +extern u32 SWRAM_ARM7Mask; + +extern u8 ARM7WRAM[0x10000]; + #define MAIN_RAM_SIZE 0x400000 extern u8 MainRAM[MAIN_RAM_SIZE]; |