diff options
Diffstat (limited to 'src/ARMJIT.cpp')
-rw-r--r-- | src/ARMJIT.cpp | 905 |
1 files changed, 680 insertions, 225 deletions
diff --git a/src/ARMJIT.cpp b/src/ARMJIT.cpp index 46f71f1..9602aed 100644 --- a/src/ARMJIT.cpp +++ b/src/ARMJIT.cpp @@ -23,6 +23,7 @@ #include "ARMInterpreter_Branch.h" #include "ARMInterpreter.h" +#include "GPU.h" #include "GPU3D.h" #include "SPU.h" #include "Wifi.h" @@ -34,9 +35,10 @@ namespace ARMJIT #define JIT_DEBUGPRINT(msg, ...) //#define JIT_DEBUGPRINT(msg, ...) printf(msg, ## __VA_ARGS__) -Compiler* compiler; +Compiler* JITCompiler; -const u32 ExeMemRegionSizes[] = { +const u32 ExeMemRegionSizes[] = +{ 0x8000, // Unmapped Region (dummy) 0x8000, // ITCM 4*1024*1024, // Main RAM @@ -48,7 +50,8 @@ const u32 ExeMemRegionSizes[] = { 0x40000 // ARM7 WVRAM }; -const u32 ExeMemRegionOffsets[] = { +const u32 ExeMemRegionOffsets[] = +{ 0, 0x8000, 0x10000, @@ -61,65 +64,391 @@ const u32 ExeMemRegionOffsets[] = { 0x518000, }; -#define DUP2(x) x, x - -const static ExeMemKind JIT_MEM[2][32] = { - //arm9 - { - /* 0X*/ DUP2(exeMem_ITCM), - /* 1X*/ DUP2(exeMem_ITCM), // mirror - /* 2X*/ DUP2(exeMem_MainRAM), - /* 3X*/ DUP2(exeMem_SWRAM), - /* 4X*/ DUP2(exeMem_Unmapped), - /* 5X*/ DUP2(exeMem_Unmapped), - /* 6X*/ exeMem_Unmapped, - exeMem_LCDC, // Plain ARM9-CPU Access (LCDC mode) (max 656KB) - /* 7X*/ DUP2(exeMem_Unmapped), - /* 8X*/ DUP2(exeMem_Unmapped), - /* 9X*/ DUP2(exeMem_Unmapped), - /* AX*/ DUP2(exeMem_Unmapped), - /* BX*/ DUP2(exeMem_Unmapped), - /* CX*/ DUP2(exeMem_Unmapped), - /* DX*/ DUP2(exeMem_Unmapped), - /* EX*/ DUP2(exeMem_Unmapped), - /* FX*/ DUP2(exeMem_ARM9_BIOS) - }, - //arm7 - { - /* 0X*/ DUP2(exeMem_ARM7_BIOS), - /* 1X*/ DUP2(exeMem_Unmapped), - /* 2X*/ DUP2(exeMem_MainRAM), - /* 3X*/ exeMem_SWRAM, - exeMem_ARM7_WRAM, - /* 4X*/ DUP2(exeMem_Unmapped), - /* 5X*/ DUP2(exeMem_Unmapped), - /* 6X*/ DUP2(exeMem_ARM7_WVRAM), /* contrary to Gbatek, melonDS and itself, - DeSmuME doesn't mirror the 64 MB region at 0x6800000 */ - /* 7X*/ DUP2(exeMem_Unmapped), - /* 8X*/ DUP2(exeMem_Unmapped), - /* 9X*/ DUP2(exeMem_Unmapped), - /* AX*/ DUP2(exeMem_Unmapped), - /* BX*/ DUP2(exeMem_Unmapped), - /* CX*/ DUP2(exeMem_Unmapped), - /* DX*/ DUP2(exeMem_Unmapped), - /* EX*/ DUP2(exeMem_Unmapped), - /* FX*/ DUP2(exeMem_Unmapped) - } -}; - -#undef DUP2 - /* translates address to pseudo physical address - more compact, eliminates mirroring, everything comes in a row - we only need one translation table */ -u32 AddrTranslate9[0x2000]; -u32 AddrTranslate7[0x4000]; + +u32 TranslateAddr9(u32 addr) +{ + switch (ClassifyAddress9(addr)) + { + case memregion_MainRAM: return ExeMemRegionOffsets[exeMem_MainRAM] + (addr & (MAIN_RAM_SIZE - 1)); + case memregion_SWRAM9: + if (NDS::SWRAM_ARM9) + return ExeMemRegionOffsets[exeMem_SWRAM] + (NDS::SWRAM_ARM9 - NDS::SharedWRAM) + (addr & NDS::SWRAM_ARM9Mask); + else + return 0; + case memregion_ITCM: return ExeMemRegionOffsets[exeMem_ITCM] + (addr & 0x7FFF); + case memregion_VRAM: return (addr >= 0x6800000 && addr < 0x68A4000) ? ExeMemRegionOffsets[exeMem_LCDC] + (addr - 0x6800000) : 0; + case memregion_BIOS9: return ExeMemRegionOffsets[exeMem_ARM9_BIOS] + (addr & 0xFFF); + default: return 0; + } +} + +u32 TranslateAddr7(u32 addr) +{ + switch (ClassifyAddress7(addr)) + { + case memregion_MainRAM: return ExeMemRegionOffsets[exeMem_MainRAM] + (addr & (MAIN_RAM_SIZE - 1)); + case memregion_SWRAM7: + if (NDS::SWRAM_ARM7) + return ExeMemRegionOffsets[exeMem_SWRAM] + (NDS::SWRAM_ARM7 - NDS::SharedWRAM) + (addr & NDS::SWRAM_ARM7Mask); + else + return 0; + case memregion_BIOS7: return ExeMemRegionOffsets[exeMem_ARM7_BIOS] + addr; + case memregion_WRAM7: return ExeMemRegionOffsets[exeMem_ARM7_WRAM] + (addr & 0xFFFF); + case memregion_VWRAM: return ExeMemRegionOffsets[exeMem_ARM7_WVRAM] + (addr & 0x1FFFF); + default: return 0; + } +} AddressRange CodeRanges[ExeMemSpaceSize / 512]; -std::unordered_map<u32, JitBlock*> JitBlocks; +TinyVector<u32> InvalidLiterals; + +std::unordered_map<u32, JitBlock*> JitBlocks9; +std::unordered_map<u32, JitBlock*> JitBlocks7; + +u8 MemoryStatus9[0x800000]; +u8 MemoryStatus7[0x800000]; + +int ClassifyAddress9(u32 addr) +{ + if (addr < NDS::ARM9->ITCMSize) + return memregion_ITCM; + else if (addr >= NDS::ARM9->DTCMBase && addr < (NDS::ARM9->DTCMBase + NDS::ARM9->DTCMSize)) + return memregion_DTCM; + else if ((addr & 0xFFFFF000) == 0xFFFF0000) + return memregion_BIOS9; + else + { + switch (addr & 0xFF000000) + { + case 0x02000000: + return memregion_MainRAM; + case 0x03000000: + return memregion_SWRAM9; + case 0x04000000: + return memregion_IO9; + case 0x06000000: + return memregion_VRAM; + } + } + return memregion_Other; +} + +int ClassifyAddress7(u32 addr) +{ + if (addr < 0x00004000) + return memregion_BIOS7; + else + { + switch (addr & 0xFF800000) + { + case 0x02000000: + case 0x02800000: + return memregion_MainRAM; + case 0x03000000: + if (NDS::SWRAM_ARM7) + return memregion_SWRAM7; + else + return memregion_WRAM7; + case 0x03800000: + return memregion_WRAM7; + case 0x04000000: + return memregion_IO7; + case 0x04800000: + return memregion_Wifi; + case 0x06000000: + case 0x06800000: + return memregion_VWRAM; + } + } + return memregion_Other; +} + +void UpdateMemoryStatus9(u32 start, u32 end) +{ + start >>= 12; + end >>= 12; + + if (end == 0xFFFFF) + end++; + + for (u32 i = start; i < end; i++) + { + u32 addr = i << 12; + + int region = ClassifyAddress9(addr); + u32 pseudoPhyisical = TranslateAddr9(addr); + + for (u32 j = 0; j < 8; j++) + { + u8 val = region; + if (CodeRanges[(pseudoPhyisical + (j << 12)) / 512].Blocks.Length) + val |= 0x80; + MemoryStatus9[i * 8 + j] = val; + } + } +} + +void UpdateMemoryStatus7(u32 start, u32 end) +{ + start >>= 12; + end >>= 12; + + if (end == 0xFFFFF) + end++; + + for (u32 i = start; i < end; i++) + { + u32 addr = i << 12; + + int region = ClassifyAddress7(addr); + u32 pseudoPhyisical = TranslateAddr7(addr); + + for (u32 j = 0; j < 8; j++) + { + u8 val = region; + if (CodeRanges[(pseudoPhyisical + (j << 12)) / 512].Blocks.Length) + val |= 0x80; + MemoryStatus7[i * 8 + j] = val; + } + } +} + +void UpdateRegionByPseudoPhyiscal(u32 addr, bool invalidate) +{ + for (u32 i = 1; i < exeMem_Count; i++) + { + if (addr >= ExeMemRegionOffsets[i] && addr < ExeMemRegionOffsets[i] + ExeMemRegionSizes[i]) + { + for (u32 num = 0; num < 2; num++) + { + u32 physSize = ExeMemRegionSizes[i]; + u32 mapSize = 0; + u32 mapStart = 0; + switch (i) + { + case exeMem_ITCM: + if (num == 0) + mapStart = 0; mapSize = NDS::ARM9->ITCMSize; + break; + case exeMem_MainRAM: mapStart = 0x2000000; mapSize = 0x1000000; break; + case exeMem_SWRAM: + if (num == 0) + { + if (NDS::SWRAM_ARM9) + mapStart = 0x3000000, mapSize = 0x1000000; + else + mapStart = mapSize = 0; + } + else + { + if (NDS::SWRAM_ARM7) + mapStart = 0x3000000, mapSize = 0x800000; + else + mapStart = mapSize = 0; + } + break; + case exeMem_LCDC: + if (num == 0) + mapStart = 0x6800000, mapSize = 0xA4000; + break; + case exeMem_ARM9_BIOS: + if (num == 0) + mapStart = 0xFFFF0000, mapSize = 0x10000; + break; + case exeMem_ARM7_BIOS: + if (num == 1) + mapStart = 0; mapSize = 0x4000; + break; + case exeMem_ARM7_WRAM: + if (num == 1) + { + if (NDS::SWRAM_ARM7) + mapStart = 0x3800000, mapSize = 0x800000; + else + mapStart = 0x3000000, mapSize = 0x1000000; + } + break; + case exeMem_ARM7_WVRAM: + if (num == 1) + mapStart = 0x6000000, mapSize = 0x1000000; + break; + } + + for (u32 j = 0; j < mapSize / physSize; j++) + { + u32 virtAddr = mapStart + physSize * j + (addr - ExeMemRegionOffsets[i]); + if (num == 0 + && virtAddr >= NDS::ARM9->DTCMBase && virtAddr < (NDS::ARM9->DTCMBase + NDS::ARM9->DTCMSize)) + continue; + if (invalidate) + { + if (num == 0) + MemoryStatus9[virtAddr / 512] |= 0x80; + else + MemoryStatus7[virtAddr / 512] |= 0x80; + } + else + { + if (num == 0) + MemoryStatus9[virtAddr / 512] &= ~0x80; + else + MemoryStatus7[virtAddr / 512] &= ~0x80; + } + } + + } + return; + } + } + + assert(false); +} + +template <typename T> +T SlowRead9(ARMv5* cpu, u32 addr) +{ + u32 offset = addr & 0x3; + addr &= ~(sizeof(T) - 1); + + T val; + if (addr < cpu->ITCMSize) + val = *(T*)&cpu->ITCM[addr & 0x7FFF]; + else if (addr >= cpu->DTCMBase && addr < (cpu->DTCMBase + cpu->DTCMSize)) + val = *(T*)&cpu->DTCM[(addr - cpu->DTCMBase) & 0x3FFF]; + else if (std::is_same<T, u32>::value) + val = NDS::ARM9Read32(addr); + else if (std::is_same<T, u16>::value) + val = NDS::ARM9Read16(addr); + else + val = NDS::ARM9Read8(addr); + + if (std::is_same<T, u32>::value) + return ROR(val, offset << 3); + else + return val; +} + +template <typename T> +void SlowWrite9(ARMv5* cpu, u32 addr, T val) +{ + addr &= ~(sizeof(T) - 1); + + if (addr < cpu->ITCMSize) + { + InvalidateITCMIfNecessary(addr); + *(T*)&cpu->ITCM[addr & 0x7FFF] = val; + } + else if (addr >= cpu->DTCMBase && addr < (cpu->DTCMBase + cpu->DTCMSize)) + { + *(T*)&cpu->DTCM[(addr - cpu->DTCMBase) & 0x3FFF] = val; + } + else if (std::is_same<T, u32>::value) + { + NDS::ARM9Write32(addr, val); + } + else if (std::is_same<T, u16>::value) + { + NDS::ARM9Write16(addr, val); + } + else + { + NDS::ARM9Write8(addr, val); + } +} + +template void SlowWrite9<u32>(ARMv5*, u32, u32); +template void SlowWrite9<u16>(ARMv5*, u32, u16); +template void SlowWrite9<u8>(ARMv5*, u32, u8); + +template u32 SlowRead9<u32>(ARMv5*, u32); +template u16 SlowRead9<u16>(ARMv5*, u32); +template u8 SlowRead9<u8>(ARMv5*, u32); + +template <typename T> +T SlowRead7(u32 addr) +{ + u32 offset = addr & 0x3; + addr &= ~(sizeof(T) - 1); + + T val; + if (std::is_same<T, u32>::value) + val = NDS::ARM7Read32(addr); + else if (std::is_same<T, u16>::value) + val = NDS::ARM7Read16(addr); + else + val = NDS::ARM7Read8(addr); + + if (std::is_same<T, u32>::value) + return ROR(val, offset << 3); + else + return val; +} + +template <typename T> +void SlowWrite7(u32 addr, T val) +{ + addr &= ~(sizeof(T) - 1); + + if (std::is_same<T, u32>::value) + NDS::ARM7Write32(addr, val); + else if (std::is_same<T, u16>::value) + NDS::ARM7Write16(addr, val); + else + NDS::ARM7Write8(addr, val); +} + +template <bool PreInc, bool Write> +void SlowBlockTransfer9(u32 addr, u64* data, u32 num, ARMv5* cpu) +{ + addr &= ~0x3; + for (int i = 0; i < num; i++) + { + addr += PreInc * 4; + if (Write) + SlowWrite9<u32>(cpu, addr, data[i]); + else + data[i] = SlowRead9<u32>(cpu, addr); + addr += !PreInc * 4; + } +} + +template <bool PreInc, bool Write> +void SlowBlockTransfer7(u32 addr, u64* data, u32 num) +{ + addr &= ~0x3; + for (int i = 0; i < num; i++) + { + addr += PreInc * 4; + if (Write) + SlowWrite7<u32>(addr, data[i]); + else + data[i] = SlowRead7<u32>(addr); + addr += !PreInc * 4; + } +} + +template void SlowWrite7<u32>(u32, u32); +template void SlowWrite7<u16>(u32, u16); +template void SlowWrite7<u8>(u32, u8); + +template u32 SlowRead7<u32>(u32); +template u16 SlowRead7<u16>(u32); +template u8 SlowRead7<u8>(u32); + +template void SlowBlockTransfer9<false, false>(u32, u64*, u32, ARMv5*); +template void SlowBlockTransfer9<false, true>(u32, u64*, u32, ARMv5*); +template void SlowBlockTransfer9<true, false>(u32, u64*, u32, ARMv5*); +template void SlowBlockTransfer9<true, true>(u32, u64*, u32, ARMv5*); +template void SlowBlockTransfer7<false, false>(u32 addr, u64* data, u32 num); +template void SlowBlockTransfer7<false, true>(u32 addr, u64* data, u32 num); +template void SlowBlockTransfer7<true, false>(u32 addr, u64* data, u32 num); +template void SlowBlockTransfer7<true, true>(u32 addr, u64* data, u32 num); template <typename K, typename V, int Size, V InvalidValue> struct UnreliableHashTable @@ -211,31 +540,25 @@ struct UnreliableHashTable }; UnreliableHashTable<u32, JitBlock*, 0x800, nullptr> RestoreCandidates; -UnreliableHashTable<u32, u32, 0x1000, UINT32_MAX> FastBlockLookUp; +UnreliableHashTable<u32, u32, 0x800, UINT32_MAX> FastBlockLookUp9; +UnreliableHashTable<u32, u32, 0x800, UINT32_MAX> FastBlockLookUp7; void Init() { - for (int i = 0; i < 0x2000; i++) - { - ExeMemKind kind = JIT_MEM[0][i >> 8]; - u32 size = ExeMemRegionSizes[kind]; - - AddrTranslate9[i] = ExeMemRegionOffsets[kind] + ((i << 15) & (size - 1)); - } - for (int i = 0; i < 0x4000; i++) - { - ExeMemKind kind = JIT_MEM[1][i >> 9]; - u32 size = ExeMemRegionSizes[kind]; - - AddrTranslate7[i] = ExeMemRegionOffsets[kind] + ((i << 14) & (size - 1)); - } - - compiler = new Compiler(); + JITCompiler = new Compiler(); } void DeInit() { - delete compiler; + delete JITCompiler; +} + +void Reset() +{ + ResetBlockCache(); + + UpdateMemoryStatus9(0, 0xFFFFFFFF); + UpdateMemoryStatus7(0, 0xFFFFFFFF); } void FloodFillSetFlags(FetchedInstr instrs[], int start, u8 flags) @@ -256,25 +579,31 @@ void FloodFillSetFlags(FetchedInstr instrs[], int start, u8 flags) } } -bool DecodeLiteral(const FetchedInstr& instr, u32& addr) +bool DecodeLiteral(bool thumb, const FetchedInstr& instr, u32& addr) { - switch (instr.Info.Kind) + if (!thumb) { - case ARMInstrInfo::ak_STR_IMM: - case ARMInstrInfo::ak_STRB_IMM: - addr = (instr.Addr + 8) + ((instr.Instr & 0xFFF) * (instr.Instr & (1 << 23) ? 1 : -1)); - return true; - case ARMInstrInfo::ak_STRD_IMM: - case ARMInstrInfo::ak_STRH_IMM: - addr = (instr.Addr + 8) + (((instr.Instr & 0xF00) >> 4 | (instr.Instr & 0xF)) * (instr.Instr & (1 << 23) ? 1 : -1)); - return true; - case ARMInstrInfo::ak_STM: // I honestly hope noone was ever crazy enough to do stm pc, {whatever} - addr = instr.Addr + 8; + switch (instr.Info.Kind) + { + case ARMInstrInfo::ak_LDR_IMM: + case ARMInstrInfo::ak_LDRB_IMM: + addr = (instr.Addr + 8) + ((instr.Instr & 0xFFF) * (instr.Instr & (1 << 23) ? 1 : -1)); + return true; + case ARMInstrInfo::ak_LDRH_IMM: + addr = (instr.Addr + 8) + (((instr.Instr & 0xF00) >> 4 | (instr.Instr & 0xF)) * (instr.Instr & (1 << 23) ? 1 : -1)); + return true; + default: + break; + } + } + else if (instr.Info.Kind == ARMInstrInfo::tk_LDR_PCREL) + { + addr = ((instr.Addr + 4) & ~0x2) + ((instr.Instr & 0xFF) << 2); return true; - default: - JIT_DEBUGPRINT("Literal %08x %x not recognised\n", instr.Instr, instr.Addr); - return false; } + + JIT_DEBUGPRINT("Literal %08x %x not recognised %d\n", instr.Instr, instr.Addr, instr.Info.Kind); + return false; } bool DecodeBranch(bool thumb, const FetchedInstr& instr, u32& cond, bool hasLink, u32 lr, bool& link, @@ -453,6 +782,8 @@ InterpreterFunc InterpretTHUMB[ARMInstrInfo::tk_Count] = }; #undef F + +extern u32 literalsPerBlock; void CompileBlock(ARM* cpu) { bool thumb = cpu->CPSR & 0x20; @@ -463,31 +794,33 @@ void CompileBlock(ARM* cpu) Config::JIT_MaxBlockSize = 32; u32 blockAddr = cpu->R[15] - (thumb ? 2 : 4); - if (!(cpu->Num == 0 - ? IsMapped<0>(blockAddr) - : IsMapped<1>(blockAddr))) + u32 pseudoPhysicalAddr = cpu->Num == 0 + ? TranslateAddr9(blockAddr) + : TranslateAddr7(blockAddr); + if (pseudoPhysicalAddr < ExeMemRegionSizes[exeMem_Unmapped]) { printf("Trying to compile a block in unmapped memory: %x\n", blockAddr); } - u32 pseudoPhysicalAddr = cpu->Num == 0 - ? TranslateAddr<0>(blockAddr) - : TranslateAddr<1>(blockAddr); - FetchedInstr instrs[Config::JIT_MaxBlockSize]; int i = 0; u32 r15 = cpu->R[15]; - u32 addresseRanges[32] = {}; + u32 addressRanges[Config::JIT_MaxBlockSize]; + u32 addressMasks[Config::JIT_MaxBlockSize] = {0}; u32 numAddressRanges = 0; + u32 numLiterals = 0; + u32 literalLoadAddrs[Config::JIT_MaxBlockSize]; + // they are going to be hashed + u32 literalValues[Config::JIT_MaxBlockSize]; + u32 instrValues[Config::JIT_MaxBlockSize]; + cpu->FillPipeline(); u32 nextInstr[2] = {cpu->NextInstr[0], cpu->NextInstr[1]}; u32 nextInstrAddr[2] = {blockAddr, r15}; - JIT_DEBUGPRINT("start block %x %08x (%x) (region invalidates %dx)\n", - blockAddr, cpu->CPSR, pseudoPhysicalAddr, - CodeRanges[pseudoPhysicalAddr / 512].TimesInvalidated); + JIT_DEBUGPRINT("start block %x %08x (%x)\n", blockAddr, cpu->CPSR, pseudoPhysicalAddr); u32 lastSegmentStart = blockAddr; u32 lr; @@ -507,23 +840,29 @@ void CompileBlock(ARM* cpu) nextInstrAddr[1] = r15; JIT_DEBUGPRINT("instr %08x %x\n", instrs[i].Instr & (thumb ? 0xFFFF : ~0), instrs[i].Addr); - u32 translatedAddr = (cpu->Num == 0 - ? TranslateAddr<0>(instrs[i].Addr) - : TranslateAddr<1>(instrs[i].Addr)) & ~0x1FF; - if (i == 0 || translatedAddr != addresseRanges[numAddressRanges - 1]) + instrValues[i] = instrs[i].Instr; + + u32 translatedAddr = cpu->Num == 0 + ? TranslateAddr9(instrs[i].Addr) + : TranslateAddr7(instrs[i].Addr); + u32 translatedAddrRounded = translatedAddr & ~0x1FF; + if (i == 0 || translatedAddrRounded != addressRanges[numAddressRanges - 1]) { bool returning = false; for (int j = 0; j < numAddressRanges; j++) { - if (addresseRanges[j] == translatedAddr) + if (addressRanges[j] == translatedAddrRounded) { + std::swap(addressRanges[j], addressRanges[numAddressRanges - 1]); + std::swap(addressMasks[j], addressMasks[numAddressRanges - 1]); returning = true; break; } } if (!returning) - addresseRanges[numAddressRanges++] = translatedAddr; + addressRanges[numAddressRanges++] = translatedAddrRounded; } + addressMasks[numAddressRanges - 1] |= 1 << ((translatedAddr & 0x1FF) / 16); if (cpu->Num == 0) { @@ -572,7 +911,8 @@ void CompileBlock(ARM* cpu) u32 icode = ((instrs[i].Instr >> 4) & 0xF) | ((instrs[i].Instr >> 16) & 0xFF0); assert(InterpretARM[instrs[i].Info.Kind] == ARMInterpreter::ARMInstrTable[icode] || instrs[i].Info.Kind == ARMInstrInfo::ak_MOV_REG_LSL_IMM - || instrs[i].Info.Kind == ARMInstrInfo::ak_Nop); + || instrs[i].Info.Kind == ARMInstrInfo::ak_Nop + || instrs[i].Info.Kind == ARMInstrInfo::ak_UNK); if (cpu->CheckCondition(instrs[i].Cond())) InterpretARM[instrs[i].Info.Kind](cpu); else @@ -583,21 +923,26 @@ void CompileBlock(ARM* cpu) instrs[i].DataCycles = cpu->DataCycles; instrs[i].DataRegion = cpu->DataRegion; - if (instrs[i].Info.SpecialKind == ARMInstrInfo::special_WriteMem - && instrs[i].Info.SrcRegs == (1 << 15) - && instrs[i].Info.DstRegs == 0) + u32 literalAddr; + if (Config::JIT_LiteralOptimisations + && instrs[i].Info.SpecialKind == ARMInstrInfo::special_LoadLiteral + && DecodeLiteral(thumb, instrs[i], literalAddr)) { - assert (!thumb); - - u32 addr; - if (DecodeLiteral(instrs[i], addr)) - { - JIT_DEBUGPRINT("pc relative write detected\n"); - u32 translatedAddr = cpu->Num == 0 ? TranslateAddr<0>(addr) : TranslateAddr<1>(addr); - - ARMJIT::InvalidateByAddr(translatedAddr, false); - CodeRanges[translatedAddr / 512].InvalidLiterals |= (1 << ((translatedAddr & 0x1FF) / 16)); - } + u32 translatedAddr = cpu->Num == 0 + ? TranslateAddr9(literalAddr) + : TranslateAddr7(literalAddr); + u32 translatedAddrRounded = translatedAddr & ~0x1FF; + + u32 j = 0; + for (; j < numAddressRanges; j++) + if (addressRanges[j] == translatedAddrRounded) + break; + if (j == numAddressRanges) + addressRanges[numAddressRanges++] = translatedAddrRounded; + addressMasks[j] |= 1 << ((translatedAddr & 0x1FF) / 16); + JIT_DEBUGPRINT("literal loading %08x %08x %08x %08x\n", literalAddr, translatedAddr, addressMasks[j], addressRanges[j]); + cpu->DataRead32(literalAddr, &literalValues[numLiterals]); + literalLoadAddrs[numLiterals++] = translatedAddr; } if (thumb && instrs[i].Info.Kind == ARMInstrInfo::tk_BL_LONG_2 && i > 0 @@ -650,8 +995,8 @@ void CompileBlock(ARM* cpu) else if (hasBranched && !isBackJump && i + 1 < Config::JIT_MaxBlockSize) { u32 targetPseudoPhysical = cpu->Num == 0 - ? TranslateAddr<0>(target) - : TranslateAddr<1>(target); + ? TranslateAddr9(target) + : TranslateAddr7(target); if (link) { @@ -688,36 +1033,29 @@ void CompileBlock(ARM* cpu) i++; - bool canCompile = compiler->CanCompile(thumb, instrs[i - 1].Info.Kind); + bool canCompile = JITCompiler->CanCompile(thumb, instrs[i - 1].Info.Kind); bool secondaryFlagReadCond = !canCompile || (instrs[i - 1].BranchFlags & (branch_FollowCondTaken | branch_FollowCondNotTaken)); if (instrs[i - 1].Info.ReadFlags != 0 || secondaryFlagReadCond) FloodFillSetFlags(instrs, i - 2, !secondaryFlagReadCond ? instrs[i - 1].Info.ReadFlags : 0xF); } while(!instrs[i - 1].Info.EndBlock && i < Config::JIT_MaxBlockSize && !cpu->Halted && (!cpu->IRQ || (cpu->CPSR & 0x80))); + u32 literalHash = (u32)XXH3_64bits(literalValues, numLiterals * 4); + u32 instrHash = (u32)XXH3_64bits(instrValues, i * 4); + JitBlock* prevBlock = RestoreCandidates.LookUp(pseudoPhysicalAddr); bool mayRestore = true; if (prevBlock) { RestoreCandidates.Remove(pseudoPhysicalAddr); - if (prevBlock->NumInstrs == i) - { - for (int j = 0; j < i; j++) - { - if (prevBlock->Instrs()[j] != instrs[j].Instr) - { - mayRestore = false; - break; - } - } - } - else - mayRestore = false; - if (prevBlock->NumAddresses == numAddressRanges) + mayRestore = prevBlock->LiteralHash == literalHash && prevBlock->InstrHash == instrHash; + + if (mayRestore && prevBlock->NumAddresses == numAddressRanges) { for (int j = 0; j < numAddressRanges; j++) { - if (prevBlock->AddressRanges()[j] != addresseRanges[j]) + if (prevBlock->AddressRanges()[j] != addressRanges[j] + || prevBlock->AddressMasks()[j] != addressMasks[j]) { mayRestore = false; break; @@ -739,18 +1077,21 @@ void CompileBlock(ARM* cpu) if (prevBlock) delete prevBlock; - block = new JitBlock(i, numAddressRanges); - for (int j = 0; j < i; j++) - block->Instrs()[j] = instrs[j].Instr; + block = new JitBlock(cpu->Num, i, numAddressRanges, numLiterals); + block->LiteralHash = literalHash; + block->InstrHash = instrHash; + for (int j = 0; j < numAddressRanges; j++) + block->AddressRanges()[j] = addressRanges[j]; for (int j = 0; j < numAddressRanges; j++) - block->AddressRanges()[j] = addresseRanges[j]; + block->AddressMasks()[j] = addressMasks[j]; + for (int j = 0; j < numLiterals; j++) + block->Literals()[j] = literalLoadAddrs[j]; - block->StartAddr = blockAddr; block->PseudoPhysicalAddr = pseudoPhysicalAddr; FloodFillSetFlags(instrs, i - 1, 0xF); - block->EntryPoint = compiler->CompileBlock(pseudoPhysicalAddr, cpu, thumb, instrs, i); + block->EntryPoint = JITCompiler->CompileBlock(pseudoPhysicalAddr, cpu, thumb, instrs, i); } else { @@ -760,23 +1101,73 @@ void CompileBlock(ARM* cpu) for (int j = 0; j < numAddressRanges; j++) { - assert(addresseRanges[j] == block->AddressRanges()[j]); - CodeRanges[addresseRanges[j] / 512].Blocks.Add(block); + assert(addressRanges[j] == block->AddressRanges()[j]); + assert(addressMasks[j] == block->AddressMasks()[j]); + assert(addressMasks[j] != 0); + CodeRanges[addressRanges[j] / 512].Code |= addressMasks[j]; + CodeRanges[addressRanges[j] / 512].Blocks.Add(block); + + UpdateRegionByPseudoPhyiscal(addressRanges[j], true); } - JitBlocks[pseudoPhysicalAddr] = block; - FastBlockLookUp.Insert(pseudoPhysicalAddr, compiler->SubEntryOffset(block->EntryPoint)); + if (cpu->Num == 0) + { + JitBlocks9[pseudoPhysicalAddr] = block; + FastBlockLookUp9.Insert(pseudoPhysicalAddr, JITCompiler->SubEntryOffset(block->EntryPoint)); + } + else + { + JitBlocks7[pseudoPhysicalAddr] = block; + FastBlockLookUp7.Insert(pseudoPhysicalAddr, JITCompiler->SubEntryOffset(block->EntryPoint)); + } } -void InvalidateByAddr(u32 pseudoPhysical, bool mayRestore) +void InvalidateByAddr(u32 pseudoPhysical) { JIT_DEBUGPRINT("invalidating by addr %x\n", pseudoPhysical); AddressRange* range = &CodeRanges[pseudoPhysical / 512]; - int startLength = range->Blocks.Length; - for (int i = 0; i < range->Blocks.Length; i++) + u32 mask = 1 << ((pseudoPhysical & 0x1FF) / 16); + + range->Code = 0; + for (int i = 0; i < range->Blocks.Length;) { - assert(range->Blocks.Length == startLength); JitBlock* block = range->Blocks[i]; + + bool invalidated = false; + u32 mask = 0; + for (int j = 0; j < block->NumAddresses; j++) + { + if (block->AddressRanges()[j] == (pseudoPhysical & ~0x1FF)) + { + mask = block->AddressMasks()[j]; + invalidated = block->AddressMasks()[j] & mask; + break; + } + } + assert(mask); + if (!invalidated) + { + range->Code |= mask; + i++; + continue; + } + range->Blocks.Remove(i); + + bool literalInvalidation = false; + for (int j = 0; j < block->NumLiterals; j++) + { + u32 addr = block->Literals()[j]; + if (addr == pseudoPhysical) + { + if (InvalidLiterals.Find(pseudoPhysical) != -1) + { + InvalidLiterals.Add(pseudoPhysical); + JIT_DEBUGPRINT("found invalid literal %d\n", InvalidLiterals.Length); + } + literalInvalidation = true; + break; + } + } for (int j = 0; j < block->NumAddresses; j++) { u32 addr = block->AddressRanges()[j]; @@ -786,76 +1177,59 @@ void InvalidateByAddr(u32 pseudoPhysical, bool mayRestore) assert(otherRange != range); bool removed = otherRange->Blocks.RemoveByValue(block); assert(removed); + + if (otherRange->Blocks.Length == 0) + { + otherRange->Code = 0; + UpdateRegionByPseudoPhyiscal(addr, false); + } } } for (int j = 0; j < block->NumLinks(); j++) - compiler->UnlinkBlock(block->Links()[j]); + JITCompiler->UnlinkBlock(block->Links()[j]); + block->ResetLinks(); - JitBlocks.erase(block->PseudoPhysicalAddr); - FastBlockLookUp.Remove(block->PseudoPhysicalAddr); + if (block->Num == 0) + { + JitBlocks9.erase(block->PseudoPhysicalAddr); + FastBlockLookUp9.Remove(block->PseudoPhysicalAddr); + } + else + { + JitBlocks7.erase(block->PseudoPhysicalAddr); + FastBlockLookUp7.Remove(block->PseudoPhysicalAddr); + } - if (mayRestore) + if (!literalInvalidation) { JitBlock* prevBlock = RestoreCandidates.Insert(block->PseudoPhysicalAddr, block); if (prevBlock) delete prevBlock; } + else + { + delete block; + } } - if ((range->TimesInvalidated + 1) > range->TimesInvalidated) - range->TimesInvalidated++; - - range->Blocks.Clear(); -} -void InvalidateByAddr7(u32 addr) -{ - u32 pseudoPhysical = TranslateAddr<1>(addr); - if (__builtin_expect(CodeRanges[pseudoPhysical / 512].Blocks.Length > 0, false)) - InvalidateByAddr(pseudoPhysical); + if (range->Blocks.Length == 0) + UpdateRegionByPseudoPhyiscal(pseudoPhysical, false); } -void InvalidateITCM(u32 addr) +void InvalidateRegionIfNecessary(u32 pseudoPhyisical) { - u32 pseudoPhysical = addr + ExeMemRegionOffsets[exeMem_ITCM]; - if (CodeRanges[pseudoPhysical / 512].Blocks.Length > 0) - InvalidateByAddr(pseudoPhysical); -} - -void InvalidateAll() -{ - JIT_DEBUGPRINT("invalidating all %x\n", JitBlocks.size()); - for (auto it : JitBlocks) - { - JitBlock* block = it.second; - - FastBlockLookUp.Remove(block->PseudoPhysicalAddr); - - for (int i = 0; i < block->NumAddresses; i++) - { - u32 addr = block->AddressRanges()[i]; - AddressRange* range = &CodeRanges[addr / 512]; - range->Blocks.Clear(); - if (range->TimesInvalidated + 1 > range->TimesInvalidated) - range->TimesInvalidated++; - } - for (int i = 0; i < block->NumLinks(); i++) - compiler->UnlinkBlock(block->Links()[i]); - block->ResetLinks(); - - JitBlock* prevBlock = RestoreCandidates.Insert(block->PseudoPhysicalAddr, block); - if (prevBlock) - delete prevBlock; - } - - JitBlocks.clear(); + if (CodeRanges[pseudoPhyisical / 512].Code & (1 << ((pseudoPhyisical & 0x1FF) / 16))) + InvalidateByAddr(pseudoPhyisical); } void ResetBlockCache() { printf("Resetting JIT block cache...\n"); - FastBlockLookUp.Reset(); + InvalidLiterals.Clear(); + FastBlockLookUp9.Reset(); + FastBlockLookUp7.Reset(); RestoreCandidates.Reset(); for (int i = 0; i < sizeof(RestoreCandidates.Table)/sizeof(RestoreCandidates.Table[0]); i++) { @@ -870,61 +1244,119 @@ void ResetBlockCache() RestoreCandidates.Table[i].ValB = NULL; } } - for (auto it : JitBlocks) + for (auto it : JitBlocks9) { JitBlock* block = it.second; for (int j = 0; j < block->NumAddresses; j++) { u32 addr = block->AddressRanges()[j]; CodeRanges[addr / 512].Blocks.Clear(); - CodeRanges[addr / 512].TimesInvalidated = 0; - CodeRanges[addr / 512].InvalidLiterals = 0; + CodeRanges[addr / 512].Code = 0; } delete block; } - JitBlocks.clear(); + for (auto it : JitBlocks7) + { + JitBlock* block = it.second; + for (int j = 0; j < block->NumAddresses; j++) + { + u32 addr = block->AddressRanges()[j]; + CodeRanges[addr / 512].Blocks.Clear(); + CodeRanges[addr / 512].Code = 0; + } + } + JitBlocks9.clear(); + JitBlocks7.clear(); - compiler->Reset(); + JITCompiler->Reset(); } +template <u32 Num> JitBlockEntry LookUpBlockEntry(u32 addr) { - u32 entryOffset = FastBlockLookUp.LookUp(addr); + auto& fastMap = Num == 0 ? FastBlockLookUp9 : FastBlockLookUp7; + u32 entryOffset = fastMap.LookUp(addr); if (entryOffset != UINT32_MAX) - return compiler->AddEntryOffset(entryOffset); + return JITCompiler->AddEntryOffset(entryOffset); - auto block = JitBlocks.find(addr); - if (block != JitBlocks.end()) + auto& slowMap = Num == 0 ? JitBlocks9 : JitBlocks7; + auto block = slowMap.find(addr); + if (block != slowMap.end()) { - FastBlockLookUp.Insert(addr, compiler->SubEntryOffset(block->second->EntryPoint)); + fastMap.Insert(addr, JITCompiler->SubEntryOffset(block->second->EntryPoint)); return block->second->EntryPoint; } return NULL; } +template JitBlockEntry LookUpBlockEntry<0>(u32); +template JitBlockEntry LookUpBlockEntry<1>(u32); + template <u32 Num> void LinkBlock(ARM* cpu, u32 codeOffset) { - u32 targetPseudoPhys = TranslateAddr<Num>(cpu->R[15] - ((cpu->CPSR&0x20)?2:4)); - auto block = JitBlocks.find(targetPseudoPhys); - if (block == JitBlocks.end()) + auto& blockMap = Num == 0 ? JitBlocks9 : JitBlocks7; + u32 instrAddr = cpu->R[15] - ((cpu->CPSR&0x20)?2:4); + u32 targetPseudoPhys = Num == 0 ? TranslateAddr9(instrAddr) : TranslateAddr7(instrAddr); + auto block = blockMap.find(targetPseudoPhys); + if (block == blockMap.end()) { CompileBlock(cpu); - block = JitBlocks.find(targetPseudoPhys); + block = blockMap.find(targetPseudoPhys); } JIT_DEBUGPRINT("linking to block %08x\n", targetPseudoPhys); block->second->AddLink(codeOffset); - compiler->LinkBlock(codeOffset, block->second->EntryPoint); + JITCompiler->LinkBlock(codeOffset, block->second->EntryPoint); +} + +template void LinkBlock<0>(ARM*, u32); +template void LinkBlock<1>(ARM*, u32); + +void WifiWrite32(u32 addr, u32 val) +{ + Wifi::Write(addr, val & 0xFFFF); + Wifi::Write(addr + 2, val >> 16); +} + +u32 WifiRead32(u32 addr) +{ + return Wifi::Read(addr) | (Wifi::Read(addr + 2) << 16); +} + +template <typename T> +void VRAMWrite(u32 addr, T val) +{ + switch (addr & 0x00E00000) + { + case 0x00000000: GPU::WriteVRAM_ABG<T>(addr, val); return; + case 0x00200000: GPU::WriteVRAM_BBG<T>(addr, val); return; + case 0x00400000: GPU::WriteVRAM_AOBJ<T>(addr, val); return; + case 0x00600000: GPU::WriteVRAM_BOBJ<T>(addr, val); return; + default: GPU::WriteVRAM_LCDC<T>(addr, val); return; + } +} +template <typename T> +T VRAMRead(u32 addr) +{ + switch (addr & 0x00E00000) + { + case 0x00000000: return GPU::ReadVRAM_ABG<T>(addr); + case 0x00200000: return GPU::ReadVRAM_BBG<T>(addr); + case 0x00400000: return GPU::ReadVRAM_AOBJ<T>(addr); + case 0x00600000: return GPU::ReadVRAM_BOBJ<T>(addr); + default: return GPU::ReadVRAM_LCDC<T>(addr); + } } void* GetFuncForAddr(ARM* cpu, u32 addr, bool store, int size) { if (cpu->Num == 0) { - if ((addr & 0xFF000000) == 0x04000000) + switch (addr & 0xFF000000) { + case 0x04000000: if (!store && size == 32 && addr == 0x04100010 && NDS::ExMemCnt[0] & (1<<11)) return (void*)NDSCart::ReadROMData; @@ -949,13 +1381,25 @@ void* GetFuncForAddr(ARM* cpu, u32 addr, bool store, int size) switch (size | store) { - case 8: return (void*)NDS::ARM9IORead8; - case 9: return (void*)NDS::ARM9IOWrite8; + case 8: return (void*)NDS::ARM9IORead8; + case 9: return (void*)NDS::ARM9IOWrite8; case 16: return (void*)NDS::ARM9IORead16; case 17: return (void*)NDS::ARM9IOWrite16; case 32: return (void*)NDS::ARM9IORead32; case 33: return (void*)NDS::ARM9IOWrite32; } + break; + case 0x06000000: + switch (size | store) + { + case 8: return (void*)VRAMRead<u8>; + case 9: return NULL; + case 16: return (void*)VRAMRead<u16>; + case 17: return (void*)VRAMWrite<u16>; + case 32: return (void*)VRAMRead<u32>; + case 33: return (void*)VRAMWrite<u32>; + } + break; } } else @@ -987,20 +1431,31 @@ void* GetFuncForAddr(ARM* cpu, u32 addr, bool store, int size) } break; case 0x04800000: - if (addr < 0x04810000 && size == 16) + if (addr < 0x04810000 && size >= 16) { - if (store) - return (void*)Wifi::Write; - else - return (void*)Wifi::Read; + switch (size | store) + { + case 16: return (void*)Wifi::Read; + case 17: return (void*)Wifi::Write; + case 32: return (void*)WifiRead32; + case 33: return (void*)WifiWrite32; + } } break; + case 0x06000000: + case 0x06800000: + switch (size | store) + { + case 8: return (void*)GPU::ReadVRAM_ARM7<u8>; + case 9: return (void*)GPU::WriteVRAM_ARM7<u8>; + case 16: return (void*)GPU::ReadVRAM_ARM7<u16>; + case 17: return (void*)GPU::WriteVRAM_ARM7<u16>; + case 32: return (void*)GPU::ReadVRAM_ARM7<u32>; + case 33: return (void*)GPU::WriteVRAM_ARM7<u32>; + } } } return NULL; } } - -template void ARMJIT::LinkBlock<0>(ARM*, u32); -template void ARMJIT::LinkBlock<1>(ARM*, u32); |