#include "ARMJIT.h" #include #include #include #define XXH_STATIC_LINKING_ONLY #include "xxhash/xxhash.h" #include "Config.h" #include "ARMJIT_Internal.h" #include "ARMJIT_Memory.h" #include "ARMJIT_Compiler.h" #include "ARMInterpreter_ALU.h" #include "ARMInterpreter_LoadStore.h" #include "ARMInterpreter_Branch.h" #include "ARMInterpreter.h" #include "GPU.h" #include "GPU3D.h" #include "SPU.h" #include "Wifi.h" #include "NDSCart.h" #include "ARMJIT_x64/ARMJIT_Offsets.h" static_assert(offsetof(ARM, CPSR) == ARM_CPSR_offset); static_assert(offsetof(ARM, Cycles) == ARM_Cycles_offset); static_assert(offsetof(ARM, StopExecution) == ARM_StopExecution_offset); namespace ARMJIT { #define JIT_DEBUGPRINT(msg, ...) //#define JIT_DEBUGPRINT(msg, ...) printf(msg, ## __VA_ARGS__) Compiler* JITCompiler; AddressRange CodeIndexITCM[ITCMPhysicalSize / 512]; AddressRange CodeIndexMainRAM[NDS::MainRAMSize / 512]; AddressRange CodeIndexSWRAM[NDS::SharedWRAMSize / 512]; AddressRange CodeIndexVRAM[0x100000 / 512]; AddressRange CodeIndexARM9BIOS[sizeof(NDS::ARM9BIOS) / 512]; AddressRange CodeIndexARM7BIOS[sizeof(NDS::ARM7BIOS) / 512]; AddressRange CodeIndexARM7WRAM[NDS::ARM7WRAMSize / 512]; AddressRange CodeIndexARM7WVRAM[0x40000 / 512]; std::unordered_map JitBlocks9; std::unordered_map JitBlocks7; u64 FastBlockLookupITCM[ITCMPhysicalSize / 2]; u64 FastBlockLookupMainRAM[NDS::MainRAMSize / 2]; u64 FastBlockLookupSWRAM[NDS::SharedWRAMSize / 2]; u64 FastBlockLookupVRAM[0x100000 / 2]; u64 FastBlockLookupARM9BIOS[sizeof(NDS::ARM9BIOS) / 2]; u64 FastBlockLookupARM7BIOS[sizeof(NDS::ARM7BIOS) / 2]; u64 FastBlockLookupARM7WRAM[NDS::ARM7WRAMSize / 2]; u64 FastBlockLookupARM7WVRAM[0x40000 / 2]; const u32 CodeRegionSizes[ARMJIT_Memory::memregions_Count] = { 0, ITCMPhysicalSize, 0, sizeof(NDS::ARM9BIOS), NDS::MainRAMSize, NDS::SharedWRAMSize, 0, 0x100000, sizeof(NDS::ARM7BIOS), NDS::ARM7WRAMSize, 0, 0, 0x40000, }; AddressRange* const CodeMemRegions[ARMJIT_Memory::memregions_Count] = { NULL, CodeIndexITCM, NULL, CodeIndexARM9BIOS, CodeIndexMainRAM, CodeIndexSWRAM, NULL, CodeIndexVRAM, CodeIndexARM7BIOS, CodeIndexARM7WRAM, NULL, NULL, CodeIndexARM7WVRAM, }; u64* const FastBlockLookupRegions[ARMJIT_Memory::memregions_Count] = { NULL, FastBlockLookupITCM, NULL, FastBlockLookupARM9BIOS, FastBlockLookupMainRAM, FastBlockLookupSWRAM, NULL, FastBlockLookupVRAM, FastBlockLookupARM7BIOS, FastBlockLookupARM7WRAM, NULL, NULL, FastBlockLookupARM7WVRAM }; u32 LocaliseCodeAddress(u32 num, u32 addr) { int region = num == 0 ? ARMJIT_Memory::ClassifyAddress9(addr) : ARMJIT_Memory::ClassifyAddress7(addr); u32 mappingStart, mappingSize, memoryOffset, memorySize; if (ARMJIT_Memory::GetRegionMapping(region, num, mappingStart, mappingSize, memoryOffset, memorySize) && CodeMemRegions[region]) { addr = ((addr - mappingStart) & (memorySize - 1)) + memoryOffset; addr |= (u32)region << 28; return addr; } return 0; } TinyVector InvalidLiterals; template T SlowRead9(u32 addr, ARMv5* cpu) { u32 offset = addr & 0x3; addr &= ~(sizeof(T) - 1); T val; if (addr < cpu->ITCMSize) val = *(T*)&cpu->ITCM[addr & 0x7FFF]; else if (addr >= cpu->DTCMBase && addr < (cpu->DTCMBase + cpu->DTCMSize)) val = *(T*)&cpu->DTCM[(addr - cpu->DTCMBase) & 0x3FFF]; else if (std::is_same::value) val = NDS::ARM9Read32(addr); else if (std::is_same::value) val = NDS::ARM9Read16(addr); else val = NDS::ARM9Read8(addr); if (std::is_same::value) return ROR(val, offset << 3); else return val; } template void SlowWrite9(u32 addr, ARMv5* cpu, T val) { addr &= ~(sizeof(T) - 1); if (addr < cpu->ITCMSize) { CheckAndInvalidate<0, ARMJIT_Memory::memregion_ITCM>(addr); *(T*)&cpu->ITCM[addr & 0x7FFF] = val; } else if (addr >= cpu->DTCMBase && addr < (cpu->DTCMBase + cpu->DTCMSize)) { *(T*)&cpu->DTCM[(addr - cpu->DTCMBase) & 0x3FFF] = val; } else if (std::is_same::value) { NDS::ARM9Write32(addr, val); } else if (std::is_same::value) { NDS::ARM9Write16(addr, val); } else { NDS::ARM9Write8(addr, val); } } template void SlowWrite9(u32, ARMv5*, u32); template void SlowWrite9(u32, ARMv5*, u16); template void SlowWrite9(u32, ARMv5*, u8); template u32 SlowRead9(u32, ARMv5*); template u16 SlowRead9(u32, ARMv5*); template u8 SlowRead9(u32, ARMv5*); template T SlowRead7(u32 addr) { u32 offset = addr & 0x3; addr &= ~(sizeof(T) - 1); T val; if (std::is_same::value) val = NDS::ARM7Read32(addr); else if (std::is_same::value) val = NDS::ARM7Read16(addr); else val = NDS::ARM7Read8(addr); if (std::is_same::value) return ROR(val, offset << 3); else return val; } template void SlowWrite7(u32 addr, T val) { addr &= ~(sizeof(T) - 1); if (std::is_same::value) NDS::ARM7Write32(addr, val); else if (std::is_same::value) NDS::ARM7Write16(addr, val); else NDS::ARM7Write8(addr, val); } template void SlowBlockTransfer9(u32 addr, u64* data, u32 num, ARMv5* cpu) { addr &= ~0x3; if (PreInc) addr += 4; for (int i = 0; i < num; i++) { if (Write) SlowWrite9(addr, cpu, data[i]); else data[i] = SlowRead9(addr, cpu); addr += 4; } } template void SlowBlockTransfer7(u32 addr, u64* data, u32 num) { addr &= ~0x3; if (PreInc) addr += 4; for (int i = 0; i < num; i++) { if (Write) SlowWrite7(addr, data[i]); else data[i] = SlowRead7(addr); addr += 4; } } template void SlowWrite7(u32, u32); template void SlowWrite7(u32, u16); template void SlowWrite7(u32, u8); template u32 SlowRead7(u32); template u16 SlowRead7(u32); template u8 SlowRead7(u32); template void SlowBlockTransfer9(u32, u64*, u32, ARMv5*); template void SlowBlockTransfer9(u32, u64*, u32, ARMv5*); template void SlowBlockTransfer9(u32, u64*, u32, ARMv5*); template void SlowBlockTransfer9(u32, u64*, u32, ARMv5*); template void SlowBlockTransfer7(u32 addr, u64* data, u32 num); template void SlowBlockTransfer7(u32 addr, u64* data, u32 num); template void SlowBlockTransfer7(u32 addr, u64* data, u32 num); template void SlowBlockTransfer7(u32 addr, u64* data, u32 num); template struct UnreliableHashTable { struct Bucket { K KeyA, KeyB; V ValA, ValB; }; Bucket Table[Size]; void Reset() { for (int i = 0; i < Size; i++) { Table[i].ValA = Table[i].ValB = InvalidValue; } } UnreliableHashTable() { Reset(); } V Insert(K key, V value) { u32 slot = XXH3_64bits(&key, sizeof(K)) & (Size - 1); Bucket* bucket = &Table[slot]; if (bucket->ValA == value || bucket->ValB == value) { return InvalidValue; } else if (bucket->ValA == InvalidValue) { bucket->KeyA = key; bucket->ValA = value; } else if (bucket->ValB == InvalidValue) { bucket->KeyB = key; bucket->ValB = value; } else { V prevVal = bucket->ValB; bucket->KeyB = bucket->KeyA; bucket->ValB = bucket->ValA; bucket->KeyA = key; bucket->ValA = value; return prevVal; } return InvalidValue; } void Remove(K key) { u32 slot = XXH3_64bits(&key, sizeof(K)) & (Size - 1); Bucket* bucket = &Table[slot]; if (bucket->KeyA == key && bucket->ValA != InvalidValue) { bucket->ValA = InvalidValue; if (bucket->ValB != InvalidValue) { bucket->KeyA = bucket->KeyB; bucket->ValA = bucket->ValB; bucket->ValB = InvalidValue; } } if (bucket->KeyB == key && bucket->ValB != InvalidValue) bucket->ValB = InvalidValue; } V LookUp(K addr) { u32 slot = XXH3_64bits(&addr, 4) & (Size - 1); Bucket* bucket = &Table[slot]; if (bucket->ValA != InvalidValue && bucket->KeyA == addr) return bucket->ValA; if (bucket->ValB != InvalidValue && bucket->KeyB == addr) return bucket->ValB; return InvalidValue; } }; UnreliableHashTable RestoreCandidates; void Init() { JITCompiler = new Compiler(); ARMJIT_Memory::Init(); } void DeInit() { ARMJIT_Memory::DeInit(); delete JITCompiler; } void Reset() { ResetBlockCache(); ARMJIT_Memory::Reset(); } void FloodFillSetFlags(FetchedInstr instrs[], int start, u8 flags) { for (int j = start; j >= 0; j--) { u8 match = instrs[j].Info.WriteFlags & flags; u8 matchMaybe = (instrs[j].Info.WriteFlags >> 4) & flags; if (matchMaybe) // writes flags maybe instrs[j].SetFlags |= matchMaybe; if (match) { instrs[j].SetFlags |= match; flags &= ~match; if (!flags) return; } } } bool DecodeLiteral(bool thumb, const FetchedInstr& instr, u32& addr) { if (!thumb) { switch (instr.Info.Kind) { case ARMInstrInfo::ak_LDR_IMM: case ARMInstrInfo::ak_LDRB_IMM: addr = (instr.Addr + 8) + ((instr.Instr & 0xFFF) * (instr.Instr & (1 << 23) ? 1 : -1)); return true; case ARMInstrInfo::ak_LDRH_IMM: addr = (instr.Addr + 8) + (((instr.Instr & 0xF00) >> 4 | (instr.Instr & 0xF)) * (instr.Instr & (1 << 23) ? 1 : -1)); return true; default: break; } } else if (instr.Info.Kind == ARMInstrInfo::tk_LDR_PCREL) { addr = ((instr.Addr + 4) & ~0x2) + ((instr.Instr & 0xFF) << 2); return true; } JIT_DEBUGPRINT("Literal %08x %x not recognised %d\n", instr.Instr, instr.Addr, instr.Info.Kind); return false; } bool DecodeBranch(bool thumb, const FetchedInstr& instr, u32& cond, bool hasLink, u32 lr, bool& link, u32& linkAddr, u32& targetAddr) { if (thumb) { u32 r15 = instr.Addr + 4; cond = 0xE; link = instr.Info.Kind == ARMInstrInfo::tk_BL_LONG; linkAddr = instr.Addr + 4; if (instr.Info.Kind == ARMInstrInfo::tk_BL_LONG && !(instr.Instr & (1 << 12))) { targetAddr = r15 + ((s32)((instr.Instr & 0x7FF) << 21) >> 9); targetAddr += ((instr.Instr >> 16) & 0x7FF) << 1; return true; } else if (instr.Info.Kind == ARMInstrInfo::tk_B) { s32 offset = (s32)((instr.Instr & 0x7FF) << 21) >> 20; targetAddr = r15 + offset; return true; } else if (instr.Info.Kind == ARMInstrInfo::tk_BCOND) { cond = (instr.Instr >> 8) & 0xF; s32 offset = (s32)(instr.Instr << 24) >> 23; targetAddr = r15 + offset; return true; } else if (hasLink && instr.Info.Kind == ARMInstrInfo::tk_BX && instr.A_Reg(3) == 14) { JIT_DEBUGPRINT("returning!\n"); targetAddr = lr; return true; } } else { link = instr.Info.Kind == ARMInstrInfo::ak_BL; linkAddr = instr.Addr + 4; cond = instr.Cond(); if (instr.Info.Kind == ARMInstrInfo::ak_BL || instr.Info.Kind == ARMInstrInfo::ak_B) { s32 offset = (s32)(instr.Instr << 8) >> 6; u32 r15 = instr.Addr + 8; targetAddr = r15 + offset; return true; } else if (hasLink && instr.Info.Kind == ARMInstrInfo::ak_BX && instr.A_Reg(0) == 14) { JIT_DEBUGPRINT("returning!\n"); targetAddr = lr; return true; } } return false; } bool IsIdleLoop(FetchedInstr* instrs, int instrsCount) { // see https://github.com/dolphin-emu/dolphin/blob/master/Source/Core/Core/PowerPC/PPCAnalyst.cpp#L678 // it basically checks if one iteration of a loop depends on another // the rules are quite simple JIT_DEBUGPRINT("checking potential idle loop\n"); u16 regsWrittenTo = 0; u16 regsDisallowedToWrite = 0; for (int i = 0; i < instrsCount; i++) { JIT_DEBUGPRINT("instr %d %x regs(%x %x) %x %x\n", i, instrs[i].Instr, instrs[i].Info.DstRegs, instrs[i].Info.SrcRegs, regsWrittenTo, regsDisallowedToWrite); if (instrs[i].Info.SpecialKind == ARMInstrInfo::special_WriteMem) return false; if (i < instrsCount - 1 && instrs[i].Info.Branches()) return false; u16 srcRegs = instrs[i].Info.SrcRegs & ~(1 << 15); u16 dstRegs = instrs[i].Info.DstRegs & ~(1 << 15); regsDisallowedToWrite |= srcRegs & ~regsWrittenTo; if (dstRegs & regsDisallowedToWrite) return false; regsWrittenTo |= dstRegs; } return true; } typedef void (*InterpreterFunc)(ARM* cpu); void NOP(ARM* cpu) {} #define F(x) &ARMInterpreter::A_##x #define F_ALU(name, s) \ F(name##_REG_LSL_IMM##s), F(name##_REG_LSR_IMM##s), F(name##_REG_ASR_IMM##s), F(name##_REG_ROR_IMM##s), \ F(name##_REG_LSL_REG##s), F(name##_REG_LSR_REG##s), F(name##_REG_ASR_REG##s), F(name##_REG_ROR_REG##s), F(name##_IMM##s) #define F_MEM_WB(name) \ F(name##_REG_LSL), F(name##_REG_LSR), F(name##_REG_ASR), F(name##_REG_ROR), F(name##_IMM), \ F(name##_POST_REG_LSL), F(name##_POST_REG_LSR), F(name##_POST_REG_ASR), F(name##_POST_REG_ROR), F(name##_POST_IMM) #define F_MEM_HD(name) \ F(name##_REG), F(name##_IMM), F(name##_POST_REG), F(name##_POST_IMM) InterpreterFunc InterpretARM[ARMInstrInfo::ak_Count] = { F_ALU(AND,), F_ALU(AND,_S), F_ALU(EOR,), F_ALU(EOR,_S), F_ALU(SUB,), F_ALU(SUB,_S), F_ALU(RSB,), F_ALU(RSB,_S), F_ALU(ADD,), F_ALU(ADD,_S), F_ALU(ADC,), F_ALU(ADC,_S), F_ALU(SBC,), F_ALU(SBC,_S), F_ALU(RSC,), F_ALU(RSC,_S), F_ALU(ORR,), F_ALU(ORR,_S), F_ALU(MOV,), F_ALU(MOV,_S), F_ALU(BIC,), F_ALU(BIC,_S), F_ALU(MVN,), F_ALU(MVN,_S), F_ALU(TST,), F_ALU(TEQ,), F_ALU(CMP,), F_ALU(CMN,), F(MUL), F(MLA), F(UMULL), F(UMLAL), F(SMULL), F(SMLAL), F(SMLAxy), F(SMLAWy), F(SMULWy), F(SMLALxy), F(SMULxy), F(CLZ), F(QADD), F(QDADD), F(QSUB), F(QDSUB), F_MEM_WB(STR), F_MEM_WB(STRB), F_MEM_WB(LDR), F_MEM_WB(LDRB), F_MEM_HD(STRH), F_MEM_HD(LDRD), F_MEM_HD(STRD), F_MEM_HD(LDRH), F_MEM_HD(LDRSB), F_MEM_HD(LDRSH), F(SWP), F(SWPB), F(LDM), F(STM), F(B), F(BL), F(BLX_IMM), F(BX), F(BLX_REG), F(UNK), F(MSR_IMM), F(MSR_REG), F(MRS), F(MCR), F(MRC), F(SVC), NOP }; #undef F_ALU #undef F_MEM_WB #undef F_MEM_HD #undef F void T_BL_LONG(ARM* cpu) { ARMInterpreter::T_BL_LONG_1(cpu); cpu->R[15] += 2; ARMInterpreter::T_BL_LONG_2(cpu); } #define F(x) ARMInterpreter::T_##x InterpreterFunc InterpretTHUMB[ARMInstrInfo::tk_Count] = { F(LSL_IMM), F(LSR_IMM), F(ASR_IMM), F(ADD_REG_), F(SUB_REG_), F(ADD_IMM_), F(SUB_IMM_), F(MOV_IMM), F(CMP_IMM), F(ADD_IMM), F(SUB_IMM), F(AND_REG), F(EOR_REG), F(LSL_REG), F(LSR_REG), F(ASR_REG), F(ADC_REG), F(SBC_REG), F(ROR_REG), F(TST_REG), F(NEG_REG), F(CMP_REG), F(CMN_REG), F(ORR_REG), F(MUL_REG), F(BIC_REG), F(MVN_REG), F(ADD_HIREG), F(CMP_HIREG), F(MOV_HIREG), F(ADD_PCREL), F(ADD_SPREL), F(ADD_SP), F(LDR_PCREL), F(STR_REG), F(STRB_REG), F(LDR_REG), F(LDRB_REG), F(STRH_REG), F(LDRSB_REG), F(LDRH_REG), F(LDRSH_REG), F(STR_IMM), F(LDR_IMM), F(STRB_IMM), F(LDRB_IMM), F(STRH_IMM), F(LDRH_IMM), F(STR_SPREL), F(LDR_SPREL), F(PUSH), F(POP), F(LDMIA), F(STMIA), F(BCOND), F(BX), F(BLX_REG), F(B), F(BL_LONG_1), F(BL_LONG_2), F(UNK), F(SVC), T_BL_LONG // BL_LONG psudo opcode }; #undef F void CompileBlock(ARM* cpu) { bool thumb = cpu->CPSR & 0x20; if (Config::JIT_MaxBlockSize < 1) Config::JIT_MaxBlockSize = 1; if (Config::JIT_MaxBlockSize > 32) Config::JIT_MaxBlockSize = 32; u32 blockAddr = cpu->R[15] - (thumb ? 2 : 4); auto& map = cpu->Num == 0 ? JitBlocks9 : JitBlocks7; auto existingBlockIt = map.find(blockAddr); if (existingBlockIt != map.end()) { // there's already a block, though it's not inside the fast map // could be that there are two blocks at the same physical addr // but different mirrors u32 localAddr = existingBlockIt->second->StartAddrLocal; u64* entry = &FastBlockLookupRegions[localAddr >> 28][localAddr & 0xFFFFFFF]; *entry = ((u64)blockAddr | cpu->Num) << 32; *entry |= JITCompiler->SubEntryOffset(existingBlockIt->second->EntryPoint); return; } u32 localAddr = LocaliseCodeAddress(cpu->Num, blockAddr); if (!localAddr) { printf("trying to compile non executable code? %x\n", blockAddr); } FetchedInstr instrs[Config::JIT_MaxBlockSize]; int i = 0; u32 r15 = cpu->R[15]; u32 addressRanges[Config::JIT_MaxBlockSize]; u32 addressMasks[Config::JIT_MaxBlockSize] = {0}; u32 numAddressRanges = 0; u32 numLiterals = 0; u32 literalLoadAddrs[Config::JIT_MaxBlockSize]; // they are going to be hashed u32 literalValues[Config::JIT_MaxBlockSize]; u32 instrValues[Config::JIT_MaxBlockSize]; cpu->FillPipeline(); u32 nextInstr[2] = {cpu->NextInstr[0], cpu->NextInstr[1]}; u32 nextInstrAddr[2] = {blockAddr, r15}; JIT_DEBUGPRINT("start block %x %08x (%x)\n", blockAddr, cpu->CPSR, pseudoPhysicalAddr); u32 lastSegmentStart = blockAddr; u32 lr; bool hasLink = false; do { r15 += thumb ? 2 : 4; instrs[i].BranchFlags = 0; instrs[i].SetFlags = 0; instrs[i].Instr = nextInstr[0]; nextInstr[0] = nextInstr[1]; instrs[i].Addr = nextInstrAddr[0]; nextInstrAddr[0] = nextInstrAddr[1]; nextInstrAddr[1] = r15; JIT_DEBUGPRINT("instr %08x %x\n", instrs[i].Instr & (thumb ? 0xFFFF : ~0), instrs[i].Addr); instrValues[i] = instrs[i].Instr; u32 translatedAddr = LocaliseCodeAddress(cpu->Num, instrs[i].Addr); assert(translatedAddr); u32 translatedAddrRounded = translatedAddr & ~0x1FF; if (i == 0 || translatedAddrRounded != addressRanges[numAddressRanges - 1]) { bool returning = false; for (int j = 0; j < numAddressRanges; j++) { if (addressRanges[j] == translatedAddrRounded) { std::swap(addressRanges[j], addressRanges[numAddressRanges - 1]); std::swap(addressMasks[j], addressMasks[numAddressRanges - 1]); returning = true; break; } } if (!returning) addressRanges[numAddressRanges++] = translatedAddrRounded; } addressMasks[numAddressRanges - 1] |= 1 << ((translatedAddr & 0x1FF) / 16); if (cpu->Num == 0) { ARMv5* cpuv5 = (ARMv5*)cpu; if (thumb && r15 & 0x2) { nextInstr[1] >>= 16; instrs[i].CodeCycles = 0; } else { nextInstr[1] = cpuv5->CodeRead32(r15, false); instrs[i].CodeCycles = cpu->CodeCycles; } } else { ARMv4* cpuv4 = (ARMv4*)cpu; if (thumb) nextInstr[1] = cpuv4->CodeRead16(r15); else nextInstr[1] = cpuv4->CodeRead32(r15); instrs[i].CodeCycles = cpu->CodeCycles; } instrs[i].Info = ARMInstrInfo::Decode(thumb, cpu->Num, instrs[i].Instr); cpu->R[15] = r15; cpu->CurInstr = instrs[i].Instr; cpu->CodeCycles = instrs[i].CodeCycles; if (instrs[i].Info.DstRegs & (1 << 14)) hasLink = false; if (thumb) { InterpretTHUMB[instrs[i].Info.Kind](cpu); } else { if (cpu->Num == 0 && instrs[i].Info.Kind == ARMInstrInfo::ak_BLX_IMM) { ARMInterpreter::A_BLX_IMM(cpu); } else { u32 icode = ((instrs[i].Instr >> 4) & 0xF) | ((instrs[i].Instr >> 16) & 0xFF0); assert(InterpretARM[instrs[i].Info.Kind] == ARMInterpreter::ARMInstrTable[icode] || instrs[i].Info.Kind == ARMInstrInfo::ak_MOV_REG_LSL_IMM || instrs[i].Info.Kind == ARMInstrInfo::ak_Nop || instrs[i].Info.Kind == ARMInstrInfo::ak_UNK); if (cpu->CheckCondition(instrs[i].Cond())) InterpretARM[instrs[i].Info.Kind](cpu); else cpu->AddCycles_C(); } } instrs[i].DataCycles = cpu->DataCycles; instrs[i].DataRegion = cpu->DataRegion; u32 literalAddr; if (Config::JIT_LiteralOptimisations && instrs[i].Info.SpecialKind == ARMInstrInfo::special_LoadLiteral && DecodeLiteral(thumb, instrs[i], literalAddr)) { u32 translatedAddr = LocaliseCodeAddress(cpu->Num, literalAddr); if (!translatedAddr) { printf("literal in non executable memory?\n"); } u32 translatedAddrRounded = translatedAddr & ~0x1FF; u32 j = 0; for (; j < numAddressRanges; j++) if (addressRanges[j] == translatedAddrRounded) break; if (j == numAddressRanges) addressRanges[numAddressRanges++] = translatedAddrRounded; addressMasks[j] |= 1 << ((translatedAddr & 0x1FF) / 16); JIT_DEBUGPRINT("literal loading %08x %08x %08x %08x\n", literalAddr, translatedAddr, addressMasks[j], addressRanges[j]); cpu->DataRead32(literalAddr, &literalValues[numLiterals]); literalLoadAddrs[numLiterals++] = translatedAddr; } if (thumb && instrs[i].Info.Kind == ARMInstrInfo::tk_BL_LONG_2 && i > 0 && instrs[i - 1].Info.Kind == ARMInstrInfo::tk_BL_LONG_1) { instrs[i - 1].Info.Kind = ARMInstrInfo::tk_BL_LONG; instrs[i - 1].Instr = (instrs[i - 1].Instr & 0xFFFF) | (instrs[i].Instr << 16); instrs[i - 1].Info.DstRegs = 0xC000; instrs[i - 1].Info.SrcRegs = 0; instrs[i - 1].Info.EndBlock = true; i--; } if (instrs[i].Info.Branches() && Config::JIT_BrancheOptimisations) { bool hasBranched = cpu->R[15] != r15; bool link; u32 cond, target, linkAddr; bool staticBranch = DecodeBranch(thumb, instrs[i], cond, hasLink, lr, link, linkAddr, target); JIT_DEBUGPRINT("branch cond %x target %x (%d)\n", cond, target, hasBranched); if (staticBranch) { instrs[i].BranchFlags |= branch_StaticTarget; bool isBackJump = false; if (hasBranched) { for (int j = 0; j < i; j++) { if (instrs[i].Addr == target) { isBackJump = true; break; } } } if (cond < 0xE && target < instrs[i].Addr && target >= lastSegmentStart) { // we might have an idle loop u32 backwardsOffset = (instrs[i].Addr - target) / (thumb ? 2 : 4); if (IsIdleLoop(&instrs[i - backwardsOffset], backwardsOffset + 1)) { instrs[i].BranchFlags |= branch_IdleBranch; JIT_DEBUGPRINT("found %s idle loop %d in block %x\n", thumb ? "thumb" : "arm", cpu->Num, blockAddr); } } else if (hasBranched && !isBackJump && i + 1 < Config::JIT_MaxBlockSize) { u32 targetLocalised = LocaliseCodeAddress(cpu->Num, target); if (link) { lr = linkAddr; hasLink = true; } r15 = target + (thumb ? 2 : 4); assert(r15 == cpu->R[15]); JIT_DEBUGPRINT("block lengthened by static branch (target %x)\n", target); nextInstr[0] = cpu->NextInstr[0]; nextInstr[1] = cpu->NextInstr[1]; nextInstrAddr[0] = target; nextInstrAddr[1] = r15; lastSegmentStart = target; instrs[i].Info.EndBlock = false; if (cond < 0xE) instrs[i].BranchFlags |= branch_FollowCondTaken; } } if (!hasBranched && cond < 0xE && i + 1 < Config::JIT_MaxBlockSize) { instrs[i].Info.EndBlock = false; instrs[i].BranchFlags |= branch_FollowCondNotTaken; } } i++; bool canCompile = JITCompiler->CanCompile(thumb, instrs[i - 1].Info.Kind); bool secondaryFlagReadCond = !canCompile || (instrs[i - 1].BranchFlags & (branch_FollowCondTaken | branch_FollowCondNotTaken)); if (instrs[i - 1].Info.ReadFlags != 0 || secondaryFlagReadCond) FloodFillSetFlags(instrs, i - 2, !secondaryFlagReadCond ? instrs[i - 1].Info.ReadFlags : 0xF); } while(!instrs[i - 1].Info.EndBlock && i < Config::JIT_MaxBlockSize && !cpu->Halted && (!cpu->IRQ || (cpu->CPSR & 0x80))); u32 literalHash = (u32)XXH3_64bits(literalValues, numLiterals * 4); u32 instrHash = (u32)XXH3_64bits(instrValues, i * 4); JitBlock* prevBlock = RestoreCandidates.LookUp(instrHash); bool mayRestore = true; if (prevBlock) { RestoreCandidates.Remove(instrHash); mayRestore = prevBlock->StartAddr == blockAddr && prevBlock->LiteralHash == literalHash; if (mayRestore && prevBlock->NumAddresses == numAddressRanges) { for (int j = 0; j < numAddressRanges; j++) { if (prevBlock->AddressRanges()[j] != addressRanges[j] || prevBlock->AddressMasks()[j] != addressMasks[j]) { mayRestore = false; break; } } } else mayRestore = false; } else { mayRestore = false; prevBlock = NULL; } JitBlock* block; if (!mayRestore) { if (prevBlock) delete prevBlock; block = new JitBlock(cpu->Num, i, numAddressRanges, numLiterals); block->LiteralHash = literalHash; block->InstrHash = instrHash; for (int j = 0; j < numAddressRanges; j++) block->AddressRanges()[j] = addressRanges[j]; for (int j = 0; j < numAddressRanges; j++) block->AddressMasks()[j] = addressMasks[j]; for (int j = 0; j < numLiterals; j++) block->Literals()[j] = literalLoadAddrs[j]; block->StartAddr = blockAddr; block->StartAddrLocal = localAddr; FloodFillSetFlags(instrs, i - 1, 0xF); block->EntryPoint = JITCompiler->CompileBlock(cpu, thumb, instrs, i); } else { JIT_DEBUGPRINT("restored! %p\n", prevBlock); block = prevBlock; } for (int j = 0; j < numAddressRanges; j++) { assert(addressRanges[j] == block->AddressRanges()[j]); assert(addressMasks[j] == block->AddressMasks()[j]); assert(addressMasks[j] != 0); AddressRange* region = CodeMemRegions[addressRanges[j] >> 28]; if (!PageContainsCode(®ion[(addressRanges[j] & 0xFFFF000) / 512])) ARMJIT_Memory::SetCodeProtection(addressRanges[j] >> 28, addressRanges[j] & 0xFFFFFFF, true); AddressRange* range = ®ion[(addressRanges[j] & 0xFFFFFFF) / 512]; range->Code |= addressMasks[j]; range->Blocks.Add(block); } if (cpu->Num == 0) JitBlocks9[blockAddr] = block; else JitBlocks7[blockAddr] = block; u64* entry = &FastBlockLookupRegions[(localAddr >> 28)][(localAddr & 0xFFFFFFF) / 2]; *entry = ((u64)blockAddr | cpu->Num) << 32; *entry |= JITCompiler->SubEntryOffset(block->EntryPoint); } void InvalidateByAddr(u32 localAddr) { JIT_DEBUGPRINT("invalidating by addr %x\n", localAddr); AddressRange* region = CodeMemRegions[localAddr >> 28]; AddressRange* range = ®ion[(localAddr & 0xFFFFFFF) / 512]; u32 mask = 1 << ((localAddr & 0x1FF) / 16); range->Code = 0; for (int i = 0; i < range->Blocks.Length;) { JitBlock* block = range->Blocks[i]; bool invalidated = false; u32 mask = 0; for (int j = 0; j < block->NumAddresses; j++) { if (block->AddressRanges()[j] == (localAddr & ~0x1FF)) { mask = block->AddressMasks()[j]; invalidated = block->AddressMasks()[j] & mask; break; } } assert(mask); if (!invalidated) { range->Code |= mask; i++; continue; } range->Blocks.Remove(i); if (range->Blocks.Length == 0 && !PageContainsCode(®ion[(localAddr & 0xFFFF000) / 512])) { ARMJIT_Memory::SetCodeProtection(localAddr >> 28, localAddr & 0xFFFFFFF, false); } bool literalInvalidation = false; for (int j = 0; j < block->NumLiterals; j++) { u32 addr = block->Literals()[j]; if (addr == localAddr) { if (InvalidLiterals.Find(localAddr) != -1) { InvalidLiterals.Add(localAddr); JIT_DEBUGPRINT("found invalid literal %d\n", InvalidLiterals.Length); } literalInvalidation = true; break; } } for (int j = 0; j < block->NumAddresses; j++) { u32 addr = block->AddressRanges()[j]; if ((addr / 512) != (localAddr / 512)) { AddressRange* otherRegion = CodeMemRegions[addr >> 28]; AddressRange* otherRange = &otherRegion[(addr & 0xFFFFFFF) / 512]; assert(otherRange != range); bool removed = otherRange->Blocks.RemoveByValue(block); assert(removed); if (otherRange->Blocks.Length == 0) { if (!PageContainsCode(&otherRegion[(addr & 0xFFFF000) / 512])) ARMJIT_Memory::SetCodeProtection(addr >> 28, addr & 0xFFFFFFF, false); otherRange->Code = 0; } } } FastBlockLookupRegions[block->StartAddrLocal >> 28][(block->StartAddrLocal & 0xFFFFFFF) / 2] = (u64)UINT32_MAX << 32; if (block->Num == 0) JitBlocks9.erase(block->StartAddr); else JitBlocks7.erase(block->StartAddr); if (!literalInvalidation) { JitBlock* prevBlock = RestoreCandidates.Insert(block->InstrHash, block); if (prevBlock) delete prevBlock; } else { delete block; } } } template void CheckAndInvalidate(u32 addr) { // let's hope this gets all properly inlined u32 mappingStart, mappingSize, memoryOffset, memorySize; if (ARMJIT_Memory::GetRegionMapping(region, num, mappingStart, mappingSize, memoryOffset, memorySize)) { u32 localAddr = ((addr - mappingStart) & (memorySize - 1)) + memoryOffset; if (CodeMemRegions[region][localAddr / 512].Code & (1 << ((localAddr & 0x1FF) / 16))) InvalidateByAddr(localAddr | (region << 28)); } } JitBlockEntry LookUpBlock(u32 num, u64* entries, u32 offset, u32 addr) { u64* entry = &entries[offset / 2]; if (*entry >> 32 == (addr | num)) return JITCompiler->AddEntryOffset((u32)*entry); return NULL; } bool SetupExecutableRegion(u32 num, u32 blockAddr, u64*& entry, u32& start, u32& size) { int region = num == 0 ? ARMJIT_Memory::ClassifyAddress9(blockAddr) : ARMJIT_Memory::ClassifyAddress7(blockAddr); u32 mappingStart, mappingSize, memoryOffset, memorySize; if (CodeMemRegions[region] && ARMJIT_Memory::GetRegionMapping(region, num, mappingStart, mappingSize, memoryOffset, memorySize)) { entry = FastBlockLookupRegions[region] + memoryOffset / 2; // evil, though it should work for everything except DTCM which is not relevant here start = blockAddr & ~(memorySize - 1); size = memorySize; return true; } else return false; } template void CheckAndInvalidate<0, ARMJIT_Memory::memregion_MainRAM>(u32); template void CheckAndInvalidate<1, ARMJIT_Memory::memregion_MainRAM>(u32); template void CheckAndInvalidate<0, ARMJIT_Memory::memregion_SWRAM>(u32); template void CheckAndInvalidate<1, ARMJIT_Memory::memregion_SWRAM>(u32); template void CheckAndInvalidate<1, ARMJIT_Memory::memregion_WRAM7>(u32); template void CheckAndInvalidate<1, ARMJIT_Memory::memregion_VWRAM>(u32); template void CheckAndInvalidate<0, ARMJIT_Memory::memregion_VRAM>(u32); template void CheckAndInvalidate<0, ARMJIT_Memory::memregion_ITCM>(u32); void ResetBlockCache() { printf("Resetting JIT block cache...\n"); InvalidLiterals.Clear(); for (int i = 0; i < ARMJIT_Memory::memregions_Count; i++) memset(FastBlockLookupRegions[i], 0xFF, CodeRegionSizes[i] * sizeof(u64) / 2); RestoreCandidates.Reset(); for (int i = 0; i < sizeof(RestoreCandidates.Table)/sizeof(RestoreCandidates.Table[0]); i++) { if (RestoreCandidates.Table[i].ValA) { delete RestoreCandidates.Table[i].ValA; RestoreCandidates.Table[i].ValA = NULL; } if (RestoreCandidates.Table[i].ValA) { delete RestoreCandidates.Table[i].ValB; RestoreCandidates.Table[i].ValB = NULL; } } for (auto it : JitBlocks9) { JitBlock* block = it.second; for (int j = 0; j < block->NumAddresses; j++) { u32 addr = block->AddressRanges()[j]; AddressRange* range = &CodeMemRegions[addr >> 28][(addr & 0xFFFFFFF) / 512]; range->Blocks.Clear(); range->Code = 0; } delete block; } for (auto it : JitBlocks7) { JitBlock* block = it.second; for (int j = 0; j < block->NumAddresses; j++) { u32 addr = block->AddressRanges()[j]; AddressRange* range = &CodeMemRegions[addr >> 28][(addr & 0xFFFFFFF) / 512]; range->Blocks.Clear(); range->Code = 0; } } JitBlocks9.clear(); JitBlocks7.clear(); JITCompiler->Reset(); } }