diff options
author | RSDuck <rsduck@users.noreply.github.com> | 2020-06-14 21:04:25 +0200 |
---|---|---|
committer | RSDuck <rsduck@users.noreply.github.com> | 2020-06-16 12:11:19 +0200 |
commit | e335a8ca7615c702cfa2dcdb71deb69468088fd8 (patch) | |
tree | c09dcec016d87e7d82a6aec377f8eb3fa9949026 | |
parent | fea9f95bba7475b2cd3b624a3ccd6cdee00a33f1 (diff) |
first steps in bringing over the JIT refactor/fastmem
-rw-r--r-- | src/ARM.cpp | 43 | ||||
-rw-r--r-- | src/ARM.h | 15 | ||||
-rw-r--r-- | src/ARMJIT.cpp | 771 | ||||
-rw-r--r-- | src/ARMJIT.h | 64 | ||||
-rw-r--r-- | src/ARMJIT_A64/ARMJIT_ALU.cpp | 123 | ||||
-rw-r--r-- | src/ARMJIT_A64/ARMJIT_Branch.cpp | 99 | ||||
-rw-r--r-- | src/ARMJIT_A64/ARMJIT_Compiler.cpp | 383 | ||||
-rw-r--r-- | src/ARMJIT_A64/ARMJIT_Compiler.h | 71 | ||||
-rw-r--r-- | src/ARMJIT_A64/ARMJIT_Linkage.s | 68 | ||||
-rw-r--r-- | src/ARMJIT_A64/ARMJIT_LoadStore.cpp | 790 | ||||
-rw-r--r-- | src/ARMJIT_Compiler.h | 12 | ||||
-rw-r--r-- | src/ARMJIT_Internal.h | 70 | ||||
-rw-r--r-- | src/ARMJIT_Memory.cpp | 822 | ||||
-rw-r--r-- | src/ARMJIT_Memory.h | 53 | ||||
-rw-r--r-- | src/ARMJIT_x64/ARMJIT_Compiler.cpp | 92 | ||||
-rw-r--r-- | src/ARMJIT_x64/ARMJIT_Compiler.h | 11 | ||||
-rw-r--r-- | src/ARMJIT_x64/ARMJIT_LoadStore.cpp | 45 | ||||
-rw-r--r-- | src/ARM_InstrInfo.cpp | 73 | ||||
-rw-r--r-- | src/ARM_InstrInfo.h | 1 | ||||
-rw-r--r-- | src/CMakeLists.txt | 6 | ||||
-rw-r--r-- | src/CP15.cpp | 84 | ||||
-rw-r--r-- | src/Config.cpp | 6 | ||||
-rw-r--r-- | src/Config.h | 1 | ||||
-rw-r--r-- | src/NDS.cpp | 220 | ||||
-rw-r--r-- | src/NDS.h | 17 |
25 files changed, 2342 insertions, 1598 deletions
diff --git a/src/ARM.cpp b/src/ARM.cpp index 92a3a9e..e529be8 100644 --- a/src/ARM.cpp +++ b/src/ARM.cpp @@ -21,6 +21,8 @@ #include "DSi.h" #include "ARM.h" #include "ARMInterpreter.h" +#include "ARMJIT.h" +#include "Config.h" #include "AREngine.h" #include "ARMJIT.h" #include "Config.h" @@ -74,7 +76,9 @@ ARM::~ARM() ARMv5::ARMv5() : ARM(0) { - // +#ifndef JIT_ENABLED + DTCM = new u8[DTCMSize]; +#endif } ARMv4::ARMv4() : ARM(1) @@ -82,6 +86,13 @@ ARMv4::ARMv4() : ARM(1) // } +ARMv5::~ARMv5() +{ +#ifndef JIT_ENABLED + delete[] DTCM; +#endif +} + void ARM::Reset() { Cycles = 0; @@ -622,24 +633,26 @@ void ARMv5::ExecuteJIT() while (NDS::ARM9Timestamp < NDS::ARM9Target) { u32 instrAddr = R[15] - ((CPSR&0x20)?2:4); - u32 translatedAddr = ARMJIT::TranslateAddr9(instrAddr); - if (!translatedAddr) + + // hack so Cycles <= 0 becomes Cycles < 0 + Cycles = NDS::ARM9Target - NDS::ARM9Timestamp - 1; + + if ((instrAddr < FastBlockLookupStart || instrAddr >= (FastBlockLookupStart + FastBlockLookupSize)) + && !ARMJIT::SetupExecutableRegion(0, instrAddr, FastBlockLookup, FastBlockLookupStart, FastBlockLookupSize)) { NDS::ARM9Timestamp = NDS::ARM9Target; printf("ARMv5 PC in non executable region %08X\n", R[15]); return; } - // hack so Cycles <= 0 becomes Cycles < 0 - Cycles = NDS::ARM9Target - NDS::ARM9Timestamp - 1; - - ARMJIT::JitBlockEntry block = ARMJIT::LookUpBlockEntry<0>(translatedAddr); + ARMJIT::JitBlockEntry block = ARMJIT::LookUpBlock(0, FastBlockLookup, + instrAddr - FastBlockLookupStart, instrAddr); if (block) ARM_Dispatch(this, block); else ARMJIT::CompileBlock(this); - NDS::ARM9Timestamp = NDS::ARM9Target - (Cycles + 1); + NDS::ARM9Timestamp = NDS::ARM9Target - Cycles - 1; if (StopExecution) { @@ -766,23 +779,25 @@ void ARMv4::ExecuteJIT() while (NDS::ARM7Timestamp < NDS::ARM7Target) { u32 instrAddr = R[15] - ((CPSR&0x20)?2:4); - u32 translatedAddr = ARMJIT::TranslateAddr7(instrAddr); - if (!translatedAddr) + + Cycles = NDS::ARM7Target - NDS::ARM7Timestamp - 1; + + if ((instrAddr < FastBlockLookupStart || instrAddr >= (FastBlockLookupStart + FastBlockLookupSize)) + && !ARMJIT::SetupExecutableRegion(1, instrAddr, FastBlockLookup, FastBlockLookupStart, FastBlockLookupSize)) { NDS::ARM7Timestamp = NDS::ARM7Target; printf("ARMv4 PC in non executable region %08X\n", R[15]); return; } - Cycles = NDS::ARM7Target - NDS::ARM7Timestamp - 1; - - ARMJIT::JitBlockEntry block = ARMJIT::LookUpBlockEntry<1>(translatedAddr); + ARMJIT::JitBlockEntry block = ARMJIT::LookUpBlock(1, FastBlockLookup, + instrAddr - FastBlockLookupStart, instrAddr); if (block) ARM_Dispatch(this, block); else ARMJIT::CompileBlock(this); - NDS::ARM7Timestamp = NDS::ARM7Target - (Cycles + 1); + NDS::ARM7Timestamp = NDS::ARM7Target - Cycles - 1; // TODO optimize this shit!!! if (StopExecution) @@ -32,11 +32,14 @@ enum RWFlags_ForceUser = (1<<21), }; +const u32 ITCMPhysicalSize = 0x8000; +const u32 DTCMPhysicalSize = 0x4000; + class ARM { public: ARM(u32 num); - ~ARM(); // destroy shit + virtual ~ARM(); // destroy shit virtual void Reset(); @@ -143,6 +146,11 @@ public: NDS::MemRegion CodeMem; +#ifdef JIT_ENABLED + u32 FastBlockLookupStart = 0, FastBlockLookupSize = 0; + u64* FastBlockLookup; +#endif + static u32 ConditionTable[16]; protected: @@ -158,6 +166,7 @@ class ARMv5 : public ARM { public: ARMv5(); + ~ARMv5(); void Reset(); @@ -260,8 +269,8 @@ public: u32 DTCMBase, DTCMSize; s32 RegionCodeCycles; - u8 ITCM[0x8000]; - u8 DTCM[0x4000]; + u8 ITCM[ITCMPhysicalSize]; + u8* DTCM; u8 ICache[0x2000]; u32 ICacheTags[64*4]; diff --git a/src/ARMJIT.cpp b/src/ARMJIT.cpp index 8d87c76..53b28c1 100644 --- a/src/ARMJIT.cpp +++ b/src/ARMJIT.cpp @@ -10,13 +10,8 @@ #include "Config.h" #include "ARMJIT_Internal.h" -#if defined(__x86_64__) -#include "ARMJIT_x64/ARMJIT_Compiler.h" -#elif defined(__aarch64__) -#include "ARMJIT_A64/ARMJIT_Compiler.h" -#else -#error "The current target platform doesn't have a JIT backend" -#endif +#include "ARMJIT_Memory.h" +#include "ARMJIT_Compiler.h" #include "ARMInterpreter_ALU.h" #include "ARMInterpreter_LoadStore.h" @@ -29,6 +24,11 @@ #include "Wifi.h" #include "NDSCart.h" +#include "ARMJIT_x64/ARMJIT_Offsets.h" +static_assert(offsetof(ARM, CPSR) == ARM_CPSR_offset); +static_assert(offsetof(ARM, Cycles) == ARM_Cycles_offset); +static_assert(offsetof(ARM, StopExecution) == ARM_StopExecution_offset); + namespace ARMJIT { @@ -37,281 +37,100 @@ namespace ARMJIT Compiler* JITCompiler; -const u32 ExeMemRegionSizes[] = -{ - 0x8000, // Unmapped Region (dummy) - 0x8000, // ITCM - 4*1024*1024, // Main RAM - 0x8000, // SWRAM - 0xA4000, // LCDC - 0x8000, // ARM9 BIOS - 0x4000, // ARM7 BIOS - 0x10000, // ARM7 WRAM - 0x40000 // ARM7 WVRAM -}; - -const u32 ExeMemRegionOffsets[] = -{ - 0, - 0x8000, - 0x10000, - 0x410000, - 0x418000, - 0x4BC000, - 0x4C4000, - 0x4C8000, - 0x4D8000, - 0x518000, -}; - -/* - translates address to pseudo physical address - - more compact, eliminates mirroring, everything comes in a row - - we only need one translation table -*/ - -u32 TranslateAddr9(u32 addr) -{ - switch (ClassifyAddress9(addr)) - { - case memregion_MainRAM: return ExeMemRegionOffsets[exeMem_MainRAM] + (addr & (MAIN_RAM_SIZE - 1)); - case memregion_SWRAM9: - if (NDS::SWRAM_ARM9) - return ExeMemRegionOffsets[exeMem_SWRAM] + (NDS::SWRAM_ARM9 - NDS::SharedWRAM) + (addr & NDS::SWRAM_ARM9Mask); - else - return 0; - case memregion_ITCM: return ExeMemRegionOffsets[exeMem_ITCM] + (addr & 0x7FFF); - case memregion_VRAM: return (addr >= 0x6800000 && addr < 0x68A4000) ? ExeMemRegionOffsets[exeMem_LCDC] + (addr - 0x6800000) : 0; - case memregion_BIOS9: return ExeMemRegionOffsets[exeMem_ARM9_BIOS] + (addr & 0xFFF); - default: return 0; - } -} - -u32 TranslateAddr7(u32 addr) -{ - switch (ClassifyAddress7(addr)) - { - case memregion_MainRAM: return ExeMemRegionOffsets[exeMem_MainRAM] + (addr & (MAIN_RAM_SIZE - 1)); - case memregion_SWRAM7: - if (NDS::SWRAM_ARM7) - return ExeMemRegionOffsets[exeMem_SWRAM] + (NDS::SWRAM_ARM7 - NDS::SharedWRAM) + (addr & NDS::SWRAM_ARM7Mask); - else - return 0; - case memregion_BIOS7: return ExeMemRegionOffsets[exeMem_ARM7_BIOS] + addr; - case memregion_WRAM7: return ExeMemRegionOffsets[exeMem_ARM7_WRAM] + (addr & 0xFFFF); - case memregion_VWRAM: return ExeMemRegionOffsets[exeMem_ARM7_WVRAM] + (addr & 0x1FFFF); - default: return 0; - } -} - -AddressRange CodeRanges[ExeMemSpaceSize / 512]; - -TinyVector<u32> InvalidLiterals; +AddressRange CodeIndexITCM[ITCMPhysicalSize / 512]; +AddressRange CodeIndexMainRAM[NDS::MainRAMSize / 512]; +AddressRange CodeIndexSWRAM[NDS::SharedWRAMSize / 512]; +AddressRange CodeIndexVRAM[0x100000 / 512]; +AddressRange CodeIndexARM9BIOS[sizeof(NDS::ARM9BIOS) / 512]; +AddressRange CodeIndexARM7BIOS[sizeof(NDS::ARM7BIOS) / 512]; +AddressRange CodeIndexARM7WRAM[NDS::ARM7WRAMSize / 512]; +AddressRange CodeIndexARM7WVRAM[0x40000 / 512]; std::unordered_map<u32, JitBlock*> JitBlocks9; std::unordered_map<u32, JitBlock*> JitBlocks7; -u8 MemoryStatus9[0x800000]; -u8 MemoryStatus7[0x800000]; +u64 FastBlockLookupITCM[ITCMPhysicalSize / 2]; +u64 FastBlockLookupMainRAM[NDS::MainRAMSize / 2]; +u64 FastBlockLookupSWRAM[NDS::SharedWRAMSize / 2]; +u64 FastBlockLookupVRAM[0x100000 / 2]; +u64 FastBlockLookupARM9BIOS[sizeof(NDS::ARM9BIOS) / 2]; +u64 FastBlockLookupARM7BIOS[sizeof(NDS::ARM7BIOS) / 2]; +u64 FastBlockLookupARM7WRAM[NDS::ARM7WRAMSize / 2]; +u64 FastBlockLookupARM7WVRAM[0x40000 / 2]; -int ClassifyAddress9(u32 addr) +const u32 CodeRegionSizes[ARMJIT_Memory::memregions_Count] = { - if (addr < NDS::ARM9->ITCMSize) - return memregion_ITCM; - else if (addr >= NDS::ARM9->DTCMBase && addr < (NDS::ARM9->DTCMBase + NDS::ARM9->DTCMSize)) - return memregion_DTCM; - else if ((addr & 0xFFFFF000) == 0xFFFF0000) - return memregion_BIOS9; - else - { - switch (addr & 0xFF000000) - { - case 0x02000000: - return memregion_MainRAM; - case 0x03000000: - return memregion_SWRAM9; - case 0x04000000: - return memregion_IO9; - case 0x06000000: - return memregion_VRAM; - } - } - return memregion_Other; -} + 0, + ITCMPhysicalSize, + 0, + sizeof(NDS::ARM9BIOS), + NDS::MainRAMSize, + NDS::SharedWRAMSize, + 0, + 0x100000, + sizeof(NDS::ARM7BIOS), + NDS::ARM7WRAMSize, + 0, + 0, + 0x40000, +}; -int ClassifyAddress7(u32 addr) +AddressRange* const CodeMemRegions[ARMJIT_Memory::memregions_Count] = { - if (addr < 0x00004000) - return memregion_BIOS7; - else - { - switch (addr & 0xFF800000) - { - case 0x02000000: - case 0x02800000: - return memregion_MainRAM; - case 0x03000000: - if (NDS::SWRAM_ARM7) - return memregion_SWRAM7; - else - return memregion_WRAM7; - case 0x03800000: - return memregion_WRAM7; - case 0x04000000: - return memregion_IO7; - case 0x04800000: - return memregion_Wifi; - case 0x06000000: - case 0x06800000: - return memregion_VWRAM; - } - } - return memregion_Other; -} + NULL, + CodeIndexITCM, + NULL, + CodeIndexARM9BIOS, + CodeIndexMainRAM, + CodeIndexSWRAM, + NULL, + CodeIndexVRAM, + CodeIndexARM7BIOS, + CodeIndexARM7WRAM, + NULL, + NULL, + CodeIndexARM7WVRAM, +}; -void UpdateMemoryStatus9(u32 start, u32 end) +u64* const FastBlockLookupRegions[ARMJIT_Memory::memregions_Count] = { - start >>= 12; - end >>= 12; - - if (end == 0xFFFFF) - end++; - - for (u32 i = start; i < end; i++) - { - u32 addr = i << 12; - - int region = ClassifyAddress9(addr); - u32 pseudoPhyisical = TranslateAddr9(addr); - - for (u32 j = 0; j < 8; j++) - { - u8 val = region; - if (CodeRanges[(pseudoPhyisical + (j << 12)) / 512].Blocks.Length) - val |= 0x80; - MemoryStatus9[i * 8 + j] = val; - } - } -} + NULL, + FastBlockLookupITCM, + NULL, + FastBlockLookupARM9BIOS, + FastBlockLookupMainRAM, + FastBlockLookupSWRAM, + NULL, + FastBlockLookupVRAM, + FastBlockLookupARM7BIOS, + FastBlockLookupARM7WRAM, + NULL, + NULL, + FastBlockLookupARM7WVRAM +}; -void UpdateMemoryStatus7(u32 start, u32 end) +u32 LocaliseCodeAddress(u32 num, u32 addr) { - start >>= 12; - end >>= 12; - - if (end == 0xFFFFF) - end++; - - for (u32 i = start; i < end; i++) + int region = num == 0 + ? ARMJIT_Memory::ClassifyAddress9(addr) + : ARMJIT_Memory::ClassifyAddress7(addr); + + u32 mappingStart, mappingSize, memoryOffset, memorySize; + if (ARMJIT_Memory::GetRegionMapping(region, num, mappingStart, + mappingSize, memoryOffset, memorySize) + && CodeMemRegions[region]) { - u32 addr = i << 12; - - int region = ClassifyAddress7(addr); - u32 pseudoPhyisical = TranslateAddr7(addr); - - for (u32 j = 0; j < 8; j++) - { - u8 val = region; - if (CodeRanges[(pseudoPhyisical + (j << 12)) / 512].Blocks.Length) - val |= 0x80; - MemoryStatus7[i * 8 + j] = val; - } + addr = ((addr - mappingStart) & (memorySize - 1)) + memoryOffset; + addr |= (u32)region << 28; + return addr; } + return 0; } -void UpdateRegionByPseudoPhyiscal(u32 addr, bool invalidate) -{ - for (u32 i = 1; i < exeMem_Count; i++) - { - if (addr >= ExeMemRegionOffsets[i] && addr < ExeMemRegionOffsets[i] + ExeMemRegionSizes[i]) - { - for (u32 num = 0; num < 2; num++) - { - u32 physSize = ExeMemRegionSizes[i]; - u32 mapSize = 0; - u32 mapStart = 0; - switch (i) - { - case exeMem_ITCM: - if (num == 0) - mapStart = 0; mapSize = NDS::ARM9->ITCMSize; - break; - case exeMem_MainRAM: mapStart = 0x2000000; mapSize = 0x1000000; break; - case exeMem_SWRAM: - if (num == 0) - { - if (NDS::SWRAM_ARM9) - mapStart = 0x3000000, mapSize = 0x1000000; - else - mapStart = mapSize = 0; - } - else - { - if (NDS::SWRAM_ARM7) - mapStart = 0x3000000, mapSize = 0x800000; - else - mapStart = mapSize = 0; - } - break; - case exeMem_LCDC: - if (num == 0) - mapStart = 0x6800000, mapSize = 0xA4000; - break; - case exeMem_ARM9_BIOS: - if (num == 0) - mapStart = 0xFFFF0000, mapSize = 0x10000; - break; - case exeMem_ARM7_BIOS: - if (num == 1) - mapStart = 0; mapSize = 0x4000; - break; - case exeMem_ARM7_WRAM: - if (num == 1) - { - if (NDS::SWRAM_ARM7) - mapStart = 0x3800000, mapSize = 0x800000; - else - mapStart = 0x3000000, mapSize = 0x1000000; - } - break; - case exeMem_ARM7_WVRAM: - if (num == 1) - mapStart = 0x6000000, mapSize = 0x1000000; - break; - } - - for (u32 j = 0; j < mapSize / physSize; j++) - { - u32 virtAddr = mapStart + physSize * j + (addr - ExeMemRegionOffsets[i]); - if (num == 0 - && virtAddr >= NDS::ARM9->DTCMBase && virtAddr < (NDS::ARM9->DTCMBase + NDS::ARM9->DTCMSize)) - continue; - if (invalidate) - { - if (num == 0) - MemoryStatus9[virtAddr / 512] |= 0x80; - else - MemoryStatus7[virtAddr / 512] |= 0x80; - } - else - { - if (num == 0) - MemoryStatus9[virtAddr / 512] &= ~0x80; - else - MemoryStatus7[virtAddr / 512] &= ~0x80; - } - } - - } - return; - } - } - - assert(false); -} +TinyVector<u32> InvalidLiterals; template <typename T> -T SlowRead9(ARMv5* cpu, u32 addr) +T SlowRead9(u32 addr, ARMv5* cpu) { u32 offset = addr & 0x3; addr &= ~(sizeof(T) - 1); @@ -335,13 +154,13 @@ T SlowRead9(ARMv5* cpu, u32 addr) } template <typename T> -void SlowWrite9(ARMv5* cpu, u32 addr, T val) +void SlowWrite9(u32 addr, ARMv5* cpu, T val) { addr &= ~(sizeof(T) - 1); if (addr < cpu->ITCMSize) { - InvalidateITCMIfNecessary(addr); + CheckAndInvalidate<0, ARMJIT_Memory::memregion_ITCM>(addr); *(T*)&cpu->ITCM[addr & 0x7FFF] = val; } else if (addr >= cpu->DTCMBase && addr < (cpu->DTCMBase + cpu->DTCMSize)) @@ -362,13 +181,13 @@ void SlowWrite9(ARMv5* cpu, u32 addr, T val) } } -template void SlowWrite9<u32>(ARMv5*, u32, u32); -template void SlowWrite9<u16>(ARMv5*, u32, u16); -template void SlowWrite9<u8>(ARMv5*, u32, u8); +template void SlowWrite9<u32>(u32, ARMv5*, u32); +template void SlowWrite9<u16>(u32, ARMv5*, u16); +template void SlowWrite9<u8>(u32, ARMv5*, u8); -template u32 SlowRead9<u32>(ARMv5*, u32); -template u16 SlowRead9<u16>(ARMv5*, u32); -template u8 SlowRead9<u8>(ARMv5*, u32); +template u32 SlowRead9<u32>(u32, ARMv5*); +template u16 SlowRead9<u16>(u32, ARMv5*); +template u8 SlowRead9<u8>(u32, ARMv5*); template <typename T> T SlowRead7(u32 addr) @@ -407,14 +226,15 @@ template <bool PreInc, bool Write> void SlowBlockTransfer9(u32 addr, u64* data, u32 num, ARMv5* cpu) { addr &= ~0x3; + if (PreInc) + addr += 4; for (int i = 0; i < num; i++) { - addr += PreInc * 4; if (Write) - SlowWrite9<u32>(cpu, addr, data[i]); + SlowWrite9<u32>(addr, cpu, data[i]); else - data[i] = SlowRead9<u32>(cpu, addr); - addr += !PreInc * 4; + data[i] = SlowRead9<u32>(addr, cpu); + addr += 4; } } @@ -422,14 +242,15 @@ template <bool PreInc, bool Write> void SlowBlockTransfer7(u32 addr, u64* data, u32 num) { addr &= ~0x3; + if (PreInc) + addr += 4; for (int i = 0; i < num; i++) { - addr += PreInc * 4; if (Write) SlowWrite7<u32>(addr, data[i]); else data[i] = SlowRead7<u32>(addr); - addr += !PreInc * 4; + addr += 4; } } @@ -540,16 +361,18 @@ struct UnreliableHashTable }; UnreliableHashTable<u32, JitBlock*, 0x800, nullptr> RestoreCandidates; -UnreliableHashTable<u32, u32, 0x800, UINT32_MAX> FastBlockLookUp9; -UnreliableHashTable<u32, u32, 0x800, UINT32_MAX> FastBlockLookUp7; void Init() { JITCompiler = new Compiler(); + + ARMJIT_Memory::Init(); } void DeInit() { + ARMJIT_Memory::DeInit(); + delete JITCompiler; } @@ -557,8 +380,7 @@ void Reset() { ResetBlockCache(); - UpdateMemoryStatus9(0, 0xFFFFFFFF); - UpdateMemoryStatus7(0, 0xFFFFFFFF); + ARMJIT_Memory::Reset(); } void FloodFillSetFlags(FetchedInstr instrs[], int start, u8 flags) @@ -673,11 +495,12 @@ bool IsIdleLoop(FetchedInstr* instrs, int instrsCount) // it basically checks if one iteration of a loop depends on another // the rules are quite simple + JIT_DEBUGPRINT("checking potential idle loop\n"); u16 regsWrittenTo = 0; u16 regsDisallowedToWrite = 0; for (int i = 0; i < instrsCount; i++) { - //printf("instr %d %x regs(%x %x) %x %x\n", i, instrs[i].Instr, instrs[i].Info.DstRegs, instrs[i].Info.SrcRegs, regsWrittenTo, regsDisallowedToWrite); + JIT_DEBUGPRINT("instr %d %x regs(%x %x) %x %x\n", i, instrs[i].Instr, instrs[i].Info.DstRegs, instrs[i].Info.SrcRegs, regsWrittenTo, regsDisallowedToWrite); if (instrs[i].Info.SpecialKind == ARMInstrInfo::special_WriteMem) return false; if (i < instrsCount - 1 && instrs[i].Info.Branches()) @@ -782,8 +605,6 @@ InterpreterFunc InterpretTHUMB[ARMInstrInfo::tk_Count] = }; #undef F - -extern u32 literalsPerBlock; void CompileBlock(ARM* cpu) { bool thumb = cpu->CPSR & 0x20; @@ -794,14 +615,28 @@ void CompileBlock(ARM* cpu) Config::JIT_MaxBlockSize = 32; u32 blockAddr = cpu->R[15] - (thumb ? 2 : 4); - u32 pseudoPhysicalAddr = cpu->Num == 0 - ? TranslateAddr9(blockAddr) - : TranslateAddr7(blockAddr); - if (pseudoPhysicalAddr < ExeMemRegionSizes[exeMem_Unmapped]) - { - printf("Trying to compile a block in unmapped memory: %x\n", blockAddr); - } - + + auto& map = cpu->Num == 0 ? JitBlocks9 : JitBlocks7; + auto existingBlockIt = map.find(blockAddr); + if (existingBlockIt != map.end()) + { + // there's already a block, though it's not inside the fast map + // could be that there are two blocks at the same physical addr + // but different mirrors + u32 localAddr = existingBlockIt->second->StartAddrLocal; + + u64* entry = &FastBlockLookupRegions[localAddr >> 28][localAddr & 0xFFFFFFF]; + *entry = ((u64)blockAddr | cpu->Num) << 32; + *entry |= JITCompiler->SubEntryOffset(existingBlockIt->second->EntryPoint); + return; + } + + u32 localAddr = LocaliseCodeAddress(cpu->Num, blockAddr); + if (!localAddr) + { + printf("trying to compile non executable code? %x\n", blockAddr); + } + FetchedInstr instrs[Config::JIT_MaxBlockSize]; int i = 0; u32 r15 = cpu->R[15]; @@ -842,9 +677,8 @@ void CompileBlock(ARM* cpu) instrValues[i] = instrs[i].Instr; - u32 translatedAddr = cpu->Num == 0 - ? TranslateAddr9(instrs[i].Addr) - : TranslateAddr7(instrs[i].Addr); + u32 translatedAddr = LocaliseCodeAddress(cpu->Num, instrs[i].Addr); + assert(translatedAddr); u32 translatedAddrRounded = translatedAddr & ~0x1FF; if (i == 0 || translatedAddrRounded != addressRanges[numAddressRanges - 1]) { @@ -928,9 +762,11 @@ void CompileBlock(ARM* cpu) && instrs[i].Info.SpecialKind == ARMInstrInfo::special_LoadLiteral && DecodeLiteral(thumb, instrs[i], literalAddr)) { - u32 translatedAddr = cpu->Num == 0 - ? TranslateAddr9(literalAddr) - : TranslateAddr7(literalAddr); + u32 translatedAddr = LocaliseCodeAddress(cpu->Num, literalAddr); + if (!translatedAddr) + { + printf("literal in non executable memory?\n"); + } u32 translatedAddrRounded = translatedAddr & ~0x1FF; u32 j = 0; @@ -994,9 +830,7 @@ void CompileBlock(ARM* cpu) } else if (hasBranched && !isBackJump && i + 1 < Config::JIT_MaxBlockSize) { - u32 targetPseudoPhysical = cpu->Num == 0 - ? TranslateAddr9(target) - : TranslateAddr7(target); + u32 targetLocalised = LocaliseCodeAddress(cpu->Num, target); if (link) { @@ -1048,7 +882,7 @@ void CompileBlock(ARM* cpu) { RestoreCandidates.Remove(instrHash); - mayRestore = prevBlock->PseudoPhysicalAddr == pseudoPhysicalAddr && prevBlock->LiteralHash == literalHash; + mayRestore = prevBlock->StartAddr == blockAddr && prevBlock->LiteralHash == literalHash; if (mayRestore && prevBlock->NumAddresses == numAddressRanges) { @@ -1087,11 +921,12 @@ void CompileBlock(ARM* cpu) for (int j = 0; j < numLiterals; j++) block->Literals()[j] = literalLoadAddrs[j]; - block->PseudoPhysicalAddr = pseudoPhysicalAddr; + block->StartAddr = blockAddr; + block->StartAddrLocal = localAddr; FloodFillSetFlags(instrs, i - 1, 0xF); - block->EntryPoint = JITCompiler->CompileBlock(pseudoPhysicalAddr, cpu, thumb, instrs, i); + block->EntryPoint = JITCompiler->CompileBlock(cpu, thumb, instrs, i); } else { @@ -1104,30 +939,34 @@ void CompileBlock(ARM* cpu) assert(addressRanges[j] == block->AddressRanges()[j]); assert(addressMasks[j] == block->AddressMasks()[j]); assert(addressMasks[j] != 0); - CodeRanges[addressRanges[j] / 512].Code |= addressMasks[j]; - CodeRanges[addressRanges[j] / 512].Blocks.Add(block); - UpdateRegionByPseudoPhyiscal(addressRanges[j], true); + AddressRange* region = CodeMemRegions[addressRanges[j] >> 28]; + + if (!PageContainsCode(®ion[(addressRanges[j] & 0xFFFF000) / 512])) + ARMJIT_Memory::SetCodeProtection(addressRanges[j] >> 28, addressRanges[j] & 0xFFFFFFF, true); + + AddressRange* range = ®ion[(addressRanges[j] & 0xFFFFFFF) / 512]; + range->Code |= addressMasks[j]; + range->Blocks.Add(block); } if (cpu->Num == 0) - { - JitBlocks9[pseudoPhysicalAddr] = block; - FastBlockLookUp9.Insert(pseudoPhysicalAddr, JITCompiler->SubEntryOffset(block->EntryPoint)); - } + JitBlocks9[blockAddr] = block; else - { - JitBlocks7[pseudoPhysicalAddr] = block; - FastBlockLookUp7.Insert(pseudoPhysicalAddr, JITCompiler->SubEntryOffset(block->EntryPoint)); - } + JitBlocks7[blockAddr] = block; + + u64* entry = &FastBlockLookupRegions[(localAddr >> 28)][(localAddr & 0xFFFFFFF) / 2]; + *entry = ((u64)blockAddr | cpu->Num) << 32; + *entry |= JITCompiler->SubEntryOffset(block->EntryPoint); } -void InvalidateByAddr(u32 pseudoPhysical) +void InvalidateByAddr(u32 localAddr) { - JIT_DEBUGPRINT("invalidating by addr %x\n", pseudoPhysical); + JIT_DEBUGPRINT("invalidating by addr %x\n", localAddr); - AddressRange* range = &CodeRanges[pseudoPhysical / 512]; - u32 mask = 1 << ((pseudoPhysical & 0x1FF) / 16); + AddressRange* region = CodeMemRegions[localAddr >> 28]; + AddressRange* range = ®ion[(localAddr & 0xFFFFFFF) / 512]; + u32 mask = 1 << ((localAddr & 0x1FF) / 16); range->Code = 0; for (int i = 0; i < range->Blocks.Length;) @@ -1138,7 +977,7 @@ void InvalidateByAddr(u32 pseudoPhysical) u32 mask = 0; for (int j = 0; j < block->NumAddresses; j++) { - if (block->AddressRanges()[j] == (pseudoPhysical & ~0x1FF)) + if (block->AddressRanges()[j] == (localAddr & ~0x1FF)) { mask = block->AddressMasks()[j]; invalidated = block->AddressMasks()[j] & mask; @@ -1154,15 +993,21 @@ void InvalidateByAddr(u32 pseudoPhysical) } range->Blocks.Remove(i); + if (range->Blocks.Length == 0 + && !PageContainsCode(®ion[(localAddr & 0xFFFF000) / 512])) + { + ARMJIT_Memory::SetCodeProtection(localAddr >> 28, localAddr & 0xFFFFFFF, false); + } + bool literalInvalidation = false; for (int j = 0; j < block->NumLiterals; j++) { u32 addr = block->Literals()[j]; - if (addr == pseudoPhysical) + if (addr == localAddr) { - if (InvalidLiterals.Find(pseudoPhysical) != -1) + if (InvalidLiterals.Find(localAddr) != -1) { - InvalidLiterals.Add(pseudoPhysical); + InvalidLiterals.Add(localAddr); JIT_DEBUGPRINT("found invalid literal %d\n", InvalidLiterals.Length); } literalInvalidation = true; @@ -1172,35 +1017,30 @@ void InvalidateByAddr(u32 pseudoPhysical) for (int j = 0; j < block->NumAddresses; j++) { u32 addr = block->AddressRanges()[j]; - if ((addr / 512) != (pseudoPhysical / 512)) + if ((addr / 512) != (localAddr / 512)) { - AddressRange* otherRange = &CodeRanges[addr / 512]; + AddressRange* otherRegion = CodeMemRegions[addr >> 28]; + AddressRange* otherRange = &otherRegion[(addr & 0xFFFFFFF) / 512]; assert(otherRange != range); + bool removed = otherRange->Blocks.RemoveByValue(block); assert(removed); if (otherRange->Blocks.Length == 0) { + if (!PageContainsCode(&otherRegion[(addr & 0xFFFF000) / 512])) + ARMJIT_Memory::SetCodeProtection(addr >> 28, addr & 0xFFFFFFF, false); + otherRange->Code = 0; - UpdateRegionByPseudoPhyiscal(addr, false); } } } - for (int j = 0; j < block->NumLinks(); j++) - JITCompiler->UnlinkBlock(block->Links()[j]); - block->ResetLinks(); - + FastBlockLookupRegions[block->StartAddrLocal >> 28][(block->StartAddrLocal & 0xFFFFFFF) / 2] = (u64)UINT32_MAX << 32; if (block->Num == 0) - { - JitBlocks9.erase(block->PseudoPhysicalAddr); - FastBlockLookUp9.Remove(block->PseudoPhysicalAddr); - } + JitBlocks9.erase(block->StartAddr); else - { - JitBlocks7.erase(block->PseudoPhysicalAddr); - FastBlockLookUp7.Remove(block->PseudoPhysicalAddr); - } + JitBlocks7.erase(block->StartAddr); if (!literalInvalidation) { @@ -1213,24 +1053,66 @@ void InvalidateByAddr(u32 pseudoPhysical) delete block; } } +} - if (range->Blocks.Length == 0) - UpdateRegionByPseudoPhyiscal(pseudoPhysical, false); +template <u32 num, int region> +void CheckAndInvalidate(u32 addr) +{ + // let's hope this gets all properly inlined + u32 mappingStart, mappingSize, memoryOffset, memorySize; + if (ARMJIT_Memory::GetRegionMapping(region, num, mappingStart, mappingSize, memoryOffset, memorySize)) + { + u32 localAddr = ((addr - mappingStart) & (memorySize - 1)) + memoryOffset; + if (CodeMemRegions[region][localAddr / 512].Code & (1 << ((localAddr & 0x1FF) / 16))) + InvalidateByAddr(localAddr | (region << 28)); + } +} + +JitBlockEntry LookUpBlock(u32 num, u64* entries, u32 offset, u32 addr) +{ + u64* entry = &entries[offset / 2]; + if (*entry >> 32 == (addr | num)) + return JITCompiler->AddEntryOffset((u32)*entry); + return NULL; } -void InvalidateRegionIfNecessary(u32 pseudoPhyisical) +bool SetupExecutableRegion(u32 num, u32 blockAddr, u64*& entry, u32& start, u32& size) { - if (CodeRanges[pseudoPhyisical / 512].Code & (1 << ((pseudoPhyisical & 0x1FF) / 16))) - InvalidateByAddr(pseudoPhyisical); + int region = num == 0 + ? ARMJIT_Memory::ClassifyAddress9(blockAddr) + : ARMJIT_Memory::ClassifyAddress7(blockAddr); + + u32 mappingStart, mappingSize, memoryOffset, memorySize; + if (CodeMemRegions[region] + && ARMJIT_Memory::GetRegionMapping(region, num, mappingStart, + mappingSize, memoryOffset, memorySize)) + { + entry = FastBlockLookupRegions[region] + memoryOffset / 2; + // evil, though it should work for everything except DTCM which is not relevant here + start = blockAddr & ~(memorySize - 1); + size = memorySize; + return true; + } + else + return false; } +template void CheckAndInvalidate<0, ARMJIT_Memory::memregion_MainRAM>(u32); +template void CheckAndInvalidate<1, ARMJIT_Memory::memregion_MainRAM>(u32); +template void CheckAndInvalidate<0, ARMJIT_Memory::memregion_SWRAM>(u32); +template void CheckAndInvalidate<1, ARMJIT_Memory::memregion_SWRAM>(u32); +template void CheckAndInvalidate<1, ARMJIT_Memory::memregion_WRAM7>(u32); +template void CheckAndInvalidate<1, ARMJIT_Memory::memregion_VWRAM>(u32); +template void CheckAndInvalidate<0, ARMJIT_Memory::memregion_VRAM>(u32); +template void CheckAndInvalidate<0, ARMJIT_Memory::memregion_ITCM>(u32); + void ResetBlockCache() { printf("Resetting JIT block cache...\n"); InvalidLiterals.Clear(); - FastBlockLookUp9.Reset(); - FastBlockLookUp7.Reset(); + for (int i = 0; i < ARMJIT_Memory::memregions_Count; i++) + memset(FastBlockLookupRegions[i], 0xFF, CodeRegionSizes[i] * sizeof(u64) / 2); RestoreCandidates.Reset(); for (int i = 0; i < sizeof(RestoreCandidates.Table)/sizeof(RestoreCandidates.Table[0]); i++) { @@ -1251,8 +1133,9 @@ void ResetBlockCache() for (int j = 0; j < block->NumAddresses; j++) { u32 addr = block->AddressRanges()[j]; - CodeRanges[addr / 512].Blocks.Clear(); - CodeRanges[addr / 512].Code = 0; + AddressRange* range = &CodeMemRegions[addr >> 28][(addr & 0xFFFFFFF) / 512]; + range->Blocks.Clear(); + range->Code = 0; } delete block; } @@ -1262,8 +1145,9 @@ void ResetBlockCache() for (int j = 0; j < block->NumAddresses; j++) { u32 addr = block->AddressRanges()[j]; - CodeRanges[addr / 512].Blocks.Clear(); - CodeRanges[addr / 512].Code = 0; + AddressRange* range = &CodeMemRegions[addr >> 28][(addr & 0xFFFFFFF) / 512]; + range->Blocks.Clear(); + range->Code = 0; } } JitBlocks9.clear(); @@ -1272,191 +1156,4 @@ void ResetBlockCache() JITCompiler->Reset(); } -template <u32 Num> -JitBlockEntry LookUpBlockEntry(u32 addr) -{ - auto& fastMap = Num == 0 ? FastBlockLookUp9 : FastBlockLookUp7; - u32 entryOffset = fastMap.LookUp(addr); - if (entryOffset != UINT32_MAX) - return JITCompiler->AddEntryOffset(entryOffset); - - auto& slowMap = Num == 0 ? JitBlocks9 : JitBlocks7; - auto block = slowMap.find(addr); - if (block != slowMap.end()) - { - fastMap.Insert(addr, JITCompiler->SubEntryOffset(block->second->EntryPoint)); - return block->second->EntryPoint; - } - return NULL; -} - -template JitBlockEntry LookUpBlockEntry<0>(u32); -template JitBlockEntry LookUpBlockEntry<1>(u32); - -template <u32 Num> -void LinkBlock(ARM* cpu, u32 codeOffset) -{ - auto& blockMap = Num == 0 ? JitBlocks9 : JitBlocks7; - u32 instrAddr = cpu->R[15] - ((cpu->CPSR&0x20)?2:4); - u32 targetPseudoPhys = Num == 0 ? TranslateAddr9(instrAddr) : TranslateAddr7(instrAddr); - auto block = blockMap.find(targetPseudoPhys); - if (block == blockMap.end()) - { - CompileBlock(cpu); - block = blockMap.find(targetPseudoPhys); - } - - JIT_DEBUGPRINT("linking to block %08x\n", targetPseudoPhys); - - block->second->AddLink(codeOffset); - JITCompiler->LinkBlock(codeOffset, block->second->EntryPoint); -} - -template void LinkBlock<0>(ARM*, u32); -template void LinkBlock<1>(ARM*, u32); - -void WifiWrite32(u32 addr, u32 val) -{ - Wifi::Write(addr, val & 0xFFFF); - Wifi::Write(addr + 2, val >> 16); -} - -u32 WifiRead32(u32 addr) -{ - return Wifi::Read(addr) | (Wifi::Read(addr + 2) << 16); -} - -template <typename T> -void VRAMWrite(u32 addr, T val) -{ - switch (addr & 0x00E00000) - { - case 0x00000000: GPU::WriteVRAM_ABG<T>(addr, val); return; - case 0x00200000: GPU::WriteVRAM_BBG<T>(addr, val); return; - case 0x00400000: GPU::WriteVRAM_AOBJ<T>(addr, val); return; - case 0x00600000: GPU::WriteVRAM_BOBJ<T>(addr, val); return; - default: GPU::WriteVRAM_LCDC<T>(addr, val); return; - } -} -template <typename T> -T VRAMRead(u32 addr) -{ - switch (addr & 0x00E00000) - { - case 0x00000000: return GPU::ReadVRAM_ABG<T>(addr); - case 0x00200000: return GPU::ReadVRAM_BBG<T>(addr); - case 0x00400000: return GPU::ReadVRAM_AOBJ<T>(addr); - case 0x00600000: return GPU::ReadVRAM_BOBJ<T>(addr); - default: return GPU::ReadVRAM_LCDC<T>(addr); - } -} - -void* GetFuncForAddr(ARM* cpu, u32 addr, bool store, int size) -{ - if (cpu->Num == 0) - { - switch (addr & 0xFF000000) - { - case 0x04000000: - if (!store && size == 32 && addr == 0x04100010 && NDS::ExMemCnt[0] & (1<<11)) - return (void*)NDSCart::ReadROMData; - - /* - unfortunately we can't map GPU2D this way - since it's hidden inside an object - - though GPU3D registers are accessed much more intensive - */ - if (addr >= 0x04000320 && addr < 0x040006A4) - { - switch (size | store) - { - case 8: return (void*)GPU3D::Read8; - case 9: return (void*)GPU3D::Write8; - case 16: return (void*)GPU3D::Read16; - case 17: return (void*)GPU3D::Write16; - case 32: return (void*)GPU3D::Read32; - case 33: return (void*)GPU3D::Write32; - } - } - - switch (size | store) - { - case 8: return (void*)NDS::ARM9IORead8; - case 9: return (void*)NDS::ARM9IOWrite8; - case 16: return (void*)NDS::ARM9IORead16; - case 17: return (void*)NDS::ARM9IOWrite16; - case 32: return (void*)NDS::ARM9IORead32; - case 33: return (void*)NDS::ARM9IOWrite32; - } - break; - case 0x06000000: - switch (size | store) - { - case 8: return (void*)VRAMRead<u8>; - case 9: return NULL; - case 16: return (void*)VRAMRead<u16>; - case 17: return (void*)VRAMWrite<u16>; - case 32: return (void*)VRAMRead<u32>; - case 33: return (void*)VRAMWrite<u32>; - } - break; - } - } - else - { - switch (addr & 0xFF800000) - { - case 0x04000000: - if (addr >= 0x04000400 && addr < 0x04000520) - { - switch (size | store) - { - case 8: return (void*)SPU::Read8; - case 9: return (void*)SPU::Write8; - case 16: return (void*)SPU::Read16; - case 17: return (void*)SPU::Write16; - case 32: return (void*)SPU::Read32; - case 33: return (void*)SPU::Write32; - } - } - - switch (size | store) - { - case 8: return (void*)NDS::ARM7IORead8; - case 9: return (void*)NDS::ARM7IOWrite8; - case 16: return (void*)NDS::ARM7IORead16; - case 17: return (void*)NDS::ARM7IOWrite16; - case 32: return (void*)NDS::ARM7IORead32; - case 33: return (void*)NDS::ARM7IOWrite32; - } - break; - case 0x04800000: - if (addr < 0x04810000 && size >= 16) - { - switch (size | store) - { - case 16: return (void*)Wifi::Read; - case 17: return (void*)Wifi::Write; - case 32: return (void*)WifiRead32; - case 33: return (void*)WifiWrite32; - } - } - break; - case 0x06000000: - case 0x06800000: - switch (size | store) - { - case 8: return (void*)GPU::ReadVRAM_ARM7<u8>; - case 9: return (void*)GPU::WriteVRAM_ARM7<u8>; - case 16: return (void*)GPU::ReadVRAM_ARM7<u16>; - case 17: return (void*)GPU::WriteVRAM_ARM7<u16>; - case 32: return (void*)GPU::ReadVRAM_ARM7<u32>; - case 33: return (void*)GPU::WriteVRAM_ARM7<u32>; - } - } - } - return NULL; -} - } diff --git a/src/ARMJIT.h b/src/ARMJIT.h index 44a6140..2320b7b 100644 --- a/src/ARMJIT.h +++ b/src/ARMJIT.h @@ -9,32 +9,7 @@ namespace ARMJIT { -enum ExeMemKind -{ - exeMem_Unmapped = 0, - exeMem_ITCM, - exeMem_MainRAM, - exeMem_SWRAM, - exeMem_LCDC, - exeMem_ARM9_BIOS, - exeMem_ARM7_BIOS, - exeMem_ARM7_WRAM, - exeMem_ARM7_WVRAM, - exeMem_Count -}; - -extern const u32 ExeMemRegionOffsets[]; -extern const u32 ExeMemRegionSizes[]; - -typedef u32 (*JitBlockEntry)(); - -const u32 ExeMemSpaceSize = 0x518000; // I hate you C++, sometimes I really hate you... - -u32 TranslateAddr9(u32 addr); -u32 TranslateAddr7(u32 addr); - -template <u32 Num> -JitBlockEntry LookUpBlockEntry(u32 addr); +typedef void (*JitBlockEntry)(); void Init(); void DeInit(); @@ -43,44 +18,15 @@ void Reset(); void InvalidateByAddr(u32 pseudoPhysical); -void InvalidateRegionIfNecessary(u32 addr); - -inline void InvalidateMainRAMIfNecessary(u32 addr) -{ - InvalidateRegionIfNecessary(ExeMemRegionOffsets[exeMem_MainRAM] + (addr & (MAIN_RAM_SIZE - 1))); -} -inline void InvalidateITCMIfNecessary(u32 addr) -{ - InvalidateRegionIfNecessary(ExeMemRegionOffsets[exeMem_ITCM] + (addr & 0x7FFF)); -} -inline void InvalidateLCDCIfNecessary(u32 addr) -{ - if (addr < 0x68A3FFF) - InvalidateRegionIfNecessary(ExeMemRegionOffsets[exeMem_LCDC] + (addr - 0x6800000)); -} -inline void InvalidateSWRAM7IfNecessary(u32 addr) -{ - InvalidateRegionIfNecessary(ExeMemRegionOffsets[exeMem_SWRAM] + (NDS::SWRAM_ARM7 - NDS::SharedWRAM) + (addr & NDS::SWRAM_ARM7Mask)); -} -inline void InvalidateSWRAM9IfNecessary(u32 addr) -{ - InvalidateRegionIfNecessary(ExeMemRegionOffsets[exeMem_SWRAM] + (NDS::SWRAM_ARM9 - NDS::SharedWRAM) + (addr & NDS::SWRAM_ARM9Mask)); -} -inline void InvalidateARM7WRAMIfNecessary(u32 addr) -{ - InvalidateRegionIfNecessary(ExeMemRegionOffsets[exeMem_ARM7_WRAM] + (addr & 0xFFFF)); -} -inline void InvalidateARM7WVRAMIfNecessary(u32 addr) -{ - InvalidateRegionIfNecessary(ExeMemRegionOffsets[exeMem_ARM7_WVRAM] + (addr & 0x1FFFF)); -} +template <u32 num, int region> +void CheckAndInvalidate(u32 addr); void CompileBlock(ARM* cpu); void ResetBlockCache(); -void UpdateMemoryStatus9(u32 start, u32 end); -void UpdateMemoryStatus7(u32 start, u32 end); +JitBlockEntry LookUpBlock(u32 num, u64* entries, u32 offset, u32 addr); +bool SetupExecutableRegion(u32 num, u32 blockAddr, u64*& entry, u32& start, u32& size); } diff --git a/src/ARMJIT_A64/ARMJIT_ALU.cpp b/src/ARMJIT_A64/ARMJIT_ALU.cpp index 0fe6a97..5f021a0 100644 --- a/src/ARMJIT_A64/ARMJIT_ALU.cpp +++ b/src/ARMJIT_A64/ARMJIT_ALU.cpp @@ -243,7 +243,7 @@ void Compiler::Comp_Arithmetic(int op, bool S, ARM64Reg rd, ARM64Reg rn, Op2 op2 if (S && !CurInstr.SetFlags) S = false; - bool CVInGP = false; + bool CVInGPR = false; switch (op) { case 0x2: // SUB @@ -306,7 +306,7 @@ void Compiler::Comp_Arithmetic(int op, bool S, ARM64Reg rd, ARM64Reg rn, Op2 op2 UBFX(W2, RCPSR, 29, 1); if (S) { - CVInGP = true; + CVInGPR = true; ADDS(W1, rn, W2); CSET(W2, CC_CS); CSET(W3, CC_VS); @@ -335,7 +335,7 @@ void Compiler::Comp_Arithmetic(int op, bool S, ARM64Reg rd, ARM64Reg rn, Op2 op2 ORN(W1, WZR, op2.Reg.Rm, op2.ToArithOption()); if (S) { - CVInGP = true; + CVInGPR = true; ADDS(W1, W2, W1); CSET(W2, CC_CS); CSET(W3, CC_VS); @@ -355,7 +355,7 @@ void Compiler::Comp_Arithmetic(int op, bool S, ARM64Reg rd, ARM64Reg rn, Op2 op2 MVN(W1, rn); if (S) { - CVInGP = true; + CVInGPR = true; ADDS(W1, W2, W1); CSET(W2, CC_CS); CSET(W3, CC_VS); @@ -379,12 +379,12 @@ void Compiler::Comp_Arithmetic(int op, bool S, ARM64Reg rd, ARM64Reg rn, Op2 op2 if (S) { - if (CVInGP) + if (CVInGPR) { BFI(RCPSR, W2, 29, 1); BFI(RCPSR, W3, 28, 1); } - Comp_RetriveFlags(!CVInGP); + Comp_RetriveFlags(!CVInGPR); } } @@ -501,7 +501,23 @@ void Compiler::A_Comp_ALUMovOp() MOVI2R(rd, op2.Imm); } else - MOV(rd, op2.Reg.Rm, op2.ToArithOption()); + { + // ORR with shifted operand has cycles latency + if (op2.Reg.ShiftAmount > 0) + { + switch (op2.Reg.ShiftType) + { + case ST_LSL: LSL(rd, op2.Reg.Rm, op2.Reg.ShiftAmount); break; + case ST_LSR: LSR(rd, op2.Reg.Rm, op2.Reg.ShiftAmount); break; + case ST_ASR: ASR(rd, op2.Reg.Rm, op2.Reg.ShiftAmount); break; + case ST_ROR: ROR_(rd, op2.Reg.Rm, op2.Reg.ShiftAmount); break; + } + } + else + { + MOV(rd, op2.Reg.Rm, op2.ToArithOption()); + } + } } if (S) @@ -558,10 +574,7 @@ void Compiler::Comp_Mul_Mla(bool S, bool mla, ARM64Reg rd, ARM64Reg rm, ARM64Reg } else { - CLZ(W0, rs); - CLS(W1, rs); - CMP(W0, W1); - CSEL(W0, W0, W1, CC_GT); + CLS(W0, rs); Comp_AddCycles_CI(mla ? 1 : 0, W0, ArithOption(W0, ST_LSR, 3)); } @@ -594,10 +607,10 @@ void Compiler::A_Comp_Mul_Long() } else { - CLZ(W0, rs); - CLS(W1, rs); - CMP(W0, W1); - CSEL(W0, W0, W1, CC_GT); + if (sign) + CLS(W0, rs); + else + CLZ(W0, rs); Comp_AddCycles_CI(0, W0, ArithOption(W0, ST_LSR, 3)); } @@ -628,6 +641,86 @@ void Compiler::A_Comp_Mul_Long() Comp_RetriveFlags(false); } +void Compiler::A_Comp_Mul_Short() +{ + ARM64Reg rd = MapReg(CurInstr.A_Reg(16)); + ARM64Reg rm = MapReg(CurInstr.A_Reg(0)); + ARM64Reg rs = MapReg(CurInstr.A_Reg(8)); + u32 op = (CurInstr.Instr >> 21) & 0xF; + + bool x = CurInstr.Instr & (1 << 5); + bool y = CurInstr.Instr & (1 << 6); + + SBFX(W1, rs, y ? 16 : 0, 16); + + if (op == 0b1000) + { + // SMLAxy + + SBFX(W0, rm, x ? 16 : 0, 16); + + MUL(W0, W0, W1); + + ORRI2R(W1, RCPSR, 0x08000000); + + ARM64Reg rn = MapReg(CurInstr.A_Reg(12)); + ADDS(rd, W0, rn); + + CSEL(RCPSR, W1, RCPSR, CC_VS); + + CPSRDirty = true; + + Comp_AddCycles_C(); + } + else if (op == 0b1011) + { + // SMULxy + + SBFX(W0, rm, x ? 16 : 0, 16); + + MUL(rd, W0, W1); + + Comp_AddCycles_C(); + } + else if (op == 0b1010) + { + // SMLALxy + + ARM64Reg rn = MapReg(CurInstr.A_Reg(12)); + + MOV(W2, rn); + BFI(X2, rd, 32, 32); + + SBFX(W0, rm, x ? 16 : 0, 16); + + SMADDL(EncodeRegTo64(rn), W0, W1, X2); + + UBFX(EncodeRegTo64(rd), EncodeRegTo64(rn), 32, 32); + + Comp_AddCycles_CI(1); + } + else if (op == 0b1001) + { + // SMLAWy/SMULWy + SMULL(X0, rm, W1); + ASR(x ? EncodeRegTo64(rd) : X0, X0, 16); + + if (!x) + { + ORRI2R(W1, RCPSR, 0x08000000); + + ARM64Reg rn = MapReg(CurInstr.A_Reg(12)); + ADDS(rd, W0, rn); + + CSEL(RCPSR, W1, RCPSR, CC_VS); + + CPSRDirty = true; + } + + Comp_AddCycles_C(); + } +} + void Compiler::A_Comp_Mul() { ARM64Reg rd = MapReg(CurInstr.A_Reg(16)); diff --git a/src/ARMJIT_A64/ARMJIT_Branch.cpp b/src/ARMJIT_A64/ARMJIT_Branch.cpp index 542f0b7..f130938 100644 --- a/src/ARMJIT_A64/ARMJIT_Branch.cpp +++ b/src/ARMJIT_A64/ARMJIT_Branch.cpp @@ -143,7 +143,7 @@ void Compiler::Comp_JumpTo(u32 addr, bool forceNonConstantCycles) if ((Thumb || CurInstr.Cond() >= 0xE) && !forceNonConstantCycles) ConstantCycles += cycles; else - ADD(RCycles, RCycles, cycles); + SUB(RCycles, RCycles, cycles); } @@ -152,23 +152,19 @@ void* Compiler::Gen_JumpTo9(int kind) AlignCode16(); void* res = GetRXPtr(); - MOVI2R(W2, kCodeCacheTiming); - // W1 - code cycles non branch - // W2 - branch code cycles LSR(W1, W0, 12); - LSL(W1, W1, 2); ADDI2R(W1, W1, offsetof(ARMv5, MemTimings), W2); LDRB(W1, RCPU, W1); - LDR(INDEX_UNSIGNED, W3, RCPU, offsetof(ARMv5, ITCMSize)); + LDR(INDEX_UNSIGNED, W2, RCPU, offsetof(ARMv5, ITCMSize)); STR(INDEX_UNSIGNED, W1, RCPU, offsetof(ARMv5, RegionCodeCycles)); - CMP(W0, W3); - FixupBranch outsideITCM = B(CC_LO); - MOVI2R(W1, 1); - MOVI2R(W2, 1); - SetJumpTarget(outsideITCM); + CMP(W1, 0xFF); + MOVI2R(W3, kCodeCacheTiming); + CSEL(W1, W3, W1, CC_EQ); + CMP(W0, W2); + CSINC(W1, W1, WZR, CC_HS); FixupBranch switchToThumb; if (kind == 0) @@ -176,40 +172,36 @@ void* Compiler::Gen_JumpTo9(int kind) if (kind == 0 || kind == 1) { - ANDI2R(W0, W0, ~3); - + // ARM if (kind == 0) ANDI2R(RCPSR, RCPSR, ~0x20); - ADD(W3, W0, 4); - STR(INDEX_UNSIGNED, W3, RCPU, offsetof(ARM, R[15])); - - ADD(W1, W1, W2); - ADD(RCycles, RCycles, W1); + ANDI2R(W0, W0, ~3); + ADD(W0, W0, 4); + STR(INDEX_UNSIGNED, W0, RCPU, offsetof(ARMv5, R[15])); + ADD(W1, W1, W1); + SUB(RCycles, RCycles, W1); RET(); } + if (kind == 0 || kind == 2) { + // Thumb if (kind == 0) { SetJumpTarget(switchToThumb); - ORRI2R(RCPSR, RCPSR, 0x20); } ANDI2R(W0, W0, ~1); + ADD(W0, W0, 2); + STR(INDEX_UNSIGNED, W0, RCPU, offsetof(ARMv5, R[15])); - ADD(W3, W0, 2); - STR(INDEX_UNSIGNED, W3, RCPU, offsetof(ARM, R[15])); - - FixupBranch halfwordLoc = TBZ(W0, 1); - ADD(W1, W1, W2); - ADD(RCycles, RCycles, W1); - RET(); - - SetJumpTarget(halfwordLoc); - ADD(RCycles, RCycles, W2); + ADD(W2, W1, W1); + TSTI2R(W0, 0x2); + CSEL(W1, W1, W2, CC_EQ); + SUB(RCycles, RCycles, W1); RET(); } @@ -237,7 +229,7 @@ void* Compiler::Gen_JumpTo7(int kind) UBFX(W2, W3, 0, 8); UBFX(W3, W3, 8, 8); ADD(W2, W3, W2); - ADD(RCycles, RCycles, W2); + SUB(RCycles, RCycles, W2); ANDI2R(W0, W0, ~3); @@ -261,7 +253,7 @@ void* Compiler::Gen_JumpTo7(int kind) UBFX(W2, W3, 16, 8); UBFX(W3, W3, 24, 8); ADD(W2, W3, W2); - ADD(RCycles, RCycles, W2); + SUB(RCycles, RCycles, W2); ANDI2R(W0, W0, ~1); @@ -287,22 +279,11 @@ void Compiler::Comp_JumpTo(Arm64Gen::ARM64Reg addr, bool switchThumb, bool resto } else { - BitSet16 hiRegsLoaded(RegCache.DirtyRegs & 0xFF00); - bool previouslyDirty = CPSRDirty; + + bool cpsrDirty = CPSRDirty; SaveCPSR(); - - if (restoreCPSR) - { - if (Thumb || CurInstr.Cond() >= 0xE) - RegCache.Flush(); - else - { - // the ugly way... - // we only save them, to load and save them again - for (int reg : hiRegsLoaded) - SaveReg(reg, RegCache.Mapping[reg]); - } - } + SaveCycles(); + PushRegs(restoreCPSR); if (switchThumb) MOV(W1, addr); @@ -319,16 +300,12 @@ void Compiler::Comp_JumpTo(Arm64Gen::ARM64Reg addr, bool switchThumb, bool resto QuickCallFunction(X3, jumpToTrampoline<ARMv5>); else QuickCallFunction(X3, jumpToTrampoline<ARMv4>); - - if (!Thumb && restoreCPSR && CurInstr.Cond() < 0xE) - { - for (int reg : hiRegsLoaded) - LoadReg(reg, RegCache.Mapping[reg]); - } - if (previouslyDirty) - LoadCPSR(); - CPSRDirty = previouslyDirty; + PopRegs(restoreCPSR); + LoadCycles(); + LoadCPSR(); + if (CurInstr.Cond() < 0xE) + CPSRDirty = cpsrDirty; } } @@ -368,21 +345,13 @@ void Compiler::T_Comp_BCOND() s32 offset = (s32)(CurInstr.Instr << 24) >> 23; Comp_JumpTo(R15 + offset + 1, true); - Comp_BranchSpecialBehaviour(); + Comp_BranchSpecialBehaviour(true); FixupBranch skipFailed = B(); SetJumpTarget(skipExecute); Comp_AddCycles_C(true); - if (CurInstr.BranchFlags & branch_FollowCondTaken) - { - SaveCPSR(false); - RegCache.PrepareExit(); - - ADD(W0, RCycles, ConstantCycles); - ABI_PopRegisters(SavedRegs); - RET(); - } + Comp_BranchSpecialBehaviour(false); SetJumpTarget(skipFailed); } diff --git a/src/ARMJIT_A64/ARMJIT_Compiler.cpp b/src/ARMJIT_A64/ARMJIT_Compiler.cpp index a67f357..42435ed 100644 --- a/src/ARMJIT_A64/ARMJIT_Compiler.cpp +++ b/src/ARMJIT_A64/ARMJIT_Compiler.cpp @@ -1,9 +1,3 @@ -#include "ARMJIT_Compiler.h" - -#include "../ARMInterpreter.h" - -#include "../ARMJIT_Internal.h" - #ifdef __SWITCH__ #include "../switch/compat_switch.h" @@ -13,10 +7,17 @@ extern char __start__; #include <unistd.h> #endif +#include "ARMJIT_Compiler.h" + +#include "../ARMJIT_Internal.h" +#include "../ARMInterpreter.h" +#include "../Config.h" + #include <malloc.h> using namespace Arm64Gen; +extern "C" void ARM_Ret(); namespace ARMJIT { @@ -28,7 +29,10 @@ namespace ARMJIT like x64. At one hand you can translate a lot of instructions directly. But at the same time, there are a ton of exceptions, like for example ADD and SUB can't have a RORed second operand on ARMv8. - */ + + While writing a JIT when an instruction is recompiled into multiple ones + not to write back until you've read all the other operands! +*/ template <> const ARM64Reg RegisterCache<Compiler, ARM64Reg>::NativeRegAllocOrder[] = @@ -46,6 +50,132 @@ void Compiler::MovePC() ADD(MapReg(15), MapReg(15), Thumb ? 2 : 4); } +void Compiler::A_Comp_MRS() +{ + Comp_AddCycles_C(); + + ARM64Reg rd = MapReg(CurInstr.A_Reg(12)); + + if (CurInstr.Instr & (1 << 22)) + { + ANDI2R(W5, RCPSR, 0x1F); + MOVI2R(W3, 0); + MOVI2R(W1, 15 - 8); + BL(ReadBanked); + MOV(rd, W3); + } + else + MOV(rd, RCPSR); +} + +void Compiler::A_Comp_MSR() +{ + Comp_AddCycles_C(); + + ARM64Reg val; + if (CurInstr.Instr & (1 << 25)) + { + val = W0; + MOVI2R(val, ROR((CurInstr.Instr & 0xFF), ((CurInstr.Instr >> 7) & 0x1E))); + } + else + { + val = MapReg(CurInstr.A_Reg(0)); + } + + u32 mask = 0; + if (CurInstr.Instr & (1<<16)) mask |= 0x000000FF; + if (CurInstr.Instr & (1<<17)) mask |= 0x0000FF00; + if (CurInstr.Instr & (1<<18)) mask |= 0x00FF0000; + if (CurInstr.Instr & (1<<19)) mask |= 0xFF000000; + + if (CurInstr.Instr & (1 << 22)) + { + ANDI2R(W5, RCPSR, 0x1F); + MOVI2R(W3, 0); + MOVI2R(W1, 15 - 8); + BL(ReadBanked); + + MOVI2R(W1, mask); + MOVI2R(W2, mask & 0xFFFFFF00); + ANDI2R(W5, RCPSR, 0x1F); + CMP(W5, 0x10); + CSEL(W1, W2, W1, CC_EQ); + + BIC(W3, W3, W1); + AND(W0, val, W1); + ORR(W3, W3, W0); + + MOVI2R(W1, 15 - 8); + + BL(WriteBanked); + } + else + { + mask &= 0xFFFFFFDF; + CPSRDirty = true; + + if ((mask & 0xFF) == 0) + { + ANDI2R(RCPSR, RCPSR, ~mask); + ANDI2R(W0, val, mask); + ORR(RCPSR, RCPSR, W0); + } + else + { + MOVI2R(W2, mask); + MOVI2R(W3, mask & 0xFFFFFF00); + ANDI2R(W1, RCPSR, 0x1F); + // W1 = first argument + CMP(W1, 0x10); + CSEL(W2, W3, W2, CC_EQ); + + BIC(RCPSR, RCPSR, W2); + AND(W0, val, W2); + ORR(RCPSR, RCPSR, W0); + + MOV(W2, RCPSR); + MOV(X0, RCPU); + + PushRegs(true); + + QuickCallFunction(X3, (void*)&ARM::UpdateMode); + + PopRegs(true); + } + } +} + +void Compiler::PushRegs(bool saveHiRegs) +{ + if (saveHiRegs) + { + if (Thumb || CurInstr.Cond() == 0xE) + { + BitSet16 hiRegsLoaded(RegCache.LoadedRegs & 0x7F00); + for (int reg : hiRegsLoaded) + RegCache.UnloadRegister(reg); + } + else + { + BitSet16 hiRegsDirty(RegCache.LoadedRegs & 0x7F00); + for (int reg : hiRegsDirty) + SaveReg(reg, RegCache.Mapping[reg]); + } + } +} + +void Compiler::PopRegs(bool saveHiRegs) +{ + if (saveHiRegs) + { + BitSet16 hiRegsLoaded(RegCache.LoadedRegs & 0x7F00); + + for (int reg : hiRegsLoaded) + LoadReg(reg, RegCache.Mapping[reg]); + } +} + Compiler::Compiler() { #ifdef __SWITCH__ @@ -80,8 +210,7 @@ Compiler::Compiler() assert(succeded); SetCodeBase((u8*)JitRWStart, (u8*)JitRXStart); - JitMemUseableSize = JitMemSize; - Reset(); + JitMemMainSize = JitMemSize; #else u64 pageSize = sysconf(_SC_PAGE_SIZE); u8* pageAligned = (u8*)(((u64)JitMem & ~(pageSize - 1)) + pageSize); @@ -90,31 +219,8 @@ Compiler::Compiler() SetCodeBase(pageAligned, pageAligned); JitMemUseableSize = alignedSize; - Reset(); #endif - - for (int i = 0; i < 3; i++) - { - for (int j = 0; j < 2; j++) - { - MemFunc9[i][j] = Gen_MemoryRoutine9(8 << i, j); - } - } - MemFunc7[0][0] = (void*)NDS::ARM7Read8; - MemFunc7[1][0] = (void*)NDS::ARM7Read16; - MemFunc7[2][0] = (void*)NDS::ARM7Read32; - MemFunc7[0][1] = (void*)NDS::ARM7Write8; - MemFunc7[1][1] = (void*)NDS::ARM7Write16; - MemFunc7[2][1] = (void*)NDS::ARM7Write32; - - for (int i = 0; i < 2; i++) - { - for (int j = 0; j < 2; j++) - { - MemFuncsSeq9[i][j] = Gen_MemoryRoutine9Seq(i, j); - MemFuncsSeq7[i][j] = Gen_MemoryRoutine7Seq(i, j); - } - } + SetCodePtr(0); for (int i = 0; i < 3; i++) { @@ -123,26 +229,26 @@ Compiler::Compiler() } /* - W0 - mode + W5 - mode W1 - reg num W3 - in/out value of reg */ { ReadBanked = GetRXPtr(); - ADD(X2, RCPU, X1, ArithOption(X1, ST_LSL, 2)); - CMP(W0, 0x11); + ADD(X2, RCPU, X1, ArithOption(X2, ST_LSL, 2)); + CMP(W5, 0x11); FixupBranch fiq = B(CC_EQ); SUBS(W1, W1, 13 - 8); - ADD(X2, RCPU, X1, ArithOption(X1, ST_LSL, 2)); + ADD(X2, RCPU, X1, ArithOption(X2, ST_LSL, 2)); FixupBranch notEverything = B(CC_LT); - CMP(W0, 0x12); + CMP(W5, 0x12); FixupBranch irq = B(CC_EQ); - CMP(W0, 0x13); + CMP(W5, 0x13); FixupBranch svc = B(CC_EQ); - CMP(W0, 0x17); + CMP(W5, 0x17); FixupBranch abt = B(CC_EQ); - CMP(W0, 0x1B); + CMP(W5, 0x1B); FixupBranch und = B(CC_EQ); SetJumpTarget(notEverything); RET(); @@ -166,19 +272,19 @@ Compiler::Compiler() { WriteBanked = GetRXPtr(); - ADD(X2, RCPU, X1, ArithOption(X1, ST_LSL, 2)); - CMP(W0, 0x11); + ADD(X2, RCPU, X1, ArithOption(X2, ST_LSL, 2)); + CMP(W5, 0x11); FixupBranch fiq = B(CC_EQ); SUBS(W1, W1, 13 - 8); - ADD(X2, RCPU, X1, ArithOption(X1, ST_LSL, 2)); + ADD(X2, RCPU, X1, ArithOption(X2, ST_LSL, 2)); FixupBranch notEverything = B(CC_LT); - CMP(W0, 0x12); + CMP(W5, 0x12); FixupBranch irq = B(CC_EQ); - CMP(W0, 0x13); + CMP(W5, 0x13); FixupBranch svc = B(CC_EQ); - CMP(W0, 0x17); + CMP(W5, 0x17); FixupBranch abt = B(CC_EQ); - CMP(W0, 0x1B); + CMP(W5, 0x1B); FixupBranch und = B(CC_EQ); SetJumpTarget(notEverything); MOVI2R(W4, 0); @@ -206,9 +312,71 @@ Compiler::Compiler() RET(); } - //FlushIcache(); + for (int num = 0; num < 2; num++) + { + for (int size = 0; size < 3; size++) + { + for (int reg = 0; reg < 8; reg++) + { + ARM64Reg rdMapped = (ARM64Reg)(W19 + reg); + PatchedStoreFuncs[num][size][reg] = GetRXPtr(); + if (num == 0) + { + MOV(X1, RCPU); + MOV(W2, rdMapped); + } + else + { + MOV(W1, rdMapped); + } + ABI_PushRegisters({30}); + switch ((8 << size) | num) + { + case 32: QuickCallFunction(X3, SlowWrite9<u32>); break; + case 33: QuickCallFunction(X3, SlowWrite7<u32>); break; + case 16: QuickCallFunction(X3, SlowWrite9<u16>); break; + case 17: QuickCallFunction(X3, SlowWrite7<u16>); break; + case 8: QuickCallFunction(X3, SlowWrite9<u8>); break; + case 9: QuickCallFunction(X3, SlowWrite7<u8>); break; + } + ABI_PopRegisters({30}); + RET(); + + for (int signextend = 0; signextend < 2; signextend++) + { + PatchedLoadFuncs[num][size][signextend][reg] = GetRXPtr(); + if (num == 0) + MOV(X1, RCPU); + ABI_PushRegisters({30}); + switch ((8 << size) | num) + { + case 32: QuickCallFunction(X3, SlowRead9<u32>); break; + case 33: QuickCallFunction(X3, SlowRead7<u32>); break; + case 16: QuickCallFunction(X3, SlowRead9<u16>); break; + case 17: QuickCallFunction(X3, SlowRead7<u16>); break; + case 8: QuickCallFunction(X3, SlowRead9<u8>); break; + case 9: QuickCallFunction(X3, SlowRead7<u8>); break; + } + ABI_PopRegisters({30}); + if (size == 32) + MOV(rdMapped, W0); + else if (signextend) + SBFX(rdMapped, W0, 0, 8 << size); + else + UBFX(rdMapped, W0, 0, 8 << size); + RET(); + } + } + } + } + + FlushIcache(); + + JitMemSecondarySize = 1024*1024*4; + + JitMemMainSize -= GetCodeOffset(); + JitMemMainSize -= JitMemSecondarySize; - JitMemUseableSize -= GetCodeOffset(); SetCodeBase((u8*)GetRWPtr(), (u8*)GetRXPtr()); } @@ -227,6 +395,16 @@ Compiler::~Compiler() #endif } +void Compiler::LoadCycles() +{ + LDR(INDEX_UNSIGNED, RCycles, RCPU, offsetof(ARM, Cycles)); +} + +void Compiler::SaveCycles() +{ + STR(INDEX_UNSIGNED, RCycles, RCPU, offsetof(ARM, Cycles)); +} + void Compiler::LoadReg(int reg, ARM64Reg nativeReg) { if (reg == 15) @@ -325,7 +503,7 @@ const Compiler::CompileFunc A_Comp[ARMInstrInfo::ak_Count] = // CMN F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), // Mul - F(Mul), F(Mul), F(Mul_Long), F(Mul_Long), F(Mul_Long), F(Mul_Long), NULL, NULL, NULL, NULL, NULL, + F(Mul), F(Mul), F(Mul_Long), F(Mul_Long), F(Mul_Long), F(Mul_Long), F(Mul_Short), F(Mul_Short), F(Mul_Short), F(Mul_Short), F(Mul_Short), // ARMv5 exclusives F(Clz), NULL, NULL, NULL, NULL, @@ -356,7 +534,7 @@ const Compiler::CompileFunc A_Comp[ARMInstrInfo::ak_Count] = // Branch F(BranchImm), F(BranchImm), F(BranchImm), F(BranchXchangeReg), F(BranchXchangeReg), // Special - NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, F(MSR), F(MSR), F(MRS), NULL, NULL, NULL, &Compiler::Nop }; #undef F @@ -404,29 +582,34 @@ bool Compiler::CanCompile(bool thumb, u16 kind) return (thumb ? T_Comp[kind] : A_Comp[kind]) != NULL; } -void Compiler::Comp_BranchSpecialBehaviour() +void Compiler::Comp_BranchSpecialBehaviour(bool taken) { - if (CurInstr.BranchFlags & branch_IdleBranch) + if (taken && CurInstr.BranchFlags & branch_IdleBranch) { MOVI2R(W0, 1); STRB(INDEX_UNSIGNED, W0, RCPU, offsetof(ARM, IdleLoop)); } - if (CurInstr.BranchFlags & branch_FollowCondNotTaken) + if ((CurInstr.BranchFlags & branch_FollowCondNotTaken && taken) + || (CurInstr.BranchFlags & branch_FollowCondTaken && !taken)) { - SaveCPSR(false); RegCache.PrepareExit(); - ADD(W0, RCycles, ConstantCycles); - ABI_PopRegisters(SavedRegs); - RET(); + + SUB(RCycles, RCycles, ConstantCycles); + QuickTailCall(X0, ARM_Ret); } } JitBlockEntry Compiler::CompileBlock(ARM* cpu, bool thumb, FetchedInstr instrs[], int instrsCount) { - if (JitMemUseableSize - GetCodeOffset() < 1024 * 16) + if (JitMemMainSize - GetCodeOffset() < 1024 * 16) + { + printf("JIT near memory full, resetting...\n"); + ResetBlockCache(); + } + if ((JitMemMainSize + JitMemSecondarySize) - OtherCodeRegion < 1024 * 8) { - printf("JIT memory full, resetting...\n"); + printf("JIT far memory full, resetting...\n"); ResetBlockCache(); } @@ -437,21 +620,7 @@ JitBlockEntry Compiler::CompileBlock(ARM* cpu, bool thumb, FetchedInstr instrs[] CurCPU = cpu; ConstantCycles = 0; RegCache = RegisterCache<Compiler, ARM64Reg>(this, instrs, instrsCount, true); - - //printf("compiling block at %x\n", R15 - (Thumb ? 2 : 4)); - const u32 ALL_CALLEE_SAVED = 0x7FF80000; - - SavedRegs = BitSet32((RegCache.GetPushRegs() | BitSet32(0x78000000)) & BitSet32(ALL_CALLEE_SAVED)); - - //if (Num == 1) - { - ABI_PushRegisters(SavedRegs); - - MOVP2R(RCPU, CurCPU); - MOVI2R(RCycles, 0); - - LoadCPSR(); - } + CPSRDirty = false; for (int i = 0; i < instrsCount; i++) { @@ -486,6 +655,7 @@ JitBlockEntry Compiler::CompileBlock(ARM* cpu, bool thumb, FetchedInstr instrs[] if (comp == NULL) { + SaveCycles(); SaveCPSR(); RegCache.Flush(); } @@ -535,25 +705,18 @@ JitBlockEntry Compiler::CompileBlock(ARM* cpu, bool thumb, FetchedInstr instrs[] (this->*comp)(); } - Comp_BranchSpecialBehaviour(); + Comp_BranchSpecialBehaviour(true); if (cond < 0xE) { - if (IrregularCycles) + if (IrregularCycles || (CurInstr.BranchFlags & branch_FollowCondTaken)) { FixupBranch skipNop = B(); SetJumpTarget(skipExecute); Comp_AddCycles_C(); - if (CurInstr.BranchFlags & branch_FollowCondTaken) - { - SaveCPSR(false); - RegCache.PrepareExit(); - ADD(W0, RCycles, ConstantCycles); - ABI_PopRegisters(SavedRegs); - RET(); - } + Comp_BranchSpecialBehaviour(false); SetJumpTarget(skipNop); } @@ -565,76 +728,74 @@ JitBlockEntry Compiler::CompileBlock(ARM* cpu, bool thumb, FetchedInstr instrs[] } if (comp == NULL) + { + LoadCycles(); LoadCPSR(); + } } RegCache.Flush(); - //if (Num == 1) - { - SaveCPSR(); - - ADD(W0, RCycles, ConstantCycles); - - ABI_PopRegisters(SavedRegs); - } - //else - // ADD(RCycles, RCycles, ConstantCycles); - - RET(); + SUB(RCycles, RCycles, ConstantCycles); + QuickTailCall(X0, ARM_Ret); FlushIcache(); - //printf("finished\n"); - return res; } void Compiler::Reset() { + LoadStorePatches.clear(); + SetCodePtr(0); + OtherCodeRegion = JitMemMainSize; const u32 brk_0 = 0xD4200000; - for (int i = 0; i < JitMemUseableSize / 4; i++) + for (int i = 0; i < (JitMemMainSize + JitMemSecondarySize) / 4; i++) *(((u32*)GetRWPtr()) + i) = brk_0; } -void Compiler::Comp_AddCycles_C(bool nonConst) +void Compiler::Comp_AddCycles_C(bool forceNonConstant) { s32 cycles = Num ? NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 1 : 3] : ((R15 & 0x2) ? 0 : CurInstr.CodeCycles); - if (!nonConst && !CurInstr.Info.Branches()) + if (forceNonConstant) ConstantCycles += cycles; else - ADD(RCycles, RCycles, cycles); + SUB(RCycles, RCycles, cycles); } void Compiler::Comp_AddCycles_CI(u32 numI) { + IrregularCycles = true; + s32 cycles = (Num ? NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2] : ((R15 & 0x2) ? 0 : CurInstr.CodeCycles)) + numI; - if (Thumb || CurInstr.Cond() >= 0xE) + if (Thumb || CurInstr.Cond() == 0xE) ConstantCycles += cycles; else - ADD(RCycles, RCycles, cycles); + SUB(RCycles, RCycles, cycles); } void Compiler::Comp_AddCycles_CI(u32 c, ARM64Reg numI, ArithOption shift) { + IrregularCycles = true; + s32 cycles = (Num ? NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2] : ((R15 & 0x2) ? 0 : CurInstr.CodeCycles)) + c; - ADD(RCycles, RCycles, numI, shift); + SUB(RCycles, RCycles, cycles); if (Thumb || CurInstr.Cond() >= 0xE) - ConstantCycles += c; + ConstantCycles += cycles; else - ADD(RCycles, RCycles, cycles); + SUB(RCycles, RCycles, cycles); } void Compiler::Comp_AddCycles_CDI() @@ -671,7 +832,7 @@ void Compiler::Comp_AddCycles_CDI() } if (!Thumb && CurInstr.Cond() < 0xE) - ADD(RCycles, RCycles, cycles); + SUB(RCycles, RCycles, cycles); else ConstantCycles += cycles; } @@ -715,7 +876,7 @@ void Compiler::Comp_AddCycles_CD() } if ((!Thumb && CurInstr.Cond() < 0xE) && IrregularCycles) - ADD(RCycles, RCycles, cycles); + SUB(RCycles, RCycles, cycles); else ConstantCycles += cycles; } diff --git a/src/ARMJIT_A64/ARMJIT_Compiler.h b/src/ARMJIT_A64/ARMJIT_Compiler.h index 5c9ef41..e4ffc63 100644 --- a/src/ARMJIT_A64/ARMJIT_Compiler.h +++ b/src/ARMJIT_A64/ARMJIT_Compiler.h @@ -9,6 +9,8 @@ #include "../ARMJIT_Internal.h" #include "../ARMJIT_RegisterCache.h" +#include <unordered_map> + namespace ARMJIT { @@ -64,7 +66,14 @@ struct Op2 }; }; -class Compiler : Arm64Gen::ARM64XEmitter +struct LoadStorePatch +{ + void* PatchFunc; + s32 PatchOffset; + u32 PatchSize; +}; + +class Compiler : public Arm64Gen::ARM64XEmitter { public: typedef void (Compiler::*CompileFunc)(); @@ -72,6 +81,9 @@ public: Compiler(); ~Compiler(); + void PushRegs(bool saveHiRegs); + void PopRegs(bool saveHiRegs); + Arm64Gen::ARM64Reg MapReg(int reg) { assert(RegCache.Mapping[reg] != Arm64Gen::INVALID_REG); @@ -89,7 +101,7 @@ public: void Reset(); - void Comp_AddCycles_C(bool forceNonConst = false); + void Comp_AddCycles_C(bool forceNonConstant = false); void Comp_AddCycles_CI(u32 numI); void Comp_AddCycles_CI(u32 c, Arm64Gen::ARM64Reg numI, Arm64Gen::ArithOption shift); void Comp_AddCycles_CD(); @@ -103,6 +115,9 @@ public: void LoadCPSR(); void SaveCPSR(bool markClean = true); + void LoadCycles(); + void SaveCycles(); + void Nop() {} void A_Comp_ALUTriOp(); @@ -111,6 +126,7 @@ public: void A_Comp_Mul(); void A_Comp_Mul_Long(); + void A_Comp_Mul_Short(); void A_Comp_Clz(); @@ -122,6 +138,8 @@ public: void A_Comp_BranchImm(); void A_Comp_BranchXchangeReg(); + void A_Comp_MRS(); + void A_Comp_MSR(); void T_Comp_ShiftImm(); void T_Comp_AddSub_(); @@ -168,7 +186,7 @@ public: void Comp_RegShiftImm(int op, int amount, bool S, Op2& op2, Arm64Gen::ARM64Reg tmp = Arm64Gen::W0); void Comp_RegShiftReg(int op, bool S, Op2& op2, Arm64Gen::ARM64Reg rs); - void Comp_MemLoadLiteral(int size, bool signExtend, int rd, u32 addr); + bool Comp_MemLoadLiteral(int size, bool signExtend, int rd, u32 addr); enum { memop_Writeback = 1 << 0, @@ -179,16 +197,33 @@ public: }; void Comp_MemAccess(int rd, int rn, Op2 offset, int size, int flags); - void* Gen_MemoryRoutine9(int size, bool store); - - void* Gen_MemoryRoutine9Seq(bool store, bool preinc); - void* Gen_MemoryRoutine7Seq(bool store, bool preinc); - // 0 = switch mode, 1 = stay arm, 2 = stay thumb void* Gen_JumpTo9(int kind); void* Gen_JumpTo7(int kind); - void Comp_BranchSpecialBehaviour(); + void Comp_BranchSpecialBehaviour(bool taken); + + JitBlockEntry AddEntryOffset(u32 offset) + { + return (JitBlockEntry)(GetRXBase() + offset); + } + + u32 SubEntryOffset(JitBlockEntry entry) + { + return (u8*)entry - GetRXBase(); + } + + bool IsJITFault(u64 pc); + s64 RewriteMemAccess(u64 pc); + + void SwapCodeRegion() + { + ptrdiff_t offset = GetCodeOffset(); + SetCodePtrUnsafe(OtherCodeRegion); + OtherCodeRegion = offset; + } + + ptrdiff_t OtherCodeRegion; bool Exit; @@ -202,22 +237,20 @@ public: BitSet32 SavedRegs; - u32 JitMemUseableSize; + u32 JitMemSecondarySize; + u32 JitMemMainSize; void* ReadBanked, *WriteBanked; - // [size][store] - void* MemFunc9[3][2]; - void* MemFunc7[3][2]; - - // [store][pre increment] - void* MemFuncsSeq9[2][2]; - // "[code in main ram] - void* MemFuncsSeq7[2][2]; - void* JumpToFuncs9[3]; void* JumpToFuncs7[3]; + std::unordered_map<ptrdiff_t, LoadStorePatch> LoadStorePatches; + + // [Num][Size][Sign Extend][Output register] + void* PatchedLoadFuncs[2][3][2][8]; + void* PatchedStoreFuncs[2][3][8]; + RegisterCache<Compiler, Arm64Gen::ARM64Reg> RegCache; bool CPSRDirty = false; diff --git a/src/ARMJIT_A64/ARMJIT_Linkage.s b/src/ARMJIT_A64/ARMJIT_Linkage.s new file mode 100644 index 0000000..536a478 --- /dev/null +++ b/src/ARMJIT_A64/ARMJIT_Linkage.s @@ -0,0 +1,68 @@ +#include "../ARMJIT_x64/ARMJIT_Offsets.h" + +.text + +#define RCPSR W27 +#define RCycles W28 +#define RCPU X29 + +.p2align 4,,15 + +.global ARM_Dispatch +ARM_Dispatch: + stp x19, x20, [sp, #-96]! + stp x21, x22, [sp, #16] + stp x23, x24, [sp, #32] + stp x25, x26, [sp, #48] + stp x27, x28, [sp, #64] + stp x29, x30, [sp, #80] + + mov RCPU, x0 + ldr RCycles, [RCPU, ARM_Cycles_offset] + ldr RCPSR, [RCPU, ARM_CPSR_offset] + + br x1 + +.p2align 4,,15 + +.global ARM_Ret +ARM_Ret: + str RCycles, [RCPU, ARM_Cycles_offset] + str RCPSR, [RCPU, ARM_CPSR_offset] + + ldp x29, x30, [sp, #80] + ldp x27, x28, [sp, #64] + ldp x25, x26, [sp, #48] + ldp x23, x24, [sp, #32] + ldp x21, x22, [sp, #16] + ldp x19, x20, [sp], #96 + + ret + +.p2align 4,,15 + +.global ARM_RestoreContext +ARM_RestoreContext: + mov sp, x0 + + ldp x0, x1, [sp] + ldp x2, x3, [sp, #16] + ldp x4, x5, [sp, #32] + ldp x6, x7, [sp, #48] + ldp x8, x9, [sp, #64] + ldp x10, x11, [sp, #80] + ldp x12, x13, [sp, #96] + ldp x14, x15, [sp, #112] + ldp x16, x17, [sp, #128] + ldp x18, x19, [sp, #144] + ldp x20, x21, [sp, #160] + ldp x22, x23, [sp, #176] + ldp x24, x25, [sp, #192] + ldp x26, x27, [sp, #208] + ldp x28, x29, [sp, #224] + ldr x30, [sp, #240] + + ldp x17, x18, [sp, #248] + mov sp, x17 + + br x18
\ No newline at end of file diff --git a/src/ARMJIT_A64/ARMJIT_LoadStore.cpp b/src/ARMJIT_A64/ARMJIT_LoadStore.cpp index 6cf710b..b307d0e 100644 --- a/src/ARMJIT_A64/ARMJIT_LoadStore.cpp +++ b/src/ARMJIT_A64/ARMJIT_LoadStore.cpp @@ -2,286 +2,62 @@ #include "../Config.h" +#include "../ARMJIT_Memory.h" + using namespace Arm64Gen; namespace ARMJIT { -// W0 - address -// (if store) W1 - value to store -// W2 - code cycles -void* Compiler::Gen_MemoryRoutine9(int size, bool store) +bool Compiler::IsJITFault(u64 pc) { - AlignCode16(); - void* res = GetRXPtr(); - - u32 addressMask; - switch (size) - { - case 32: addressMask = ~3; break; - case 16: addressMask = ~1; break; - case 8: addressMask = ~0; break; - } - - LDR(INDEX_UNSIGNED, W3, RCPU, offsetof(ARMv5, DTCMBase)); - LDR(INDEX_UNSIGNED, W4, RCPU, offsetof(ARMv5, DTCMSize)); - SUB(W3, W0, W3); - CMP(W3, W4); - FixupBranch insideDTCM = B(CC_LO); - - UBFX(W4, W0, 24, 8); - CMP(W4, 0x02); - FixupBranch outsideMainRAM = B(CC_NEQ); - ANDI2R(W3, W0, addressMask & (MAIN_RAM_SIZE - 1)); - MOVP2R(X4, NDS::MainRAM); - if (!store && size == 32) - { - LDR(W3, X3, X4); - ANDI2R(W0, W0, 3); - LSL(W0, W0, 3); - RORV(W0, W3, W0); - } - else if (store) - STRGeneric(size, W1, X3, X4); - else - LDRGeneric(size, false, W0, X3, X4); - RET(); - - SetJumpTarget(outsideMainRAM); - - LDR(INDEX_UNSIGNED, W3, RCPU, offsetof(ARMv5, ITCMSize)); - CMP(W0, W3); - FixupBranch insideITCM = B(CC_LO); - - if (store) - { - if (size > 8) - ANDI2R(W0, W0, addressMask); - - switch (size) - { - case 32: QuickTailCall(X4, NDS::ARM9Write32); break; - case 16: QuickTailCall(X4, NDS::ARM9Write16); break; - case 8: QuickTailCall(X4, NDS::ARM9Write8); break; - } - } - else - { - if (size == 32) - ABI_PushRegisters({0, 30}); - if (size > 8) - ANDI2R(W0, W0, addressMask); - - switch (size) - { - case 32: QuickCallFunction(X4, NDS::ARM9Read32); break; - case 16: QuickTailCall (X4, NDS::ARM9Read16); break; - case 8: QuickTailCall (X4, NDS::ARM9Read8 ); break; - } - if (size == 32) - { - ABI_PopRegisters({1, 30}); - ANDI2R(W1, W1, 3); - LSL(W1, W1, 3); - RORV(W0, W0, W1); - RET(); - } - } - - SetJumpTarget(insideDTCM); - ANDI2R(W3, W3, 0x3FFF & addressMask); - ADDI2R(W3, W3, offsetof(ARMv5, DTCM), W4); - if (!store && size == 32) - { - ANDI2R(W4, W0, 3); - LDR(W0, RCPU, W3); - LSL(W4, W4, 3); - RORV(W0, W0, W4); - } - else if (store) - STRGeneric(size, W1, RCPU, W3); - else - LDRGeneric(size, false, W0, RCPU, W3); - - RET(); - - SetJumpTarget(insideITCM); - ANDI2R(W3, W0, 0x7FFF & addressMask); - if (store) - { - ADDI2R(W0, W3, ExeMemRegionOffsets[exeMem_ITCM], W4); - LSR(W5, W0, 9); - MOVP2R(X4, CodeRanges); - ADD(X4, X4, X5, ArithOption(X5, ST_LSL, 4)); - static_assert(sizeof(AddressRange) == 16); - LDRH(INDEX_UNSIGNED, W4, X4, offsetof(AddressRange, Blocks.Length)); - FixupBranch null = CBZ(W4); - ABI_PushRegisters({1, 3, 30}); - QuickCallFunction(X4, InvalidateByAddr); - ABI_PopRegisters({1, 3, 30}); - SetJumpTarget(null); - } - ADDI2R(W3, W3, offsetof(ARMv5, ITCM), W4); - if (!store && size == 32) - { - ANDI2R(W4, W0, 3); - LDR(W0, RCPU, W3); - LSL(W4, W4, 3); - RORV(W0, W0, W4); - } - else if (store) - STRGeneric(size, W1, RCPU, W3); - else - LDRGeneric(size, false, W0, RCPU, W3); - RET(); - - return res; + return pc >= (u64)GetRXBase() && pc - (u64)GetRXBase() < (JitMemMainSize + JitMemSecondarySize); } -/* - W0 - base address - X1 - stack space - W2 - values count -*/ -void* Compiler::Gen_MemoryRoutine9Seq(bool store, bool preinc) +s64 Compiler::RewriteMemAccess(u64 pc) { - AlignCode16(); - void* res = GetRXPtr(); - - void* loopStart = GetRXPtr(); - SUB(W2, W2, 1); - - if (preinc) - ADD(W0, W0, 4); + ptrdiff_t pcOffset = pc - (u64)GetRXBase(); - LDR(INDEX_UNSIGNED, W4, RCPU, offsetof(ARMv5, DTCMBase)); - LDR(INDEX_UNSIGNED, W5, RCPU, offsetof(ARMv5, DTCMSize)); - SUB(W4, W0, W4); - CMP(W4, W5); - FixupBranch insideDTCM = B(CC_LO); + auto it = LoadStorePatches.find(pcOffset); - LDR(INDEX_UNSIGNED, W4, RCPU, offsetof(ARMv5, ITCMSize)); - CMP(W0, W4); - FixupBranch insideITCM = B(CC_LO); - - ABI_PushRegisters({0, 1, 2, 30}); // TODO: move SP only once - if (store) + if (it != LoadStorePatches.end()) { - LDR(X1, X1, ArithOption(X2, true)); - QuickCallFunction(X4, NDS::ARM9Write32); + LoadStorePatch patch = it->second; - ABI_PopRegisters({0, 1, 2, 30}); - } - else - { - QuickCallFunction(X4, NDS::ARM9Read32); - MOV(W4, W0); + ptrdiff_t curCodeOffset = GetCodeOffset(); - ABI_PopRegisters({0, 1, 2, 30}); + SetCodePtrUnsafe(pcOffset + patch.PatchOffset); - STR(X4, X1, ArithOption(X2, true)); - } + BL(patch.PatchFunc); - if (!preinc) - ADD(W0, W0, 4); - CBNZ(W2, loopStart); - RET(); + for (int i = 0; i < patch.PatchSize / 4 - 1; i++) + HINT(HINT_NOP); - SetJumpTarget(insideDTCM); + FlushIcacheSection((u8*)pc + patch.PatchOffset, (u8*)GetRXPtr()); - ANDI2R(W4, W4, ~3 & 0x3FFF); - ADDI2R(X4, X4, offsetof(ARMv5, DTCM)); - if (store) - { - LDR(X5, X1, ArithOption(X2, true)); - STR(W5, RCPU, X4); - } - else - { - LDR(W5, RCPU, X4); - STR(X5, X1, ArithOption(X2, true)); - } + SetCodePtrUnsafe(curCodeOffset); - if (!preinc) - ADD(W0, W0, 4); - CBNZ(W2, loopStart); - RET(); - - SetJumpTarget(insideITCM); - - ANDI2R(W4, W0, ~3 & 0x7FFF); - - ADDI2R(W6, W4, offsetof(ARMv5, ITCM), W5); - if (store) - { - LDR(X5, X1, ArithOption(X2, true)); - STR(W5, RCPU, X6); - } - else - { - LDR(W5, RCPU, X6); - STR(X5, X1, ArithOption(X2, true)); - } + LoadStorePatches.erase(it); - if (store) - { - ADDI2R(W4, W4, ExeMemRegionOffsets[exeMem_ITCM], W5); - LSR(W6, W4, 9); - MOVP2R(X5, CodeRanges); - ADD(X5, X5, X6, ArithOption(X6, ST_LSL, 4)); - static_assert(sizeof(AddressRange) == 16); - LDRH(INDEX_UNSIGNED, W5, X5, offsetof(AddressRange, Blocks.Length)); - FixupBranch null = CBZ(W5); - ABI_PushRegisters({0, 1, 2, 4, 30}); - MOV(W0, W4); - QuickCallFunction(X5, InvalidateByAddr); - ABI_PopRegisters({0, 1, 2, 4, 30}); - SetJumpTarget(null); + return patch.PatchOffset; } - - if (!preinc) - ADD(W0, W0, 4); - CBNZ(W2, loopStart); - RET(); - return res; + printf("this is a JIT bug! %08x\n", __builtin_bswap32(*(u32*)pc)); + assert(false); } -void* Compiler::Gen_MemoryRoutine7Seq(bool store, bool preinc) +bool Compiler::Comp_MemLoadLiteral(int size, bool signExtend, int rd, u32 addr) { - AlignCode16(); - void* res = GetRXPtr(); + u32 localAddr = LocaliseCodeAddress(Num, addr); - void* loopStart = GetRXPtr(); - SUB(W2, W2, 1); - - if (preinc) - ADD(W0, W0, 4); - - ABI_PushRegisters({0, 1, 2, 30}); - if (store) + int invalidLiteralIdx = InvalidLiterals.Find(localAddr); + if (invalidLiteralIdx != -1) { - LDR(X1, X1, ArithOption(X2, true)); - QuickCallFunction(X4, NDS::ARM7Write32); - ABI_PopRegisters({0, 1, 2, 30}); + InvalidLiterals.Remove(invalidLiteralIdx); + return false; } - else - { - QuickCallFunction(X4, NDS::ARM7Read32); - MOV(W4, W0); - ABI_PopRegisters({0, 1, 2, 30}); - STR(X4, X1, ArithOption(X2, true)); - } - - if (!preinc) - ADD(W0, W0, 4); - CBNZ(W2, loopStart); - RET(); - return res; -} + Comp_AddCycles_CDI(); -void Compiler::Comp_MemLoadLiteral(int size, bool signExtend, int rd, u32 addr) -{ u32 val; // make sure arm7 bios is accessible u32 tmpR15 = CurCPU->R[15]; @@ -309,6 +85,8 @@ void Compiler::Comp_MemLoadLiteral(int size, bool signExtend, int rd, u32 addr) if (Thumb || CurInstr.Cond() == 0xE) RegCache.PutLiteral(rd, val); + + return true; } void Compiler::Comp_MemAccess(int rd, int rn, Op2 offset, int size, int flags) @@ -318,163 +96,209 @@ void Compiler::Comp_MemAccess(int rd, int rn, Op2 offset, int size, int flags) addressMask = ~3; if (size == 16) addressMask = ~1; + + if (Config::JIT_LiteralOptimisations && rn == 15 && rd != 15 && offset.IsImm && !(flags & (memop_Post|memop_Store|memop_Writeback))) + { + u32 addr = R15 + offset.Imm * ((flags & memop_SubtractOffset) ? -1 : 1); + + if (Comp_MemLoadLiteral(size, flags & memop_SignExtend, rd, addr)) + return; + } if (flags & memop_Store) Comp_AddCycles_CD(); else Comp_AddCycles_CDI(); - if (Config::JIT_LiteralOptimisations && rn == 15 && rd != 15 && offset.IsImm && !(flags & (memop_Post|memop_Store|memop_Writeback))) - { - u32 addr = R15 + offset.Imm * ((flags & memop_SubtractOffset) ? -1 : 1); - u32 translatedAddr = Num == 0 ? TranslateAddr<0>(addr) : TranslateAddr<1>(addr); + ARM64Reg rdMapped = MapReg(rd); + ARM64Reg rnMapped = MapReg(rn); - if (!(CodeRanges[translatedAddr / 512].InvalidLiterals & (1 << ((translatedAddr & 0x1FF) / 16)))) - { - Comp_MemLoadLiteral(size, flags & memop_SignExtend, rd, addr); - return; - } + if (Thumb && rn == 15) + { + ANDI2R(W3, rnMapped, ~2); + rnMapped = W3; } + ARM64Reg finalAddr = W0; + if (flags & memop_Post) { - ARM64Reg rdMapped = MapReg(rd); - ARM64Reg rnMapped = MapReg(rn); - - bool inlinePreparation = Num == 1; - u32 constLocalROR32 = 4; + finalAddr = rnMapped; + MOV(W0, rnMapped); + } - void* memFunc = Num == 0 - ? MemFunc9[size >> 4][!!(flags & memop_Store)] - : MemFunc7[size >> 4][!!((flags & memop_Store))]; + bool addrIsStatic = Config::JIT_LiteralOptimisations + && RegCache.IsLiteral(rn) && offset.IsImm && !(flags & (memop_Writeback|memop_Post)); + u32 staticAddress; + if (addrIsStatic) + staticAddress = RegCache.LiteralValues[rn] + offset.Imm * ((flags & memop_SubtractOffset) ? -1 : 1); - if (Config::JIT_LiteralOptimisations && (rd != 15 || (flags & memop_Store)) && offset.IsImm && RegCache.IsLiteral(rn)) + if (!offset.IsImm) + Comp_RegShiftImm(offset.Reg.ShiftType, offset.Reg.ShiftAmount, false, offset, W2); + // offset might has become an immediate + if (offset.IsImm) + { + if (offset.Imm) + { + if (flags & memop_SubtractOffset) + SUB(finalAddr, rnMapped, offset.Imm); + else + ADD(finalAddr, rnMapped, offset.Imm); + } + else if (finalAddr != rnMapped) + MOV(finalAddr, rnMapped); + } + else + { + if (offset.Reg.ShiftType == ST_ROR) { - u32 addr = RegCache.LiteralValues[rn] + offset.Imm * ((flags & memop_SubtractOffset) ? -1 : 1); + ROR_(W0, offset.Reg.Rm, offset.Reg.ShiftAmount); + offset = Op2(W0); + } - NDS::MemRegion region; - region.Mem = NULL; - if (Num == 0) - { - ARMv5* cpu5 = (ARMv5*)CurCPU; + if (flags & memop_SubtractOffset) + SUB(finalAddr, rnMapped, offset.Reg.Rm, offset.ToArithOption()); + else + ADD(finalAddr, rnMapped, offset.Reg.Rm, offset.ToArithOption()); + } - // stupid dtcm... - if (addr >= cpu5->DTCMBase && addr < (cpu5->DTCMBase + cpu5->DTCMSize)) - { - region.Mem = cpu5->DTCM; - region.Mask = 0x3FFF; - } - else - { - NDS::ARM9GetMemRegion(addr, flags & memop_Store, ®ion); - } - } - else - NDS::ARM7GetMemRegion(addr, flags & memop_Store, ®ion); + if (!(flags & memop_Post) && (flags & memop_Writeback)) + MOV(rnMapped, W0); - if (region.Mem != NULL) - { - void* ptr = ®ion.Mem[addr & addressMask & region.Mask]; + u32 expectedTarget = Num == 0 + ? ARMJIT_Memory::ClassifyAddress9(addrIsStatic ? staticAddress : CurInstr.DataRegion) + : ARMJIT_Memory::ClassifyAddress7(addrIsStatic ? staticAddress : CurInstr.DataRegion); - MOVP2R(X0, ptr); - if (flags & memop_Store) - STRGeneric(size, INDEX_UNSIGNED, rdMapped, X0, 0); - else - { - LDRGeneric(size, flags & memop_SignExtend, INDEX_UNSIGNED, rdMapped, X0, 0); - if (size == 32 && addr & ~0x3) - ROR_(rdMapped, rdMapped, (addr & 0x3) << 3); - } - return; - } + if (Config::JIT_FastMemory && ((!Thumb && CurInstr.Cond() != 0xE) || ARMJIT_Memory::IsMappable(expectedTarget))) + { + ptrdiff_t memopStart = GetCodeOffset(); + LoadStorePatch patch; - void* specialFunc = GetFuncForAddr(CurCPU, addr, flags & memop_Store, size); - if (specialFunc) - { - memFunc = specialFunc; - inlinePreparation = true; - constLocalROR32 = addr & 0x3; - } - } + patch.PatchFunc = flags & memop_Store + ? PatchedStoreFuncs[Num][__builtin_ctz(size) - 3][rdMapped - W19] + : PatchedLoadFuncs[Num][__builtin_ctz(size) - 3][!!(flags & memop_SignExtend)][rdMapped - W19]; + assert(rdMapped - W19 >= 0 && rdMapped - W19 < 8); - ARM64Reg finalAddr = W0; - if (flags & memop_Post) - { - finalAddr = rnMapped; - MOV(W0, rnMapped); - } + MOVP2R(X7, Num == 0 ? ARMJIT_Memory::FastMem9Start : ARMJIT_Memory::FastMem7Start); + // take a chance at fastmem + if (size > 8) + ANDI2R(W1, W0, addressMask); + + ptrdiff_t loadStorePosition = GetCodeOffset(); if (flags & memop_Store) - MOV(W1, rdMapped); - - if (!offset.IsImm) - Comp_RegShiftImm(offset.Reg.ShiftType, offset.Reg.ShiftAmount, false, offset, W2); - // offset might become an immediate - if (offset.IsImm) { - if (flags & memop_SubtractOffset) - SUB(finalAddr, rnMapped, offset.Imm); - else - ADD(finalAddr, rnMapped, offset.Imm); + STRGeneric(size, rdMapped, size > 8 ? X1 : X0, X7); } else { - if (offset.Reg.ShiftType == ST_ROR) + LDRGeneric(size, flags & memop_SignExtend, rdMapped, size > 8 ? X1 : X0, X7); + if (size == 32) { - ROR_(W0, offset.Reg.Rm, offset.Reg.ShiftAmount); - offset = Op2(W0); + UBFIZ(W0, W0, 3, 2); + RORV(rdMapped, rdMapped, W0); } - - if (flags & memop_SubtractOffset) - SUB(finalAddr, rnMapped, offset.Reg.Rm, offset.ToArithOption()); - else - ADD(finalAddr, rnMapped, offset.Reg.Rm, offset.ToArithOption()); } - if (!(flags & memop_Post) && (flags & memop_Writeback)) - MOV(rnMapped, W0); + patch.PatchOffset = memopStart - loadStorePosition; + patch.PatchSize = GetCodeOffset() - memopStart; + LoadStorePatches[loadStorePosition] = patch; + } + else + { + void* func = NULL; + if (addrIsStatic) + func = ARMJIT_Memory::GetFuncForAddr(CurCPU, staticAddress, flags & memop_Store, size); - if (inlinePreparation) + if (func) { - if (size == 32 && !(flags & memop_Store) && constLocalROR32 == 4) - ANDI2R(rdMapped, W0, 3); - if (size > 8) - ANDI2R(W0, W0, addressMask); + if (flags & memop_Store) + MOV(W1, rdMapped); + QuickCallFunction(X2, (void (*)())func); + + if (!(flags & memop_Store)) + { + if (size == 32) + { + if (staticAddress & 0x3) + ROR_(rdMapped, W0, (staticAddress & 0x3) << 3); + else + MOV(rdMapped, W0); + } + else + { + if (flags & memop_SignExtend) + SBFX(rdMapped, W0, 0, size); + else + UBFX(rdMapped, W0, 0, size); + } + } } - QuickCallFunction(X2, memFunc); - if (!(flags & memop_Store)) + else { - if (inlinePreparation && !(flags & memop_Store) && size == 32) + if (Num == 0) { - if (constLocalROR32 == 4) + MOV(X1, RCPU); + if (flags & memop_Store) { - LSL(rdMapped, rdMapped, 3); - RORV(rdMapped, W0, rdMapped); + MOV(W2, rdMapped); + switch (size) + { + case 32: QuickCallFunction(X3, SlowWrite9<u32>); break; + case 16: QuickCallFunction(X3, SlowWrite9<u16>); break; + case 8: QuickCallFunction(X3, SlowWrite9<u8>); break; + } } - else if (constLocalROR32 > 0) - ROR_(rdMapped, W0, constLocalROR32 << 3); else - MOV(rdMapped, W0); + { + switch (size) + { + case 32: QuickCallFunction(X3, SlowRead9<u32>); break; + case 16: QuickCallFunction(X3, SlowRead9<u16>); break; + case 8: QuickCallFunction(X3, SlowRead9<u8>); break; + } + } } - else if (flags & memop_SignExtend) + else { - if (size == 16) - SXTH(rdMapped, W0); - else if (size == 8) - SXTB(rdMapped, W0); + if (flags & memop_Store) + { + MOV(W1, rdMapped); + switch (size) + { + case 32: QuickCallFunction(X3, SlowWrite7<u32>); break; + case 16: QuickCallFunction(X3, SlowWrite7<u16>); break; + case 8: QuickCallFunction(X3, SlowWrite7<u8>); break; + } + } else - assert("What's wrong with you?"); + { + switch (size) + { + case 32: QuickCallFunction(X3, SlowRead7<u32>); break; + case 16: QuickCallFunction(X3, SlowRead7<u16>); break; + case 8: QuickCallFunction(X3, SlowRead7<u8>); break; + } + } } - else - MOV(rdMapped, W0); - - if (CurInstr.Info.Branches()) + + if (!(flags & memop_Store)) { - if (size < 32) - printf("LDR size < 32 branching?\n"); - Comp_JumpTo(rdMapped, Num == 0, false); + if (size == 32) + MOV(rdMapped, W0); + else if (flags & memop_SignExtend) + SBFX(rdMapped, W0, 0, size); + else + UBFX(rdMapped, W0, 0, size); } } } + + if (CurInstr.Info.Branches()) + { + if (size < 32) + printf("LDR size < 32 branching?\n"); + Comp_JumpTo(rdMapped, Num == 0, false); + } } void Compiler::A_Comp_MemWB() @@ -589,19 +413,11 @@ void Compiler::T_Comp_MemImmHalf() void Compiler::T_Comp_LoadPCRel() { - u32 addr = (R15 & ~0x2) + ((CurInstr.Instr & 0xFF) << 2); + u32 offset = ((CurInstr.Instr & 0xFF) << 2); + u32 addr = (R15 & ~0x2) + offset; - if (Config::JIT_LiteralOptimisations) - { - Comp_MemLoadLiteral(32, false, CurInstr.T_Reg(8), addr); - Comp_AddCycles_CDI(); - } - else - { - bool negative = addr < R15; - u32 abs = negative ? R15 - addr : addr - R15; - Comp_MemAccess(CurInstr.T_Reg(8), 15, Op2(abs), 32, negative ? memop_SubtractOffset : 0); - } + if (!Config::JIT_LiteralOptimisations || !Comp_MemLoadLiteral(32, false, CurInstr.T_Reg(8), addr)) + Comp_MemAccess(CurInstr.T_Reg(8), 15, Op2(offset), 32, 0); } void Compiler::T_Comp_MemSPRel() @@ -621,15 +437,138 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc if (regsCount == 0) return 0; // actually not the right behaviour TODO: fix me - SUB(SP, SP, ((regsCount + 1) & ~1) * 8); - if (store) + if (regsCount == 1 && !usermode && RegCache.LoadedRegs & (1 << *regs.begin())) { + int flags = 0; + if (store) + flags |= memop_Store; + if (decrement) + flags |= memop_SubtractOffset; + Op2 offset = preinc ? Op2(4) : Op2(0); + + Comp_MemAccess(*regs.begin(), rn, offset, 32, flags); + + return decrement ? -4 : 4; + } + + if (store) Comp_AddCycles_CD(); + else + Comp_AddCycles_CDI(); - if (usermode && (regs & BitSet16(0x7f00))) - UBFX(W0, RCPSR, 0, 5); + int expectedTarget = Num == 0 + ? ARMJIT_Memory::ClassifyAddress9(CurInstr.DataRegion) + : ARMJIT_Memory::ClassifyAddress7(CurInstr.DataRegion); + + bool compileFastPath = Config::JIT_FastMemory + && store && !usermode && (CurInstr.Cond() < 0xE || ARMJIT_Memory::IsMappable(expectedTarget)); + + if (decrement) + { + SUB(W0, MapReg(rn), regsCount * 4); + ANDI2R(W0, W0, ~3); + preinc ^= true; + } + else + { + ANDI2R(W0, MapReg(rn), ~3); + } + + LoadStorePatch patch; + if (compileFastPath) + { + ptrdiff_t fastPathStart = GetCodeOffset(); + ptrdiff_t firstLoadStoreOffset; + + bool firstLoadStore = true; + + MOVP2R(X1, Num == 0 ? ARMJIT_Memory::FastMem9Start : ARMJIT_Memory::FastMem7Start); + ADD(X1, X1, X0); + + u32 offset = preinc ? 4 : 0; + BitSet16::Iterator it = regs.begin(); + + if (regsCount & 1) + { + int reg = *it; + it++; + + ARM64Reg first = W3; + if (RegCache.LoadedRegs & (1 << reg)) + first = MapReg(reg); + else if (store) + LoadReg(reg, first); + + if (firstLoadStore) + { + firstLoadStoreOffset = GetCodeOffset(); + firstLoadStore = false; + } + + if (store) + STR(INDEX_UNSIGNED, first, X1, offset); + else + LDR(INDEX_UNSIGNED, first, X1, offset); + + if (!(RegCache.LoadedRegs & (1 << reg)) && !store) + SaveReg(reg, first); + + offset += 4; + } + + while (it != regs.end()) + { + int reg = *it; + it++; + int nextReg = *it; + it++; - int i = regsCount - 1; + ARM64Reg first = W3, second = W4; + if (RegCache.LoadedRegs & (1 << reg)) + first = MapReg(reg); + else if (store) + LoadReg(reg, first); + if (RegCache.LoadedRegs & (1 << nextReg)) + second = MapReg(nextReg); + else if (store) + LoadReg(nextReg, second); + + if (firstLoadStore) + { + firstLoadStoreOffset = GetCodeOffset(); + firstLoadStore = false; + } + + if (store) + STP(INDEX_SIGNED, first, second, X1, offset); + else + LDP(INDEX_SIGNED, first, second, X1, offset); + + if (!(RegCache.LoadedRegs & (1 << reg)) && !store) + SaveReg(reg, first); + if (!(RegCache.LoadedRegs & (1 << nextReg)) && !store) + SaveReg(nextReg, second); + + offset += 8; + } + + patch.PatchSize = GetCodeOffset() - fastPathStart; + patch.PatchOffset = fastPathStart - firstLoadStoreOffset; + SwapCodeRegion(); + patch.PatchFunc = GetRXPtr(); + + LoadStorePatches[firstLoadStoreOffset] = patch; + + ABI_PushRegisters({30}); + } + + int i = 0; + + SUB(SP, SP, ((regsCount + 1) & ~1) * 8); + if (store) + { + if (usermode && (regs & BitSet16(0x7f00))) + UBFX(W5, RCPSR, 0, 5); BitSet16::Iterator it = regs.begin(); while (it != regs.end()) @@ -641,7 +580,7 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc if (usermode && reg >= 8 && reg < 15) { - if (RegCache.Mapping[reg] != INVALID_REG) + if (RegCache.LoadedRegs & (1 << reg)) MOV(W3, MapReg(reg)); else LoadReg(reg, W3); @@ -651,55 +590,67 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc } else if (!usermode && nextReg != regs.end()) { - ARM64Reg first = W3; - ARM64Reg second = W4; + ARM64Reg first = W3, second = W4; - if (RegCache.Mapping[reg] != INVALID_REG) + if (RegCache.LoadedRegs & (1 << reg)) first = MapReg(reg); else LoadReg(reg, W3); - if (RegCache.Mapping[*nextReg] != INVALID_REG) + if (RegCache.LoadedRegs & (1 << *nextReg)) second = MapReg(*nextReg); else LoadReg(*nextReg, W4); - STP(INDEX_SIGNED, EncodeRegTo64(second), EncodeRegTo64(first), SP, i * 8 - 8); + STP(INDEX_SIGNED, EncodeRegTo64(first), EncodeRegTo64(second), SP, i * 8); - i--; + i++; it++; } - else if (RegCache.Mapping[reg] != INVALID_REG) + else if (RegCache.LoadedRegs & (1 << reg)) + { STR(INDEX_UNSIGNED, MapReg(reg), SP, i * 8); + } else { LoadReg(reg, W3); STR(INDEX_UNSIGNED, W3, SP, i * 8); } - i--; + i++; it++; } } - if (decrement) - { - SUB(W0, MapReg(rn), regsCount * 4); - preinc ^= true; - } - else - MOV(W0, MapReg(rn)); + ADD(X1, SP, 0); MOVI2R(W2, regsCount); - BL(Num ? MemFuncsSeq7[store][preinc] : MemFuncsSeq9[store][preinc]); + if (Num == 0) + { + MOV(X3, RCPU); + switch (preinc * 2 | store) + { + case 0: QuickCallFunction(X4, SlowBlockTransfer9<false, false>); break; + case 1: QuickCallFunction(X4, SlowBlockTransfer9<false, true>); break; + case 2: QuickCallFunction(X4, SlowBlockTransfer9<true, false>); break; + case 3: QuickCallFunction(X4, SlowBlockTransfer9<true, true>); break; + } + } + else + { + switch (preinc * 2 | store) + { + case 0: QuickCallFunction(X4, SlowBlockTransfer7<false, false>); break; + case 1: QuickCallFunction(X4, SlowBlockTransfer7<false, true>); break; + case 2: QuickCallFunction(X4, SlowBlockTransfer7<true, false>); break; + case 3: QuickCallFunction(X4, SlowBlockTransfer7<true, true>); break; + } + } if (!store) { - Comp_AddCycles_CDI(); - if (usermode && !regs[15] && (regs & BitSet16(0x7f00))) - UBFX(W0, RCPSR, 0, 5); + UBFX(W5, RCPSR, 0, 5); - int i = regsCount - 1; BitSet16::Iterator it = regs.begin(); while (it != regs.end()) { @@ -714,11 +665,8 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc MOVI2R(W1, reg - 8); BL(WriteBanked); FixupBranch alreadyWritten = CBNZ(W4); - if (RegCache.Mapping[reg] != INVALID_REG) - { + if (RegCache.LoadedRegs & (1 << reg)) MOV(MapReg(reg), W3); - RegCache.DirtyRegs |= 1 << reg; - } else SaveReg(reg, W3); SetJumpTarget(alreadyWritten); @@ -727,20 +675,12 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc { ARM64Reg first = W3, second = W4; - if (RegCache.Mapping[reg] != INVALID_REG) - { + if (RegCache.LoadedRegs & (1 << reg)) first = MapReg(reg); - if (reg != 15) - RegCache.DirtyRegs |= 1 << reg; - } - if (RegCache.Mapping[*nextReg] != INVALID_REG) - { + if (RegCache.LoadedRegs & (1 << *nextReg)) second = MapReg(*nextReg); - if (*nextReg != 15) - RegCache.DirtyRegs |= 1 << *nextReg; - } - LDP(INDEX_SIGNED, EncodeRegTo64(second), EncodeRegTo64(first), SP, i * 8 - 8); + LDP(INDEX_SIGNED, EncodeRegTo64(first), EncodeRegTo64(second), SP, i * 8); if (first == W3) SaveReg(reg, W3); @@ -748,15 +688,12 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc SaveReg(*nextReg, W4); it++; - i--; + i++; } - else if (RegCache.Mapping[reg] != INVALID_REG) + else if (RegCache.LoadedRegs & (1 << reg)) { ARM64Reg mapped = MapReg(reg); LDR(INDEX_UNSIGNED, mapped, SP, i * 8); - - if (reg != 15) - RegCache.DirtyRegs |= 1 << reg; } else { @@ -765,11 +702,20 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc } it++; - i--; + i++; } } ADD(SP, SP, ((regsCount + 1) & ~1) * 8); + if (compileFastPath) + { + ABI_PopRegisters({30}); + RET(); + + FlushIcacheSection((u8*)patch.PatchFunc, (u8*)GetRXPtr()); + SwapCodeRegion(); + } + if (!store && regs[15]) { ARM64Reg mapped = MapReg(15); diff --git a/src/ARMJIT_Compiler.h b/src/ARMJIT_Compiler.h new file mode 100644 index 0000000..513c103 --- /dev/null +++ b/src/ARMJIT_Compiler.h @@ -0,0 +1,12 @@ +#if defined(__x86_64__) +#include "ARMJIT_x64/ARMJIT_Compiler.h" +#elif defined(__aarch64__) +#include "ARMJIT_A64/ARMJIT_Compiler.h" +#else +#error "The current target platform doesn't have a JIT backend" +#endif + +namespace ARMJIT +{ +extern Compiler* JITCompiler; +}
\ No newline at end of file diff --git a/src/ARMJIT_Internal.h b/src/ARMJIT_Internal.h index 4e45760..19684c4 100644 --- a/src/ARMJIT_Internal.h +++ b/src/ARMJIT_Internal.h @@ -3,8 +3,11 @@ #include "types.h" #include <stdint.h> +#include <string.h> +#include <assert.h> #include "ARMJIT.h" +#include "ARMJIT_Memory.h" // here lands everything which doesn't fit into ARMJIT.h // where it would be included by pretty much everything @@ -160,8 +163,8 @@ public: Data.SetLength(numAddresses * 2 + numLiterals); } - u32 PseudoPhysicalAddr; - + u32 StartAddr; + u32 StartAddrLocal; u32 InstrHash, LiteralHash; u8 Num; u16 NumAddresses; @@ -175,28 +178,8 @@ public: { return &Data[NumAddresses]; } u32* Literals() { return &Data[NumAddresses * 2]; } - u32* Links() - { return &Data[NumAddresses * 2 + NumLiterals]; } - - u32 NumLinks() - { return Data.Length - NumAddresses * 2 - NumLiterals; } - - void AddLink(u32 link) - { - Data.Add(link); - } - - void ResetLinks() - { - Data.SetLength(NumAddresses * 2 + NumLiterals); - } private: - /* - 0..<NumInstrs - the instructions of the block - NumInstrs..<(NumLinks + NumInstrs) - pseudo physical addresses where the block is located - (atleast one, the pseudo physical address of the block) - */ TinyVector<u32> Data; }; @@ -207,45 +190,32 @@ struct __attribute__((packed)) AddressRange u32 Code; }; -extern AddressRange CodeRanges[ExeMemSpaceSize / 512]; typedef void (*InterpreterFunc)(ARM* cpu); extern InterpreterFunc InterpretARM[]; extern InterpreterFunc InterpretTHUMB[]; -extern u8 MemoryStatus9[0x800000]; -extern u8 MemoryStatus7[0x800000]; - extern TinyVector<u32> InvalidLiterals; -void* GetFuncForAddr(ARM* cpu, u32 addr, bool store, int size); - -template <u32 Num> -void LinkBlock(ARM* cpu, u32 codeOffset); +extern AddressRange* const CodeMemRegions[ARMJIT_Memory::memregions_Count]; -enum +inline bool PageContainsCode(AddressRange* range) { - memregion_Other = 0, - memregion_ITCM, - memregion_DTCM, - memregion_BIOS9, - memregion_MainRAM, - memregion_SWRAM9, - memregion_SWRAM7, - memregion_IO9, - memregion_VRAM, - memregion_BIOS7, - memregion_WRAM7, - memregion_IO7, - memregion_Wifi, - memregion_VWRAM, -}; + for (int i = 0; i < 8; i++) + { + if (range[i].Blocks.Length > 0) + return true; + } + return false; +} + +u32 LocaliseCodeAddress(u32 num, u32 addr); -int ClassifyAddress9(u32 addr); -int ClassifyAddress7(u32 addr); +template <u32 Num> +void LinkBlock(ARM* cpu, u32 codeOffset); -template <typename T> T SlowRead9(ARMv5* cpu, u32 addr); -template <typename T> void SlowWrite9(ARMv5* cpu, u32 addr, T val); +template <typename T> T SlowRead9(u32 addr, ARMv5* cpu); +template <typename T> void SlowWrite9(u32 addr, ARMv5* cpu, T val); template <typename T> T SlowRead7(u32 addr); template <typename T> void SlowWrite7(u32 addr, T val); diff --git a/src/ARMJIT_Memory.cpp b/src/ARMJIT_Memory.cpp new file mode 100644 index 0000000..162827d --- /dev/null +++ b/src/ARMJIT_Memory.cpp @@ -0,0 +1,822 @@ +#ifdef __SWITCH__ +#include "switch/compat_switch.h" +#endif + +#include "ARMJIT_Memory.h" + +#include "ARMJIT_Internal.h" +#include "ARMJIT_Compiler.h" + +#include "GPU.h" +#include "GPU3D.h" +#include "Wifi.h" +#include "NDSCart.h" +#include "SPU.h" + +#include <malloc.h> + +/* + We're handling fastmem here. + + Basically we're repurposing a big piece of virtual memory + and map the memory regions as they're structured on the DS + in it. + + On most systems you have a single piece of main ram, + maybe some video ram and faster cache RAM and that's about it. + Here we have not only a lot more different memory regions, + but also two address spaces. Not only that but they all have + mirrors (the worst case is 16kb SWRAM which is mirrored 1024x). + + We handle this by only mapping those regions which are actually + used and by praying the games don't go wild. + + Beware, this file is full of platform specific code. + +*/ + +namespace ARMJIT_Memory +{ +#ifdef __aarch64__ +struct FaultDescription +{ + u64 IntegerRegisters[33]; + u64 FaultAddr; + + u32 GetEmulatedAddr() + { + // now this is podracing + return (u32)IntegerRegisters[0]; + } + u64 RealAddr() + { + return FaultAddr; + } + + u64 GetPC() + { + return IntegerRegisters[32]; + } + + void RestoreAndRepeat(s64 offset); +}; +#else +struct FaultDescription +{ + u64 GetPC() + { + return 0; + } + + u32 GetEmulatedAddr() + { + return 0; + } + u64 RealAddr() + { + return 0; + } + + void RestoreAndRepeat(s64 offset); +}; +#endif + +void FaultHandler(FaultDescription* faultDesc); +} + + +#ifdef __aarch64__ + +extern "C" void ARM_RestoreContext(u64* registers) __attribute__((noreturn)); + +#endif + +#ifdef __SWITCH__ +// with LTO the symbols seem to be not properly overriden +// if they're somewhere else + +extern "C" +{ +extern char __start__; +extern char __rodata_start; + +alignas(16) u8 __nx_exception_stack[0x8000]; +u64 __nx_exception_stack_size = 0x8000; + +void __libnx_exception_handler(ThreadExceptionDump* ctx) +{ + ARMJIT_Memory::FaultDescription desc; + memcpy(desc.IntegerRegisters, &ctx->cpu_gprs[0].x, 8*29); + desc.IntegerRegisters[29] = ctx->fp.x; + desc.IntegerRegisters[30] = ctx->lr.x; + desc.IntegerRegisters[31] = ctx->sp.x; + desc.IntegerRegisters[32] = ctx->pc.x; + + ARMJIT_Memory::FaultHandler(&desc); + + if (ctx->pc.x >= (u64)&__start__ && ctx->pc.x < (u64)&__rodata_start) + { + printf("non JIT fault in .text at 0x%x (type %d) (trying to access 0x%x?)\n", + ctx->pc.x - (u64)&__start__, ctx->error_desc, ctx->far.x); + } + else + { + printf("non JIT fault somewhere in deep (address) space at %x (type %d)\n", ctx->pc.x, ctx->error_desc); + } +} + +} +#endif + +namespace ARMJIT_Memory +{ + +#ifdef __aarch64__ +void FaultDescription::RestoreAndRepeat(s64 offset) +{ + IntegerRegisters[32] += offset; + + ARM_RestoreContext(IntegerRegisters); +} +#else +void FaultDescription::RestoreAndRepeat(s64 offset) +{ + +} +#endif + +void* FastMem9Start, *FastMem7Start; + +const u32 MemoryTotalSize = + NDS::MainRAMSize + + NDS::SharedWRAMSize + + NDS::ARM7WRAMSize + + DTCMPhysicalSize; + +const u32 MemBlockMainRAMOffset = 0; +const u32 MemBlockSWRAMOffset = NDS::MainRAMSize; +const u32 MemBlockARM7WRAMOffset = NDS::MainRAMSize + NDS::SharedWRAMSize; +const u32 MemBlockDTCMOffset = NDS::MainRAMSize + NDS::SharedWRAMSize + NDS::ARM7WRAMSize; + +const u32 OffsetsPerRegion[memregions_Count] = +{ + UINT32_MAX, + UINT32_MAX, + MemBlockDTCMOffset, + UINT32_MAX, + MemBlockMainRAMOffset, + MemBlockSWRAMOffset, + UINT32_MAX, + UINT32_MAX, + UINT32_MAX, + MemBlockARM7WRAMOffset, + UINT32_MAX, + UINT32_MAX, + UINT32_MAX, +}; + +enum +{ + memstate_Unmapped, + memstate_MappedRW, + // on switch this is unmapped as well + memstate_MappedProtected, +}; + +u8 MappingStatus9[1 << (32-12)]; +u8 MappingStatus7[1 << (32-12)]; + +#ifdef __SWITCH__ +u8* MemoryBase; +u8* MemoryBaseCodeMem; +#else +u8* MemoryBase; +#endif + +bool MapIntoRange(u32 addr, u32 num, u32 offset, u32 size) +{ + u8* dst = (u8*)(num == 0 ? FastMem9Start : FastMem7Start) + addr; +#ifdef __SWITCH__ + Result r = (svcMapProcessMemory(dst, envGetOwnProcessHandle(), + (u64)(MemoryBaseCodeMem + offset), size)); + return R_SUCCEEDED(r); +#endif +} + +bool UnmapFromRange(u32 addr, u32 num, u32 offset, u32 size) +{ + u8* dst = (u8*)(num == 0 ? FastMem9Start : FastMem7Start) + addr; +#ifdef __SWITCH__ + Result r = svcUnmapProcessMemory(dst, envGetOwnProcessHandle(), + (u64)(MemoryBaseCodeMem + offset), size); + printf("%x\n", r); + return R_SUCCEEDED(r); +#endif +} + +struct Mapping +{ + u32 Addr; + u32 Size, LocalOffset; + u32 Num; + + void Unmap(int region) + { + bool skipDTCM = Num == 0 && region != memregion_DTCM; + u8* statuses = Num == 0 ? MappingStatus9 : MappingStatus7; + u32 offset = 0; + while (offset < Size) + { + if (skipDTCM && Addr + offset == NDS::ARM9->DTCMBase) + { + offset += NDS::ARM9->DTCMSize; + printf("%x skip\n", NDS::ARM9->DTCMSize); + } + else + { + u32 segmentOffset = offset; + u8 status = statuses[(Addr + offset) >> 12]; + while (statuses[(Addr + offset) >> 12] == status + && offset < Size + && (!skipDTCM || Addr + offset != NDS::ARM9->DTCMBase)) + { + assert(statuses[(Addr + offset) >> 12] != memstate_Unmapped); + statuses[(Addr + offset) >> 12] = memstate_Unmapped; + offset += 0x1000; + } + + if (status == memstate_MappedRW) + { + u32 segmentSize = offset - segmentOffset; + printf("unmapping %x %x %x %x\n", Addr + segmentOffset, Num, segmentOffset + LocalOffset + OffsetsPerRegion[region], segmentSize); + bool success = UnmapFromRange(Addr + segmentOffset, Num, segmentOffset + LocalOffset + OffsetsPerRegion[region], segmentSize); + assert(success); + } + } + } + } +}; +ARMJIT::TinyVector<Mapping> Mappings[memregions_Count]; + +void SetCodeProtection(int region, u32 offset, bool protect) +{ + offset &= ~0xFFF; + printf("set code protection %d %x %d\n", region, offset, protect); + + for (int i = 0; i < Mappings[region].Length; i++) + { + Mapping& mapping = Mappings[region][i]; + + u32 effectiveAddr = mapping.Addr + (offset - mapping.LocalOffset); + if (mapping.Num == 0 + && region != memregion_DTCM + && effectiveAddr >= NDS::ARM9->DTCMBase + && effectiveAddr < (NDS::ARM9->DTCMBase + NDS::ARM9->DTCMSize)) + continue; + + u8* states = (u8*)(mapping.Num == 0 ? MappingStatus9 : MappingStatus7); + + printf("%d %x %d\n", states[effectiveAddr >> 12], effectiveAddr, mapping.Num); + assert(states[effectiveAddr >> 12] == (protect ? memstate_MappedRW : memstate_MappedProtected)); + states[effectiveAddr >> 12] = protect ? memstate_MappedProtected : memstate_MappedRW; + + bool success; + if (protect) + success = UnmapFromRange(effectiveAddr, mapping.Num, OffsetsPerRegion[region] + offset, 0x1000); + else + success = MapIntoRange(effectiveAddr, mapping.Num, OffsetsPerRegion[region] + offset, 0x1000); + assert(success); + } +} + +void RemapDTCM(u32 newBase, u32 newSize) +{ + // this first part could be made more efficient + // by unmapping DTCM first and then map the holes + u32 oldDTCMBase = NDS::ARM9->DTCMBase; + u32 oldDTCBEnd = oldDTCMBase + NDS::ARM9->DTCMSize; + + u32 newEnd = newBase + newSize; + + printf("remapping DTCM %x %x %x %x\n", newBase, newEnd, oldDTCMBase, oldDTCBEnd); + // unmap all regions containing the old or the current DTCM mapping + for (int region = 0; region < memregions_Count; region++) + { + if (region == memregion_DTCM) + continue; + + for (int i = 0; i < Mappings[region].Length;) + { + Mapping& mapping = Mappings[region][i]; + + u32 start = mapping.Addr; + u32 end = mapping.Addr + mapping.Size; + + printf("mapping %d %x %x %x %x\n", region, mapping.Addr, mapping.Size, mapping.Num, mapping.LocalOffset); + + bool oldOverlap = NDS::ARM9->DTCMSize > 0 && ((oldDTCMBase >= start && oldDTCMBase < end) || (oldDTCBEnd >= start && oldDTCBEnd < end)); + bool newOverlap = newSize > 0 && ((newBase >= start && newBase < end) || (newEnd >= start && newEnd < end)); + + if (mapping.Num == 0 && (oldOverlap || newOverlap)) + { + mapping.Unmap(region); + Mappings[region].Remove(i); + } + else + { + i++; + } + } + } + + for (int i = 0; i < Mappings[memregion_DTCM].Length; i++) + { + Mappings[memregion_DTCM][i].Unmap(memregion_DTCM); + } + Mappings[memregion_DTCM].Clear(); +} + +void RemapSWRAM() +{ + printf("remapping SWRAM\n"); + for (int i = 0; i < Mappings[memregion_SWRAM].Length; i++) + { + Mappings[memregion_SWRAM][i].Unmap(memregion_SWRAM); + } + Mappings[memregion_SWRAM].Clear(); + for (int i = 0; i < Mappings[memregion_WRAM7].Length; i++) + { + Mappings[memregion_WRAM7][i].Unmap(memregion_WRAM7); + } + Mappings[memregion_WRAM7].Clear(); +} + +bool MapAtAddress(u32 addr) +{ + u32 num = NDS::CurCPU; + + int region = num == 0 + ? ClassifyAddress9(addr) + : ClassifyAddress7(addr); + + if (!IsMappable(region)) + return false; + + u32 mappingStart, mappingSize, memoryOffset, memorySize; + bool isMapped = GetRegionMapping(region, num, mappingStart, mappingSize, memoryOffset, memorySize); + + if (!isMapped) + return false; + + // this calculation even works with DTCM + // which doesn't have to be aligned to it's own size + u32 mirrorStart = (addr - mappingStart) / memorySize * memorySize + mappingStart; + + u8* states = num == 0 ? MappingStatus9 : MappingStatus7; + printf("trying to create mapping %08x %d %x %d %x\n", addr, num, memorySize, region, memoryOffset); + bool isExecutable = ARMJIT::CodeMemRegions[region]; + + ARMJIT::AddressRange* range = ARMJIT::CodeMemRegions[region] + memoryOffset; + + // this overcomplicated piece of code basically just finds whole pieces of code memory + // which can be mapped + u32 offset = 0; + bool skipDTCM = num == 0 && region != memregion_DTCM; + while (offset < memorySize) + { + if (skipDTCM && mirrorStart + offset == NDS::ARM9->DTCMBase) + { + offset += NDS::ARM9->DTCMSize; + } + else + { + u32 sectionOffset = offset; + bool hasCode = isExecutable && ARMJIT::PageContainsCode(&range[offset / 512]); + while ((!isExecutable || ARMJIT::PageContainsCode(&range[offset / 512]) == hasCode) + && offset < memorySize + && (!skipDTCM || mirrorStart + offset != NDS::ARM9->DTCMBase)) + { + assert(states[(mirrorStart + offset) >> 12] == memstate_Unmapped); + states[(mirrorStart + offset) >> 12] = hasCode ? memstate_MappedProtected : memstate_MappedRW; + offset += 0x1000; + } + + u32 sectionSize = offset - sectionOffset; + + if (!hasCode) + { + printf("trying to map %x (size: %x) from %x\n", mirrorStart + sectionOffset, sectionSize, sectionOffset + memoryOffset + OffsetsPerRegion[region]); + bool succeded = MapIntoRange(mirrorStart + sectionOffset, num, sectionOffset + memoryOffset + OffsetsPerRegion[region], sectionSize); + assert(succeded); + } + } + } + + Mapping mapping{mirrorStart, memorySize, memoryOffset, num}; + Mappings[region].Add(mapping); + + printf("mapped mirror at %08x-%08x\n", mirrorStart, mirrorStart + memorySize - 1); + + return true; +} + +void FaultHandler(FaultDescription* faultDesc) +{ + if (ARMJIT::JITCompiler->IsJITFault(faultDesc->GetPC())) + { + bool rewriteToSlowPath = true; + + u32 addr = faultDesc->GetEmulatedAddr(); + + if ((NDS::CurCPU == 0 ? MappingStatus9 : MappingStatus7)[addr >> 12] == memstate_Unmapped) + rewriteToSlowPath = !MapAtAddress(faultDesc->GetEmulatedAddr()); + + s64 offset = 0; + if (rewriteToSlowPath) + { + offset = ARMJIT::JITCompiler->RewriteMemAccess(faultDesc->GetPC()); + } + faultDesc->RestoreAndRepeat(offset); + } +} + +void Init() +{ +#if defined(__SWITCH__) + MemoryBase = (u8*)memalign(0x1000, MemoryTotalSize); + MemoryBaseCodeMem = (u8*)virtmemReserve(MemoryTotalSize); + + bool succeded = R_SUCCEEDED(svcMapProcessCodeMemory(envGetOwnProcessHandle(), (u64)MemoryBaseCodeMem, + (u64)MemoryBase, MemoryTotalSize)); + assert(succeded); + succeded = R_SUCCEEDED(svcSetProcessMemoryPermission(envGetOwnProcessHandle(), (u64)MemoryBaseCodeMem, + MemoryTotalSize, Perm_Rw)); + assert(succeded); + + // 8 GB of address space, just don't ask... + FastMem9Start = virtmemReserve(0x100000000); + assert(FastMem9Start); + FastMem7Start = virtmemReserve(0x100000000); + assert(FastMem7Start); + + NDS::MainRAM = MemoryBaseCodeMem + MemBlockMainRAMOffset; + NDS::SharedWRAM = MemoryBaseCodeMem + MemBlockSWRAMOffset; + NDS::ARM7WRAM = MemoryBaseCodeMem + MemBlockARM7WRAMOffset; + NDS::ARM9->DTCM = MemoryBaseCodeMem + MemBlockDTCMOffset; +#else + MemoryBase = new u8[MemoryTotalSize]; + + NDS::MainRAM = MemoryBase + MemBlockMainRAMOffset; + NDS::SharedWRAM = MemoryBase + MemBlockSWRAMOffset; + NDS::ARM7WRAM = MemoryBase + MemBlockARM7WRAMOffset; + NDS::ARM9->DTCM = MemoryBase + MemBlockDTCMOffset; +#endif +} + +void DeInit() +{ +#if defined(__SWITCH__) + virtmemFree(FastMem9Start, 0x100000000); + virtmemFree(FastMem7Start, 0x100000000); + + svcUnmapProcessCodeMemory(envGetOwnProcessHandle(), (u64)MemoryBaseCodeMem, (u64)MemoryBase, MemoryTotalSize); + virtmemFree(MemoryBaseCodeMem, MemoryTotalSize); + free(MemoryBase); +#else + delete[] MemoryBase; +#endif +} + +void Reset() +{ + for (int region = 0; region < memregions_Count; region++) + { + for (int i = 0; i < Mappings[region].Length; i++) + Mappings[region][i].Unmap(region); + Mappings[region].Clear(); + } + + for (int i = 0; i < sizeof(MappingStatus9); i++) + { + assert(MappingStatus9[i] == memstate_Unmapped); + assert(MappingStatus7[i] == memstate_Unmapped); + } + + printf("done resetting jit mem\n"); +} + +bool IsMappable(int region) +{ + return OffsetsPerRegion[region] != UINT32_MAX; +} + +bool GetRegionMapping(int region, u32 num, u32& mappingStart, u32& mappingSize, u32& memoryOffset, u32& memorySize) +{ + memoryOffset = 0; + switch (region) + { + case memregion_ITCM: + if (num == 0) + { + mappingStart = 0; + mappingSize = NDS::ARM9->ITCMSize; + memorySize = ITCMPhysicalSize; + return true; + } + return false; + case memregion_DTCM: + if (num == 0) + { + mappingStart = NDS::ARM9->DTCMBase; + mappingSize = NDS::ARM9->DTCMSize; + memorySize = DTCMPhysicalSize; + return true; + } + return false; + case memregion_BIOS9: + if (num == 0) + { + mappingStart = 0xFFFF0000; + mappingSize = 0x10000; + memorySize = 0x1000; + return true; + } + return false; + case memregion_MainRAM: + mappingStart = 0x2000000; + mappingSize = 0x1000000; + memorySize = NDS::MainRAMSize; + return true; + case memregion_SWRAM: + mappingStart = 0x3000000; + if (num == 0 && NDS::SWRAM_ARM9.Mem) + { + mappingSize = 0x1000000; + memoryOffset = NDS::SWRAM_ARM9.Mem - NDS::SharedWRAM; + memorySize = NDS::SWRAM_ARM9.Mask + 1; + return true; + } + else if (num == 1 && NDS::SWRAM_ARM7.Mem) + { + mappingSize = 0x800000; + memoryOffset = NDS::SWRAM_ARM7.Mem - NDS::SharedWRAM; + memorySize = NDS::SWRAM_ARM7.Mask + 1; + return true; + } + return false; + case memregion_VRAM: + if (num == 0) + { + // this is a gross simplification + // mostly to make code on vram working + // it doesn't take any of the actual VRAM mappings into account + mappingStart = 0x6000000; + mappingSize = 0x1000000; + memorySize = 0x100000; + return true; + } + return false; + case memregion_BIOS7: + if (num == 1) + { + mappingStart = 0; + mappingSize = 0x4000; + memorySize = 0x4000; + return true; + } + return false; + case memregion_WRAM7: + if (num == 1) + { + if (NDS::SWRAM_ARM7.Mem) + { + mappingStart = 0x3800000; + mappingSize = 0x800000; + } + else + { + mappingStart = 0x3000000; + mappingSize = 0x1000000; + } + memorySize = NDS::ARM7WRAMSize; + return true; + } + return false; + case memregion_VWRAM: + if (num == 1) + { + mappingStart = 0x6000000; + mappingSize = 0x1000000; + memorySize = 0x20000; + return true; + } + return false; + default: + // for the JIT we don't are about the rest + return false; + } +} + +int ClassifyAddress9(u32 addr) +{ + if (addr < NDS::ARM9->ITCMSize) + return memregion_ITCM; + else if (addr >= NDS::ARM9->DTCMBase && addr < (NDS::ARM9->DTCMBase + NDS::ARM9->DTCMSize)) + return memregion_DTCM; + else if ((addr & 0xFFFFF000) == 0xFFFF0000) + return memregion_BIOS9; + else + { + switch (addr & 0xFF000000) + { + case 0x02000000: + return memregion_MainRAM; + case 0x03000000: + if (NDS::SWRAM_ARM9.Mem) + return memregion_SWRAM; + else + return memregion_Other; + case 0x04000000: + return memregion_IO9; + case 0x06000000: + return memregion_VRAM; + } + } + return memregion_Other; +} + +int ClassifyAddress7(u32 addr) +{ + if (addr < 0x00004000) + return memregion_BIOS7; + else + { + switch (addr & 0xFF800000) + { + case 0x02000000: + case 0x02800000: + return memregion_MainRAM; + case 0x03000000: + if (NDS::SWRAM_ARM7.Mem) + return memregion_SWRAM; + else + return memregion_WRAM7; + case 0x03800000: + return memregion_WRAM7; + case 0x04000000: + return memregion_IO7; + case 0x04800000: + return memregion_Wifi; + case 0x06000000: + case 0x06800000: + return memregion_VWRAM; + } + } + return memregion_Other; +} + +void WifiWrite32(u32 addr, u32 val) +{ + Wifi::Write(addr, val & 0xFFFF); + Wifi::Write(addr + 2, val >> 16); +} + +u32 WifiRead32(u32 addr) +{ + return Wifi::Read(addr) | (Wifi::Read(addr + 2) << 16); +} + +template <typename T> +void VRAMWrite(u32 addr, T val) +{ + switch (addr & 0x00E00000) + { + case 0x00000000: GPU::WriteVRAM_ABG<T>(addr, val); return; + case 0x00200000: GPU::WriteVRAM_BBG<T>(addr, val); return; + case 0x00400000: GPU::WriteVRAM_AOBJ<T>(addr, val); return; + case 0x00600000: GPU::WriteVRAM_BOBJ<T>(addr, val); return; + default: GPU::WriteVRAM_LCDC<T>(addr, val); return; + } +} +template <typename T> +T VRAMRead(u32 addr) +{ + switch (addr & 0x00E00000) + { + case 0x00000000: return GPU::ReadVRAM_ABG<T>(addr); + case 0x00200000: return GPU::ReadVRAM_BBG<T>(addr); + case 0x00400000: return GPU::ReadVRAM_AOBJ<T>(addr); + case 0x00600000: return GPU::ReadVRAM_BOBJ<T>(addr); + default: return GPU::ReadVRAM_LCDC<T>(addr); + } +} + +void* GetFuncForAddr(ARM* cpu, u32 addr, bool store, int size) +{ + if (cpu->Num == 0) + { + switch (addr & 0xFF000000) + { + case 0x04000000: + if (!store && size == 32 && addr == 0x04100010 && NDS::ExMemCnt[0] & (1<<11)) + return (void*)NDSCart::ReadROMData; + + /* + unfortunately we can't map GPU2D this way + since it's hidden inside an object + + though GPU3D registers are accessed much more intensive + */ + if (addr >= 0x04000320 && addr < 0x040006A4) + { + switch (size | store) + { + case 8: return (void*)GPU3D::Read8; + case 9: return (void*)GPU3D::Write8; + case 16: return (void*)GPU3D::Read16; + case 17: return (void*)GPU3D::Write16; + case 32: return (void*)GPU3D::Read32; + case 33: return (void*)GPU3D::Write32; + } + } + + switch (size | store) + { + case 8: return (void*)NDS::ARM9IORead8; + case 9: return (void*)NDS::ARM9IOWrite8; + case 16: return (void*)NDS::ARM9IORead16; + case 17: return (void*)NDS::ARM9IOWrite16; + case 32: return (void*)NDS::ARM9IORead32; + case 33: return (void*)NDS::ARM9IOWrite32; + } + break; + case 0x06000000: + switch (size | store) + { + case 8: return (void*)VRAMRead<u8>; + case 9: return NULL; + case 16: return (void*)VRAMRead<u16>; + case 17: return (void*)VRAMWrite<u16>; + case 32: return (void*)VRAMRead<u32>; + case 33: return (void*)VRAMWrite<u32>; + } + break; + } + } + else + { + switch (addr & 0xFF800000) + { + case 0x04000000: + if (addr >= 0x04000400 && addr < 0x04000520) + { + switch (size | store) + { + case 8: return (void*)SPU::Read8; + case 9: return (void*)SPU::Write8; + case 16: return (void*)SPU::Read16; + case 17: return (void*)SPU::Write16; + case 32: return (void*)SPU::Read32; + case 33: return (void*)SPU::Write32; + } + } + + switch (size | store) + { + case 8: return (void*)NDS::ARM7IORead8; + case 9: return (void*)NDS::ARM7IOWrite8; + case 16: return (void*)NDS::ARM7IORead16; + case 17: return (void*)NDS::ARM7IOWrite16; + case 32: return (void*)NDS::ARM7IORead32; + case 33: return (void*)NDS::ARM7IOWrite32; + } + break; + case 0x04800000: + if (addr < 0x04810000 && size >= 16) + { + switch (size | store) + { + case 16: return (void*)Wifi::Read; + case 17: return (void*)Wifi::Write; + case 32: return (void*)WifiRead32; + case 33: return (void*)WifiWrite32; + } + } + break; + case 0x06000000: + case 0x06800000: + switch (size | store) + { + case 8: return (void*)GPU::ReadVRAM_ARM7<u8>; + case 9: return (void*)GPU::WriteVRAM_ARM7<u8>; + case 16: return (void*)GPU::ReadVRAM_ARM7<u16>; + case 17: return (void*)GPU::WriteVRAM_ARM7<u16>; + case 32: return (void*)GPU::ReadVRAM_ARM7<u32>; + case 33: return (void*)GPU::WriteVRAM_ARM7<u32>; + } + } + } + return NULL; +} + +}
\ No newline at end of file diff --git a/src/ARMJIT_Memory.h b/src/ARMJIT_Memory.h new file mode 100644 index 0000000..1a59d98 --- /dev/null +++ b/src/ARMJIT_Memory.h @@ -0,0 +1,53 @@ +#ifndef ARMJIT_MEMORY +#define ARMJIT_MEMORY + +#include "types.h" + +#include "ARM.h" + +namespace ARMJIT_Memory +{ + +extern void* FastMem9Start; +extern void* FastMem7Start; + +void Init(); +void DeInit(); + +void Reset(); + +enum +{ + memregion_Other = 0, + memregion_ITCM, + memregion_DTCM, + memregion_BIOS9, + memregion_MainRAM, + memregion_SWRAM, + memregion_IO9, + memregion_VRAM, + memregion_BIOS7, + memregion_WRAM7, + memregion_IO7, + memregion_Wifi, + memregion_VWRAM, + memregions_Count +}; + +int ClassifyAddress9(u32 addr); +int ClassifyAddress7(u32 addr); + +bool GetRegionMapping(int region, u32 num, u32& mappingStart, u32& mappingSize, u32& memoryOffset, u32& memorySize); + +bool IsMappable(int region); + +void RemapDTCM(u32 newBase, u32 newSize); +void RemapSWRAM(); + +void SetCodeProtection(int region, u32 offset, bool protect); + +void* GetFuncForAddr(ARM* cpu, u32 addr, bool store, int size); + +} + +#endif
\ No newline at end of file diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp index fd3fb70..34c1c91 100644 --- a/src/ARMJIT_x64/ARMJIT_Compiler.cpp +++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp @@ -301,24 +301,6 @@ Compiler::Compiler() RET(); } - { - CPSRDirty = true; - BranchStub[0] = GetWritableCodePtr(); - SaveCPSR(); - MOV(64, R(ABI_PARAM1), R(RCPU)); - CALL((u8*)ARMJIT::LinkBlock<0>); - LoadCPSR(); - JMP((u8*)ARM_Ret, true); - - CPSRDirty = true; - BranchStub[1] = GetWritableCodePtr(); - SaveCPSR(); - MOV(64, R(ABI_PARAM1), R(RCPU)); - CALL((u8*)ARMJIT::LinkBlock<1>); - LoadCPSR(); - JMP((u8*)ARM_Ret, true); - } - // move the region forward to prevent overwriting the generated functions CodeMemSize -= GetWritableCodePtr() - ResetStart; ResetStart = GetWritableCodePtr(); @@ -520,6 +502,11 @@ void Compiler::Reset() FarCode = FarStart; } +bool Compiler::IsJITFault(u64 addr) +{ + return addr >= (u64)CodeMemory && addr < (u64)CodeMemory + sizeof(CodeMemory); +} + void Compiler::Comp_SpecialBranchBehaviour(bool taken) { if (taken && CurInstr.BranchFlags & branch_IdleBranch) @@ -531,32 +518,11 @@ void Compiler::Comp_SpecialBranchBehaviour(bool taken) RegCache.PrepareExit(); SUB(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm32(ConstantCycles)); - - if (Config::JIT_BrancheOptimisations == 2 && !(CurInstr.BranchFlags & branch_IdleBranch) - && (!taken || (CurInstr.BranchFlags & branch_StaticTarget))) - { - FixupBranch ret = J_CC(CC_S); - CMP(32, MDisp(RCPU, offsetof(ARM, StopExecution)), Imm8(0)); - FixupBranch ret2 = J_CC(CC_NZ); - - u8* rewritePart = GetWritableCodePtr(); - NOP(5); - - MOV(32, R(ABI_PARAM2), Imm32(rewritePart - ResetStart)); - JMP((u8*)BranchStub[Num], true); - - SetJumpTarget(ret); - SetJumpTarget(ret2); - JMP((u8*)ARM_Ret, true); - } - else - { - JMP((u8*)&ARM_Ret, true); - } + JMP((u8*)&ARM_Ret, true); } } -JitBlockEntry Compiler::CompileBlock(u32 translatedAddr, ARM* cpu, bool thumb, FetchedInstr instrs[], int instrsCount) +JitBlockEntry Compiler::CompileBlock(ARM* cpu, bool thumb, FetchedInstr instrs[], int instrsCount) { if (NearSize - (NearCode - NearStart) < 1024 * 32) // guess... { @@ -575,7 +541,7 @@ JitBlockEntry Compiler::CompileBlock(u32 translatedAddr, ARM* cpu, bool thumb, F CodeRegion = instrs[0].Addr >> 24; CurCPU = cpu; // CPSR might have been modified in a previous block - CPSRDirty = Config::JIT_BrancheOptimisations == 2; + CPSRDirty = false; JitBlockEntry res = (JitBlockEntry)GetWritableCodePtr(); @@ -685,31 +651,7 @@ JitBlockEntry Compiler::CompileBlock(u32 translatedAddr, ARM* cpu, bool thumb, F RegCache.Flush(); SUB(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm32(ConstantCycles)); - - if (Config::JIT_BrancheOptimisations == 2 - && !(instrs[instrsCount - 1].BranchFlags & branch_IdleBranch) - && (!instrs[instrsCount - 1].Info.Branches() - || instrs[instrsCount - 1].BranchFlags & branch_FollowCondNotTaken - || (instrs[instrsCount - 1].BranchFlags & branch_FollowCondTaken && instrs[instrsCount - 1].BranchFlags & branch_StaticTarget))) - { - FixupBranch ret = J_CC(CC_S); - CMP(32, MDisp(RCPU, offsetof(ARM, StopExecution)), Imm8(0)); - FixupBranch ret2 = J_CC(CC_NZ); - - u8* rewritePart = GetWritableCodePtr(); - NOP(5); - - MOV(32, R(ABI_PARAM2), Imm32(rewritePart - ResetStart)); - JMP((u8*)BranchStub[Num], true); - - SetJumpTarget(ret); - SetJumpTarget(ret2); - JMP((u8*)ARM_Ret, true); - } - else - { - JMP((u8*)ARM_Ret, true); - } + JMP((u8*)ARM_Ret, true); /*FILE* codeout = fopen("codeout", "a"); fprintf(codeout, "beginning block argargarg__ %x!!!", instrs[0].Addr); @@ -720,22 +662,6 @@ JitBlockEntry Compiler::CompileBlock(u32 translatedAddr, ARM* cpu, bool thumb, F return res; } -void Compiler::LinkBlock(u32 offset, JitBlockEntry entry) -{ - u8* curPtr = GetWritableCodePtr(); - SetCodePtr(ResetStart + offset); - JMP((u8*)entry, true); - SetCodePtr(curPtr); -} - -void Compiler::UnlinkBlock(u32 offset) -{ - u8* curPtr = GetWritableCodePtr(); - SetCodePtr(ResetStart + offset); - NOP(5); - SetCodePtr(curPtr); -} - void Compiler::Comp_AddCycles_C(bool forceNonConstant) { s32 cycles = Num ? diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.h b/src/ARMJIT_x64/ARMJIT_Compiler.h index f2fc301..09ac257 100644 --- a/src/ARMJIT_x64/ARMJIT_Compiler.h +++ b/src/ARMJIT_x64/ARMJIT_Compiler.h @@ -52,10 +52,7 @@ public: void Reset(); - void LinkBlock(u32 offset, JitBlockEntry entry); - void UnlinkBlock(u32 offset); - - JitBlockEntry CompileBlock(u32 translatedAddr, ARM* cpu, bool thumb, FetchedInstr instrs[], int instrsCount); + JitBlockEntry CompileBlock(ARM* cpu, bool thumb, FetchedInstr instrs[], int instrsCount); void LoadReg(int reg, Gen::X64Reg nativeReg); void SaveReg(int reg, Gen::X64Reg nativeReg); @@ -202,6 +199,10 @@ public: SetCodePtr(FarCode); } + bool IsJITFault(u64 addr); + + s32 RewriteMemAccess(u64 pc); + u8* FarCode; u8* NearCode; u32 FarSize; @@ -216,8 +217,6 @@ public: bool Exit; bool IrregularCycles; - void* BranchStub[2]; - void* ReadBanked; void* WriteBanked; diff --git a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp index cf0bd23..0bf2f83 100644 --- a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp +++ b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp @@ -15,6 +15,11 @@ int squeezePointer(T* ptr) return truncated; } +s32 Compiler::RewriteMemAccess(u64 pc) +{ + return 0; +} + /* According to DeSmuME and my own research, approx. 99% (seriously, that's an empirical number) of all memory load and store instructions always access addresses in the same region as @@ -27,14 +32,15 @@ int squeezePointer(T* ptr) bool Compiler::Comp_MemLoadLiteral(int size, int rd, u32 addr) { - u32 translatedAddr = Num == 0 ? TranslateAddr9(addr) : TranslateAddr7(addr); + return false; + //u32 translatedAddr = Num == 0 ? TranslateAddr9(addr) : TranslateAddr7(addr); - int invalidLiteralIdx = InvalidLiterals.Find(translatedAddr); + /*int invalidLiteralIdx = InvalidLiterals.Find(translatedAddr); if (invalidLiteralIdx != -1) { InvalidLiterals.Remove(invalidLiteralIdx); return false; - } + }*/ u32 val; // make sure arm7 bios is accessible @@ -95,7 +101,7 @@ void Compiler::Comp_MemAccess(int rd, int rn, const ComplexOperand& op2, int siz staticAddress = RegCache.LiteralValues[rn] + op2.Imm * ((flags & memop_SubtractOffset) ? -1 : 1); OpArg rdMapped = MapReg(rd); - if (!addrIsStatic) + if (true) { OpArg rnMapped = MapReg(rn); if (Thumb && rn == 15) @@ -145,7 +151,7 @@ void Compiler::Comp_MemAccess(int rd, int rn, const ComplexOperand& op2, int siz MOV(32, rnMapped, R(finalAddr)); } - int expectedTarget = Num == 0 + /*int expectedTarget = Num == 0 ? ClassifyAddress9(addrIsStatic ? staticAddress : CurInstr.DataRegion) : ClassifyAddress7(addrIsStatic ? staticAddress : CurInstr.DataRegion); if (CurInstr.Cond() < 0xE) @@ -184,8 +190,8 @@ void Compiler::Comp_MemAccess(int rd, int rn, const ComplexOperand& op2, int siz if (addrIsStatic && compileSlowPath) MOV(32, R(RSCRATCH3), Imm32(staticAddress)); - - if (compileFastPath) +*/ + /*if (compileFastPath) { FixupBranch slowPath; if (compileSlowPath) @@ -357,15 +363,16 @@ void Compiler::Comp_MemAccess(int rd, int rn, const ComplexOperand& op2, int siz SetJumpTarget(slowPath); } } - - if (compileSlowPath) +*/ + if (true) { PushRegs(false); if (Num == 0) { - MOV(32, R(ABI_PARAM2), R(RSCRATCH3)); - MOV(64, R(ABI_PARAM1), R(RCPU)); + MOV(64, R(ABI_PARAM2), R(RCPU)); + if (ABI_PARAM1 != RSCRATCH3) + MOV(32, R(ABI_PARAM1), R(RSCRATCH3)); if (flags & memop_Store) { MOV(32, R(ABI_PARAM3), rdMapped); @@ -423,13 +430,13 @@ void Compiler::Comp_MemAccess(int rd, int rn, const ComplexOperand& op2, int siz MOVZX(32, size, rdMapped.GetSimpleReg(), R(RSCRATCH)); } } - +/* if (compileFastPath && compileSlowPath) { FixupBranch ret = J(true); SwitchToNearCode(); SetJumpTarget(ret); - } + }*/ if (!(flags & memop_Store) && rd == 15) { @@ -458,7 +465,7 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc u32 stackAlloc = ((regsCount + 1) & ~1) * 8; #endif u32 allocOffset = stackAlloc - regsCount * 8; - +/* int expectedTarget = Num == 0 ? ClassifyAddress9(CurInstr.DataRegion) : ClassifyAddress7(CurInstr.DataRegion); @@ -479,7 +486,7 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc default: break; } - +*/ if (!store) Comp_AddCycles_CDI(); else @@ -492,7 +499,7 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc } else MOV(32, R(RSCRATCH4), MapReg(rn)); - +/* if (compileFastPath) { assert(!usermode); @@ -570,7 +577,7 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc SwitchToFarCode(); SetJumpTarget(slowPath); - } + }*/ if (!store) { @@ -696,13 +703,13 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc PopRegs(false); } - +/* if (compileFastPath) { FixupBranch ret = J(true); SwitchToNearCode(); SetJumpTarget(ret); - } + }*/ if (!store && regs[15]) { diff --git a/src/ARM_InstrInfo.cpp b/src/ARM_InstrInfo.cpp index b50e821..ccec951 100644 --- a/src/ARM_InstrInfo.cpp +++ b/src/ARM_InstrInfo.cpp @@ -206,15 +206,14 @@ enum { T_ReadR14 = 1 << 13, T_WriteR14 = 1 << 14, - T_PopPC = 1 << 15, - - T_SetNZ = 1 << 16, - T_SetCV = 1 << 17, - T_SetMaybeC = 1 << 18, - T_ReadC = 1 << 19, - T_SetC = 1 << 20, + T_SetNZ = 1 << 15, + T_SetCV = 1 << 16, + T_SetMaybeC = 1 << 17, + T_ReadC = 1 << 18, + T_SetC = 1 << 19, - T_WriteMem = 1 << 21, + T_WriteMem = 1 << 20, + T_LoadMem = 1 << 21, }; const u32 T_LSL_IMM = T_SetNZ | T_SetMaybeC | T_Write0 | T_Read3 | tk(tk_LSL_IMM); @@ -256,31 +255,31 @@ const u32 T_ADD_PCREL = T_Write8 | tk(tk_ADD_PCREL); const u32 T_ADD_SPREL = T_Write8 | T_ReadR13 | tk(tk_ADD_SPREL); const u32 T_ADD_SP = T_WriteR13 | T_ReadR13 | tk(tk_ADD_SP); -const u32 T_LDR_PCREL = T_Write8 | tk(tk_LDR_PCREL); +const u32 T_LDR_PCREL = T_Write8 | T_LoadMem | tk(tk_LDR_PCREL); const u32 T_STR_REG = T_Read0 | T_Read3 | T_Read6 | T_WriteMem | tk(tk_STR_REG); const u32 T_STRB_REG = T_Read0 | T_Read3 | T_Read6 | T_WriteMem | tk(tk_STRB_REG); -const u32 T_LDR_REG = T_Write0 | T_Read3 | T_Read6 | tk(tk_LDR_REG); -const u32 T_LDRB_REG = T_Write0 | T_Read3 | T_Read6 | tk(tk_LDRB_REG); +const u32 T_LDR_REG = T_Write0 | T_Read3 | T_Read6 | T_LoadMem | tk(tk_LDR_REG); +const u32 T_LDRB_REG = T_Write0 | T_Read3 | T_Read6 | T_LoadMem | tk(tk_LDRB_REG); const u32 T_STRH_REG = T_Read0 | T_Read3 | T_Read6 | T_WriteMem | tk(tk_STRH_REG); -const u32 T_LDRSB_REG = T_Write0 | T_Read3 | T_Read6 | tk(tk_LDRSB_REG); -const u32 T_LDRH_REG = T_Write0 | T_Read3 | T_Read6 | tk(tk_LDRH_REG); -const u32 T_LDRSH_REG = T_Write0 | T_Read3 | T_Read6 | tk(tk_LDRSH_REG); +const u32 T_LDRSB_REG = T_Write0 | T_Read3 | T_Read6 | T_LoadMem | tk(tk_LDRSB_REG); +const u32 T_LDRH_REG = T_Write0 | T_Read3 | T_Read6 | T_LoadMem | tk(tk_LDRH_REG); +const u32 T_LDRSH_REG = T_Write0 | T_Read3 | T_Read6 | T_LoadMem | tk(tk_LDRSH_REG); const u32 T_STR_IMM = T_Read0 | T_Read3 | T_WriteMem | tk(tk_STR_IMM); -const u32 T_LDR_IMM = T_Write0 | T_Read3 | tk(tk_LDR_IMM); +const u32 T_LDR_IMM = T_Write0 | T_Read3 | T_LoadMem | tk(tk_LDR_IMM); const u32 T_STRB_IMM = T_Read0 | T_Read3 | T_WriteMem | tk(tk_STRB_IMM); -const u32 T_LDRB_IMM = T_Write0 | T_Read3 | tk(tk_LDRB_IMM); +const u32 T_LDRB_IMM = T_Write0 | T_Read3 | T_LoadMem | tk(tk_LDRB_IMM); const u32 T_STRH_IMM = T_Read0 | T_Read3 | T_WriteMem | tk(tk_STRH_IMM); -const u32 T_LDRH_IMM = T_Write0 | T_Read3 | tk(tk_LDRH_IMM); +const u32 T_LDRH_IMM = T_Write0 | T_Read3 | T_LoadMem | tk(tk_LDRH_IMM); const u32 T_STR_SPREL = T_Read8 | T_ReadR13 | T_WriteMem | tk(tk_STR_SPREL); -const u32 T_LDR_SPREL = T_Write8 | T_ReadR13 | tk(tk_LDR_SPREL); +const u32 T_LDR_SPREL = T_Write8 | T_ReadR13 | T_LoadMem | tk(tk_LDR_SPREL); const u32 T_PUSH = T_ReadR13 | T_WriteR13 | T_WriteMem | tk(tk_PUSH); -const u32 T_POP = T_PopPC | T_ReadR13 | T_WriteR13 | tk(tk_POP); +const u32 T_POP = T_ReadR13 | T_WriteR13 | T_LoadMem | tk(tk_POP); -const u32 T_LDMIA = T_Read8 | T_Write8 | tk(tk_LDMIA); +const u32 T_LDMIA = T_Read8 | T_Write8 | T_LoadMem | tk(tk_LDMIA); const u32 T_STMIA = T_Read8 | T_Write8 | T_WriteMem | tk(tk_STMIA); const u32 T_BCOND = T_BranchAlways | tk(tk_BCOND); @@ -347,7 +346,7 @@ Info Decode(bool thumb, u32 num, u32 instr) if (data & T_BranchAlways) res.DstRegs |= (1 << 15); - if (data & T_PopPC && instr & (1 << 8)) + if (res.Kind == tk_POP && instr & (1 << 8)) res.DstRegs |= 1 << 15; if (data & T_SetNZ) @@ -364,11 +363,18 @@ Info Decode(bool thumb, u32 num, u32 instr) if (data & T_WriteMem) res.SpecialKind = special_WriteMem; - if (res.Kind == ARMInstrInfo::tk_LDR_PCREL) + if (data & T_LoadMem) { - if (!Config::JIT_LiteralOptimisations) - res.SrcRegs |= 1 << 15; - res.SpecialKind = special_LoadLiteral; + if (res.Kind == tk_LDR_PCREL) + { + if (!Config::JIT_LiteralOptimisations) + res.SrcRegs |= 1 << 15; + res.SpecialKind = special_LoadLiteral; + } + else + { + res.SpecialKind = special_LoadMem; + } } if (res.Kind == tk_LDMIA || res.Kind == tk_POP) @@ -401,11 +407,17 @@ Info Decode(bool thumb, u32 num, u32 instr) else if ((instr >> 28) == 0xF) data = ak(ak_Nop); - if (data & A_UnkOnARM7 && num != 0) + if (data & A_UnkOnARM7 && num == 1) data = A_UNK; res.Kind = (data >> 22) & 0x1FF; + if (res.Kind >= ak_SMLAxy && res.Kind <= ak_SMULxy && num == 1) + { + data = ak(ak_Nop); + res.Kind = ak_Nop; + } + if (res.Kind == ak_MCR) { u32 cn = (instr >> 16) & 0xF; @@ -490,8 +502,13 @@ Info Decode(bool thumb, u32 num, u32 instr) if (data & A_WriteMem) res.SpecialKind = special_WriteMem; - if ((data & A_LoadMem) && res.SrcRegs == (1 << 15)) - res.SpecialKind = special_LoadLiteral; + if (data & A_LoadMem) + { + if (res.SrcRegs == (1 << 15)) + res.SpecialKind = special_LoadLiteral; + else + res.SpecialKind = special_LoadMem; + } if (res.Kind == ak_LDM) { diff --git a/src/ARM_InstrInfo.h b/src/ARM_InstrInfo.h index 6ab4929..a702435 100644 --- a/src/ARM_InstrInfo.h +++ b/src/ARM_InstrInfo.h @@ -232,6 +232,7 @@ enum { special_NotSpecialAtAll = 0, special_WriteMem, + special_LoadMem, special_WaitForInterrupt, special_LoadLiteral }; diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index f35b3e9..84bbc2b 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -55,9 +55,11 @@ if (ENABLE_JIT) enable_language(ASM) target_sources(core PRIVATE - ARMJIT.cpp ARM_InstrInfo.cpp + ARMJIT.cpp + ARMJIT_Memory.cpp + dolphin/CommonFuncs.cpp ) @@ -85,6 +87,8 @@ if (ENABLE_JIT) ARMJIT_A64/ARMJIT_ALU.cpp ARMJIT_A64/ARMJIT_LoadStore.cpp ARMJIT_A64/ARMJIT_Branch.cpp + + ARMJIT_A64/ARMJIT_Linkage.s ) endif() endif() diff --git a/src/CP15.cpp b/src/CP15.cpp index 225847e..3d64259 100644 --- a/src/CP15.cpp +++ b/src/CP15.cpp @@ -22,6 +22,7 @@ #include "DSi.h" #include "ARM.h" #include "ARMJIT.h" +#include "ARMJIT_Memory.h" // access timing for cached regions @@ -42,8 +43,8 @@ void ARMv5::CP15Reset() DTCMSetting = 0; ITCMSetting = 0; - memset(ITCM, 0, 0x8000); - memset(DTCM, 0, 0x4000); + memset(ITCM, 0, ITCMPhysicalSize); + memset(DTCM, 0, DTCMPhysicalSize); ITCMSize = 0; DTCMBase = 0xFFFFFFFF; @@ -75,8 +76,8 @@ void ARMv5::CP15DoSavestate(Savestate* file) file->Var32(&DTCMSetting); file->Var32(&ITCMSetting); - file->VarArray(ITCM, 0x8000); - file->VarArray(DTCM, 0x4000); + file->VarArray(ITCM, ITCMPhysicalSize); + file->VarArray(DTCM, DTCMPhysicalSize); file->Var32(&PU_CodeCacheable); file->Var32(&PU_DataCacheable); @@ -98,36 +99,30 @@ void ARMv5::CP15DoSavestate(Savestate* file) void ARMv5::UpdateDTCMSetting() { -#ifdef JIT_ENABLED - u32 oldDTCMBase = DTCMBase; - u32 oldDTCMSize = DTCMSize; -#endif + u32 newDTCMBase; + u32 newDTCMSize; if (CP15Control & (1<<16)) { - DTCMBase = DTCMSetting & 0xFFFFF000; - DTCMSize = 0x200 << ((DTCMSetting >> 1) & 0x1F); + newDTCMBase = DTCMSetting & 0xFFFFF000; + newDTCMSize = 0x200 << ((DTCMSetting >> 1) & 0x1F); //printf("DTCM [%08X] enabled at %08X, size %X\n", DTCMSetting, DTCMBase, DTCMSize); } else { - DTCMBase = 0xFFFFFFFF; - DTCMSize = 0; + newDTCMBase = 0xFFFFFFFF; + newDTCMSize = 0; //printf("DTCM disabled\n"); } -#ifdef JIT_ENABLED - if (oldDTCMBase != DTCMBase || oldDTCMSize != DTCMSize) + if (newDTCMBase != DTCMBase || newDTCMSize != DTCMSize) { - ARMJIT::UpdateMemoryStatus9(oldDTCMBase, oldDTCMBase + oldDTCMSize); - ARMJIT::UpdateMemoryStatus9(DTCMBase, DTCMBase + DTCMSize); + ARMJIT_Memory::RemapDTCM(newDTCMBase, newDTCMSize); + DTCMBase = newDTCMBase; + DTCMSize = newDTCMSize; } -#endif } void ARMv5::UpdateITCMSetting() { -#ifdef JIT_ENABLED - u32 oldITCMSize = ITCMSize; -#endif if (CP15Control & (1<<18)) { ITCMSize = 0x200 << ((ITCMSetting >> 1) & 0x1F); @@ -138,10 +133,6 @@ void ARMv5::UpdateITCMSetting() ITCMSize = 0; //printf("ITCM disabled\n"); } -#ifdef JIT_ENABLED - if (oldITCMSize != ITCMSize) - ARMJIT::UpdateMemoryStatus9(0, std::max(oldITCMSize, ITCMSize)); -#endif } @@ -581,12 +572,15 @@ void ARMv5::CP15Write(u32 id, u32 val) case 0x750: ICacheInvalidateAll(); + //Halt(255); return; case 0x751: ICacheInvalidateByAddr(val); + //Halt(255); return; case 0x752: printf("CP15: ICACHE INVALIDATE WEIRD. %08X\n", val); + //Halt(255); return; @@ -723,7 +717,7 @@ u32 ARMv5::CodeRead32(u32 addr, bool branch) if (addr < ITCMSize) { CodeCycles = 1; - return *(u32*)&ITCM[addr & 0x7FFF]; + return *(u32*)&ITCM[addr & (ITCMPhysicalSize - 1)]; } CodeCycles = RegionCodeCycles; @@ -750,13 +744,13 @@ void ARMv5::DataRead8(u32 addr, u32* val) if (addr < ITCMSize) { DataCycles = 1; - *val = *(u8*)&ITCM[addr & 0x7FFF]; + *val = *(u8*)&ITCM[addr & (ITCMPhysicalSize - 1)]; return; } if (addr >= DTCMBase && addr < (DTCMBase + DTCMSize)) { DataCycles = 1; - *val = *(u8*)&DTCM[(addr - DTCMBase) & 0x3FFF]; + *val = *(u8*)&DTCM[(addr - DTCMBase) & (DTCMPhysicalSize - 1)]; return; } @@ -773,13 +767,13 @@ void ARMv5::DataRead16(u32 addr, u32* val) if (addr < ITCMSize) { DataCycles = 1; - *val = *(u16*)&ITCM[addr & 0x7FFF]; + *val = *(u16*)&ITCM[addr & (ITCMPhysicalSize - 1)]; return; } if (addr >= DTCMBase && addr < (DTCMBase + DTCMSize)) { DataCycles = 1; - *val = *(u16*)&DTCM[(addr - DTCMBase) & 0x3FFF]; + *val = *(u16*)&DTCM[(addr - DTCMBase) & (DTCMPhysicalSize - 1)]; return; } @@ -796,13 +790,13 @@ void ARMv5::DataRead32(u32 addr, u32* val) if (addr < ITCMSize) { DataCycles = 1; - *val = *(u32*)&ITCM[addr & 0x7FFF]; + *val = *(u32*)&ITCM[addr & (ITCMPhysicalSize - 1)]; return; } if (addr >= DTCMBase && addr < (DTCMBase + DTCMSize)) { DataCycles = 1; - *val = *(u32*)&DTCM[(addr - DTCMBase) & 0x3FFF]; + *val = *(u32*)&DTCM[(addr - DTCMBase) & (DTCMPhysicalSize - 1)]; return; } @@ -817,13 +811,13 @@ void ARMv5::DataRead32S(u32 addr, u32* val) if (addr < ITCMSize) { DataCycles += 1; - *val = *(u32*)&ITCM[addr & 0x7FFF]; + *val = *(u32*)&ITCM[addr & (ITCMPhysicalSize - 1)]; return; } if (addr >= DTCMBase && addr < (DTCMBase + DTCMSize)) { DataCycles += 1; - *val = *(u32*)&DTCM[(addr - DTCMBase) & 0x3FFF]; + *val = *(u32*)&DTCM[(addr - DTCMBase) & (DTCMPhysicalSize - 1)]; return; } @@ -838,16 +832,16 @@ void ARMv5::DataWrite8(u32 addr, u8 val) if (addr < ITCMSize) { DataCycles = 1; - *(u8*)&ITCM[addr & 0x7FFF] = val; + *(u8*)&ITCM[addr & (ITCMPhysicalSize - 1)] = val; #ifdef JIT_ENABLED - ARMJIT::InvalidateITCMIfNecessary(addr); + ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_ITCM>(addr); #endif return; } if (addr >= DTCMBase && addr < (DTCMBase + DTCMSize)) { DataCycles = 1; - *(u8*)&DTCM[(addr - DTCMBase) & 0x3FFF] = val; + *(u8*)&DTCM[(addr - DTCMBase) & (DTCMPhysicalSize - 1)] = val; return; } @@ -864,16 +858,16 @@ void ARMv5::DataWrite16(u32 addr, u16 val) if (addr < ITCMSize) { DataCycles = 1; - *(u16*)&ITCM[addr & 0x7FFF] = val; + *(u16*)&ITCM[addr & (ITCMPhysicalSize - 1)] = val; #ifdef JIT_ENABLED - ARMJIT::InvalidateITCMIfNecessary(addr); + ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_ITCM>(addr); #endif return; } if (addr >= DTCMBase && addr < (DTCMBase + DTCMSize)) { DataCycles = 1; - *(u16*)&DTCM[(addr - DTCMBase) & 0x3FFF] = val; + *(u16*)&DTCM[(addr - DTCMBase) & (DTCMPhysicalSize - 1)] = val; return; } @@ -890,16 +884,16 @@ void ARMv5::DataWrite32(u32 addr, u32 val) if (addr < ITCMSize) { DataCycles = 1; - *(u32*)&ITCM[addr & 0x7FFF] = val; + *(u32*)&ITCM[addr & (ITCMPhysicalSize - 1)] = val; #ifdef JIT_ENABLED - ARMJIT::InvalidateITCMIfNecessary(addr); + ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_ITCM>(addr); #endif return; } if (addr >= DTCMBase && addr < (DTCMBase + DTCMSize)) { DataCycles = 1; - *(u32*)&DTCM[(addr - DTCMBase) & 0x3FFF] = val; + *(u32*)&DTCM[(addr - DTCMBase) & (DTCMPhysicalSize - 1)] = val; return; } @@ -914,16 +908,16 @@ void ARMv5::DataWrite32S(u32 addr, u32 val) if (addr < ITCMSize) { DataCycles += 1; - *(u32*)&ITCM[addr & 0x7FFF] = val; + *(u32*)&ITCM[addr & (ITCMPhysicalSize - 1)] = val; #ifdef JIT_ENABLED - ARMJIT::InvalidateITCMIfNecessary(addr); + ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_ITCM>(addr); #endif return; } if (addr >= DTCMBase && addr < (DTCMBase + DTCMSize)) { DataCycles += 1; - *(u32*)&DTCM[(addr - DTCMBase) & 0x3FFF] = val; + *(u32*)&DTCM[(addr - DTCMBase) & (DTCMPhysicalSize - 1)] = val; return; } diff --git a/src/Config.cpp b/src/Config.cpp index 22e9c11..edf84f2 100644 --- a/src/Config.cpp +++ b/src/Config.cpp @@ -47,8 +47,9 @@ int JIT_LiteralOptimisations = true; #ifdef JIT_ENABLED int JIT_Enable = false; int JIT_MaxBlockSize = 32; -int JIT_BrancheOptimisations = 2; +int JIT_BrancheOptimisations = true; int JIT_LiteralOptimisations = true; +int JIT_FastMemory = true; #endif ConfigEntry ConfigFile[] = @@ -72,8 +73,9 @@ ConfigEntry ConfigFile[] = #ifdef JIT_ENABLED {"JIT_Enable", 0, &JIT_Enable, 0, NULL, 0}, {"JIT_MaxBlockSize", 0, &JIT_MaxBlockSize, 32, NULL, 0}, - {"JIT_BranchOptimisations", 0, &JIT_BrancheOptimisations, 2, NULL, 0}, + {"JIT_BranchOptimisations", 0, &JIT_BrancheOptimisations, 1, NULL, 0}, {"JIT_LiteralOptimisations", 0, &JIT_LiteralOptimisations, 1, NULL, 0}, + {"JIT_FastMem", 0, &JIT_FastMemory, 1, NULL, 0}, #endif {"", -1, NULL, 0, NULL, 0} diff --git a/src/Config.h b/src/Config.h index 31fa67a..7b19a4b 100644 --- a/src/Config.h +++ b/src/Config.h @@ -63,6 +63,7 @@ extern int JIT_Enable; extern int JIT_MaxBlockSize; extern int JIT_BrancheOptimisations; extern int JIT_LiteralOptimisations; +extern int JIT_FastMemory; #endif } diff --git a/src/NDS.cpp b/src/NDS.cpp index 657241f..3d65482 100644 --- a/src/NDS.cpp +++ b/src/NDS.cpp @@ -33,6 +33,7 @@ #include "AREngine.h" #include "Platform.h" #include "ARMJIT.h" +#include "ARMJIT_Memory.h" #include "DSi.h" #include "DSi_SPI_TSC.h" @@ -94,17 +95,17 @@ u32 CPUStop; u8 ARM9BIOS[0x1000]; u8 ARM7BIOS[0x4000]; -u8 MainRAM[0x1000000]; +u8* MainRAM; u32 MainRAMMask; -u8 SharedWRAM[0x8000]; +u8* SharedWRAM; u8 WRAMCnt; -u8* SWRAM_ARM9; -u8* SWRAM_ARM7; -u32 SWRAM_ARM9Mask; -u32 SWRAM_ARM7Mask; -u8 ARM7WRAM[0x10000]; +// putting them together so they're always next to each other +MemRegion SWRAM_ARM9; +MemRegion SWRAM_ARM7; + +u8* ARM7WRAM; u16 ExMemCnt[2]; @@ -171,6 +172,10 @@ bool Init() #ifdef JIT_ENABLED ARMJIT::Init(); +#else + MainRAM = new u8[MainRAMSize]; + ARM7WRAM = new u8[ARM7WRAMSize]; + SharedWRAM = new u8[SharedWRAMSize]; #endif DMAs[0] = new DMA(0, 0); @@ -485,6 +490,10 @@ void Reset() printf("ARM7 BIOS loaded\n"); fclose(f); } + +#ifdef JIT_ENABLED + ARMJIT::Reset(); +#endif if (ConsoleType == 1) { @@ -510,7 +519,7 @@ void Reset() InitTimings(); - memset(MainRAM, 0, 0x1000000); + memset(MainRAM, 0, MainRAMMask + 1); memset(SharedWRAM, 0, 0x8000); memset(ARM7WRAM, 0, 0x10000); @@ -587,10 +596,6 @@ void Reset() } AREngine::Reset(); - -#ifdef JIT_ENABLED - ARMJIT::Reset(); -#endif } void Stop() @@ -705,7 +710,7 @@ bool DoSavestate(Savestate* file) file->VarArray(MainRAM, 0x400000); file->VarArray(SharedWRAM, 0x8000); - file->VarArray(ARM7WRAM, 0x10000); + file->VarArray(ARM7WRAM, ARM7WRAMSize); file->VarArray(ExMemCnt, 2*sizeof(u16)); file->VarArray(ROMSeed0, 2*8); @@ -1128,43 +1133,40 @@ void MapSharedWRAM(u8 val) if (val == WRAMCnt) return; + ARMJIT_Memory::RemapSWRAM(); + WRAMCnt = val; switch (WRAMCnt & 0x3) { case 0: - SWRAM_ARM9 = &SharedWRAM[0]; - SWRAM_ARM9Mask = 0x7FFF; - SWRAM_ARM7 = NULL; - SWRAM_ARM7Mask = 0; + SWRAM_ARM9.Mem = &SharedWRAM[0]; + SWRAM_ARM9.Mask = 0x7FFF; + SWRAM_ARM7.Mem = NULL; + SWRAM_ARM7.Mask = 0; break; case 1: - SWRAM_ARM9 = &SharedWRAM[0x4000]; - SWRAM_ARM9Mask = 0x3FFF; - SWRAM_ARM7 = &SharedWRAM[0]; - SWRAM_ARM7Mask = 0x3FFF; + SWRAM_ARM9.Mem = &SharedWRAM[0x4000]; + SWRAM_ARM9.Mask = 0x3FFF; + SWRAM_ARM7.Mem = &SharedWRAM[0]; + SWRAM_ARM7.Mask = 0x3FFF; break; case 2: - SWRAM_ARM9 = &SharedWRAM[0]; - SWRAM_ARM9Mask = 0x3FFF; - SWRAM_ARM7 = &SharedWRAM[0x4000]; - SWRAM_ARM7Mask = 0x3FFF; + SWRAM_ARM9.Mem = &SharedWRAM[0]; + SWRAM_ARM9.Mask = 0x3FFF; + SWRAM_ARM7.Mem = &SharedWRAM[0x4000]; + SWRAM_ARM7.Mask = 0x3FFF; break; case 3: - SWRAM_ARM9 = NULL; - SWRAM_ARM9Mask = 0; - SWRAM_ARM7 = &SharedWRAM[0]; - SWRAM_ARM7Mask = 0x7FFF; + SWRAM_ARM9.Mem = NULL; + SWRAM_ARM9.Mask = 0; + SWRAM_ARM7.Mem = &SharedWRAM[0]; + SWRAM_ARM7.Mask = 0x7FFF; break; } - -#ifdef JIT_ENABLED - ARMJIT::UpdateMemoryStatus9(0x3000000, 0x3000000 + 0x1000000); - ARMJIT::UpdateMemoryStatus7(0x3000000, 0x3000000 + 0x1000000); -#endif } @@ -1835,12 +1837,12 @@ u8 ARM9Read8(u32 addr) switch (addr & 0xFF000000) { case 0x02000000: - return *(u8*)&MainRAM[addr & MainRAMMask]; + return *(u8*)&MainRAM[addr & (MainRAMSize - 1)]; case 0x03000000: - if (SWRAM_ARM9) + if (SWRAM_ARM9.Mem) { - return *(u8*)&SWRAM_ARM9[addr & SWRAM_ARM9Mask]; + return *(u8*)&SWRAM_ARM9.Mem[addr & SWRAM_ARM9.Mask]; } else { @@ -1900,12 +1902,12 @@ u16 ARM9Read16(u32 addr) switch (addr & 0xFF000000) { case 0x02000000: - return *(u16*)&MainRAM[addr & MainRAMMask]; + return *(u16*)&MainRAM[addr & (MainRAMSize - 1)]; case 0x03000000: - if (SWRAM_ARM9) + if (SWRAM_ARM9.Mem) { - return *(u16*)&SWRAM_ARM9[addr & SWRAM_ARM9Mask]; + return *(u16*)&SWRAM_ARM9.Mem[addr & SWRAM_ARM9.Mask]; } else { @@ -1968,9 +1970,9 @@ u32 ARM9Read32(u32 addr) return *(u32*)&MainRAM[addr & MainRAMMask]; case 0x03000000: - if (SWRAM_ARM9) + if (SWRAM_ARM9.Mem) { - return *(u32*)&SWRAM_ARM9[addr & SWRAM_ARM9Mask]; + return *(u32*)&SWRAM_ARM9.Mem[addr & SWRAM_ARM9.Mask]; } else { @@ -2026,7 +2028,7 @@ void ARM9Write8(u32 addr, u8 val) { case 0x02000000: #ifdef JIT_ENABLED - ARMJIT::InvalidateMainRAMIfNecessary(addr); + ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_MainRAM>(addr); #endif *(u8*)&MainRAM[addr & MainRAMMask] = val; #ifdef JIT_ENABLED @@ -2035,12 +2037,12 @@ void ARM9Write8(u32 addr, u8 val) return; case 0x03000000: - if (SWRAM_ARM9) + if (SWRAM_ARM9.Mem) { #ifdef JIT_ENABLED - ARMJIT::InvalidateSWRAM9IfNecessary(addr); + ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_SWRAM>(addr); #endif - *(u8*)&SWRAM_ARM9[addr & SWRAM_ARM9Mask] = val; + *(u8*)&SWRAM_ARM9.Mem[addr & SWRAM_ARM9.Mask] = val; } return; @@ -2085,7 +2087,7 @@ void ARM9Write16(u32 addr, u16 val) { case 0x02000000: #ifdef JIT_ENABLED - ARMJIT::InvalidateMainRAMIfNecessary(addr); + ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_MainRAM>(addr); #endif *(u16*)&MainRAM[addr & MainRAMMask] = val; #ifdef JIT_ENABLED @@ -2094,12 +2096,12 @@ void ARM9Write16(u32 addr, u16 val) return; case 0x03000000: - if (SWRAM_ARM9) + if (SWRAM_ARM9.Mem) { #ifdef JIT_ENABLED - ARMJIT::InvalidateSWRAM9IfNecessary(addr); + ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_SWRAM>(addr); #endif - *(u16*)&SWRAM_ARM9[addr & SWRAM_ARM9Mask] = val; + *(u16*)&SWRAM_ARM9.Mem[addr & SWRAM_ARM9.Mask] = val; } return; @@ -2113,18 +2115,16 @@ void ARM9Write16(u32 addr, u16 val) return; case 0x06000000: +#ifdef JIT_ENABLED + ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_VRAM>(addr); +#endif switch (addr & 0x00E00000) { case 0x00000000: GPU::WriteVRAM_ABG<u16>(addr, val); return; case 0x00200000: GPU::WriteVRAM_BBG<u16>(addr, val); return; case 0x00400000: GPU::WriteVRAM_AOBJ<u16>(addr, val); return; case 0x00600000: GPU::WriteVRAM_BOBJ<u16>(addr, val); return; - default: -#ifdef JIT_ENABLED - ARMJIT::InvalidateLCDCIfNecessary(addr); -#endif - GPU::WriteVRAM_LCDC<u16>(addr, val); - return; + default: GPU::WriteVRAM_LCDC<u16>(addr, val); return; } case 0x07000000: @@ -2165,7 +2165,7 @@ void ARM9Write32(u32 addr, u32 val) { case 0x02000000: #ifdef JIT_ENABLED - ARMJIT::InvalidateMainRAMIfNecessary(addr); + ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_MainRAM>(addr); #endif *(u32*)&MainRAM[addr & MainRAMMask] = val; #ifdef JIT_ENABLED @@ -2174,12 +2174,12 @@ void ARM9Write32(u32 addr, u32 val) return ; case 0x03000000: - if (SWRAM_ARM9) + if (SWRAM_ARM9.Mem) { #ifdef JIT_ENABLED - ARMJIT::InvalidateSWRAM9IfNecessary(addr); + ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_SWRAM>(addr); #endif - *(u32*)&SWRAM_ARM9[addr & SWRAM_ARM9Mask] = val; + *(u32*)&SWRAM_ARM9.Mem[addr & SWRAM_ARM9.Mask] = val; } return; @@ -2193,18 +2193,16 @@ void ARM9Write32(u32 addr, u32 val) return; case 0x06000000: +#ifdef JIT_ENABLED + ARMJIT::CheckAndInvalidate<0, ARMJIT_Memory::memregion_VRAM>(addr); +#endif switch (addr & 0x00E00000) { case 0x00000000: GPU::WriteVRAM_ABG<u32>(addr, val); return; case 0x00200000: GPU::WriteVRAM_BBG<u32>(addr, val); return; case 0x00400000: GPU::WriteVRAM_AOBJ<u32>(addr, val); return; case 0x00600000: GPU::WriteVRAM_BOBJ<u32>(addr, val); return; - default: -#ifdef JIT_ENABLED - ARMJIT::InvalidateLCDCIfNecessary(addr); -#endif - GPU::WriteVRAM_LCDC<u32>(addr, val); - return; + default: GPU::WriteVRAM_LCDC<u32>(addr, val); return; } case 0x07000000: @@ -2250,10 +2248,10 @@ bool ARM9GetMemRegion(u32 addr, bool write, MemRegion* region) return true; case 0x03000000: - if (SWRAM_ARM9) + if (SWRAM_ARM9.Mem) { - region->Mem = SWRAM_ARM9; - region->Mask = SWRAM_ARM9Mask; + region->Mem = SWRAM_ARM9.Mem; + region->Mask = SWRAM_ARM9.Mask; return true; } break; @@ -2292,17 +2290,17 @@ u8 ARM7Read8(u32 addr) return *(u8*)&MainRAM[addr & MainRAMMask]; case 0x03000000: - if (SWRAM_ARM7) + if (SWRAM_ARM7.Mem) { - return *(u8*)&SWRAM_ARM7[addr & SWRAM_ARM7Mask]; + return *(u8*)&SWRAM_ARM7.Mem[addr & SWRAM_ARM7.Mask]; } else { - return *(u8*)&ARM7WRAM[addr & 0xFFFF]; + return *(u8*)&ARM7WRAM[addr & (ARM7WRAMSize - 1)]; } case 0x03800000: - return *(u8*)&ARM7WRAM[addr & 0xFFFF]; + return *(u8*)&ARM7WRAM[addr & (ARM7WRAMSize - 1)]; case 0x04000000: return ARM7IORead8(addr); @@ -2352,17 +2350,17 @@ u16 ARM7Read16(u32 addr) return *(u16*)&MainRAM[addr & MainRAMMask]; case 0x03000000: - if (SWRAM_ARM7) + if (SWRAM_ARM7.Mem) { - return *(u16*)&SWRAM_ARM7[addr & SWRAM_ARM7Mask]; + return *(u16*)&SWRAM_ARM7.Mem[addr & SWRAM_ARM7.Mask]; } else { - return *(u16*)&ARM7WRAM[addr & 0xFFFF]; + return *(u16*)&ARM7WRAM[addr & (ARM7WRAMSize - 1)]; } case 0x03800000: - return *(u16*)&ARM7WRAM[addr & 0xFFFF]; + return *(u16*)&ARM7WRAM[addr & (ARM7WRAMSize - 1)]; case 0x04000000: return ARM7IORead16(addr); @@ -2419,17 +2417,17 @@ u32 ARM7Read32(u32 addr) return *(u32*)&MainRAM[addr & MainRAMMask]; case 0x03000000: - if (SWRAM_ARM7) + if (SWRAM_ARM7.Mem) { - return *(u32*)&SWRAM_ARM7[addr & SWRAM_ARM7Mask]; + return *(u32*)&SWRAM_ARM7.Mem[addr & SWRAM_ARM7.Mask]; } else { - return *(u32*)&ARM7WRAM[addr & 0xFFFF]; + return *(u32*)&ARM7WRAM[addr & (ARM7WRAMSize - 1)]; } case 0x03800000: - return *(u32*)&ARM7WRAM[addr & 0xFFFF]; + return *(u32*)&ARM7WRAM[addr & (ARM7WRAMSize - 1)]; case 0x04000000: return ARM7IORead32(addr); @@ -2474,7 +2472,7 @@ void ARM7Write8(u32 addr, u8 val) case 0x02000000: case 0x02800000: #ifdef JIT_ENABLED - ARMJIT::InvalidateMainRAMIfNecessary(addr); + ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_MainRAM>(addr); #endif *(u8*)&MainRAM[addr & MainRAMMask] = val; #ifdef JIT_ENABLED @@ -2483,28 +2481,28 @@ void ARM7Write8(u32 addr, u8 val) return; case 0x03000000: - if (SWRAM_ARM7) + if (SWRAM_ARM7.Mem) { #ifdef JIT_ENABLED - ARMJIT::InvalidateSWRAM7IfNecessary(addr); + ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_SWRAM>(addr); #endif - *(u8*)&SWRAM_ARM7[addr & SWRAM_ARM7Mask] = val; + *(u8*)&SWRAM_ARM7.Mem[addr & SWRAM_ARM7.Mask] = val; return; } else { #ifdef JIT_ENABLED - ARMJIT::InvalidateARM7WRAMIfNecessary(addr); + ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_WRAM7>(addr); #endif - *(u8*)&ARM7WRAM[addr & 0xFFFF] = val; + *(u8*)&ARM7WRAM[addr & (ARM7WRAMSize - 1)] = val; return; } case 0x03800000: #ifdef JIT_ENABLED - ARMJIT::InvalidateARM7WRAMIfNecessary(addr); + ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_WRAM7>(addr); #endif - *(u8*)&ARM7WRAM[addr & 0xFFFF] = val; + *(u8*)&ARM7WRAM[addr & (ARM7WRAMSize - 1)] = val; return; case 0x04000000: @@ -2514,7 +2512,7 @@ void ARM7Write8(u32 addr, u8 val) case 0x06000000: case 0x06800000: #ifdef JIT_ENABLED - ARMJIT::InvalidateARM7WVRAMIfNecessary(addr); + ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_VWRAM>(addr); #endif GPU::WriteVRAM_ARM7<u8>(addr, val); return; @@ -2551,7 +2549,7 @@ void ARM7Write16(u32 addr, u16 val) case 0x02000000: case 0x02800000: #ifdef JIT_ENABLED - ARMJIT::InvalidateMainRAMIfNecessary(addr); + ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_MainRAM>(addr); #endif *(u16*)&MainRAM[addr & MainRAMMask] = val; #ifdef JIT_ENABLED @@ -2560,28 +2558,28 @@ void ARM7Write16(u32 addr, u16 val) return; case 0x03000000: - if (SWRAM_ARM7) + if (SWRAM_ARM7.Mem) { #ifdef JIT_ENABLED - ARMJIT::InvalidateSWRAM7IfNecessary(addr); + ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_SWRAM>(addr); #endif - *(u16*)&SWRAM_ARM7[addr & SWRAM_ARM7Mask] = val; + *(u16*)&SWRAM_ARM7.Mem[addr & SWRAM_ARM7.Mask] = val; return; } else { #ifdef JIT_ENABLED - ARMJIT::InvalidateARM7WRAMIfNecessary(addr); + ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_WRAM7>(addr); #endif - *(u16*)&ARM7WRAM[addr & 0xFFFF] = val; + *(u16*)&ARM7WRAM[addr & (ARM7WRAMSize - 1)] = val; return; } case 0x03800000: #ifdef JIT_ENABLED - ARMJIT::InvalidateARM7WRAMIfNecessary(addr); + ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_WRAM7>(addr); #endif - *(u16*)&ARM7WRAM[addr & 0xFFFF] = val; + *(u16*)&ARM7WRAM[addr & (ARM7WRAMSize - 1)] = val; return; case 0x04000000: @@ -2599,7 +2597,7 @@ void ARM7Write16(u32 addr, u16 val) case 0x06000000: case 0x06800000: #ifdef JIT_ENABLED - ARMJIT::InvalidateARM7WVRAMIfNecessary(addr); + ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_VWRAM>(addr); #endif GPU::WriteVRAM_ARM7<u16>(addr, val); return; @@ -2638,7 +2636,7 @@ void ARM7Write32(u32 addr, u32 val) case 0x02000000: case 0x02800000: #ifdef JIT_ENABLED - ARMJIT::InvalidateMainRAMIfNecessary(addr); + ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_MainRAM>(addr); #endif *(u32*)&MainRAM[addr & MainRAMMask] = val; #ifdef JIT_ENABLED @@ -2647,28 +2645,28 @@ void ARM7Write32(u32 addr, u32 val) return; case 0x03000000: - if (SWRAM_ARM7) + if (SWRAM_ARM7.Mem) { #ifdef JIT_ENABLED - ARMJIT::InvalidateSWRAM7IfNecessary(addr); + ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_SWRAM>(addr); #endif - *(u32*)&SWRAM_ARM7[addr & SWRAM_ARM7Mask] = val; + *(u32*)&SWRAM_ARM7.Mem[addr & SWRAM_ARM7.Mask] = val; return; } else { #ifdef JIT_ENABLED - ARMJIT::InvalidateARM7WRAMIfNecessary(addr); + ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_WRAM7>(addr); #endif - *(u32*)&ARM7WRAM[addr & 0xFFFF] = val; + *(u32*)&ARM7WRAM[addr & (ARM7WRAMSize - 1)] = val; return; } case 0x03800000: #ifdef JIT_ENABLED - ARMJIT::InvalidateARM7WRAMIfNecessary(addr); + ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_WRAM7>(addr); #endif - *(u32*)&ARM7WRAM[addr & 0xFFFF] = val; + *(u32*)&ARM7WRAM[addr & (ARM7WRAMSize - 1)] = val; return; case 0x04000000: @@ -2687,7 +2685,7 @@ void ARM7Write32(u32 addr, u32 val) case 0x06000000: case 0x06800000: #ifdef JIT_ENABLED - ARMJIT::InvalidateARM7WVRAMIfNecessary(addr); + ARMJIT::CheckAndInvalidate<1, ARMJIT_Memory::memregion_VWRAM>(addr); #endif GPU::WriteVRAM_ARM7<u32>(addr, val); return; @@ -2736,17 +2734,17 @@ bool ARM7GetMemRegion(u32 addr, bool write, MemRegion* region) // then access all the WRAM as one contiguous block starting at 0x037F8000 // this case needs a bit of a hack to cover // it's not really worth bothering anyway - if (!SWRAM_ARM7) + if (!SWRAM_ARM7.Mem) { region->Mem = ARM7WRAM; - region->Mask = 0xFFFF; + region->Mask = ARM7WRAMSize-1; return true; } break; case 0x03800000: region->Mem = ARM7WRAM; - region->Mask = 0xFFFF; + region->Mask = ARM7WRAMSize-1; return true; } @@ -134,6 +134,7 @@ typedef struct } MemRegion; extern int ConsoleType; +extern int CurCPU; extern u8 ARM9MemTimings[0x40000][4]; extern u8 ARM7MemTimings[0x20000][4]; @@ -161,20 +162,20 @@ extern u8 ARM9BIOS[0x1000]; extern u8 ARM7BIOS[0x4000]; extern u16 ARM7BIOSProt; -extern u8 MainRAM[0x1000000]; +extern u8* MainRAM; extern u32 MainRAMMask; -extern u8 SharedWRAM[0x8000]; -extern u8* SWRAM_ARM9; -extern u8* SWRAM_ARM7; -extern u32 SWRAM_ARM9Mask; -extern u32 SWRAM_ARM7Mask; - -extern u8 ARM7WRAM[0x10000]; +const u32 SharedWRAMSize = 0x8000; +extern u8* SharedWRAM; +extern MemRegion SWRAM_ARM9; +extern MemRegion SWRAM_ARM7; extern u32 KeyInput; +const u32 ARM7WRAMSize = 0x10000; +extern u8* ARM7WRAM; + bool Init(); void DeInit(); void Reset(); |