From 550e6b86d2dc09960c5a74270bc49d3f0e895699 Mon Sep 17 00:00:00 2001 From: RSDuck Date: Sun, 30 Jun 2019 13:35:03 +0200 Subject: JIT: compilation of word load and store --- src/ARMJIT.cpp | 4 +- src/ARMJIT.h | 3 +- src/ARMJIT_RegCache.h | 2 +- src/ARMJIT_x64/ARMJIT_ALU.cpp | 4 +- src/ARMJIT_x64/ARMJIT_Compiler.cpp | 111 +++++-- src/ARMJIT_x64/ARMJIT_Compiler.h | 19 +- src/ARMJIT_x64/ARMJIT_LoadStore.cpp | 600 ++++++++++++++++++++++++++++++++++++ src/ARM_InstrInfo.h | 8 +- src/CMakeLists.txt | 1 + src/dolphin/x64ABI.h | 3 +- 10 files changed, 712 insertions(+), 43 deletions(-) (limited to 'src') diff --git a/src/ARMJIT.cpp b/src/ARMJIT.cpp index 74e154b..4da781c 100644 --- a/src/ARMJIT.cpp +++ b/src/ARMJIT.cpp @@ -40,8 +40,7 @@ static ptrdiff_t JIT_MEM[2][32] = { /* 2X*/ DUP2(offsetof(BlockCache, MainRAM)), /* 3X*/ offsetof(BlockCache, SWRAM), offsetof(BlockCache, ARM7_WRAM), - /* 4X*/ -1, - offsetof(BlockCache, ARM7_WIRAM), + /* 4X*/ DUP2(-1), /* 5X*/ DUP2(-1), /* 6X*/ DUP2(offsetof(BlockCache, ARM7_WVRAM)), /* contrary to Gbatek, melonDS and itself, DeSmuME doesn't mirror the 64 MB region at 0x6800000 */ @@ -183,7 +182,6 @@ void ResetBlocks() memset(cache.ARM9_ITCM, 0, sizeof(cache.ARM9_ITCM)); memset(cache.ARM9_LCDC, 0, sizeof(cache.ARM9_LCDC)); memset(cache.ARM7_BIOS, 0, sizeof(cache.ARM7_BIOS)); - memset(cache.ARM7_WIRAM, 0, sizeof(cache.ARM7_WIRAM)); memset(cache.ARM7_WRAM, 0, sizeof(cache.ARM7_WRAM)); memset(cache.ARM7_WVRAM, 0, sizeof(cache.ARM7_WVRAM)); } diff --git a/src/ARMJIT.h b/src/ARMJIT.h index 2ca29e8..45bb4ed 100644 --- a/src/ARMJIT.h +++ b/src/ARMJIT.h @@ -63,14 +63,13 @@ struct BlockCache { CompiledBlock* AddrMapping[2][0x4000] = {0}; - CompiledBlock MainRAM[16*1024*1024/2]; + CompiledBlock MainRAM[4*1024*1024/2]; CompiledBlock SWRAM[0x8000/2]; // Shared working RAM CompiledBlock ARM9_ITCM[0x8000/2]; CompiledBlock ARM9_LCDC[0xA4000/2]; CompiledBlock ARM9_BIOS[0x8000/2]; CompiledBlock ARM7_BIOS[0x4000/2]; CompiledBlock ARM7_WRAM[0x10000/2]; // dedicated ARM7 WRAM - CompiledBlock ARM7_WIRAM[0x10000/2]; // Wifi CompiledBlock ARM7_WVRAM[0x40000/2]; // VRAM allocated as Working RAM }; diff --git a/src/ARMJIT_RegCache.h b/src/ARMJIT_RegCache.h index e18d50f..ea9fb30 100644 --- a/src/ARMJIT_RegCache.h +++ b/src/ARMJIT_RegCache.h @@ -30,7 +30,7 @@ public: assert(Mapping[reg] != -1); if (DirtyRegs & (1 << reg)) - Compiler->UnloadReg(reg, Mapping[reg]); + Compiler->SaveReg(reg, Mapping[reg]); DirtyRegs &= ~(1 << reg); LoadedRegs &= ~(1 << reg); diff --git a/src/ARMJIT_x64/ARMJIT_ALU.cpp b/src/ARMJIT_x64/ARMJIT_ALU.cpp index dc82af7..6294e1d 100644 --- a/src/ARMJIT_x64/ARMJIT_ALU.cpp +++ b/src/ARMJIT_x64/ARMJIT_ALU.cpp @@ -255,8 +255,8 @@ OpArg Compiler::Comp_RegShiftReg(int op, Gen::OpArg rs, Gen::OpArg rm, bool S, b if (S) { XOR(32, R(RSCRATCH2), R(RSCRATCH2)); - BT(32, R(RCPSR), Imm8(29)); - SETcc(CC_C, R(RSCRATCH2)); + TEST(32, R(RCPSR), Imm32(1 << 29)); + SETcc(CC_NZ, R(RSCRATCH2)); } MOV(32, R(RSCRATCH), rm); diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp index f51d4d9..9096397 100644 --- a/src/ARMJIT_x64/ARMJIT_Compiler.cpp +++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp @@ -9,13 +9,43 @@ using namespace Gen; namespace ARMJIT { template <> -const X64Reg RegCache::NativeRegAllocOrder[] = {RBX, RSI, RDI, R12, R13}; +const X64Reg RegCache::NativeRegAllocOrder[] = +{ +#ifdef _WIN32 + RBX, RSI, RDI, R12, R13 +#else + RBX, R12, R13 +#endif +}; template <> -const int RegCache::NativeRegsAvailable = 5; +const int RegCache::NativeRegsAvailable = +#ifdef _WIN32 + 5 +#else + 3 +#endif +; Compiler::Compiler() { - AllocCodeSpace(1024 * 1024 * 4); + AllocCodeSpace(1024 * 1024 * 16); + + for (int i = 0; i < 15; i++) + { + ReadMemFuncs9[i] = Gen_MemoryRoutine9(false, 32, 0x1000000 * i); + WriteMemFuncs9[i] = Gen_MemoryRoutine9(true, 32, 0x1000000 * i); + for (int j = 0; j < 2; j++) + { + ReadMemFuncs7[j][i] = Gen_MemoryRoutine7(false, 32, j, 0x1000000 * i); + WriteMemFuncs7[j][i] = Gen_MemoryRoutine7(true, 32, j, 0x1000000 * i); + } + } + ReadMemFuncs9[15] = Gen_MemoryRoutine9(false, 32, 0xFF000000); + WriteMemFuncs9[15] = Gen_MemoryRoutine9(true, 32, 0xFF000000); + ReadMemFuncs7[15][0] = ReadMemFuncs7[15][1] = Gen_MemoryRoutine7(false, 32, false, 0xFF000000); + WriteMemFuncs7[15][0] = WriteMemFuncs7[15][1] = Gen_MemoryRoutine7(true, 32, false, 0xFF000000); + + ResetStart = GetWritableCodePtr(); } void Compiler::LoadCPSR() @@ -42,7 +72,7 @@ void Compiler::LoadReg(int reg, X64Reg nativeReg) MOV(32, R(nativeReg), Imm32(R15)); } -void Compiler::UnloadReg(int reg, X64Reg nativeReg) +void Compiler::SaveReg(int reg, X64Reg nativeReg) { MOV(32, MDisp(RCPU, offsetof(ARM, R[reg])), R(nativeReg)); } @@ -52,7 +82,7 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs if (IsAlmostFull()) { ResetBlocks(); - ResetCodePtr(); + SetCodePtr((u8*)ResetStart); } CompiledBlock res = (CompiledBlock)GetWritableCodePtr(); @@ -61,8 +91,9 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs Thumb = cpu->CPSR & 0x20; Num = cpu->Num; R15 = cpu->R[15]; + CodeRegion = cpu->CodeRegion; - ABI_PushRegistersAndAdjustStack({ABI_ALL_CALLEE_SAVED}, 8, 0); + ABI_PushRegistersAndAdjustStack({ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS}, 8, 16); MOV(64, R(RCPU), ImmPtr(cpu)); XOR(32, R(RCycles), R(RCycles)); @@ -142,9 +173,9 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs else { // could have used a LUT, but then where would be the fun? - BT(32, R(RCPSR), Imm8(28 + ((~(cond >> 1) & 1) << 1 | (cond >> 2 & 1) ^ (cond >> 1 & 1)))); + TEST(32, R(RCPSR), Imm32(1 << (28 + ((~(cond >> 1) & 1) << 1 | (cond >> 2 & 1) ^ (cond >> 1 & 1))))); - skipExecute = J_CC(cond & 1 ? CC_C : CC_NC); + skipExecute = J_CC(cond & 1 ? CC_NZ : CC_Z); } } @@ -187,7 +218,7 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs LEA(32, RAX, MDisp(RCycles, ConstantCycles)); - ABI_PopRegistersAndAdjustStack({ABI_ALL_CALLEE_SAVED}, 8, 0); + ABI_PopRegistersAndAdjustStack({ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS}, 8, 16); RET(); return res; @@ -243,23 +274,38 @@ CompileFunc Compiler::GetCompFunc(int kind) A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, // CMN A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, + // Mul + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + // ARMv5 stuff + NULL, NULL, NULL, NULL, NULL, + // STR + A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, + // STRB + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + // LDR + A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, + // LDRB + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + // STRH + NULL, NULL, NULL, NULL, + // LDRD + NULL, NULL, NULL, NULL, + // STRD + NULL, NULL, NULL, NULL, + // LDRH + NULL, NULL, NULL, NULL, + // LDRSB NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + // LDRSH + NULL, NULL, NULL, NULL, + // swap + NULL, NULL, + // LDM/STM + NULL, NULL, + // Branch + NULL, NULL, NULL, NULL, NULL, + // system stuff + NULL, NULL, NULL, NULL, NULL, NULL, NULL, }; const CompileFunc T_Comp[ARMInstrInfo::tk_Count] = { @@ -278,10 +324,17 @@ CompileFunc Compiler::GetCompFunc(int kind) T_Comp_ALU_HiReg, T_Comp_ALU_HiReg, T_Comp_ALU_HiReg, // pc/sp relative NULL, NULL, NULL, - // mem... - NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, + // LDR pcrel + NULL, + // LDR/STR reg offset + T_Comp_MemReg, NULL, T_Comp_MemReg, NULL, + // LDR/STR sign extended, half + NULL, NULL, NULL, NULL, + // LDR/STR imm offset + T_Comp_MemImm, T_Comp_MemImm, NULL, NULL, + // LDR/STR half imm offset + NULL, NULL, + // branch, etc. NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.h b/src/ARMJIT_x64/ARMJIT_Compiler.h index 9b454f4..7ab9b25 100644 --- a/src/ARMJIT_x64/ARMJIT_Compiler.h +++ b/src/ARMJIT_x64/ARMJIT_Compiler.h @@ -29,7 +29,7 @@ public: CompiledBlock CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrsCount); void LoadReg(int reg, Gen::X64Reg nativeReg); - void UnloadReg(int reg, Gen::X64Reg nativeReg); + void SaveReg(int reg, Gen::X64Reg nativeReg); private: CompileFunc GetCompFunc(int kind); @@ -51,12 +51,17 @@ private: void A_Comp_MovOp(); void A_Comp_CmpOp(); + void A_Comp_MemWB(); + void T_Comp_ShiftImm(); void T_Comp_AddSub_(); void T_Comp_ALU_Imm8(); void T_Comp_ALU(); void T_Comp_ALU_HiReg(); + void T_Comp_MemReg(); + void T_Comp_MemImm(); + void Comp_ArithTriOp(void (Compiler::*op)(int, const Gen::OpArg&, const Gen::OpArg&), Gen::OpArg rd, Gen::OpArg rn, Gen::OpArg op2, bool carryUsed, int opFlags); void Comp_ArithTriOpReverse(void (Compiler::*op)(int, const Gen::OpArg&, const Gen::OpArg&), @@ -65,10 +70,14 @@ private: void Comp_RetriveFlags(bool sign, bool retriveCV, bool carryUsed); + void* Gen_MemoryRoutine9(bool store, int size, u32 region); + void* Gen_MemoryRoutine7(bool store, int size, bool mainRAMCode, u32 region); + Gen::OpArg Comp_RegShiftImm(int op, int amount, Gen::OpArg rm, bool S, bool& carryUsed); Gen::OpArg Comp_RegShiftReg(int op, Gen::OpArg rs, Gen::OpArg rm, bool S, bool& carryUsed); Gen::OpArg A_Comp_GetALUOp2(bool S, bool& carryUsed); + Gen::OpArg A_Comp_GetMemWBOffset(); void LoadCPSR(); void SaveCPSR(); @@ -82,6 +91,8 @@ private: return Gen::R(RegCache.Mapping[reg]); } + void* ResetStart; + bool CPSRDirty = false; FetchedInstr CurrentInstr; @@ -91,10 +102,16 @@ private: bool Thumb; u32 Num; u32 R15; + u32 CodeRegion; u32 ConstantCycles; }; +extern void* ReadMemFuncs9[16]; +extern void* ReadMemFuncs7[2][16]; +extern void* WriteMemFuncs9[16]; +extern void* WriteMemFuncs7[2][16]; + } #endif \ No newline at end of file diff --git a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp index e69de29..d534269 100644 --- a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp +++ b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp @@ -0,0 +1,600 @@ +#include "ARMJIT_Compiler.h" + +#include "../GPU.h" +#include "../Wifi.h" + +namespace NDS +{ +#define MAIN_RAM_SIZE 0x400000 +extern u8* SWRAM_ARM9; +extern u32 SWRAM_ARM9Mask; +extern u8* SWRAM_ARM7; +extern u32 SWRAM_ARM7Mask; +extern u8 ARM7WRAM[]; +extern u16 ARM7BIOSProt; +} + +using namespace Gen; + +namespace ARMJIT +{ + +void* ReadMemFuncs9[16]; +void* ReadMemFuncs7[2][16]; +void* WriteMemFuncs9[16]; +void* WriteMemFuncs7[2][16]; + +template +int squeezePointer(T* ptr) +{ + int truncated = (int)((u64)ptr); + assert((T*)((u64)truncated) == ptr); + return truncated; +} + +u32 ReadVRAM9(u32 addr) +{ + switch (addr & 0x00E00000) + { + case 0x00000000: return GPU::ReadVRAM_ABG(addr); + case 0x00200000: return GPU::ReadVRAM_BBG(addr); + case 0x00400000: return GPU::ReadVRAM_AOBJ(addr); + case 0x00600000: return GPU::ReadVRAM_BOBJ(addr); + default: return GPU::ReadVRAM_LCDC(addr); + } +} + +void WriteVRAM9(u32 addr, u32 val) +{ + switch (addr & 0x00E00000) + { + case 0x00000000: GPU::WriteVRAM_ABG(addr, val); return; + case 0x00200000: GPU::WriteVRAM_BBG(addr, val); return; + case 0x00400000: GPU::WriteVRAM_AOBJ(addr, val); return; + case 0x00600000: GPU::WriteVRAM_BOBJ(addr, val); return; + default: GPU::WriteVRAM_LCDC(addr, val); return; + } +} + +/* + R11 - data to write (store only) + RSCRATCH2 - address + RSCRATCH3 - code cycles +*/ +void* Compiler::Gen_MemoryRoutine9(bool store, int size, u32 region) +{ + AlignCode4(); + void* res = (void*)GetWritableCodePtr(); + + if (!store) + { + MOV(32, R(RSCRATCH), R(RSCRATCH2)); + AND(32, R(RSCRATCH), Imm8(0x3)); + SHL(32, R(RSCRATCH), Imm8(3)); + // enter the shadow realm! + MOV(32, MDisp(RSP, 8), R(RSCRATCH)); + } + + // cycle counting! + // this is AddCycles_CDI + MOV(32, R(R10), R(RSCRATCH2)); + SHR(32, R(R10), Imm8(12)); + MOVZX(32, 8, R10, MComplex(RCPU, R10, SCALE_1, offsetof(ARMv5, MemTimings) + 2)); + LEA(32, RSCRATCH, MComplex(RSCRATCH3, R10, SCALE_1, -6)); + CMP(32, R(R10), R(RSCRATCH3)); + CMOVcc(32, RSCRATCH3, R(R10), CC_G); + CMP(32, R(RSCRATCH), R(RSCRATCH3)); + CMOVcc(32, RSCRATCH3, R(RSCRATCH), CC_G); + ADD(32, R(RCycles), R(RSCRATCH3)); + + if (!store) + XOR(32, R(RSCRATCH), R(RSCRATCH)); + AND(32, R(RSCRATCH2), Imm32(~3)); + + { + MOV(32, R(RSCRATCH3), R(RSCRATCH2)); + SUB(32, R(RSCRATCH2), MDisp(RCPU, offsetof(ARMv5, DTCMBase))); + CMP(32, R(RSCRATCH2), MDisp(RCPU, offsetof(ARMv5, DTCMSize))); + FixupBranch outsideDTCM = J_CC(CC_AE); + AND(32, R(RSCRATCH2), Imm32(0x3FFF)); + if (!store) + { + MOV(32, R(RSCRATCH), MComplex(RCPU, RSCRATCH2, SCALE_1, offsetof(ARMv5, DTCM))); + MOV(32, R(ECX), MDisp(RSP, 8)); + ROR_(32, R(RSCRATCH), R(ECX)); + } + else + MOV(32, MComplex(RCPU, RSCRATCH2, SCALE_1, offsetof(ARMv5, DTCM)), R(R11)); + RET(); + SetJumpTarget(outsideDTCM); + MOV(32, R(RSCRATCH2), R(RSCRATCH3)); + } + + switch (region) + { + case 0x00000000: + case 0x01000000: + { + CMP(32, R(RSCRATCH2), MDisp(RCPU, offsetof(ARMv5, ITCMSize))); + FixupBranch insideITCM = J_CC(CC_B); + RET(); + SetJumpTarget(insideITCM); + AND(32, R(RSCRATCH2), Imm32(0x7FFF)); + if (!store) + MOV(32, R(RSCRATCH), MComplex(RCPU, RSCRATCH2, SCALE_1, offsetof(ARMv5, ITCM))); + else + { + MOV(32, MComplex(RCPU, RSCRATCH2, SCALE_1, offsetof(ARMv5, ITCM)), R(R11)); + MOV(64, MScaled(RSCRATCH2, SCALE_4, squeezePointer(cache.ARM9_ITCM)), Imm32(0)); + MOV(64, MScaled(RSCRATCH2, SCALE_4, squeezePointer(cache.ARM9_ITCM) + 8), Imm32(0)); + } + } + break; + case 0x02000000: + AND(32, R(RSCRATCH2), Imm32(MAIN_RAM_SIZE - 1)); + if (!store) + MOV(32, R(RSCRATCH), MDisp(RSCRATCH2, squeezePointer(NDS::MainRAM))); + else + { + MOV(32, MDisp(RSCRATCH2, squeezePointer(NDS::MainRAM)), R(R11)); + MOV(64, MScaled(RSCRATCH2, SCALE_4, squeezePointer(cache.MainRAM)), Imm32(0)); + MOV(64, MScaled(RSCRATCH2, SCALE_4, squeezePointer(cache.MainRAM) + 8), Imm32(0)); + } + break; + case 0x03000000: + { + MOV(64, R(RSCRATCH3), M(&NDS::SWRAM_ARM9)); + TEST(64, R(RSCRATCH3), R(RSCRATCH3)); + FixupBranch notMapped = J_CC(CC_Z); + AND(32, R(RSCRATCH2), M(&NDS::SWRAM_ARM9Mask)); + if (!store) + MOV(32, R(RSCRATCH), MRegSum(RSCRATCH2, RSCRATCH3)); + else + { + MOV(32, MRegSum(RSCRATCH2, RSCRATCH3), R(R11)); + MOV(64, MScaled(RSCRATCH2, SCALE_4, squeezePointer(cache.SWRAM)), Imm32(0)); + MOV(64, MScaled(RSCRATCH2, SCALE_4, squeezePointer(cache.SWRAM) + 8), Imm32(0)); + } + SetJumpTarget(notMapped); + } + break; + case 0x04000000: + MOV(32, R(ABI_PARAM1), R(RSCRATCH2)); + if (!store) + { + ABI_PushRegistersAndAdjustStack({}, 8, 0); + ABI_CallFunction(NDS::ARM9IORead32); + ABI_PopRegistersAndAdjustStack({}, 8, 0); + } + else + { + MOV(32, R(ABI_PARAM2), R(R11)); + JMP((u8*)NDS::ARM9IOWrite32, true); + } + break; + case 0x05000000: + { + MOV(32, R(RSCRATCH), Imm32(1<<1)); + MOV(32, R(RSCRATCH3), Imm32(1<<9)); + TEST(32, R(RSCRATCH2), Imm32(0x400)); + CMOVcc(32, RSCRATCH, R(RSCRATCH3), CC_NZ); + TEST(16, R(RSCRATCH), M(&NDS::PowerControl9)); + FixupBranch available = J_CC(CC_NZ); + RET(); + SetJumpTarget(available); + AND(32, R(RSCRATCH2), Imm32(0x7FF)); + if (!store) + MOV(32, R(RSCRATCH), MDisp(RSCRATCH2, squeezePointer(GPU::Palette))); + else + MOV(32, MDisp(RSCRATCH2, squeezePointer(GPU::Palette)), R(R11)); + } + break; + case 0x06000000: + MOV(32, R(ABI_PARAM1), R(RSCRATCH2)); + if (!store) + { + ABI_PushRegistersAndAdjustStack({}, 8); + ABI_CallFunction(ReadVRAM9); + ABI_PopRegistersAndAdjustStack({}, 8); + } + else + { + MOV(32, R(ABI_PARAM2), R(R11)); + JMP((u8*)WriteVRAM9, true); + } + break; + case 0x07000000: + { + MOV(32, R(RSCRATCH), Imm32(1<<1)); + MOV(32, R(RSCRATCH3), Imm32(1<<9)); + TEST(32, R(RSCRATCH2), Imm32(0x400)); + CMOVcc(32, RSCRATCH, R(RSCRATCH3), CC_NZ); + TEST(16, R(RSCRATCH), M(&NDS::PowerControl9)); + FixupBranch available = J_CC(CC_NZ); + RET(); + SetJumpTarget(available); + AND(32, R(RSCRATCH2), Imm32(0x7FF)); + if (!store) + MOV(32, R(RSCRATCH), MDisp(RSCRATCH2, squeezePointer(GPU::OAM))); + else + MOV(32, MDisp(RSCRATCH2, squeezePointer(GPU::OAM)), R(R11)); + } + break; + case 0x08000000: + case 0x09000000: + case 0x0A000000: + if (!store) + MOV(32, R(RSCRATCH), Imm32(0xFFFFFFFF)); + break; + case 0xFF000000: + if (!store) + { + AND(32, R(RSCRATCH2), Imm32(0xFFF)); + MOV(32, R(RSCRATCH), MDisp(RSCRATCH2, squeezePointer(NDS::ARM9BIOS))); + } + break; + default: + MOV(32, R(ABI_PARAM1), R(RSCRATCH2)); + if (!store) + { + ABI_PushRegistersAndAdjustStack({}, 8, 0); + ABI_CallFunction(NDS::ARM9Read32); + ABI_PopRegistersAndAdjustStack({}, 8, 0); + } + else + { + MOV(32, R(ABI_PARAM2), R(R11)); + JMP((u8*)NDS::ARM9Write32, true); + } + break; + } + + if (!store) + { + MOV(32, R(ECX), MDisp(RSP, 8)); + ROR_(32, R(RSCRATCH), R(ECX)); + } + + RET(); + + return res; +} + +void* Compiler::Gen_MemoryRoutine7(bool store, int size, bool mainRAMCode, u32 region) +{ + AlignCode4(); + void* res = GetWritableCodePtr(); + + if (!store) + { + MOV(32, R(RSCRATCH), R(RSCRATCH2)); + AND(32, R(RSCRATCH), Imm8(0x3)); + SHL(32, R(RSCRATCH), Imm8(3)); + // enter the shadow realm! + MOV(32, MDisp(RSP, 8), R(RSCRATCH)); + } + + // AddCycles_CDI + MOV(32, R(RSCRATCH), R(RSCRATCH2)); + SHR(32, R(RSCRATCH), Imm8(15)); + MOVZX(32, 8, RSCRATCH, MDisp(RSCRATCH, squeezePointer(NDS::ARM7MemTimings + 2))); + if ((region == 0x02000000 && mainRAMCode) || (region != 0x02000000 && !mainRAMCode)) + { + if (!store && region != 0x02000000) + LEA(32, RSCRATCH3, MComplex(RSCRATCH, RSCRATCH3, SCALE_1, 1)); + ADD(32, R(RCycles), R(RSCRATCH3)); + } + else + { + if (!store) + ADD(32, R(region == 0x02000000 ? RSCRATCH2 : RSCRATCH), Imm8(1)); + LEA(32, R10, MComplex(RSCRATCH, RSCRATCH3, SCALE_1, -3)); + CMP(32, R(RSCRATCH3), R(RSCRATCH)); + CMOVcc(32, RSCRATCH, R(RSCRATCH3), CC_G); + CMP(32, R(R10), R(RSCRATCH)); + CMOVcc(32, RSCRATCH, R(R10), CC_G); + ADD(32, R(RCycles), R(RSCRATCH)); + } + + if (!store) + XOR(32, R(RSCRATCH), R(RSCRATCH)); + AND(32, R(RSCRATCH2), Imm32(~3)); + + switch (region) + { + case 0x00000000: + if (!store) { + CMP(32, R(RSCRATCH2), Imm32(0x4000)); + FixupBranch outsideBIOS1 = J_CC(CC_AE); + + MOV(32, R(RSCRATCH), MDisp(RCPU, offsetof(ARM, R[15]))); + CMP(32, R(RSCRATCH), Imm32(0x4000)); + FixupBranch outsideBIOS2 = J_CC(CC_AE); + MOV(32, R(RSCRATCH3), M(&NDS::ARM7BIOSProt)); + CMP(32, R(RSCRATCH2), R(RSCRATCH3)); + FixupBranch notDenied1 = J_CC(CC_AE); + CMP(32, R(RSCRATCH), R(RSCRATCH3)); + FixupBranch notDenied2 = J_CC(CC_B); + SetJumpTarget(outsideBIOS2); + MOV(32, R(RSCRATCH), Imm32(0xFFFFFFFF)); + RET(); + + SetJumpTarget(notDenied1); + SetJumpTarget(notDenied2); + MOV(32, R(RSCRATCH), MDisp(RSCRATCH2, squeezePointer(NDS::ARM7BIOS))); + MOV(32, R(ECX), MDisp(RSP, 8)); + ROR_(32, R(RSCRATCH), R(ECX)); + RET(); + + SetJumpTarget(outsideBIOS1); + } + break; + case 0x02000000: + AND(32, R(RSCRATCH2), Imm32(MAIN_RAM_SIZE - 1)); + if (!store) + MOV(32, R(RSCRATCH), MDisp(RSCRATCH2, squeezePointer(NDS::MainRAM))); + else + { + MOV(32, MDisp(RSCRATCH2, squeezePointer(NDS::MainRAM)), R(R11)); + MOV(64, MScaled(RSCRATCH2, SCALE_4, squeezePointer(cache.MainRAM)), Imm32(0)); + MOV(64, MScaled(RSCRATCH2, SCALE_4, squeezePointer(cache.MainRAM) + 8), Imm32(0)); + } + break; + case 0x03000000: + { + TEST(32, R(RSCRATCH2), Imm32(0x800000)); + FixupBranch region = J_CC(CC_NZ); + MOV(64, R(RSCRATCH), M(&NDS::SWRAM_ARM7)); + TEST(64, R(RSCRATCH), R(RSCRATCH)); + FixupBranch notMapped = J_CC(CC_Z); + AND(32, R(RSCRATCH2), M(&NDS::SWRAM_ARM7Mask)); + if (!store) + { + MOV(32, R(RSCRATCH), MRegSum(RSCRATCH, RSCRATCH2)); + MOV(32, R(ECX), MDisp(RSP, 8)); + ROR_(32, R(RSCRATCH), R(ECX)); + } + else + { + MOV(32, MRegSum(RSCRATCH, RSCRATCH2), R(R11)); + MOV(64, MScaled(RSCRATCH2, SCALE_4, squeezePointer(cache.SWRAM)), Imm32(0)); + MOV(64, MScaled(RSCRATCH2, SCALE_4, squeezePointer(cache.SWRAM) + 8), Imm32(0)); + } + RET(); + SetJumpTarget(region); + SetJumpTarget(notMapped); + AND(32, R(RSCRATCH2), Imm32(0xFFFF)); + if (!store) + MOV(32, R(RSCRATCH), MDisp(RSCRATCH2, squeezePointer(NDS::ARM7WRAM))); + else + { + MOV(32, MDisp(RSCRATCH2, squeezePointer(NDS::ARM7WRAM)), R(R11)); + MOV(64, MScaled(RSCRATCH2, SCALE_4, squeezePointer(cache.ARM7_WRAM)), Imm32(0)); + MOV(64, MScaled(RSCRATCH2, SCALE_4, squeezePointer(cache.ARM7_WRAM) + 8), Imm32(0)); + } + } + break; + case 0x04000000: + { + TEST(32, R(RSCRATCH2), Imm32(0x800000)); + FixupBranch region = J_CC(CC_NZ); + MOV(32, R(ABI_PARAM1), R(RSCRATCH2)); + if (!store) + { + ABI_PushRegistersAndAdjustStack({}, 8); + ABI_CallFunction(NDS::ARM7IORead32); + ABI_PopRegistersAndAdjustStack({}, 8); + + MOV(32, R(ECX), MDisp(RSP, 8)); + ROR_(32, R(RSCRATCH), R(ECX)); + RET(); + } + else + { + MOV(32, R(ABI_PARAM2), R(R11)); + JMP((u8*)NDS::ARM7IOWrite32, true); + } + SetJumpTarget(region); + + if (!store) + { + ABI_PushRegistersAndAdjustStack({RSCRATCH2}, 8); + MOV(32, R(ABI_PARAM1), R(RSCRATCH2)); + ABI_CallFunction(Wifi::Read); + ABI_PopRegistersAndAdjustStack({RSCRATCH2}, 8); + + ADD(32, R(RSCRATCH2), Imm8(2)); + ABI_PushRegistersAndAdjustStack({EAX}, 8); + MOV(32, R(ABI_PARAM1), R(RSCRATCH2)); + ABI_CallFunction(Wifi::Read); + MOV(32, R(RSCRATCH2), R(EAX)); + SHL(32, R(RSCRATCH2), Imm8(16)); + ABI_PopRegistersAndAdjustStack({EAX}, 8); + OR(32, R(EAX), R(RSCRATCH2)); + } + else + { + ABI_PushRegistersAndAdjustStack({RSCRATCH2, R11}, 8); + MOV(32, R(ABI_PARAM1), R(RSCRATCH2)); + MOVZX(32, 16, ABI_PARAM2, R(R11)); + ABI_CallFunction(Wifi::Write); + ABI_PopRegistersAndAdjustStack({RSCRATCH2, R11}, 8); + SHR(32, R(R11), Imm8(16)); + ADD(32, R(RSCRATCH2), Imm8(2)); + ABI_PushRegistersAndAdjustStack({RSCRATCH2, R11}, 8); + MOV(32, R(ABI_PARAM1), R(RSCRATCH2)); + MOVZX(32, 16, ABI_PARAM2, R(R11)); + ABI_CallFunction(Wifi::Write); + ABI_PopRegistersAndAdjustStack({RSCRATCH2, R11}, 8); + } + } + break; + case 0x06000000: + MOV(32, R(ABI_PARAM1), R(RSCRATCH2)); + if (!store) + { + ABI_PushRegistersAndAdjustStack({}, 8); + ABI_CallFunction(GPU::ReadVRAM_ARM7); + ABI_PopRegistersAndAdjustStack({}, 8); + } + else + { + AND(32, R(ABI_PARAM1), Imm32(0x40000 - 1)); + MOV(64, MScaled(ABI_PARAM1, SCALE_4, squeezePointer(cache.ARM7_WVRAM)), Imm32(0)); + MOV(64, MScaled(ABI_PARAM1, SCALE_4, squeezePointer(cache.ARM7_WVRAM) + 8), Imm32(0)); + MOV(32, R(ABI_PARAM2), R(R11)); + JMP((u8*)GPU::WriteVRAM_ARM7, true); + } + break; + case 0x08000000: + case 0x09000000: + case 0x0A000000: + if (!store) + MOV(32, R(RSCRATCH), Imm32(0xFFFFFFFF)); + break; + /*default: + ABI_PushRegistersAndAdjustStack({}, 8, 0); + MOV(32, R(ABI_PARAM1), R(RSCRATCH2)); + ABI_CallFunction(NDS::ARM7Read32); + ABI_PopRegistersAndAdjustStack({}, 8, 0); + break;*/ + } + + if (!store) + { + MOV(32, R(ECX), MDisp(RSP, 8)); + ROR_(32, R(RSCRATCH), R(ECX)); + } + + RET(); + + return res; +} + +OpArg Compiler::A_Comp_GetMemWBOffset() +{ + if (!(CurrentInstr.Instr & (1 << 25))) + return Imm32(CurrentInstr.Instr & 0xFFF); + else + { + int op = (CurrentInstr.Instr >> 5) & 0x3; + int amount = (CurrentInstr.Instr >> 7) & 0x1F; + OpArg rm = MapReg(CurrentInstr.A_Reg(0)); + bool carryUsed; + return Comp_RegShiftImm(op, amount, rm, false, carryUsed); + } +} + +void Compiler::A_Comp_MemWB() +{ + OpArg rn = MapReg(CurrentInstr.A_Reg(16)); + OpArg rd = MapReg(CurrentInstr.A_Reg(12)); + bool load = CurrentInstr.Instr & (1 << 20); + + MOV(32, R(RSCRATCH2), rn); + if (CurrentInstr.Instr & (1 << 24)) + { + OpArg offset = A_Comp_GetMemWBOffset(); + if (CurrentInstr.Instr & (1 << 23)) + ADD(32, R(RSCRATCH2), offset); + else + SUB(32, R(RSCRATCH2), offset); + + if (CurrentInstr.Instr & (1 << 21)) + MOV(32, rn, R(RSCRATCH2)); + } + + u32 cycles = Num ? NDS::ARM7MemTimings[CurrentInstr.CodeCycles][2] : CurrentInstr.CodeCycles; + MOV(32, R(RSCRATCH3), Imm32(cycles)); + MOV(32, R(RSCRATCH), R(RSCRATCH2)); + SHR(32, R(RSCRATCH), Imm8(24)); + AND(32, R(RSCRATCH), Imm8(0xF)); + void** funcArray; + if (load) + funcArray = Num ? ReadMemFuncs7[CodeRegion == 0x02] : ReadMemFuncs9; + else + { + funcArray = Num ? WriteMemFuncs7[CodeRegion == 0x02] : WriteMemFuncs9; + MOV(32, R(R11), rd); + } + CALLptr(MScaled(RSCRATCH, SCALE_8, squeezePointer(funcArray))); + + if (load) + MOV(32, R(RSCRATCH2), R(RSCRATCH)); + + if (!(CurrentInstr.Instr & (1 << 24))) + { + OpArg offset = A_Comp_GetMemWBOffset(); + + if (CurrentInstr.Instr & (1 << 23)) + ADD(32, rn, offset); + else + SUB(32, rn, offset); + } + + if (load) + MOV(32, rd, R(RSCRATCH2)); +} + +void Compiler::T_Comp_MemReg() +{ + OpArg rd = MapReg(CurrentInstr.T_Reg(0)); + OpArg rb = MapReg(CurrentInstr.T_Reg(3)); + OpArg ro = MapReg(CurrentInstr.T_Reg(6)); + + int op = (CurrentInstr.Instr >> 10) & 0x3; + bool load = op & 0x2; + + MOV(32, R(RSCRATCH2), rb); + ADD(32, R(RSCRATCH2), ro); + + u32 cycles = Num ? NDS::ARM7MemTimings[CurrentInstr.CodeCycles][0] : (R15 & 0x2 ? 0 : CurrentInstr.CodeCycles); + MOV(32, R(RSCRATCH3), Imm32(cycles)); + MOV(32, R(RSCRATCH), R(RSCRATCH2)); + SHR(32, R(RSCRATCH), Imm8(24)); + AND(32, R(RSCRATCH), Imm8(0xF)); + void** funcArray; + if (load) + funcArray = Num ? ReadMemFuncs7[CodeRegion == 0x02] : ReadMemFuncs9; + else + { + funcArray = Num ? WriteMemFuncs7[CodeRegion == 0x02] : WriteMemFuncs9; + MOV(32, R(R11), rd); + } + CALLptr(MScaled(RSCRATCH, SCALE_8, squeezePointer(funcArray))); + + if (load) + MOV(32, rd, R(RSCRATCH)); +} + +void Compiler::T_Comp_MemImm() +{ + // TODO: aufräumen!!! + OpArg rd = MapReg(CurrentInstr.T_Reg(0)); + OpArg rb = MapReg(CurrentInstr.T_Reg(3)); + + int op = (CurrentInstr.Instr >> 11) & 0x3; + u32 offset = ((CurrentInstr.Instr >> 6) & 0x1F) * 4; + bool load = op & 0x1; + + LEA(32, RSCRATCH2, MDisp(rb.GetSimpleReg(), offset)); + u32 cycles = Num ? NDS::ARM7MemTimings[CurrentInstr.CodeCycles][0] : (R15 & 0x2 ? 0 : CurrentInstr.CodeCycles); + MOV(32, R(RSCRATCH3), Imm32(cycles)); + MOV(32, R(RSCRATCH), R(RSCRATCH2)); + SHR(32, R(RSCRATCH), Imm8(24)); + AND(32, R(RSCRATCH), Imm8(0xF)); + void** funcArray; + if (load) + funcArray = Num ? ReadMemFuncs7[CodeRegion == 0x02] : ReadMemFuncs9; + else + { + funcArray = Num ? WriteMemFuncs7[CodeRegion == 0x02] : WriteMemFuncs9; + MOV(32, R(R11), rd); + } + CALLptr(MScaled(RSCRATCH, SCALE_8, squeezePointer(funcArray))); + + if (load) + MOV(32, rd, R(RSCRATCH)); +} + +} \ No newline at end of file diff --git a/src/ARM_InstrInfo.h b/src/ARM_InstrInfo.h index e717664..dcd938b 100644 --- a/src/ARM_InstrInfo.h +++ b/src/ARM_InstrInfo.h @@ -83,10 +83,10 @@ enum ak_ALU(BIC), ak_ALU(MVN), - ak_ALU(TST), - ak_ALU(TEQ), - ak_ALU(CMP), - ak_ALU(CMN), + ak_Test(TST), + ak_Test(TEQ), + ak_Test(CMP), + ak_Test(CMN), ak_MUL, ak_MLA, diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 0faa57a..ae04ffb 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -53,6 +53,7 @@ add_library(core STATIC ARMJIT.cpp ARMJIT_x64/ARMJIT_Compiler.cpp ARMJIT_x64/ARMJIT_ALU.cpp + ARMJIT_x64/ARMJIT_LoadStore.cpp dolphin/CommonFuncs.cpp dolphin/x64ABI.cpp diff --git a/src/dolphin/x64ABI.h b/src/dolphin/x64ABI.h index 997782e..94336d0 100644 --- a/src/dolphin/x64ABI.h +++ b/src/dolphin/x64ABI.h @@ -37,7 +37,8 @@ // xmm0-xmm15 use the upper 16 bits in the functions that push/pop registers. #define ABI_ALL_CALLER_SAVED \ - (BitSet32{RAX, RCX, RDX, R8, R9, R10, R11}) + (BitSet32{RAX, RCX, RDX, R8, R9, R10, R11, XMM0 + 16, XMM1 + 16, XMM2 + 16, XMM3 + 16, \ + XMM4 + 16, XMM5 + 16}) #else // 64-bit Unix / OS X #define ABI_PARAM1 RDI -- cgit v1.2.3