diff options
author | RSDuck <rsduck@users.noreply.github.com> | 2019-07-10 00:57:59 +0200 |
---|---|---|
committer | RSDuck <rsduck@users.noreply.github.com> | 2020-04-26 13:02:57 +0200 |
commit | ff9721111441e69b4a276a34c757476b625213c6 (patch) | |
tree | 5de28ab1c7eef4699c9dd3278957576daf9d9074 | |
parent | 2c44bf927c230efbbd1b27920de062ddcc631fcf (diff) |
jit: thumb block transfer working
also pc and sp relative loads and some refactoring
-rw-r--r-- | src/ARMJIT_RegisterCache.h (renamed from src/ARMJIT_RegCache.h) | 6 | ||||
-rw-r--r-- | src/ARMJIT_x64/ARMJIT_Compiler.cpp | 82 | ||||
-rw-r--r-- | src/ARMJIT_x64/ARMJIT_Compiler.h | 19 | ||||
-rw-r--r-- | src/ARMJIT_x64/ARMJIT_LoadStore.cpp | 515 | ||||
-rw-r--r-- | src/ARM_InstrInfo.cpp | 46 |
5 files changed, 549 insertions, 119 deletions
diff --git a/src/ARMJIT_RegCache.h b/src/ARMJIT_RegisterCache.h index 556d27b..04c1eda 100644 --- a/src/ARMJIT_RegCache.h +++ b/src/ARMJIT_RegisterCache.h @@ -12,13 +12,13 @@ namespace ARMJIT { template <typename T, typename Reg> -class RegCache +class RegisterCache { public: - RegCache() + RegisterCache() {} - RegCache(T* compiler, FetchedInstr instrs[], int instrsCount) + RegisterCache(T* compiler, FetchedInstr instrs[], int instrsCount) : Compiler(compiler), Instrs(instrs), InstrsCount(instrsCount) { for (int i = 0; i < 16; i++) diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp index b7358a2..4fe0c70 100644 --- a/src/ARMJIT_x64/ARMJIT_Compiler.cpp +++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp @@ -9,20 +9,20 @@ using namespace Gen; namespace ARMJIT { template <> -const X64Reg RegCache<Compiler, X64Reg>::NativeRegAllocOrder[] = +const X64Reg RegisterCache<Compiler, X64Reg>::NativeRegAllocOrder[] = { #ifdef _WIN32 - RBX, RSI, RDI, R12, R13 + RBX, RSI, RDI, R12, R13, R14 #else - RBX, R12, R13 + RBX, R12, R13, R14 // this is sad #endif }; template <> -const int RegCache<Compiler, X64Reg>::NativeRegsAvailable = +const int RegisterCache<Compiler, X64Reg>::NativeRegsAvailable = #ifdef _WIN32 - 5 + 6 #else - 3 + 4 #endif ; @@ -39,10 +39,47 @@ Compiler::Compiler() MemoryFuncs7[i][j][1] = Gen_MemoryRoutine7(j, true, 8 << i); } } + for (int i = 0; i < 2; i++) + for (int j = 0; j < 2; j++) + { + MemoryFuncsSeq9[i][j] = Gen_MemoryRoutineSeq9(i, j); + MemoryFuncsSeq7[i][j][0] = Gen_MemoryRoutineSeq7(i, j, false); + MemoryFuncsSeq7[i][j][1] = Gen_MemoryRoutineSeq7(i, j, true); + } ResetStart = GetWritableCodePtr(); } +void* Compiler::Gen_ChangeCPSRRoutine() +{ + void* res = (void*)GetWritableCodePtr(); + + MOV(32, R(RSCRATCH), R(RCPSR)); + AND(32, R(RSCRATCH), Imm8(0x1F)); + CMP(32, R(RSCRATCH), Imm8(0x11)); + FixupBranch fiq = J_CC(CC_E); + CMP(32, R(RSCRATCH), Imm8(0x12)); + FixupBranch irq = J_CC(CC_E); + CMP(32, R(RSCRATCH), Imm8(0x13)); + FixupBranch svc = J_CC(CC_E); + CMP(32, R(RSCRATCH), Imm8(0x17)); + FixupBranch abt = J_CC(CC_E); + CMP(32, R(RSCRATCH), Imm8(0x1B)); + FixupBranch und = J_CC(CC_E); + + SetJumpTarget(fiq); + + SetJumpTarget(irq); + + SetJumpTarget(svc); + + SetJumpTarget(abt); + + SetJumpTarget(und); + + return res; +} + DataRegion Compiler::ClassifyAddress(u32 addr) { if (Num == 0 && addr >= ((ARMv5*)CurCPU)->DTCMBase && addr < ((ARMv5*)CurCPU)->DTCMBase) @@ -106,12 +143,11 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs ABI_PushRegistersAndAdjustStack({ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS}, 8, 16); MOV(64, R(RCPU), ImmPtr(cpu)); - XOR(32, R(RCycles), R(RCycles)); LoadCPSR(); // TODO: this is ugly as a whole, do better - RegCache = ARMJIT::RegCache<Compiler, X64Reg>(this, instrs, instrsCount); + RegCache = RegisterCache<Compiler, X64Reg>(this, instrs, instrsCount); for (int i = 0; i < instrsCount; i++) { @@ -242,7 +278,7 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs RegCache.Flush(); SaveCPSR(); - LEA(32, RAX, MDisp(RCycles, ConstantCycles)); + MOV(32, R(RAX), Imm32(ConstantCycles)); ABI_PopRegistersAndAdjustStack({ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS}, 8, 16); RET(); @@ -306,18 +342,20 @@ CompileFunc Compiler::GetCompFunc(int kind) NULL, NULL, NULL, NULL, NULL, // STR A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, + //NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, // STRB + //NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, // LDR + //NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, // LDRB + //NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, // STRH A_Comp_MemHalf, A_Comp_MemHalf, A_Comp_MemHalf, A_Comp_MemHalf, - // LDRD - NULL, NULL, NULL, NULL, - // STRD - NULL, NULL, NULL, NULL, + // LDRD, STRD never used by anything so they stay interpreted (by anything I mean the 5 games I checked) + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, // LDRH A_Comp_MemHalf, A_Comp_MemHalf, A_Comp_MemHalf, A_Comp_MemHalf, // LDRSB @@ -360,10 +398,14 @@ CompileFunc Compiler::GetCompFunc(int kind) T_Comp_MemImm, T_Comp_MemImm, T_Comp_MemImm, T_Comp_MemImm, // LDR/STR half imm offset T_Comp_MemImmHalf, T_Comp_MemImmHalf, - // branch, etc. - NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL + // LDR/STR sp rel + NULL, NULL, + // PUSH/POP + NULL, NULL, + // LDMIA, STMIA + NULL, NULL, + NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL }; return Thumb ? T_Comp[kind] : A_Comp[kind]; @@ -376,7 +418,7 @@ void Compiler::Comp_AddCycles_C() : ((R15 & 0x2) ? 0 : CurInstr.CodeCycles); if (CurInstr.Cond() < 0xE) - ADD(32, R(RCycles), Imm8(cycles)); + ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm8(cycles)); else ConstantCycles += cycles; } @@ -388,13 +430,15 @@ void Compiler::Comp_AddCycles_CI(u32 i) : ((R15 & 0x2) ? 0 : CurInstr.CodeCycles)) + i; if (CurInstr.Cond() < 0xE) - ADD(32, R(RCycles), Imm8(cycles)); + ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm8(cycles)); else ConstantCycles += cycles; } void Compiler::Comp_JumpTo(Gen::X64Reg addr, bool restoreCPSR) { + // potentieller Bug: falls ein Register das noch gecacht ist, beim Modeswitch gespeichert + // wird der alte Wert gespeichert SaveCPSR(); MOV(64, R(ABI_PARAM1), R(RCPU)); diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.h b/src/ARMJIT_x64/ARMJIT_Compiler.h index 9395a29..a751737 100644 --- a/src/ARMJIT_x64/ARMJIT_Compiler.h +++ b/src/ARMJIT_x64/ARMJIT_Compiler.h @@ -4,7 +4,7 @@ #include "../dolphin/x64Emitter.h" #include "../ARMJIT.h" -#include "../ARMJIT_RegCache.h" +#include "../ARMJIT_RegisterCache.h" #include <tuple> @@ -12,7 +12,6 @@ namespace ARMJIT { const Gen::X64Reg RCPU = Gen::RBP; -const Gen::X64Reg RCycles = Gen::R14; const Gen::X64Reg RCPSR = Gen::R15; const Gen::X64Reg RSCRATCH = Gen::EAX; @@ -72,6 +71,7 @@ private: void A_Comp_MemWB(); void A_Comp_MemHalf(); + void A_Comp_LDM_STM(); void T_Comp_ShiftImm(); void T_Comp_AddSub_(); @@ -86,8 +86,13 @@ private: void T_Comp_MemImm(); void T_Comp_MemRegHalf(); void T_Comp_MemImmHalf(); + void T_Comp_LoadPCRel(); + void T_Comp_MemSPRel(); + void T_Comp_PUSH_POP(); + void T_Comp_LDMIA_STMIA(); void Comp_MemAccess(Gen::OpArg rd, bool signExtend, bool store, int size); + s32 Comp_MemAccessBlock(Gen::OpArg rb, BitSet16 regs, bool store, bool preinc, bool decrement, bool usermode); void Comp_ArithTriOp(void (Compiler::*op)(int, const Gen::OpArg&, const Gen::OpArg&), Gen::OpArg rd, Gen::OpArg rn, Gen::OpArg op2, bool carryUsed, int opFlags); @@ -100,6 +105,11 @@ private: void* Gen_MemoryRoutine9(bool store, int size); void* Gen_MemoryRoutine7(bool store, bool codeMainRAM, int size); + void* Gen_MemoryRoutineSeq9(bool store, bool preinc); + void* Gen_MemoryRoutineSeq7(bool store, bool preinc, bool codeMainRAM); + + void* Gen_ChangeCPSRRoutine(); + Gen::OpArg Comp_RegShiftImm(int op, int amount, Gen::OpArg rm, bool S, bool& carryUsed); Gen::OpArg Comp_RegShiftReg(int op, Gen::OpArg rs, Gen::OpArg rm, bool S, bool& carryUsed); @@ -122,11 +132,14 @@ private: void* MemoryFuncs9[3][2]; void* MemoryFuncs7[3][2][2]; + void* MemoryFuncsSeq9[2][2]; + void* MemoryFuncsSeq7[2][2][2]; + bool CPSRDirty = false; FetchedInstr CurInstr; - RegCache<Compiler, Gen::X64Reg> RegCache; + RegisterCache<Compiler, Gen::X64Reg> RegCache; bool Thumb; u32 Num; diff --git a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp index 69746e2..20e1893 100644 --- a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp +++ b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp @@ -3,16 +3,6 @@ #include "../GPU.h" #include "../Wifi.h" -namespace NDS -{ -extern u8* SWRAM_ARM9; -extern u32 SWRAM_ARM9Mask; -extern u8* SWRAM_ARM7; -extern u32 SWRAM_ARM7Mask; -extern u8 ARM7WRAM[]; -extern u16 ARM7BIOSProt; -} - using namespace Gen; namespace ARMJIT @@ -41,6 +31,49 @@ int squeezePointer(T* ptr) store value - ABI_PARAM2 (a.k.a. RDX = RSCRATCH2 on Windows) code cycles - ABI_PARAM3 */ + +#define CALC_CYCLES_9(numC, numD, scratch) \ + LEA(32, scratch, MComplex(numD, numC, SCALE_1, -6)); \ + CMP(32, R(numC), R(numD)); \ + CMOVcc(32, numD, R(numC), CC_G); \ + CMP(32, R(numD), R(scratch)); \ + CMOVcc(32, scratch, R(numD), CC_G); \ + ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(scratch)); +#define CALC_CYCLES_7_DATA_MAIN_RAM(numC, numD, scratch) \ + if (codeMainRAM) \ + { \ + LEA(32, scratch, MRegSum(numD, numC)); \ + ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(scratch)); \ + } \ + else \ + { \ + if (!store) \ + ADD(32, R(numC), Imm8(1)); \ + LEA(32, scratch, MComplex(numD, numC, SCALE_1, -3)); \ + CMP(32, R(numD), R(numC)); \ + CMOVcc(32, numC, R(numD), CC_G); \ + CMP(32, R(numC), R(scratch)); \ + CMOVcc(32, scratch, R(numC), CC_G); \ + ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(scratch)); \ + } +#define CALC_CYCLES_7_DATA_NON_MAIN_RAM(numC, numD, scratch) \ + if (codeMainRAM) \ + { \ + if (!store) \ + ADD(32, R(numD), Imm8(1)); \ + LEA(32, scratch, MComplex(numD, numC, SCALE_1, -3)); \ + CMP(32, R(numD), R(numC)); \ + CMOVcc(32, numC, R(numD), CC_G); \ + CMP(32, R(numC), R(scratch)); \ + CMOVcc(32, scratch, R(numC), CC_G); \ + ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(scratch)); \ + } \ + else \ + { \ + LEA(32, scratch, MComplex(numD, numC, SCALE_1, store ? 0 : 1)); \ + ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(scratch)); \ + } + void* Compiler::Gen_MemoryRoutine9(bool store, int size) { u32 addressMask = ~(size == 32 ? 3 : (size == 16 ? 1 : 0)); @@ -56,15 +89,10 @@ void* Compiler::Gen_MemoryRoutine9(bool store, int size) FixupBranch insideITCM = J_CC(CC_B); // cycle counting! - MOV(32, R(RSCRATCH), R(ABI_PARAM1)); - SHR(32, R(RSCRATCH), Imm8(12)); - MOVZX(32, 8, RSCRATCH, MComplex(RCPU, RSCRATCH, SCALE_1, offsetof(ARMv5, MemTimings) + (size == 32 ? 2 : 0))); - LEA(32, ABI_PARAM4, MComplex(RSCRATCH, ABI_PARAM3, SCALE_1, -6)); - CMP(32, R(ABI_PARAM3), R(RSCRATCH)); - CMOVcc(32, RSCRATCH, R(ABI_PARAM3), CC_G); - CMP(32, R(ABI_PARAM4), R(RSCRATCH)); - CMOVcc(32, RSCRATCH, R(ABI_PARAM4), CC_G); - ADD(32, R(RCycles), R(RSCRATCH)); + MOV(32, R(ABI_PARAM4), R(ABI_PARAM1)); + SHR(32, R(ABI_PARAM4), Imm8(12)); + MOVZX(32, 8, ABI_PARAM4, MComplex(RCPU, ABI_PARAM4, SCALE_4, offsetof(ARMv5, MemTimings) + (size == 32 ? 2 : 1))); + CALC_CYCLES_9(ABI_PARAM3, ABI_PARAM4, RSCRATCH) if (store) { @@ -101,7 +129,7 @@ void* Compiler::Gen_MemoryRoutine9(bool store, int size) } SetJumpTarget(insideDTCM); - ADD(32, R(RCycles), R(ABI_PARAM3)); + ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(ABI_PARAM3)); AND(32, R(RSCRATCH), Imm32(0x3FFF & addressMask)); if (store) MOV(size, MComplex(RCPU, RSCRATCH, SCALE_1, offsetof(ARMv5, DTCM)), R(ABI_PARAM2)); @@ -120,7 +148,7 @@ void* Compiler::Gen_MemoryRoutine9(bool store, int size) RET(); SetJumpTarget(insideITCM); - ADD(32, R(RCycles), R(ABI_PARAM3)); + ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(ABI_PARAM3)); MOV(32, R(ABI_PARAM3), R(ABI_PARAM1)); // free up ECX AND(32, R(ABI_PARAM3), Imm32(0x7FFF & addressMask)); if (store) @@ -158,28 +186,13 @@ void* Compiler::Gen_MemoryRoutine7(bool store, bool codeMainRAM, int size) MOV(32, R(RSCRATCH), R(ABI_PARAM1)); SHR(32, R(RSCRATCH), Imm8(15)); - MOVZX(32, 8, ABI_PARAM4, MDisp(RSCRATCH, (size == 32 ? 2 : 0) + squeezePointer(NDS::ARM7MemTimings))); + MOVZX(32, 8, ABI_PARAM4, MScaled(RSCRATCH, SCALE_4, (size == 32 ? 2 : 0) + squeezePointer(NDS::ARM7MemTimings))); MOV(32, R(RSCRATCH), R(ABI_PARAM1)); AND(32, R(RSCRATCH), Imm32(0xFF000000)); CMP(32, R(RSCRATCH), Imm32(0x02000000)); FixupBranch outsideMainRAM = J_CC(CC_NE); - if (codeMainRAM) - { - LEA(32, RSCRATCH, MRegSum(ABI_PARAM4, ABI_PARAM3)); - ADD(32, R(RCycles), R(RSCRATCH)); - } - else - { - if (!store) - ADD(32, R(ABI_PARAM3), Imm8(1)); - LEA(32, RSCRATCH, MComplex(ABI_PARAM4, ABI_PARAM3, SCALE_1, -3)); - CMP(32, R(ABI_PARAM4), R(ABI_PARAM3)); - CMOVcc(32, ABI_PARAM3, R(ABI_PARAM4), CC_G); - CMP(32, R(ABI_PARAM3), R(RSCRATCH)); - CMOVcc(32, RSCRATCH, R(ABI_PARAM3), CC_G); - ADD(32, R(RCycles), R(RSCRATCH)); - } + CALC_CYCLES_7_DATA_MAIN_RAM(ABI_PARAM3, ABI_PARAM4, RSCRATCH) MOV(32, R(ABI_PARAM3), R(ABI_PARAM1)); AND(32, R(ABI_PARAM3), Imm32((MAIN_RAM_SIZE - 1) & addressMask)); if (store) @@ -205,22 +218,7 @@ void* Compiler::Gen_MemoryRoutine7(bool store, bool codeMainRAM, int size) RET(); SetJumpTarget(outsideMainRAM); - if (codeMainRAM) - { - if (!store) - ADD(32, R(ABI_PARAM4), Imm8(1)); - LEA(32, RSCRATCH, MComplex(ABI_PARAM4, ABI_PARAM3, SCALE_1, -3)); - CMP(32, R(ABI_PARAM4), R(ABI_PARAM3)); - CMOVcc(32, ABI_PARAM3, R(ABI_PARAM4), CC_G); - CMP(32, R(ABI_PARAM3), R(RSCRATCH)); - CMOVcc(32, RSCRATCH, R(ABI_PARAM3), CC_G); - ADD(32, R(RCycles), R(RSCRATCH)); - } - else - { - LEA(32, RSCRATCH, MComplex(ABI_PARAM4, ABI_PARAM3, SCALE_1, store ? 0 : 1)); - ADD(32, R(RCycles), R(RSCRATCH)); - } + CALC_CYCLES_7_DATA_NON_MAIN_RAM(ABI_PARAM3, ABI_PARAM4, RSCRATCH) if (store) { if (size > 8) @@ -257,7 +255,189 @@ void* Compiler::Gen_MemoryRoutine7(bool store, bool codeMainRAM, int size) return res; } -void Compiler::Comp_MemAccess(Gen::OpArg rd, bool signExtend, bool store, int size) +#define MEMORY_SEQ_WHILE_COND \ + if (!store) \ + MOV(32, currentElement, R(EAX));\ + if (!preinc) \ + ADD(32, R(ABI_PARAM1), Imm8(4)); \ + \ + SUB(32, R(ABI_PARAM3), Imm8(1)); \ + J_CC(CC_NZ, repeat); + +/* + ABI_PARAM1 address + ABI_PARAM2 address where registers are stored + ABI_PARAM3 how many values to read/write + ABI_PARAM4 code cycles + + Dolphin x64CodeEmitter is my favourite assembler + */ +void* Compiler::Gen_MemoryRoutineSeq9(bool store, bool preinc) +{ + const u8* zero = GetCodePtr(); + ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(ABI_PARAM4)); + RET(); + + void* res = (void*)GetWritableCodePtr(); + + TEST(32, R(ABI_PARAM3), R(ABI_PARAM3)); + J_CC(CC_Z, zero); + + PUSH(ABI_PARAM3); + PUSH(ABI_PARAM4); // we need you later + + const u8* repeat = GetCodePtr(); + + if (preinc) + ADD(32, R(ABI_PARAM1), Imm8(4)); + + MOV(32, R(RSCRATCH), R(ABI_PARAM1)); + SUB(32, R(RSCRATCH), MDisp(RCPU, offsetof(ARMv5, DTCMBase))); + CMP(32, R(RSCRATCH), MDisp(RCPU, offsetof(ARMv5, DTCMSize))); + FixupBranch insideDTCM = J_CC(CC_B); + + CMP(32, R(ABI_PARAM1), MDisp(RCPU, offsetof(ARMv5, ITCMSize))); + FixupBranch insideITCM = J_CC(CC_B); + + OpArg currentElement = MComplex(ABI_PARAM2, ABI_PARAM3, SCALE_8, -8); // wasting stack space like a gangster + + ABI_PushRegistersAndAdjustStack({ABI_PARAM1, ABI_PARAM2, ABI_PARAM3}, 8); + AND(32, R(ABI_PARAM1), Imm8(~3)); + if (store) + { + MOV(32, R(ABI_PARAM2), currentElement); + CALL((void*)NDS::ARM9Write32); + } + else + CALL((void*)NDS::ARM9Read32); + ABI_PopRegistersAndAdjustStack({ABI_PARAM1, ABI_PARAM2, ABI_PARAM3}, 8); + + MEMORY_SEQ_WHILE_COND + MOV(32, R(RSCRATCH), R(ABI_PARAM1)); + SHR(32, R(RSCRATCH), Imm8(12)); + MOVZX(32, 8, ABI_PARAM2, MComplex(RCPU, RSCRATCH, SCALE_4, 2 + offsetof(ARMv5, MemTimings))); + MOVZX(32, 8, RSCRATCH, MComplex(RCPU, RSCRATCH, SCALE_4, 3 + offsetof(ARMv5, MemTimings))); + + FixupBranch finishIt1 = J(); + + SetJumpTarget(insideDTCM); + AND(32, R(RSCRATCH), Imm32(0x3FFF & ~3)); + if (store) + { + MOV(32, R(ABI_PARAM4), currentElement); + MOV(32, MComplex(RCPU, RSCRATCH, SCALE_1, offsetof(ARMv5, DTCM)), R(ABI_PARAM4)); + } + else + MOV(32, R(RSCRATCH), MComplex(RCPU, RSCRATCH, SCALE_1, offsetof(ARMv5, DTCM))); + + MEMORY_SEQ_WHILE_COND + MOV(32, R(RSCRATCH), Imm32(1)); // sequential access time + MOV(32, R(ABI_PARAM2), Imm32(1)); // non sequential + FixupBranch finishIt2 = J(); + + SetJumpTarget(insideITCM); + MOV(32, R(RSCRATCH), R(ABI_PARAM1)); + AND(32, R(RSCRATCH), Imm32(0x7FFF & ~3)); + if (store) + { + MOV(32, R(ABI_PARAM4), currentElement); + MOV(32, MComplex(RCPU, RSCRATCH, SCALE_1, offsetof(ARMv5, ITCM)), R(ABI_PARAM4)); + XOR(32, R(ABI_PARAM4), R(ABI_PARAM4)); + MOV(64, MScaled(RSCRATCH, SCALE_4, squeezePointer(cache.ARM9_ITCM)), R(ABI_PARAM4)); + MOV(64, MScaled(RSCRATCH, SCALE_4, squeezePointer(cache.ARM9_ITCM) + 8), R(ABI_PARAM4)); + } + else + MOV(32, R(RSCRATCH), MComplex(RCPU, RSCRATCH, SCALE_1, offsetof(ARMv5, ITCM))); + + MEMORY_SEQ_WHILE_COND + MOV(32, R(RSCRATCH), Imm32(1)); + MOV(32, R(ABI_PARAM2), Imm32(1)); + + SetJumpTarget(finishIt1); + SetJumpTarget(finishIt2); + + POP(ABI_PARAM4); + POP(ABI_PARAM3); + + CMP(32, R(ABI_PARAM3), Imm8(1)); + FixupBranch skipSequential = J_CC(CC_E); + SUB(32, R(ABI_PARAM3), Imm8(1)); + IMUL(32, R(ABI_PARAM3)); + ADD(32, R(ABI_PARAM2), R(RSCRATCH)); + SetJumpTarget(skipSequential); + + CALC_CYCLES_9(ABI_PARAM4, ABI_PARAM2, RSCRATCH) + RET(); + + return res; +} + +void* Compiler::Gen_MemoryRoutineSeq7(bool store, bool preinc, bool codeMainRAM) +{ + const u8* zero = GetCodePtr(); + ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(ABI_PARAM4)); + RET(); + + void* res = (void*)GetWritableCodePtr(); + + TEST(32, R(ABI_PARAM3), R(ABI_PARAM3)); + J_CC(CC_Z, zero); + + PUSH(ABI_PARAM3); + PUSH(ABI_PARAM4); // we need you later + + const u8* repeat = GetCodePtr(); + + if (preinc) + ADD(32, R(ABI_PARAM1), Imm8(4)); + + OpArg currentElement = MComplex(ABI_PARAM2, ABI_PARAM3, SCALE_8, -8); + + ABI_PushRegistersAndAdjustStack({ABI_PARAM1, ABI_PARAM2, ABI_PARAM3}, 8); + AND(32, R(ABI_PARAM1), Imm8(~3)); + if (store) + { + MOV(32, R(ABI_PARAM2), currentElement); + CALL((void*)NDS::ARM7Write32); + } + else + CALL((void*)NDS::ARM7Read32); + ABI_PopRegistersAndAdjustStack({ABI_PARAM1, ABI_PARAM2, ABI_PARAM3}, 8); + + MEMORY_SEQ_WHILE_COND + MOV(32, R(RSCRATCH), R(ABI_PARAM1)); + SHR(32, R(RSCRATCH), Imm8(15)); + MOVZX(32, 8, ABI_PARAM2, MScaled(RSCRATCH, SCALE_4, 2 + squeezePointer(NDS::ARM7MemTimings))); + MOVZX(32, 8, RSCRATCH, MScaled(RSCRATCH, SCALE_4, 3 + squeezePointer(NDS::ARM7MemTimings))); + + POP(ABI_PARAM4); + POP(ABI_PARAM3); + + CMP(32, R(ABI_PARAM3), Imm8(1)); + FixupBranch skipSequential = J_CC(CC_E); + SUB(32, R(ABI_PARAM3), Imm8(1)); + IMUL(32, R(ABI_PARAM3)); + ADD(32, R(ABI_PARAM2), R(RSCRATCH)); + SetJumpTarget(skipSequential); + + MOV(32, R(RSCRATCH), R(ABI_PARAM1)); + AND(32, R(RSCRATCH), Imm32(0xFF000000)); + CMP(32, R(RSCRATCH), Imm32(0x02000000)); + FixupBranch outsideMainRAM = J_CC(CC_NE); + CALC_CYCLES_7_DATA_MAIN_RAM(ABI_PARAM4, ABI_PARAM2, RSCRATCH) + RET(); + + SetJumpTarget(outsideMainRAM); + CALC_CYCLES_7_DATA_NON_MAIN_RAM(ABI_PARAM4, ABI_PARAM2, RSCRATCH) + RET(); + + return res; +} + +#undef CALC_CYCLES_9 +#undef MEMORY_SEQ_WHILE_COND + +void Compiler::Comp_MemAccess(OpArg rd, bool signExtend, bool store, int size) { if (store) MOV(32, R(ABI_PARAM2), rd); @@ -278,6 +458,129 @@ void Compiler::Comp_MemAccess(Gen::OpArg rd, bool signExtend, bool store, int si } } +s32 Compiler::Comp_MemAccessBlock(OpArg rb, BitSet16 regs, bool store, bool preinc, bool decrement, bool usermode) +{ + int regsCount = regs.Count(); + + const u8 userModeOffsets[] = + { + offsetof(ARM, R[8]), offsetof(ARM, R[9]), offsetof(ARM, R[10]), offsetof(ARM, R[11]), + offsetof(ARM, R[12]), offsetof(ARM, R[13]), offsetof(ARM, R[14]), 0, + + offsetof(ARM, R_FIQ[0]), offsetof(ARM, R_FIQ[1]), offsetof(ARM, R_FIQ[2]), offsetof(ARM, R_FIQ[3]), + offsetof(ARM, R_FIQ[4]), offsetof(ARM, R_FIQ[5]), offsetof(ARM, R_FIQ[6]), 0, + + offsetof(ARM, R[8]), offsetof(ARM, R[9]), offsetof(ARM, R[10]), offsetof(ARM, R[11]), + offsetof(ARM, R[12]), offsetof(ARM, R_IRQ[13]), offsetof(ARM, R_IRQ[14]), 0, + + offsetof(ARM, R[8]), offsetof(ARM, R[9]), offsetof(ARM, R[10]), offsetof(ARM, R[11]), + offsetof(ARM, R[12]), offsetof(ARM, R_SVC[13]), offsetof(ARM, R_SVC[14]), 0, + + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + + offsetof(ARM, R[8]), offsetof(ARM, R[9]), offsetof(ARM, R[10]), offsetof(ARM, R[11]), + offsetof(ARM, R[12]), offsetof(ARM, R_ABT[13]), offsetof(ARM, R_ABT[14]), 0, + + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + + offsetof(ARM, R[8]), offsetof(ARM, R[9]), offsetof(ARM, R[10]), offsetof(ARM, R[11]), + offsetof(ARM, R[12]), offsetof(ARM, R_UND[13]), offsetof(ARM, R_UND[14]), 0, + }; + + if (decrement) + { + MOV_sum(32, ABI_PARAM1, rb, Imm32(-regsCount * 4)); + preinc = !preinc; + } + else + MOV(32, R(ABI_PARAM1), rb); + + MOV(32, R(ABI_PARAM3), Imm32(regsCount)); + u32 cycles = Num + ? NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2] + : (R15 & 0x2 ? 0 : CurInstr.CodeCycles); + MOV(32, R(ABI_PARAM4), Imm32(cycles)); + if (!store) + { + SUB(32, R(RSP), regsCount < 16 ? Imm8(regsCount * 8) : Imm32(regsCount * 8)); + MOV(64, R(ABI_PARAM2), R(RSP)); + + CALL(Num == 0 + ? MemoryFuncsSeq9[0][preinc] + : MemoryFuncsSeq7[0][preinc][CodeRegion == 0x02]); + + for (int reg = 15; reg >= 0; reg--) + { + if (regs[reg]) + { + if (usermode && reg >= 8 && reg < 15) + { + MOV(32, R(RSCRATCH2), R(RCPSR)); + AND(32, R(RSCRATCH2), Imm8(0x1F)); + // (RSCRATCH2 - 0x11) * 8 + squeezePointer(userModeOffsets) + (reg - 8), algebra is great! + MOVZX(32, 8, RSCRATCH2, MScaled(RSCRATCH2, SCALE_8, squeezePointer(userModeOffsets) - 0x11 * 8 + (reg - 8))); + POP(RSCRATCH); + MOV(32, MRegSum(RCPU, RSCRATCH2), R(RSCRATCH)); + } + else if (RegCache.Mapping[reg] == INVALID_REG) + { + assert(reg != 15); + + POP(RSCRATCH); + SaveReg(reg, RSCRATCH); + } + else + { + if (reg != 15) + RegCache.DirtyRegs |= (1 << reg); + POP(MapReg(reg).GetSimpleReg()); + } + } + } + + if (regs[15]) + { + if (Num == 1) + OR(32, MapReg(15), Imm8(1)); + Comp_JumpTo(MapReg(15).GetSimpleReg(), usermode); + } + } + else + { + for (int reg : regs) + { + if (usermode && reg >= 8 && reg < 15) + { + MOV(32, R(RSCRATCH), R(RCPSR)); + AND(32, R(RSCRATCH), Imm8(0x1F)); + // (RSCRATCH2 - 0x11) * 8 + squeezePointer(userModeOffsets) + (reg - 8), algebra is great! + MOVZX(32, 8, RSCRATCH, MScaled(RSCRATCH, SCALE_8, squeezePointer(userModeOffsets) - 0x11 * 8 + (reg - 8))); + MOV(32, R(RSCRATCH), MRegSum(RCPU, RSCRATCH)); + PUSH(RSCRATCH); + } + else if (RegCache.Mapping[reg] == INVALID_REG) + { + LoadReg(reg, RSCRATCH); + PUSH(RSCRATCH); + } + else + PUSH(MapReg(reg).GetSimpleReg()); + } + MOV(64, R(ABI_PARAM2), R(RSP)); + + CALL(Num == 0 + ? MemoryFuncsSeq9[1][preinc] + : MemoryFuncsSeq7[1][preinc][CodeRegion == 0x02]); + + ADD(32, R(RSP), regsCount < 16 ? Imm8(regsCount * 8) : Imm32(regsCount * 8)); + } + + return (regsCount * 4) * (decrement ? -1 : 1); +} + OpArg Compiler::A_Comp_GetMemWBOffset() { if (!(CurInstr.Instr & (1 << 25))) @@ -354,6 +657,25 @@ void Compiler::A_Comp_MemHalf() ? Imm32(CurInstr.Instr & 0xF | ((CurInstr.Instr >> 4) & 0xF0)) : MapReg(CurInstr.A_Reg(0)); + int op = (CurInstr.Instr >> 5) & 0x3; + bool load = CurInstr.Instr & (1 << 20); + + bool signExtend = false; + int size; + if (!load) + { + size = op == 1 ? 16 : 32; + load = op == 2; + } + else if (load) + { + size = op == 2 ? 8 : 16; + signExtend = op > 1; + } + + if (size == 32 && Num == 1) + return; // NOP + if (CurInstr.Instr & (1 << 24)) { if (CurInstr.Instr & (1 << 23)) @@ -370,19 +692,6 @@ void Compiler::A_Comp_MemHalf() else MOV(32, R(ABI_PARAM1), rn); - int op = (CurInstr.Instr >> 5) & 0x3; - bool load = CurInstr.Instr & (1 << 20); - - bool signExtend = false; - int size; - if (!load && op == 1) - size = 16; - else if (load) - { - size = op == 2 ? 8 : 16; - signExtend = op > 1; - } - if (!(CurInstr.Instr & (1 << 24))) { if (CurInstr.Instr & (1 << 23)) @@ -412,6 +721,24 @@ void Compiler::T_Comp_MemReg() Comp_MemAccess(rd, false, !load, byte ? 8 : 32); } +void Compiler::A_Comp_LDM_STM() +{ + BitSet16 regs(CurInstr.Instr & 0xFFFF); + + bool load = (CurInstr.Instr >> 20) & 1; + bool pre = (CurInstr.Instr >> 24) & 1; + bool add = (CurInstr.Instr >> 23) & 1; + bool writeback = (CurInstr.Instr >> 21) & 1; + bool usermode = (CurInstr.Instr >> 22) & 1; + + OpArg rn = MapReg(CurInstr.A_Reg(16)); + + s32 offset = Comp_MemAccessBlock(rn, regs, !load, pre, !add, false); + + if (writeback) + ADD(32, rn, offset >= INT8_MIN && offset < INT8_MAX ? Imm8(offset) : Imm32(offset)); +} + void Compiler::T_Comp_MemImm() { OpArg rd = MapReg(CurInstr.T_Reg(0)); @@ -456,4 +783,56 @@ void Compiler::T_Comp_MemImmHalf() Comp_MemAccess(rd, false, !load, 16); } +void Compiler::T_Comp_LoadPCRel() +{ + OpArg rd = MapReg(CurInstr.T_Reg(8)); + u32 addr = (R15 & ~0x2) + ((CurInstr.Instr & 0xFF) << 2); + + // hopefully this doesn't break + u32 val; CurCPU->DataRead32(addr, &val); + MOV(32, rd, Imm32(val)); +} + +void Compiler::T_Comp_MemSPRel() +{ + u32 offset = (CurInstr.Instr & 0xFF) * 4; + OpArg rd = MapReg(CurInstr.T_Reg(8)); + bool load = CurInstr.Instr & (1 << 11); + + LEA(32, ABI_PARAM1, MDisp(MapReg(13).GetSimpleReg(), offset)); + + Comp_MemAccess(rd, false, !load, 32); +} + +void Compiler::T_Comp_PUSH_POP() +{ + bool load = CurInstr.Instr & (1 << 11); + BitSet16 regs(CurInstr.Instr & 0xFF); + if (CurInstr.Instr & (1 << 8)) + { + if (load) + regs[15] = true; + else + regs[14] = true; + } + + OpArg sp = MapReg(13); + + s32 offset = Comp_MemAccessBlock(sp, regs, !load, !load, !load, false); + + ADD(32, sp, Imm8(offset)); // offset will be always be in range since PUSH accesses 9 regs max +} + +void Compiler::T_Comp_LDMIA_STMIA() +{ + BitSet16 regs(CurInstr.Instr & 0xFF); + OpArg rb = MapReg(CurInstr.T_Reg(8)); + bool load = CurInstr.Instr & (1 << 11); + + s32 offset = Comp_MemAccessBlock(rb, regs, !load, false, false, false); + + if (!load || !regs[CurInstr.T_Reg(8)]) + ADD(32, rb, Imm8(offset)); +} + }
\ No newline at end of file diff --git a/src/ARM_InstrInfo.cpp b/src/ARM_InstrInfo.cpp index 32a9645..c519229 100644 --- a/src/ARM_InstrInfo.cpp +++ b/src/ARM_InstrInfo.cpp @@ -25,9 +25,7 @@ enum { A_Link = 1 << 10, - A_LDMSTM = 1 << 11, - - A_ARM9Only = 1 << 12, + A_UnkOnARM7 = 1 << 11, }; #define A_BIOP A_Read16 @@ -97,12 +95,12 @@ const u32 A_SMULWy = A_Write16 | A_Read0 | A_Read8 | ak(ak_SMULWy); const u32 A_SMLALxy = A_Write16 | A_Write12 | A_Read16 | A_Read12 | A_Read0 | A_Read8 | ak(ak_SMLALxy); const u32 A_SMULxy = A_Write16 | A_Read0 | A_Read8 | ak(ak_SMULxy); -const u32 A_CLZ = A_Write12 | A_Read0 | A_ARM9Only | ak(ak_CLZ); +const u32 A_CLZ = A_Write12 | A_Read0 | A_UnkOnARM7 | ak(ak_CLZ); -const u32 A_QADD = A_Write12 | A_Read0 | A_Read16 | A_ARM9Only | ak(ak_QADD); -const u32 A_QSUB = A_Write12 | A_Read0 | A_Read16 | A_ARM9Only | ak(ak_QSUB); -const u32 A_QDADD = A_Write12 | A_Read0 | A_Read16 | A_ARM9Only | ak(ak_QDADD); -const u32 A_QDSUB = A_Write12 | A_Read0 | A_Read16 | A_ARM9Only | ak(ak_QDSUB); +const u32 A_QADD = A_Write12 | A_Read0 | A_Read16 | A_UnkOnARM7 | ak(ak_QADD); +const u32 A_QSUB = A_Write12 | A_Read0 | A_Read16 | A_UnkOnARM7 | ak(ak_QSUB); +const u32 A_QDADD = A_Write12 | A_Read0 | A_Read16 | A_UnkOnARM7 | ak(ak_QDADD); +const u32 A_QDSUB = A_Write12 | A_Read0 | A_Read16 | A_UnkOnARM7 | ak(ak_QDSUB); #define A_LDR A_Write12 #define A_STR A_Read12 @@ -144,8 +142,8 @@ A_IMPLEMENT_HD_LDRSTR(LDRSH,LDR) const u32 A_SWP = A_Write12 | A_Read16 | A_Read0 | ak(ak_SWP); const u32 A_SWPB = A_Write12 | A_Read16 | A_Read0 | ak(ak_SWPB); -const u32 A_LDM = A_Read16 | A_LDMSTM | ak(ak_LDM); -const u32 A_STM = A_Read16 | A_LDMSTM | ak(ak_STM); +const u32 A_LDM = A_Read16 | A_MemWriteback | ak(ak_LDM); +const u32 A_STM = A_Read16 | A_MemWriteback | ak(ak_STM); const u32 A_B = A_BranchAlways | ak(ak_B); const u32 A_BL = A_BranchAlways | A_Link | ak(ak_BL); @@ -154,11 +152,11 @@ const u32 A_BX = A_BranchAlways | A_Read0 | ak(ak_BX); const u32 A_BLX_REG = A_BranchAlways | A_Link | A_Read0 | ak(ak_BLX_REG); const u32 A_UNK = A_BranchAlways | A_Link | ak(ak_UNK); -const u32 A_MSR_IMM = A_ARM9Only | ak(ak_MSR_IMM); -const u32 A_MSR_REG = A_Read0 | A_ARM9Only | ak(ak_MSR_REG); -const u32 A_MRS = A_Write12 | A_ARM9Only | ak(ak_MRS); -const u32 A_MCR = A_Read12 | A_ARM9Only | ak(ak_MCR); -const u32 A_MRC = A_Write12 | A_ARM9Only | ak(ak_MRC); +const u32 A_MSR_IMM = A_UnkOnARM7 | ak(ak_MSR_IMM); +const u32 A_MSR_REG = A_Read0 | A_UnkOnARM7 | ak(ak_MSR_REG); +const u32 A_MRS = A_Write12 | A_UnkOnARM7 | ak(ak_MRS); +const u32 A_MCR = A_Read12 | A_UnkOnARM7 | ak(ak_MCR); +const u32 A_MRC = A_Write12 | A_UnkOnARM7 | ak(ak_MRC); const u32 A_SVC = A_BranchAlways | A_Link | ak(ak_SVC); // THUMB @@ -249,7 +247,7 @@ const u32 T_LDRH_IMM = T_Write0 | T_Read3 | tk(tk_LDRH_IMM); const u32 T_STR_SPREL = T_Read8 | T_ReadR13 | tk(tk_STR_SPREL); const u32 T_LDR_SPREL = T_Write8 | T_ReadR13 | tk(tk_LDR_SPREL); -const u32 T_PUSH = T_ReadR15 | T_ReadR13 | T_WriteR13 | tk(tk_PUSH); +const u32 T_PUSH = T_ReadR13 | T_WriteR13 | tk(tk_PUSH); const u32 T_POP = T_PopPC | T_ReadR13 | T_WriteR13 | tk(tk_POP); const u32 T_LDMIA = T_Read8 | T_Write8 | tk(tk_LDMIA); @@ -320,8 +318,10 @@ Info Decode(bool thumb, u32 num, u32 instr) if (num == 0 && (instr & 0xFE000000) == 0xFA000000) data = A_BLX_IMM; - if (data & A_ARM9Only && num != 0) - data |= A_BranchAlways | A_Link; + if (data & A_UnkOnARM7 && num != 0) + data = A_UNK; + + res.Kind = (data >> 13) & 0x1FF; if (data & A_Read0) res.SrcRegs |= 1 << (instr & 0xF); @@ -360,14 +360,8 @@ Info Decode(bool thumb, u32 num, u32 instr) res.SrcRegs |= 1 << 15; } - if (data & A_LDMSTM) - { - res.DstRegs |= instr & (!!(instr & (1 << 20)) << 15); - if (instr & (1 << 21)) - res.DstRegs |= 1 << ((instr >> 16) & 0xF); - } - - res.Kind = (data >> 13) & 0x1FF; + if (res.Kind == ak_LDM) + res.DstRegs |= instr & (1 << 15); // this is right return res; } |