From ebce9f035ff05b414f1bb895beabb62bc539ac76 Mon Sep 17 00:00:00 2001 From: RSDuck Date: Tue, 25 Jun 2019 17:09:27 +0200 Subject: JIT: implemented most ALU instructions --- src/ARMJIT_x64/ARMJIT_ALU.cpp | 546 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 546 insertions(+) create mode 100644 src/ARMJIT_x64/ARMJIT_ALU.cpp (limited to 'src/ARMJIT_x64/ARMJIT_ALU.cpp') diff --git a/src/ARMJIT_x64/ARMJIT_ALU.cpp b/src/ARMJIT_x64/ARMJIT_ALU.cpp new file mode 100644 index 0000000..d06c99c --- /dev/null +++ b/src/ARMJIT_x64/ARMJIT_ALU.cpp @@ -0,0 +1,546 @@ +#include "ARMJIT_Compiler.h" + +using namespace Gen; + +namespace ARMJIT +{ + +// uses RSCRATCH3 +void Compiler::Comp_ArithTriOp(void (Compiler::*op)(int, const OpArg&, const OpArg&), + OpArg rd, OpArg rn, OpArg op2, bool carryUsed, int opFlags) +{ + if (opFlags & opSyncCarry) + { + BT(32, R(RCPSR), Imm8(29)); + if (opFlags & opInvertCarry) + CMC(); + } + + if (rd == rn && !(opFlags & opInvertOp2)) + (this->*op)(32, rd, op2); + else if (opFlags & opSymmetric && op2 == R(RSCRATCH)) + { + if (opFlags & opInvertOp2) + NOT(32, op2); + (this->*op)(32, op2, rn); + MOV(32, rd, op2); + } + else + { + if (opFlags & opInvertOp2) + { + if (op2 != R(RSCRATCH)) + { + MOV(32, R(RSCRATCH), op2); + op2 = R(RSCRATCH); + } + NOT(32, op2); + } + MOV(32, R(RSCRATCH3), rn); + (this->*op)(32, R(RSCRATCH3), op2); + MOV(32, rd, R(RSCRATCH3)); + } + + if (opFlags & opSetsFlags) + Comp_RetriveFlags(opFlags & opInvertCarry, opFlags & opRetriveCV, carryUsed); +} + +void Compiler::Comp_ArithTriOpReverse(void (Compiler::*op)(int, const Gen::OpArg&, const Gen::OpArg&), + Gen::OpArg rd, Gen::OpArg rn, Gen::OpArg op2, bool carryUsed, int opFlags) +{ + if (opFlags & opSyncCarry) + { + BT(32, R(RCPSR), Imm8(29)); + if (opFlags & opInvertCarry) + CMC(); + } + + if (op2 != R(RSCRATCH)) + { + MOV(32, R(RSCRATCH), op2); + op2 = R(RSCRATCH); + } + (this->*op)(32, op2, rn); + MOV(32, rd, op2); + + if (opFlags & opSetsFlags) + Comp_RetriveFlags(opFlags & opInvertCarry, opFlags & opRetriveCV, carryUsed); +} + +void Compiler::Comp_CmpOp(int op, Gen::OpArg rn, Gen::OpArg op2, bool carryUsed) +{ + switch (op) + { + case 0: // TST + if (rn.IsImm()) + { + MOV(32, R(RSCRATCH3), rn); + rn = R(RSCRATCH3); + } + TEST(32, rn, op2); + break; + case 1: // TEQ + MOV(32, R(RSCRATCH3), rn); + XOR(32, R(RSCRATCH3), op2); + break; + case 2: // CMP + if (rn.IsImm()) + { + MOV(32, R(RSCRATCH3), rn); + rn = R(RSCRATCH3); + } + CMP(32, rn, op2); + break; + case 3: // CMN + MOV(32, R(RSCRATCH3), rn); + ADD(32, R(RSCRATCH3), op2); + break; + } + + Comp_RetriveFlags(op == 2, op >= 2, carryUsed); +} + +// also calculates cycles +OpArg Compiler::A_Comp_GetALUOp2(bool S, bool& carryUsed) +{ + if (CurrentInstr.Instr & (1 << 25)) + { + Comp_AddCycles_C(); + carryUsed = false; + return Imm32(ROR(CurrentInstr.Instr & 0xFF, (CurrentInstr.Instr >> 7) & 0x1E)); + } + else + { + int op = (CurrentInstr.Instr >> 5) & 0x3; + if (CurrentInstr.Instr & (1 << 4)) + { + Comp_AddCycles_CI(1); + OpArg rm = MapReg(CurrentInstr.A_Reg(0)); + if (rm.IsImm() && CurrentInstr.A_Reg(0) == 15) + rm = Imm32(rm.Imm32() + 4); + return Comp_RegShiftReg(op, MapReg(CurrentInstr.A_Reg(8)), rm, S, carryUsed); + } + else + { + Comp_AddCycles_C(); + return Comp_RegShiftImm(op, (CurrentInstr.Instr >> 7) & 0x1F, + MapReg(CurrentInstr.A_Reg(0)), S, carryUsed); + } + } +} + +void Compiler::A_Comp_CmpOp() +{ + u32 op = (CurrentInstr.Instr >> 21) & 0xF; + + bool carryUsed; + OpArg rn = MapReg(CurrentInstr.A_Reg(16)); + OpArg op2 = A_Comp_GetALUOp2((1 << op) & 0xF303, carryUsed); + + Comp_CmpOp(op - 0x8, rn, op2, carryUsed); +} + +void Compiler::A_Comp_Arith() +{ + bool S = CurrentInstr.Instr & (1 << 20); + u32 op = (CurrentInstr.Instr >> 21) & 0xF; + + bool carryUsed; + OpArg rn = MapReg(CurrentInstr.A_Reg(16)); + OpArg rd = MapReg(CurrentInstr.A_Reg(12)); + OpArg op2 = A_Comp_GetALUOp2(S && (1 << op) & 0xF303, carryUsed); + + u32 sFlag = S ? opSetsFlags : 0; + switch (op) + { + case 0x0: // AND + Comp_ArithTriOp(AND, rd, rn, op2, carryUsed, opSymmetric|sFlag); + return; + case 0x1: // EOR + Comp_ArithTriOp(XOR, rd, rn, op2, carryUsed, opSymmetric|sFlag); + return; + case 0x2: // SUB + Comp_ArithTriOp(SUB, rd, rn, op2, carryUsed, sFlag|opRetriveCV|opInvertCarry); + return; + case 0x3: // RSB + if (op2.IsZero()) + { + if (rd != rn) + MOV(32, rd, rn); + NEG(32, rd); + if (S) + Comp_RetriveFlags(true, true, false); + } + else + Comp_ArithTriOpReverse(SUB, rd, rn, op2, carryUsed, sFlag|opRetriveCV|opInvertCarry); + return; + case 0x4: // ADD + Comp_ArithTriOp(ADD, rd, rn, op2, carryUsed, opSymmetric|sFlag|opRetriveCV); + return; + case 0x5: // ADC + Comp_ArithTriOp(ADC, rd, rn, op2, carryUsed, opSymmetric|sFlag|opRetriveCV|opSyncCarry); + return; + case 0x6: // SBC + Comp_ArithTriOp(SBB, rd, rn, op2, carryUsed, opSymmetric|sFlag|opRetriveCV|opSyncCarry|opInvertCarry); + return; + case 0x7: // RSC + Comp_ArithTriOpReverse(SBB, rd, rn, op2, carryUsed, sFlag|opRetriveCV|opInvertCarry|opSyncCarry); + return; + case 0xC: // ORR + Comp_ArithTriOp(OR, rd, rn, op2, carryUsed, opSymmetric|sFlag); + return; + case 0xE: // BIC + Comp_ArithTriOp(AND, rd, rn, op2, carryUsed, sFlag|opSymmetric|opInvertOp2); + return; + default: + assert("unimplemented"); + } +} + +void Compiler::A_Comp_MovOp() +{ + bool carryUsed; + bool S = CurrentInstr.Instr & (1 << 20); + OpArg op2 = A_Comp_GetALUOp2(S, carryUsed); + OpArg rd = MapReg(CurrentInstr.A_Reg(12)); + + if (rd != op2) + MOV(32, rd, op2); + + if (((CurrentInstr.Instr >> 21) & 0xF) == 0xF) + NOT(32, rd); + + if (S) + { + TEST(32, rd, rd); + Comp_RetriveFlags(false, false, carryUsed); + } +} + +void Compiler::Comp_RetriveFlags(bool sign, bool retriveCV, bool carryUsed) +{ + CPSRDirty = true; + + bool carryOnly = !retriveCV && carryUsed; + if (retriveCV) + { + SETcc(CC_O, R(RSCRATCH)); + SETcc(sign ? CC_NC : CC_C, R(RSCRATCH3)); + LEA(32, RSCRATCH2, MComplex(RSCRATCH, RSCRATCH3, SCALE_2, 0)); + } + + if (carryUsed == 983298) + printf("etwas ist faul im lande daenemark %x\n", CurrentInstr.Instr); + + SETcc(CC_S, R(RSCRATCH)); + SETcc(CC_Z, R(RSCRATCH3)); + LEA(32, RSCRATCH, MComplex(RSCRATCH3, RSCRATCH, SCALE_2, 0)); + int shiftAmount = 30; + if (retriveCV || carryUsed) + { + LEA(32, RSCRATCH, MComplex(RSCRATCH2, RSCRATCH, carryOnly ? SCALE_2 : SCALE_4, 0)); + shiftAmount = carryOnly ? 29 : 28; + } + SHL(32, R(RSCRATCH), Imm8(shiftAmount)); + + AND(32, R(RCPSR), Imm32(0x3FFFFFFF & ~(carryUsed << 29) & ~((retriveCV ? 3 : 0) << 28))); + OR(32, R(RCPSR), R(RSCRATCH)); +} + +// always uses RSCRATCH, RSCRATCH2 only if S == true +OpArg Compiler::Comp_RegShiftReg(int op, Gen::OpArg rs, Gen::OpArg rm, bool S, bool& carryUsed) +{ + carryUsed = S; + + if (S) + { + XOR(32, R(RSCRATCH2), R(RSCRATCH2)); + BT(32, R(RCPSR), Imm8(29)); + SETcc(CC_C, R(RSCRATCH2)); + } + + MOV(32, R(RSCRATCH), rm); + static_assert(RSCRATCH3 == ECX); + MOV(32, R(ECX), rs); + AND(32, R(ECX), Imm32(0xFF)); + + FixupBranch zero = J_CC(CC_Z); + if (op < 3) + { + void (Compiler::*shiftOp)(int, const OpArg&, const OpArg&) = NULL; + if (op == 0) + shiftOp = SHL; + else if (op == 1) + shiftOp = SHR; + else if (op == 2) + shiftOp = SAR; + + CMP(32, R(ECX), Imm8(32)); + FixupBranch lt32 = J_CC(CC_L); + FixupBranch done1; + if (op < 2) + { + FixupBranch eq32 = J_CC(CC_E); + XOR(32, R(RSCRATCH), R(RSCRATCH)); + if (S) + XOR(32, R(RSCRATCH2), R(RSCRATCH2)); + done1 = J(); + SetJumpTarget(eq32); + } + (this->*shiftOp)(32, R(RSCRATCH), Imm8(31)); + (this->*shiftOp)(32, R(RSCRATCH), Imm8(1)); + if (S) + SETcc(CC_C, R(RSCRATCH2)); + + FixupBranch done2 = J(); + + SetJumpTarget(lt32); + (this->*shiftOp)(32, R(RSCRATCH), R(ECX)); + if (S) + SETcc(CC_C, R(RSCRATCH2)); + + if (op < 2) + SetJumpTarget(done1); + SetJumpTarget(done2); + + } + else if (op == 3) + { + if (S) + BT(32, R(RSCRATCH), Imm8(31)); + ROR_(32, R(RSCRATCH), R(ECX)); + if (S) + SETcc(CC_C, R(RSCRATCH2)); + } + SetJumpTarget(zero); + + return R(RSCRATCH); +} + +// may uses RSCRATCH for op2 and RSCRATCH2 for the carryValue +OpArg Compiler::Comp_RegShiftImm(int op, int amount, OpArg rm, bool S, bool& carryUsed) +{ + carryUsed = true; + + switch (op) + { + case 0: // LSL + if (amount > 0) + { + MOV(32, R(RSCRATCH), rm); + SHL(32, R(RSCRATCH), Imm8(amount)); + if (S) + SETcc(CC_C, R(RSCRATCH2)); + + return R(RSCRATCH); + } + else + { + carryUsed = false; + return rm; + } + case 1: // LSR + if (amount > 0) + { + MOV(32, R(RSCRATCH), rm); + SHR(32, R(RSCRATCH), Imm8(amount)); + if (S) + SETcc(CC_C, R(RSCRATCH2)); + return R(RSCRATCH); + } + else + { + if (S) + { + MOV(32, R(RSCRATCH2), rm); + SHR(32, R(RSCRATCH2), Imm8(31)); + } + return Imm32(0); + } + case 2: // ASR + MOV(32, R(RSCRATCH), rm); + SAR(32, R(RSCRATCH), Imm8(amount ? amount : 31)); + if (S) + { + if (amount == 0) + BT(32, rm, Imm8(31)); + SETcc(CC_C, R(RSCRATCH2)); + } + return R(RSCRATCH); + case 3: // ROR + MOV(32, R(RSCRATCH), rm); + if (amount > 0) + ROR_(32, R(RSCRATCH), Imm8(amount)); + else + { + BT(32, R(RCPSR), Imm8(29)); + RCR(32, R(RSCRATCH), Imm8(1)); + } + if (S) + SETcc(CC_C, R(RSCRATCH2)); + return R(RSCRATCH); + } + + assert(false); +} + +void Compiler::T_Comp_ShiftImm() +{ + OpArg rd = MapReg(CurrentInstr.T_Reg(0)); + OpArg rs = MapReg(CurrentInstr.T_Reg(3)); + + int op = (CurrentInstr.Instr >> 11) & 0x3; + int amount = (CurrentInstr.Instr >> 6) & 0x1F; + + Comp_AddCycles_C(); + + bool carryUsed; + OpArg shifted = Comp_RegShiftImm(op, amount, rs, true, carryUsed); + + if (shifted != rd) + MOV(32, rd, shifted); + + TEST(32, rd, rd); + Comp_RetriveFlags(false, false, carryUsed); +} + +void Compiler::T_Comp_AddSub_() +{ + OpArg rd = MapReg(CurrentInstr.T_Reg(0)); + OpArg rs = MapReg(CurrentInstr.T_Reg(3)); + + int op = (CurrentInstr.Instr >> 9) & 0x3; + + OpArg rn = op >= 2 ? Imm32((CurrentInstr.Instr >> 6) & 0x7) : MapReg(CurrentInstr.T_Reg(6)); + + Comp_AddCycles_C(); + + if (op & 1) + Comp_ArithTriOp(SUB, rd, rs, rn, false, opSetsFlags|opInvertCarry|opRetriveCV); + else + Comp_ArithTriOp(ADD, rd, rs, rn, false, opSetsFlags|opSymmetric|opRetriveCV); +} + +void Compiler::T_Comp_ALU_Imm8() +{ + OpArg rd = MapReg(CurrentInstr.T_Reg(8)); + + u32 op = (CurrentInstr.Instr >> 11) & 0x3; + OpArg imm = Imm32(CurrentInstr.Instr & 0xFF); + + Comp_AddCycles_C(); + + switch (op) + { + case 0x0: + MOV(32, rd, imm); + TEST(32, rd, rd); + Comp_RetriveFlags(false, false, false); + return; + case 0x1: + Comp_CmpOp(2, rd, imm, false); + return; + case 0x2: + Comp_ArithTriOp(ADD, rd, rd, imm, false, opSetsFlags|opSymmetric|opRetriveCV); + return; + case 0x3: + Comp_ArithTriOp(SUB, rd, rd, imm, false, opSetsFlags|opInvertCarry|opRetriveCV); + return; + } +} + +void Compiler::T_Comp_ALU() +{ + OpArg rd = MapReg(CurrentInstr.T_Reg(0)); + OpArg rs = MapReg(CurrentInstr.T_Reg(3)); + + u32 op = (CurrentInstr.Instr >> 6) & 0xF; + + Comp_AddCycles_C(); + + switch (op) + { + case 0x0: // AND + Comp_ArithTriOp(AND, rd, rd, rs, false, opSetsFlags|opSymmetric); + return; + case 0x1: // EOR + Comp_ArithTriOp(XOR, rd, rd, rs, false, opSetsFlags|opSymmetric); + return; + case 0x2: + case 0x3: + case 0x4: + case 0x7: + { + int shiftOp = op == 7 ? 3 : op - 0x2; + bool carryUsed; + OpArg shifted = Comp_RegShiftReg(shiftOp, rs, rd, true, carryUsed); + TEST(32, shifted, shifted); + MOV(32, rd, shifted); + Comp_RetriveFlags(false, false, true); + } + return; + case 0x5: // ADC + Comp_ArithTriOp(ADC, rd, rd, rs, false, opSetsFlags|opSymmetric|opSyncCarry|opRetriveCV); + return; + case 0x6: // SBC + Comp_ArithTriOp(SBB, rd, rd, rs, false, opSetsFlags|opSyncCarry|opInvertCarry|opRetriveCV); + return; + case 0x8: // TST + Comp_CmpOp(0, rd, rs, false); + return; + case 0x9: // NEG + if (rd != rs) + MOV(32, rd, rs); + NEG(32, rd); + Comp_RetriveFlags(true, true, false); + return; + case 0xA: // CMP + Comp_CmpOp(2, rd, rs, false); + return; + case 0xB: // CMN + Comp_CmpOp(3, rd, rs, false); + return; + case 0xC: // ORR + Comp_ArithTriOp(OR, rd, rd, rs, false, opSetsFlags|opSymmetric); + return; + case 0xE: // BIC + Comp_ArithTriOp(AND, rd, rd, rs, false, opSetsFlags|opSymmetric|opInvertOp2); + return; + case 0xF: // MVN + if (rd != rs) + MOV(32, rd, rs); + NOT(32, rd); + Comp_RetriveFlags(false, false, false); + return; + default: + break; + } +} + +void Compiler::T_Comp_ALU_HiReg() +{ + OpArg rd = MapReg(((CurrentInstr.Instr & 0x7) | ((CurrentInstr.Instr >> 4) & 0x8))); + OpArg rs = MapReg((CurrentInstr.Instr >> 3) & 0xF); + + u32 op = (CurrentInstr.Instr >> 8) & 0x3; + + Comp_AddCycles_C(); + + switch (op) + { + case 0x0: // ADD + Comp_ArithTriOp(ADD, rd, rd, rs, false, opSymmetric|opRetriveCV); + return; + case 0x1: // CMP + Comp_CmpOp(2, rd, rs, false); + return; + case 0x2: // MOV + if (rd != rs) + MOV(32, rd, rs); + TEST(32, rd, rd); + Comp_RetriveFlags(false, false, false); + return; + } +} + +} \ No newline at end of file -- cgit v1.2.3 From ff901141e77ad6c8d2910d77bef2b7c5674fcc7f Mon Sep 17 00:00:00 2001 From: RSDuck Date: Tue, 25 Jun 2019 18:28:01 +0200 Subject: jit: correct cycle counting for thumb shift by reg --- src/ARMJIT_x64/ARMJIT_ALU.cpp | 7 +++++-- src/ARMJIT_x64/ARMJIT_LoadStore.cpp | 0 2 files changed, 5 insertions(+), 2 deletions(-) create mode 100644 src/ARMJIT_x64/ARMJIT_LoadStore.cpp (limited to 'src/ARMJIT_x64/ARMJIT_ALU.cpp') diff --git a/src/ARMJIT_x64/ARMJIT_ALU.cpp b/src/ARMJIT_x64/ARMJIT_ALU.cpp index d06c99c..dc82af7 100644 --- a/src/ARMJIT_x64/ARMJIT_ALU.cpp +++ b/src/ARMJIT_x64/ARMJIT_ALU.cpp @@ -456,7 +456,10 @@ void Compiler::T_Comp_ALU() u32 op = (CurrentInstr.Instr >> 6) & 0xF; - Comp_AddCycles_C(); + if ((op >= 0x2 && op < 0x4) || op == 0x7) + Comp_AddCycles_CI(1); + else + Comp_AddCycles_C(); switch (op) { @@ -471,7 +474,7 @@ void Compiler::T_Comp_ALU() case 0x4: case 0x7: { - int shiftOp = op == 7 ? 3 : op - 0x2; + int shiftOp = op == 0x7 ? 3 : op - 0x2; bool carryUsed; OpArg shifted = Comp_RegShiftReg(shiftOp, rs, rd, true, carryUsed); TEST(32, shifted, shifted); diff --git a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp new file mode 100644 index 0000000..e69de29 -- cgit v1.2.3 From 5f932cdf48681414465512fb47d619ad73414137 Mon Sep 17 00:00:00 2001 From: RSDuck Date: Sun, 30 Jun 2019 13:35:03 +0200 Subject: JIT: compilation of word load and store --- src/ARMJIT.cpp | 4 +- src/ARMJIT.h | 3 +- src/ARMJIT_RegCache.h | 2 +- src/ARMJIT_x64/ARMJIT_ALU.cpp | 4 +- src/ARMJIT_x64/ARMJIT_Compiler.cpp | 111 +++++-- src/ARMJIT_x64/ARMJIT_Compiler.h | 19 +- src/ARMJIT_x64/ARMJIT_LoadStore.cpp | 600 ++++++++++++++++++++++++++++++++++++ src/ARM_InstrInfo.h | 8 +- src/CMakeLists.txt | 1 + src/dolphin/x64ABI.h | 3 +- 10 files changed, 712 insertions(+), 43 deletions(-) (limited to 'src/ARMJIT_x64/ARMJIT_ALU.cpp') diff --git a/src/ARMJIT.cpp b/src/ARMJIT.cpp index 74e154b..4da781c 100644 --- a/src/ARMJIT.cpp +++ b/src/ARMJIT.cpp @@ -40,8 +40,7 @@ static ptrdiff_t JIT_MEM[2][32] = { /* 2X*/ DUP2(offsetof(BlockCache, MainRAM)), /* 3X*/ offsetof(BlockCache, SWRAM), offsetof(BlockCache, ARM7_WRAM), - /* 4X*/ -1, - offsetof(BlockCache, ARM7_WIRAM), + /* 4X*/ DUP2(-1), /* 5X*/ DUP2(-1), /* 6X*/ DUP2(offsetof(BlockCache, ARM7_WVRAM)), /* contrary to Gbatek, melonDS and itself, DeSmuME doesn't mirror the 64 MB region at 0x6800000 */ @@ -183,7 +182,6 @@ void ResetBlocks() memset(cache.ARM9_ITCM, 0, sizeof(cache.ARM9_ITCM)); memset(cache.ARM9_LCDC, 0, sizeof(cache.ARM9_LCDC)); memset(cache.ARM7_BIOS, 0, sizeof(cache.ARM7_BIOS)); - memset(cache.ARM7_WIRAM, 0, sizeof(cache.ARM7_WIRAM)); memset(cache.ARM7_WRAM, 0, sizeof(cache.ARM7_WRAM)); memset(cache.ARM7_WVRAM, 0, sizeof(cache.ARM7_WVRAM)); } diff --git a/src/ARMJIT.h b/src/ARMJIT.h index 2ca29e8..45bb4ed 100644 --- a/src/ARMJIT.h +++ b/src/ARMJIT.h @@ -63,14 +63,13 @@ struct BlockCache { CompiledBlock* AddrMapping[2][0x4000] = {0}; - CompiledBlock MainRAM[16*1024*1024/2]; + CompiledBlock MainRAM[4*1024*1024/2]; CompiledBlock SWRAM[0x8000/2]; // Shared working RAM CompiledBlock ARM9_ITCM[0x8000/2]; CompiledBlock ARM9_LCDC[0xA4000/2]; CompiledBlock ARM9_BIOS[0x8000/2]; CompiledBlock ARM7_BIOS[0x4000/2]; CompiledBlock ARM7_WRAM[0x10000/2]; // dedicated ARM7 WRAM - CompiledBlock ARM7_WIRAM[0x10000/2]; // Wifi CompiledBlock ARM7_WVRAM[0x40000/2]; // VRAM allocated as Working RAM }; diff --git a/src/ARMJIT_RegCache.h b/src/ARMJIT_RegCache.h index e18d50f..ea9fb30 100644 --- a/src/ARMJIT_RegCache.h +++ b/src/ARMJIT_RegCache.h @@ -30,7 +30,7 @@ public: assert(Mapping[reg] != -1); if (DirtyRegs & (1 << reg)) - Compiler->UnloadReg(reg, Mapping[reg]); + Compiler->SaveReg(reg, Mapping[reg]); DirtyRegs &= ~(1 << reg); LoadedRegs &= ~(1 << reg); diff --git a/src/ARMJIT_x64/ARMJIT_ALU.cpp b/src/ARMJIT_x64/ARMJIT_ALU.cpp index dc82af7..6294e1d 100644 --- a/src/ARMJIT_x64/ARMJIT_ALU.cpp +++ b/src/ARMJIT_x64/ARMJIT_ALU.cpp @@ -255,8 +255,8 @@ OpArg Compiler::Comp_RegShiftReg(int op, Gen::OpArg rs, Gen::OpArg rm, bool S, b if (S) { XOR(32, R(RSCRATCH2), R(RSCRATCH2)); - BT(32, R(RCPSR), Imm8(29)); - SETcc(CC_C, R(RSCRATCH2)); + TEST(32, R(RCPSR), Imm32(1 << 29)); + SETcc(CC_NZ, R(RSCRATCH2)); } MOV(32, R(RSCRATCH), rm); diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp index f51d4d9..9096397 100644 --- a/src/ARMJIT_x64/ARMJIT_Compiler.cpp +++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp @@ -9,13 +9,43 @@ using namespace Gen; namespace ARMJIT { template <> -const X64Reg RegCache::NativeRegAllocOrder[] = {RBX, RSI, RDI, R12, R13}; +const X64Reg RegCache::NativeRegAllocOrder[] = +{ +#ifdef _WIN32 + RBX, RSI, RDI, R12, R13 +#else + RBX, R12, R13 +#endif +}; template <> -const int RegCache::NativeRegsAvailable = 5; +const int RegCache::NativeRegsAvailable = +#ifdef _WIN32 + 5 +#else + 3 +#endif +; Compiler::Compiler() { - AllocCodeSpace(1024 * 1024 * 4); + AllocCodeSpace(1024 * 1024 * 16); + + for (int i = 0; i < 15; i++) + { + ReadMemFuncs9[i] = Gen_MemoryRoutine9(false, 32, 0x1000000 * i); + WriteMemFuncs9[i] = Gen_MemoryRoutine9(true, 32, 0x1000000 * i); + for (int j = 0; j < 2; j++) + { + ReadMemFuncs7[j][i] = Gen_MemoryRoutine7(false, 32, j, 0x1000000 * i); + WriteMemFuncs7[j][i] = Gen_MemoryRoutine7(true, 32, j, 0x1000000 * i); + } + } + ReadMemFuncs9[15] = Gen_MemoryRoutine9(false, 32, 0xFF000000); + WriteMemFuncs9[15] = Gen_MemoryRoutine9(true, 32, 0xFF000000); + ReadMemFuncs7[15][0] = ReadMemFuncs7[15][1] = Gen_MemoryRoutine7(false, 32, false, 0xFF000000); + WriteMemFuncs7[15][0] = WriteMemFuncs7[15][1] = Gen_MemoryRoutine7(true, 32, false, 0xFF000000); + + ResetStart = GetWritableCodePtr(); } void Compiler::LoadCPSR() @@ -42,7 +72,7 @@ void Compiler::LoadReg(int reg, X64Reg nativeReg) MOV(32, R(nativeReg), Imm32(R15)); } -void Compiler::UnloadReg(int reg, X64Reg nativeReg) +void Compiler::SaveReg(int reg, X64Reg nativeReg) { MOV(32, MDisp(RCPU, offsetof(ARM, R[reg])), R(nativeReg)); } @@ -52,7 +82,7 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs if (IsAlmostFull()) { ResetBlocks(); - ResetCodePtr(); + SetCodePtr((u8*)ResetStart); } CompiledBlock res = (CompiledBlock)GetWritableCodePtr(); @@ -61,8 +91,9 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs Thumb = cpu->CPSR & 0x20; Num = cpu->Num; R15 = cpu->R[15]; + CodeRegion = cpu->CodeRegion; - ABI_PushRegistersAndAdjustStack({ABI_ALL_CALLEE_SAVED}, 8, 0); + ABI_PushRegistersAndAdjustStack({ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS}, 8, 16); MOV(64, R(RCPU), ImmPtr(cpu)); XOR(32, R(RCycles), R(RCycles)); @@ -142,9 +173,9 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs else { // could have used a LUT, but then where would be the fun? - BT(32, R(RCPSR), Imm8(28 + ((~(cond >> 1) & 1) << 1 | (cond >> 2 & 1) ^ (cond >> 1 & 1)))); + TEST(32, R(RCPSR), Imm32(1 << (28 + ((~(cond >> 1) & 1) << 1 | (cond >> 2 & 1) ^ (cond >> 1 & 1))))); - skipExecute = J_CC(cond & 1 ? CC_C : CC_NC); + skipExecute = J_CC(cond & 1 ? CC_NZ : CC_Z); } } @@ -187,7 +218,7 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs LEA(32, RAX, MDisp(RCycles, ConstantCycles)); - ABI_PopRegistersAndAdjustStack({ABI_ALL_CALLEE_SAVED}, 8, 0); + ABI_PopRegistersAndAdjustStack({ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS}, 8, 16); RET(); return res; @@ -243,23 +274,38 @@ CompileFunc Compiler::GetCompFunc(int kind) A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, // CMN A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, + // Mul + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + // ARMv5 stuff + NULL, NULL, NULL, NULL, NULL, + // STR + A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, + // STRB + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + // LDR + A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, + // LDRB + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + // STRH + NULL, NULL, NULL, NULL, + // LDRD + NULL, NULL, NULL, NULL, + // STRD + NULL, NULL, NULL, NULL, + // LDRH + NULL, NULL, NULL, NULL, + // LDRSB NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + // LDRSH + NULL, NULL, NULL, NULL, + // swap + NULL, NULL, + // LDM/STM + NULL, NULL, + // Branch + NULL, NULL, NULL, NULL, NULL, + // system stuff + NULL, NULL, NULL, NULL, NULL, NULL, NULL, }; const CompileFunc T_Comp[ARMInstrInfo::tk_Count] = { @@ -278,10 +324,17 @@ CompileFunc Compiler::GetCompFunc(int kind) T_Comp_ALU_HiReg, T_Comp_ALU_HiReg, T_Comp_ALU_HiReg, // pc/sp relative NULL, NULL, NULL, - // mem... - NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, + // LDR pcrel + NULL, + // LDR/STR reg offset + T_Comp_MemReg, NULL, T_Comp_MemReg, NULL, + // LDR/STR sign extended, half + NULL, NULL, NULL, NULL, + // LDR/STR imm offset + T_Comp_MemImm, T_Comp_MemImm, NULL, NULL, + // LDR/STR half imm offset + NULL, NULL, + // branch, etc. NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.h b/src/ARMJIT_x64/ARMJIT_Compiler.h index 9b454f4..7ab9b25 100644 --- a/src/ARMJIT_x64/ARMJIT_Compiler.h +++ b/src/ARMJIT_x64/ARMJIT_Compiler.h @@ -29,7 +29,7 @@ public: CompiledBlock CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrsCount); void LoadReg(int reg, Gen::X64Reg nativeReg); - void UnloadReg(int reg, Gen::X64Reg nativeReg); + void SaveReg(int reg, Gen::X64Reg nativeReg); private: CompileFunc GetCompFunc(int kind); @@ -51,12 +51,17 @@ private: void A_Comp_MovOp(); void A_Comp_CmpOp(); + void A_Comp_MemWB(); + void T_Comp_ShiftImm(); void T_Comp_AddSub_(); void T_Comp_ALU_Imm8(); void T_Comp_ALU(); void T_Comp_ALU_HiReg(); + void T_Comp_MemReg(); + void T_Comp_MemImm(); + void Comp_ArithTriOp(void (Compiler::*op)(int, const Gen::OpArg&, const Gen::OpArg&), Gen::OpArg rd, Gen::OpArg rn, Gen::OpArg op2, bool carryUsed, int opFlags); void Comp_ArithTriOpReverse(void (Compiler::*op)(int, const Gen::OpArg&, const Gen::OpArg&), @@ -65,10 +70,14 @@ private: void Comp_RetriveFlags(bool sign, bool retriveCV, bool carryUsed); + void* Gen_MemoryRoutine9(bool store, int size, u32 region); + void* Gen_MemoryRoutine7(bool store, int size, bool mainRAMCode, u32 region); + Gen::OpArg Comp_RegShiftImm(int op, int amount, Gen::OpArg rm, bool S, bool& carryUsed); Gen::OpArg Comp_RegShiftReg(int op, Gen::OpArg rs, Gen::OpArg rm, bool S, bool& carryUsed); Gen::OpArg A_Comp_GetALUOp2(bool S, bool& carryUsed); + Gen::OpArg A_Comp_GetMemWBOffset(); void LoadCPSR(); void SaveCPSR(); @@ -82,6 +91,8 @@ private: return Gen::R(RegCache.Mapping[reg]); } + void* ResetStart; + bool CPSRDirty = false; FetchedInstr CurrentInstr; @@ -91,10 +102,16 @@ private: bool Thumb; u32 Num; u32 R15; + u32 CodeRegion; u32 ConstantCycles; }; +extern void* ReadMemFuncs9[16]; +extern void* ReadMemFuncs7[2][16]; +extern void* WriteMemFuncs9[16]; +extern void* WriteMemFuncs7[2][16]; + } #endif \ No newline at end of file diff --git a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp index e69de29..d534269 100644 --- a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp +++ b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp @@ -0,0 +1,600 @@ +#include "ARMJIT_Compiler.h" + +#include "../GPU.h" +#include "../Wifi.h" + +namespace NDS +{ +#define MAIN_RAM_SIZE 0x400000 +extern u8* SWRAM_ARM9; +extern u32 SWRAM_ARM9Mask; +extern u8* SWRAM_ARM7; +extern u32 SWRAM_ARM7Mask; +extern u8 ARM7WRAM[]; +extern u16 ARM7BIOSProt; +} + +using namespace Gen; + +namespace ARMJIT +{ + +void* ReadMemFuncs9[16]; +void* ReadMemFuncs7[2][16]; +void* WriteMemFuncs9[16]; +void* WriteMemFuncs7[2][16]; + +template +int squeezePointer(T* ptr) +{ + int truncated = (int)((u64)ptr); + assert((T*)((u64)truncated) == ptr); + return truncated; +} + +u32 ReadVRAM9(u32 addr) +{ + switch (addr & 0x00E00000) + { + case 0x00000000: return GPU::ReadVRAM_ABG(addr); + case 0x00200000: return GPU::ReadVRAM_BBG(addr); + case 0x00400000: return GPU::ReadVRAM_AOBJ(addr); + case 0x00600000: return GPU::ReadVRAM_BOBJ(addr); + default: return GPU::ReadVRAM_LCDC(addr); + } +} + +void WriteVRAM9(u32 addr, u32 val) +{ + switch (addr & 0x00E00000) + { + case 0x00000000: GPU::WriteVRAM_ABG(addr, val); return; + case 0x00200000: GPU::WriteVRAM_BBG(addr, val); return; + case 0x00400000: GPU::WriteVRAM_AOBJ(addr, val); return; + case 0x00600000: GPU::WriteVRAM_BOBJ(addr, val); return; + default: GPU::WriteVRAM_LCDC(addr, val); return; + } +} + +/* + R11 - data to write (store only) + RSCRATCH2 - address + RSCRATCH3 - code cycles +*/ +void* Compiler::Gen_MemoryRoutine9(bool store, int size, u32 region) +{ + AlignCode4(); + void* res = (void*)GetWritableCodePtr(); + + if (!store) + { + MOV(32, R(RSCRATCH), R(RSCRATCH2)); + AND(32, R(RSCRATCH), Imm8(0x3)); + SHL(32, R(RSCRATCH), Imm8(3)); + // enter the shadow realm! + MOV(32, MDisp(RSP, 8), R(RSCRATCH)); + } + + // cycle counting! + // this is AddCycles_CDI + MOV(32, R(R10), R(RSCRATCH2)); + SHR(32, R(R10), Imm8(12)); + MOVZX(32, 8, R10, MComplex(RCPU, R10, SCALE_1, offsetof(ARMv5, MemTimings) + 2)); + LEA(32, RSCRATCH, MComplex(RSCRATCH3, R10, SCALE_1, -6)); + CMP(32, R(R10), R(RSCRATCH3)); + CMOVcc(32, RSCRATCH3, R(R10), CC_G); + CMP(32, R(RSCRATCH), R(RSCRATCH3)); + CMOVcc(32, RSCRATCH3, R(RSCRATCH), CC_G); + ADD(32, R(RCycles), R(RSCRATCH3)); + + if (!store) + XOR(32, R(RSCRATCH), R(RSCRATCH)); + AND(32, R(RSCRATCH2), Imm32(~3)); + + { + MOV(32, R(RSCRATCH3), R(RSCRATCH2)); + SUB(32, R(RSCRATCH2), MDisp(RCPU, offsetof(ARMv5, DTCMBase))); + CMP(32, R(RSCRATCH2), MDisp(RCPU, offsetof(ARMv5, DTCMSize))); + FixupBranch outsideDTCM = J_CC(CC_AE); + AND(32, R(RSCRATCH2), Imm32(0x3FFF)); + if (!store) + { + MOV(32, R(RSCRATCH), MComplex(RCPU, RSCRATCH2, SCALE_1, offsetof(ARMv5, DTCM))); + MOV(32, R(ECX), MDisp(RSP, 8)); + ROR_(32, R(RSCRATCH), R(ECX)); + } + else + MOV(32, MComplex(RCPU, RSCRATCH2, SCALE_1, offsetof(ARMv5, DTCM)), R(R11)); + RET(); + SetJumpTarget(outsideDTCM); + MOV(32, R(RSCRATCH2), R(RSCRATCH3)); + } + + switch (region) + { + case 0x00000000: + case 0x01000000: + { + CMP(32, R(RSCRATCH2), MDisp(RCPU, offsetof(ARMv5, ITCMSize))); + FixupBranch insideITCM = J_CC(CC_B); + RET(); + SetJumpTarget(insideITCM); + AND(32, R(RSCRATCH2), Imm32(0x7FFF)); + if (!store) + MOV(32, R(RSCRATCH), MComplex(RCPU, RSCRATCH2, SCALE_1, offsetof(ARMv5, ITCM))); + else + { + MOV(32, MComplex(RCPU, RSCRATCH2, SCALE_1, offsetof(ARMv5, ITCM)), R(R11)); + MOV(64, MScaled(RSCRATCH2, SCALE_4, squeezePointer(cache.ARM9_ITCM)), Imm32(0)); + MOV(64, MScaled(RSCRATCH2, SCALE_4, squeezePointer(cache.ARM9_ITCM) + 8), Imm32(0)); + } + } + break; + case 0x02000000: + AND(32, R(RSCRATCH2), Imm32(MAIN_RAM_SIZE - 1)); + if (!store) + MOV(32, R(RSCRATCH), MDisp(RSCRATCH2, squeezePointer(NDS::MainRAM))); + else + { + MOV(32, MDisp(RSCRATCH2, squeezePointer(NDS::MainRAM)), R(R11)); + MOV(64, MScaled(RSCRATCH2, SCALE_4, squeezePointer(cache.MainRAM)), Imm32(0)); + MOV(64, MScaled(RSCRATCH2, SCALE_4, squeezePointer(cache.MainRAM) + 8), Imm32(0)); + } + break; + case 0x03000000: + { + MOV(64, R(RSCRATCH3), M(&NDS::SWRAM_ARM9)); + TEST(64, R(RSCRATCH3), R(RSCRATCH3)); + FixupBranch notMapped = J_CC(CC_Z); + AND(32, R(RSCRATCH2), M(&NDS::SWRAM_ARM9Mask)); + if (!store) + MOV(32, R(RSCRATCH), MRegSum(RSCRATCH2, RSCRATCH3)); + else + { + MOV(32, MRegSum(RSCRATCH2, RSCRATCH3), R(R11)); + MOV(64, MScaled(RSCRATCH2, SCALE_4, squeezePointer(cache.SWRAM)), Imm32(0)); + MOV(64, MScaled(RSCRATCH2, SCALE_4, squeezePointer(cache.SWRAM) + 8), Imm32(0)); + } + SetJumpTarget(notMapped); + } + break; + case 0x04000000: + MOV(32, R(ABI_PARAM1), R(RSCRATCH2)); + if (!store) + { + ABI_PushRegistersAndAdjustStack({}, 8, 0); + ABI_CallFunction(NDS::ARM9IORead32); + ABI_PopRegistersAndAdjustStack({}, 8, 0); + } + else + { + MOV(32, R(ABI_PARAM2), R(R11)); + JMP((u8*)NDS::ARM9IOWrite32, true); + } + break; + case 0x05000000: + { + MOV(32, R(RSCRATCH), Imm32(1<<1)); + MOV(32, R(RSCRATCH3), Imm32(1<<9)); + TEST(32, R(RSCRATCH2), Imm32(0x400)); + CMOVcc(32, RSCRATCH, R(RSCRATCH3), CC_NZ); + TEST(16, R(RSCRATCH), M(&NDS::PowerControl9)); + FixupBranch available = J_CC(CC_NZ); + RET(); + SetJumpTarget(available); + AND(32, R(RSCRATCH2), Imm32(0x7FF)); + if (!store) + MOV(32, R(RSCRATCH), MDisp(RSCRATCH2, squeezePointer(GPU::Palette))); + else + MOV(32, MDisp(RSCRATCH2, squeezePointer(GPU::Palette)), R(R11)); + } + break; + case 0x06000000: + MOV(32, R(ABI_PARAM1), R(RSCRATCH2)); + if (!store) + { + ABI_PushRegistersAndAdjustStack({}, 8); + ABI_CallFunction(ReadVRAM9); + ABI_PopRegistersAndAdjustStack({}, 8); + } + else + { + MOV(32, R(ABI_PARAM2), R(R11)); + JMP((u8*)WriteVRAM9, true); + } + break; + case 0x07000000: + { + MOV(32, R(RSCRATCH), Imm32(1<<1)); + MOV(32, R(RSCRATCH3), Imm32(1<<9)); + TEST(32, R(RSCRATCH2), Imm32(0x400)); + CMOVcc(32, RSCRATCH, R(RSCRATCH3), CC_NZ); + TEST(16, R(RSCRATCH), M(&NDS::PowerControl9)); + FixupBranch available = J_CC(CC_NZ); + RET(); + SetJumpTarget(available); + AND(32, R(RSCRATCH2), Imm32(0x7FF)); + if (!store) + MOV(32, R(RSCRATCH), MDisp(RSCRATCH2, squeezePointer(GPU::OAM))); + else + MOV(32, MDisp(RSCRATCH2, squeezePointer(GPU::OAM)), R(R11)); + } + break; + case 0x08000000: + case 0x09000000: + case 0x0A000000: + if (!store) + MOV(32, R(RSCRATCH), Imm32(0xFFFFFFFF)); + break; + case 0xFF000000: + if (!store) + { + AND(32, R(RSCRATCH2), Imm32(0xFFF)); + MOV(32, R(RSCRATCH), MDisp(RSCRATCH2, squeezePointer(NDS::ARM9BIOS))); + } + break; + default: + MOV(32, R(ABI_PARAM1), R(RSCRATCH2)); + if (!store) + { + ABI_PushRegistersAndAdjustStack({}, 8, 0); + ABI_CallFunction(NDS::ARM9Read32); + ABI_PopRegistersAndAdjustStack({}, 8, 0); + } + else + { + MOV(32, R(ABI_PARAM2), R(R11)); + JMP((u8*)NDS::ARM9Write32, true); + } + break; + } + + if (!store) + { + MOV(32, R(ECX), MDisp(RSP, 8)); + ROR_(32, R(RSCRATCH), R(ECX)); + } + + RET(); + + return res; +} + +void* Compiler::Gen_MemoryRoutine7(bool store, int size, bool mainRAMCode, u32 region) +{ + AlignCode4(); + void* res = GetWritableCodePtr(); + + if (!store) + { + MOV(32, R(RSCRATCH), R(RSCRATCH2)); + AND(32, R(RSCRATCH), Imm8(0x3)); + SHL(32, R(RSCRATCH), Imm8(3)); + // enter the shadow realm! + MOV(32, MDisp(RSP, 8), R(RSCRATCH)); + } + + // AddCycles_CDI + MOV(32, R(RSCRATCH), R(RSCRATCH2)); + SHR(32, R(RSCRATCH), Imm8(15)); + MOVZX(32, 8, RSCRATCH, MDisp(RSCRATCH, squeezePointer(NDS::ARM7MemTimings + 2))); + if ((region == 0x02000000 && mainRAMCode) || (region != 0x02000000 && !mainRAMCode)) + { + if (!store && region != 0x02000000) + LEA(32, RSCRATCH3, MComplex(RSCRATCH, RSCRATCH3, SCALE_1, 1)); + ADD(32, R(RCycles), R(RSCRATCH3)); + } + else + { + if (!store) + ADD(32, R(region == 0x02000000 ? RSCRATCH2 : RSCRATCH), Imm8(1)); + LEA(32, R10, MComplex(RSCRATCH, RSCRATCH3, SCALE_1, -3)); + CMP(32, R(RSCRATCH3), R(RSCRATCH)); + CMOVcc(32, RSCRATCH, R(RSCRATCH3), CC_G); + CMP(32, R(R10), R(RSCRATCH)); + CMOVcc(32, RSCRATCH, R(R10), CC_G); + ADD(32, R(RCycles), R(RSCRATCH)); + } + + if (!store) + XOR(32, R(RSCRATCH), R(RSCRATCH)); + AND(32, R(RSCRATCH2), Imm32(~3)); + + switch (region) + { + case 0x00000000: + if (!store) { + CMP(32, R(RSCRATCH2), Imm32(0x4000)); + FixupBranch outsideBIOS1 = J_CC(CC_AE); + + MOV(32, R(RSCRATCH), MDisp(RCPU, offsetof(ARM, R[15]))); + CMP(32, R(RSCRATCH), Imm32(0x4000)); + FixupBranch outsideBIOS2 = J_CC(CC_AE); + MOV(32, R(RSCRATCH3), M(&NDS::ARM7BIOSProt)); + CMP(32, R(RSCRATCH2), R(RSCRATCH3)); + FixupBranch notDenied1 = J_CC(CC_AE); + CMP(32, R(RSCRATCH), R(RSCRATCH3)); + FixupBranch notDenied2 = J_CC(CC_B); + SetJumpTarget(outsideBIOS2); + MOV(32, R(RSCRATCH), Imm32(0xFFFFFFFF)); + RET(); + + SetJumpTarget(notDenied1); + SetJumpTarget(notDenied2); + MOV(32, R(RSCRATCH), MDisp(RSCRATCH2, squeezePointer(NDS::ARM7BIOS))); + MOV(32, R(ECX), MDisp(RSP, 8)); + ROR_(32, R(RSCRATCH), R(ECX)); + RET(); + + SetJumpTarget(outsideBIOS1); + } + break; + case 0x02000000: + AND(32, R(RSCRATCH2), Imm32(MAIN_RAM_SIZE - 1)); + if (!store) + MOV(32, R(RSCRATCH), MDisp(RSCRATCH2, squeezePointer(NDS::MainRAM))); + else + { + MOV(32, MDisp(RSCRATCH2, squeezePointer(NDS::MainRAM)), R(R11)); + MOV(64, MScaled(RSCRATCH2, SCALE_4, squeezePointer(cache.MainRAM)), Imm32(0)); + MOV(64, MScaled(RSCRATCH2, SCALE_4, squeezePointer(cache.MainRAM) + 8), Imm32(0)); + } + break; + case 0x03000000: + { + TEST(32, R(RSCRATCH2), Imm32(0x800000)); + FixupBranch region = J_CC(CC_NZ); + MOV(64, R(RSCRATCH), M(&NDS::SWRAM_ARM7)); + TEST(64, R(RSCRATCH), R(RSCRATCH)); + FixupBranch notMapped = J_CC(CC_Z); + AND(32, R(RSCRATCH2), M(&NDS::SWRAM_ARM7Mask)); + if (!store) + { + MOV(32, R(RSCRATCH), MRegSum(RSCRATCH, RSCRATCH2)); + MOV(32, R(ECX), MDisp(RSP, 8)); + ROR_(32, R(RSCRATCH), R(ECX)); + } + else + { + MOV(32, MRegSum(RSCRATCH, RSCRATCH2), R(R11)); + MOV(64, MScaled(RSCRATCH2, SCALE_4, squeezePointer(cache.SWRAM)), Imm32(0)); + MOV(64, MScaled(RSCRATCH2, SCALE_4, squeezePointer(cache.SWRAM) + 8), Imm32(0)); + } + RET(); + SetJumpTarget(region); + SetJumpTarget(notMapped); + AND(32, R(RSCRATCH2), Imm32(0xFFFF)); + if (!store) + MOV(32, R(RSCRATCH), MDisp(RSCRATCH2, squeezePointer(NDS::ARM7WRAM))); + else + { + MOV(32, MDisp(RSCRATCH2, squeezePointer(NDS::ARM7WRAM)), R(R11)); + MOV(64, MScaled(RSCRATCH2, SCALE_4, squeezePointer(cache.ARM7_WRAM)), Imm32(0)); + MOV(64, MScaled(RSCRATCH2, SCALE_4, squeezePointer(cache.ARM7_WRAM) + 8), Imm32(0)); + } + } + break; + case 0x04000000: + { + TEST(32, R(RSCRATCH2), Imm32(0x800000)); + FixupBranch region = J_CC(CC_NZ); + MOV(32, R(ABI_PARAM1), R(RSCRATCH2)); + if (!store) + { + ABI_PushRegistersAndAdjustStack({}, 8); + ABI_CallFunction(NDS::ARM7IORead32); + ABI_PopRegistersAndAdjustStack({}, 8); + + MOV(32, R(ECX), MDisp(RSP, 8)); + ROR_(32, R(RSCRATCH), R(ECX)); + RET(); + } + else + { + MOV(32, R(ABI_PARAM2), R(R11)); + JMP((u8*)NDS::ARM7IOWrite32, true); + } + SetJumpTarget(region); + + if (!store) + { + ABI_PushRegistersAndAdjustStack({RSCRATCH2}, 8); + MOV(32, R(ABI_PARAM1), R(RSCRATCH2)); + ABI_CallFunction(Wifi::Read); + ABI_PopRegistersAndAdjustStack({RSCRATCH2}, 8); + + ADD(32, R(RSCRATCH2), Imm8(2)); + ABI_PushRegistersAndAdjustStack({EAX}, 8); + MOV(32, R(ABI_PARAM1), R(RSCRATCH2)); + ABI_CallFunction(Wifi::Read); + MOV(32, R(RSCRATCH2), R(EAX)); + SHL(32, R(RSCRATCH2), Imm8(16)); + ABI_PopRegistersAndAdjustStack({EAX}, 8); + OR(32, R(EAX), R(RSCRATCH2)); + } + else + { + ABI_PushRegistersAndAdjustStack({RSCRATCH2, R11}, 8); + MOV(32, R(ABI_PARAM1), R(RSCRATCH2)); + MOVZX(32, 16, ABI_PARAM2, R(R11)); + ABI_CallFunction(Wifi::Write); + ABI_PopRegistersAndAdjustStack({RSCRATCH2, R11}, 8); + SHR(32, R(R11), Imm8(16)); + ADD(32, R(RSCRATCH2), Imm8(2)); + ABI_PushRegistersAndAdjustStack({RSCRATCH2, R11}, 8); + MOV(32, R(ABI_PARAM1), R(RSCRATCH2)); + MOVZX(32, 16, ABI_PARAM2, R(R11)); + ABI_CallFunction(Wifi::Write); + ABI_PopRegistersAndAdjustStack({RSCRATCH2, R11}, 8); + } + } + break; + case 0x06000000: + MOV(32, R(ABI_PARAM1), R(RSCRATCH2)); + if (!store) + { + ABI_PushRegistersAndAdjustStack({}, 8); + ABI_CallFunction(GPU::ReadVRAM_ARM7); + ABI_PopRegistersAndAdjustStack({}, 8); + } + else + { + AND(32, R(ABI_PARAM1), Imm32(0x40000 - 1)); + MOV(64, MScaled(ABI_PARAM1, SCALE_4, squeezePointer(cache.ARM7_WVRAM)), Imm32(0)); + MOV(64, MScaled(ABI_PARAM1, SCALE_4, squeezePointer(cache.ARM7_WVRAM) + 8), Imm32(0)); + MOV(32, R(ABI_PARAM2), R(R11)); + JMP((u8*)GPU::WriteVRAM_ARM7, true); + } + break; + case 0x08000000: + case 0x09000000: + case 0x0A000000: + if (!store) + MOV(32, R(RSCRATCH), Imm32(0xFFFFFFFF)); + break; + /*default: + ABI_PushRegistersAndAdjustStack({}, 8, 0); + MOV(32, R(ABI_PARAM1), R(RSCRATCH2)); + ABI_CallFunction(NDS::ARM7Read32); + ABI_PopRegistersAndAdjustStack({}, 8, 0); + break;*/ + } + + if (!store) + { + MOV(32, R(ECX), MDisp(RSP, 8)); + ROR_(32, R(RSCRATCH), R(ECX)); + } + + RET(); + + return res; +} + +OpArg Compiler::A_Comp_GetMemWBOffset() +{ + if (!(CurrentInstr.Instr & (1 << 25))) + return Imm32(CurrentInstr.Instr & 0xFFF); + else + { + int op = (CurrentInstr.Instr >> 5) & 0x3; + int amount = (CurrentInstr.Instr >> 7) & 0x1F; + OpArg rm = MapReg(CurrentInstr.A_Reg(0)); + bool carryUsed; + return Comp_RegShiftImm(op, amount, rm, false, carryUsed); + } +} + +void Compiler::A_Comp_MemWB() +{ + OpArg rn = MapReg(CurrentInstr.A_Reg(16)); + OpArg rd = MapReg(CurrentInstr.A_Reg(12)); + bool load = CurrentInstr.Instr & (1 << 20); + + MOV(32, R(RSCRATCH2), rn); + if (CurrentInstr.Instr & (1 << 24)) + { + OpArg offset = A_Comp_GetMemWBOffset(); + if (CurrentInstr.Instr & (1 << 23)) + ADD(32, R(RSCRATCH2), offset); + else + SUB(32, R(RSCRATCH2), offset); + + if (CurrentInstr.Instr & (1 << 21)) + MOV(32, rn, R(RSCRATCH2)); + } + + u32 cycles = Num ? NDS::ARM7MemTimings[CurrentInstr.CodeCycles][2] : CurrentInstr.CodeCycles; + MOV(32, R(RSCRATCH3), Imm32(cycles)); + MOV(32, R(RSCRATCH), R(RSCRATCH2)); + SHR(32, R(RSCRATCH), Imm8(24)); + AND(32, R(RSCRATCH), Imm8(0xF)); + void** funcArray; + if (load) + funcArray = Num ? ReadMemFuncs7[CodeRegion == 0x02] : ReadMemFuncs9; + else + { + funcArray = Num ? WriteMemFuncs7[CodeRegion == 0x02] : WriteMemFuncs9; + MOV(32, R(R11), rd); + } + CALLptr(MScaled(RSCRATCH, SCALE_8, squeezePointer(funcArray))); + + if (load) + MOV(32, R(RSCRATCH2), R(RSCRATCH)); + + if (!(CurrentInstr.Instr & (1 << 24))) + { + OpArg offset = A_Comp_GetMemWBOffset(); + + if (CurrentInstr.Instr & (1 << 23)) + ADD(32, rn, offset); + else + SUB(32, rn, offset); + } + + if (load) + MOV(32, rd, R(RSCRATCH2)); +} + +void Compiler::T_Comp_MemReg() +{ + OpArg rd = MapReg(CurrentInstr.T_Reg(0)); + OpArg rb = MapReg(CurrentInstr.T_Reg(3)); + OpArg ro = MapReg(CurrentInstr.T_Reg(6)); + + int op = (CurrentInstr.Instr >> 10) & 0x3; + bool load = op & 0x2; + + MOV(32, R(RSCRATCH2), rb); + ADD(32, R(RSCRATCH2), ro); + + u32 cycles = Num ? NDS::ARM7MemTimings[CurrentInstr.CodeCycles][0] : (R15 & 0x2 ? 0 : CurrentInstr.CodeCycles); + MOV(32, R(RSCRATCH3), Imm32(cycles)); + MOV(32, R(RSCRATCH), R(RSCRATCH2)); + SHR(32, R(RSCRATCH), Imm8(24)); + AND(32, R(RSCRATCH), Imm8(0xF)); + void** funcArray; + if (load) + funcArray = Num ? ReadMemFuncs7[CodeRegion == 0x02] : ReadMemFuncs9; + else + { + funcArray = Num ? WriteMemFuncs7[CodeRegion == 0x02] : WriteMemFuncs9; + MOV(32, R(R11), rd); + } + CALLptr(MScaled(RSCRATCH, SCALE_8, squeezePointer(funcArray))); + + if (load) + MOV(32, rd, R(RSCRATCH)); +} + +void Compiler::T_Comp_MemImm() +{ + // TODO: aufräumen!!! + OpArg rd = MapReg(CurrentInstr.T_Reg(0)); + OpArg rb = MapReg(CurrentInstr.T_Reg(3)); + + int op = (CurrentInstr.Instr >> 11) & 0x3; + u32 offset = ((CurrentInstr.Instr >> 6) & 0x1F) * 4; + bool load = op & 0x1; + + LEA(32, RSCRATCH2, MDisp(rb.GetSimpleReg(), offset)); + u32 cycles = Num ? NDS::ARM7MemTimings[CurrentInstr.CodeCycles][0] : (R15 & 0x2 ? 0 : CurrentInstr.CodeCycles); + MOV(32, R(RSCRATCH3), Imm32(cycles)); + MOV(32, R(RSCRATCH), R(RSCRATCH2)); + SHR(32, R(RSCRATCH), Imm8(24)); + AND(32, R(RSCRATCH), Imm8(0xF)); + void** funcArray; + if (load) + funcArray = Num ? ReadMemFuncs7[CodeRegion == 0x02] : ReadMemFuncs9; + else + { + funcArray = Num ? WriteMemFuncs7[CodeRegion == 0x02] : WriteMemFuncs9; + MOV(32, R(R11), rd); + } + CALLptr(MScaled(RSCRATCH, SCALE_8, squeezePointer(funcArray))); + + if (load) + MOV(32, rd, R(RSCRATCH)); +} + +} \ No newline at end of file diff --git a/src/ARM_InstrInfo.h b/src/ARM_InstrInfo.h index e717664..dcd938b 100644 --- a/src/ARM_InstrInfo.h +++ b/src/ARM_InstrInfo.h @@ -83,10 +83,10 @@ enum ak_ALU(BIC), ak_ALU(MVN), - ak_ALU(TST), - ak_ALU(TEQ), - ak_ALU(CMP), - ak_ALU(CMN), + ak_Test(TST), + ak_Test(TEQ), + ak_Test(CMP), + ak_Test(CMN), ak_MUL, ak_MLA, diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index d88638a..662ed5c 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -34,6 +34,7 @@ add_library(core STATIC ARMJIT.cpp ARMJIT_x64/ARMJIT_Compiler.cpp ARMJIT_x64/ARMJIT_ALU.cpp + ARMJIT_x64/ARMJIT_LoadStore.cpp dolphin/CommonFuncs.cpp dolphin/x64ABI.cpp diff --git a/src/dolphin/x64ABI.h b/src/dolphin/x64ABI.h index 997782e..94336d0 100644 --- a/src/dolphin/x64ABI.h +++ b/src/dolphin/x64ABI.h @@ -37,7 +37,8 @@ // xmm0-xmm15 use the upper 16 bits in the functions that push/pop registers. #define ABI_ALL_CALLER_SAVED \ - (BitSet32{RAX, RCX, RDX, R8, R9, R10, R11}) + (BitSet32{RAX, RCX, RDX, R8, R9, R10, R11, XMM0 + 16, XMM1 + 16, XMM2 + 16, XMM3 + 16, \ + XMM4 + 16, XMM5 + 16}) #else // 64-bit Unix / OS X #define ABI_PARAM1 RDI -- cgit v1.2.3 From 2c44bf927c230efbbd1b27920de062ddcc631fcf Mon Sep 17 00:00:00 2001 From: RSDuck Date: Sat, 6 Jul 2019 01:48:42 +0200 Subject: JIT: most mem instructions working + branching --- src/ARM.cpp | 10 +- src/ARMJIT.cpp | 7 +- src/ARMJIT.h | 2 +- src/ARMJIT_RegCache.h | 2 +- src/ARMJIT_x64/ARMJIT_ALU.cpp | 322 ++++++++------- src/ARMJIT_x64/ARMJIT_Compiler.cpp | 145 ++++--- src/ARMJIT_x64/ARMJIT_Compiler.h | 42 +- src/ARMJIT_x64/ARMJIT_LoadStore.cpp | 805 +++++++++++++++--------------------- src/ARM_InstrInfo.cpp | 2 +- src/NDS.cpp | 2 + 10 files changed, 653 insertions(+), 686 deletions(-) (limited to 'src/ARMJIT_x64/ARMJIT_ALU.cpp') diff --git a/src/ARM.cpp b/src/ARM.cpp index 420257a..f7ca26d 100644 --- a/src/ARM.cpp +++ b/src/ARM.cpp @@ -522,8 +522,9 @@ void ARMv5::Execute() ARMJIT::CompiledBlock block = ARMJIT::LookUpBlock(0, R[15] - ((CPSR&0x20)?2:4)); if (block == NULL) - block = ARMJIT::CompileBlock(this); - Cycles += block(); + ARMJIT::CompileBlock(this); + else + Cycles += block(); // TODO optimize this shit!!! if (Halted) @@ -607,8 +608,9 @@ void ARMv4::Execute() ARMJIT::CompiledBlock block = ARMJIT::LookUpBlock(1, R[15] - ((CPSR&0x20)?2:4)); if (block == NULL) - block = ARMJIT::CompileBlock(this); - Cycles += block(); + ARMJIT::CompileBlock(this); + else + Cycles += block(); // TODO optimize this shit!!! if (Halted) diff --git a/src/ARMJIT.cpp b/src/ARMJIT.cpp index 4da781c..6afa967 100644 --- a/src/ARMJIT.cpp +++ b/src/ARMJIT.cpp @@ -121,12 +121,13 @@ void DeInit() delete compiler; } -CompiledBlock CompileBlock(ARM* cpu) +void CompileBlock(ARM* cpu) { bool thumb = cpu->CPSR & 0x20; FetchedInstr instrs[12]; int i = 0; + u32 r15Initial = cpu->R[15]; u32 r15 = cpu->R[15]; u32 nextInstr[2] = {cpu->NextInstr[0], cpu->NextInstr[1]}; //printf("block %x %d\n", r15, thumb); @@ -169,9 +170,7 @@ CompiledBlock CompileBlock(ARM* cpu) CompiledBlock block = compiler->CompileBlock(cpu, instrs, i); - InsertBlock(cpu->Num, cpu->R[15] - (thumb ? 2 : 4), block); - - return block; + InsertBlock(cpu->Num, r15Initial - (thumb ? 2 : 4), block); } void ResetBlocks() diff --git a/src/ARMJIT.h b/src/ARMJIT.h index 45bb4ed..71188f9 100644 --- a/src/ARMJIT.h +++ b/src/ARMJIT.h @@ -109,7 +109,7 @@ inline void InsertBlock(u32 num, u32 addr, CompiledBlock func) void Init(); void DeInit(); -CompiledBlock CompileBlock(ARM* cpu); +void CompileBlock(ARM* cpu); void ResetBlocks(); diff --git a/src/ARMJIT_RegCache.h b/src/ARMJIT_RegCache.h index ea9fb30..556d27b 100644 --- a/src/ARMJIT_RegCache.h +++ b/src/ARMJIT_RegCache.h @@ -114,7 +114,7 @@ public: for (int reg : needToBeLoaded) LoadRegister(reg); } - DirtyRegs |= Instr.Info.DstRegs; + DirtyRegs |= Instr.Info.DstRegs & ~(1 << 15); } static const Reg NativeRegAllocOrder[]; diff --git a/src/ARMJIT_x64/ARMJIT_ALU.cpp b/src/ARMJIT_x64/ARMJIT_ALU.cpp index 6294e1d..c22751e 100644 --- a/src/ARMJIT_x64/ARMJIT_ALU.cpp +++ b/src/ARMJIT_x64/ARMJIT_ALU.cpp @@ -71,30 +71,30 @@ void Compiler::Comp_CmpOp(int op, Gen::OpArg rn, Gen::OpArg op2, bool carryUsed) { switch (op) { - case 0: // TST - if (rn.IsImm()) - { - MOV(32, R(RSCRATCH3), rn); - rn = R(RSCRATCH3); - } - TEST(32, rn, op2); - break; - case 1: // TEQ + case 0: // TST + if (rn.IsImm()) + { MOV(32, R(RSCRATCH3), rn); - XOR(32, R(RSCRATCH3), op2); - break; - case 2: // CMP - if (rn.IsImm()) - { - MOV(32, R(RSCRATCH3), rn); - rn = R(RSCRATCH3); - } - CMP(32, rn, op2); - break; - case 3: // CMN + rn = R(RSCRATCH3); + } + TEST(32, rn, op2); + break; + case 1: // TEQ + MOV(32, R(RSCRATCH3), rn); + XOR(32, R(RSCRATCH3), op2); + break; + case 2: // CMP + if (rn.IsImm()) + { MOV(32, R(RSCRATCH3), rn); - ADD(32, R(RSCRATCH3), op2); - break; + rn = R(RSCRATCH3); + } + CMP(32, rn, op2); + break; + case 3: // CMN + MOV(32, R(RSCRATCH3), rn); + ADD(32, R(RSCRATCH3), op2); + break; } Comp_RetriveFlags(op == 2, op >= 2, carryUsed); @@ -103,38 +103,38 @@ void Compiler::Comp_CmpOp(int op, Gen::OpArg rn, Gen::OpArg op2, bool carryUsed) // also calculates cycles OpArg Compiler::A_Comp_GetALUOp2(bool S, bool& carryUsed) { - if (CurrentInstr.Instr & (1 << 25)) + if (CurInstr.Instr & (1 << 25)) { Comp_AddCycles_C(); carryUsed = false; - return Imm32(ROR(CurrentInstr.Instr & 0xFF, (CurrentInstr.Instr >> 7) & 0x1E)); + return Imm32(ROR(CurInstr.Instr & 0xFF, (CurInstr.Instr >> 7) & 0x1E)); } else { - int op = (CurrentInstr.Instr >> 5) & 0x3; - if (CurrentInstr.Instr & (1 << 4)) + int op = (CurInstr.Instr >> 5) & 0x3; + if (CurInstr.Instr & (1 << 4)) { Comp_AddCycles_CI(1); - OpArg rm = MapReg(CurrentInstr.A_Reg(0)); - if (rm.IsImm() && CurrentInstr.A_Reg(0) == 15) + OpArg rm = MapReg(CurInstr.A_Reg(0)); + if (rm.IsImm() && CurInstr.A_Reg(0) == 15) rm = Imm32(rm.Imm32() + 4); - return Comp_RegShiftReg(op, MapReg(CurrentInstr.A_Reg(8)), rm, S, carryUsed); + return Comp_RegShiftReg(op, MapReg(CurInstr.A_Reg(8)), rm, S, carryUsed); } else { Comp_AddCycles_C(); - return Comp_RegShiftImm(op, (CurrentInstr.Instr >> 7) & 0x1F, - MapReg(CurrentInstr.A_Reg(0)), S, carryUsed); + return Comp_RegShiftImm(op, (CurInstr.Instr >> 7) & 0x1F, + MapReg(CurInstr.A_Reg(0)), S, carryUsed); } } } void Compiler::A_Comp_CmpOp() { - u32 op = (CurrentInstr.Instr >> 21) & 0xF; + u32 op = (CurInstr.Instr >> 21) & 0xF; bool carryUsed; - OpArg rn = MapReg(CurrentInstr.A_Reg(16)); + OpArg rn = MapReg(CurInstr.A_Reg(16)); OpArg op2 = A_Comp_GetALUOp2((1 << op) & 0xF303, carryUsed); Comp_CmpOp(op - 0x8, rn, op2, carryUsed); @@ -142,12 +142,12 @@ void Compiler::A_Comp_CmpOp() void Compiler::A_Comp_Arith() { - bool S = CurrentInstr.Instr & (1 << 20); - u32 op = (CurrentInstr.Instr >> 21) & 0xF; + bool S = CurInstr.Instr & (1 << 20); + u32 op = (CurInstr.Instr >> 21) & 0xF; bool carryUsed; - OpArg rn = MapReg(CurrentInstr.A_Reg(16)); - OpArg rd = MapReg(CurrentInstr.A_Reg(12)); + OpArg rn = MapReg(CurInstr.A_Reg(16)); + OpArg rd = MapReg(CurInstr.A_Reg(12)); OpArg op2 = A_Comp_GetALUOp2(S && (1 << op) & 0xF303, carryUsed); u32 sFlag = S ? opSetsFlags : 0; @@ -155,13 +155,13 @@ void Compiler::A_Comp_Arith() { case 0x0: // AND Comp_ArithTriOp(AND, rd, rn, op2, carryUsed, opSymmetric|sFlag); - return; + break; case 0x1: // EOR Comp_ArithTriOp(XOR, rd, rn, op2, carryUsed, opSymmetric|sFlag); - return; + break; case 0x2: // SUB Comp_ArithTriOp(SUB, rd, rn, op2, carryUsed, sFlag|opRetriveCV|opInvertCarry); - return; + break; case 0x3: // RSB if (op2.IsZero()) { @@ -173,41 +173,44 @@ void Compiler::A_Comp_Arith() } else Comp_ArithTriOpReverse(SUB, rd, rn, op2, carryUsed, sFlag|opRetriveCV|opInvertCarry); - return; + break; case 0x4: // ADD Comp_ArithTriOp(ADD, rd, rn, op2, carryUsed, opSymmetric|sFlag|opRetriveCV); - return; + break; case 0x5: // ADC Comp_ArithTriOp(ADC, rd, rn, op2, carryUsed, opSymmetric|sFlag|opRetriveCV|opSyncCarry); - return; + break; case 0x6: // SBC Comp_ArithTriOp(SBB, rd, rn, op2, carryUsed, opSymmetric|sFlag|opRetriveCV|opSyncCarry|opInvertCarry); - return; + break; case 0x7: // RSC Comp_ArithTriOpReverse(SBB, rd, rn, op2, carryUsed, sFlag|opRetriveCV|opInvertCarry|opSyncCarry); - return; + break; case 0xC: // ORR Comp_ArithTriOp(OR, rd, rn, op2, carryUsed, opSymmetric|sFlag); - return; + break; case 0xE: // BIC Comp_ArithTriOp(AND, rd, rn, op2, carryUsed, sFlag|opSymmetric|opInvertOp2); - return; + break; default: assert("unimplemented"); } + + if (CurInstr.A_Reg(12) == 15) + Comp_JumpTo(rd.GetSimpleReg(), S); } void Compiler::A_Comp_MovOp() { bool carryUsed; - bool S = CurrentInstr.Instr & (1 << 20); + bool S = CurInstr.Instr & (1 << 20); OpArg op2 = A_Comp_GetALUOp2(S, carryUsed); - OpArg rd = MapReg(CurrentInstr.A_Reg(12)); + OpArg rd = MapReg(CurInstr.A_Reg(12)); if (rd != op2) MOV(32, rd, op2); - if (((CurrentInstr.Instr >> 21) & 0xF) == 0xF) + if (((CurInstr.Instr >> 21) & 0xF) == 0xF) NOT(32, rd); if (S) @@ -215,6 +218,9 @@ void Compiler::A_Comp_MovOp() TEST(32, rd, rd); Comp_RetriveFlags(false, false, carryUsed); } + + if (CurInstr.A_Reg(12) == 15) + Comp_JumpTo(rd.GetSimpleReg(), S); } void Compiler::Comp_RetriveFlags(bool sign, bool retriveCV, bool carryUsed) @@ -230,7 +236,7 @@ void Compiler::Comp_RetriveFlags(bool sign, bool retriveCV, bool carryUsed) } if (carryUsed == 983298) - printf("etwas ist faul im lande daenemark %x\n", CurrentInstr.Instr); + printf("etwas ist faul im lande daenemark %x\n", CurInstr.Instr); SETcc(CC_S, R(RSCRATCH)); SETcc(CC_Z, R(RSCRATCH3)); @@ -324,61 +330,61 @@ OpArg Compiler::Comp_RegShiftImm(int op, int amount, OpArg rm, bool S, bool& car switch (op) { - case 0: // LSL - if (amount > 0) - { - MOV(32, R(RSCRATCH), rm); - SHL(32, R(RSCRATCH), Imm8(amount)); - if (S) - SETcc(CC_C, R(RSCRATCH2)); - - return R(RSCRATCH); - } - else - { - carryUsed = false; - return rm; - } - case 1: // LSR - if (amount > 0) - { - MOV(32, R(RSCRATCH), rm); - SHR(32, R(RSCRATCH), Imm8(amount)); - if (S) - SETcc(CC_C, R(RSCRATCH2)); - return R(RSCRATCH); - } - else - { - if (S) - { - MOV(32, R(RSCRATCH2), rm); - SHR(32, R(RSCRATCH2), Imm8(31)); - } - return Imm32(0); - } - case 2: // ASR + case 0: // LSL + if (amount > 0) + { MOV(32, R(RSCRATCH), rm); - SAR(32, R(RSCRATCH), Imm8(amount ? amount : 31)); + SHL(32, R(RSCRATCH), Imm8(amount)); if (S) - { - if (amount == 0) - BT(32, rm, Imm8(31)); SETcc(CC_C, R(RSCRATCH2)); - } + return R(RSCRATCH); - case 3: // ROR + } + else + { + carryUsed = false; + return rm; + } + case 1: // LSR + if (amount > 0) + { MOV(32, R(RSCRATCH), rm); - if (amount > 0) - ROR_(32, R(RSCRATCH), Imm8(amount)); - else - { - BT(32, R(RCPSR), Imm8(29)); - RCR(32, R(RSCRATCH), Imm8(1)); - } + SHR(32, R(RSCRATCH), Imm8(amount)); if (S) SETcc(CC_C, R(RSCRATCH2)); return R(RSCRATCH); + } + else + { + if (S) + { + MOV(32, R(RSCRATCH2), rm); + SHR(32, R(RSCRATCH2), Imm8(31)); + } + return Imm32(0); + } + case 2: // ASR + MOV(32, R(RSCRATCH), rm); + SAR(32, R(RSCRATCH), Imm8(amount ? amount : 31)); + if (S) + { + if (amount == 0) + BT(32, rm, Imm8(31)); + SETcc(CC_C, R(RSCRATCH2)); + } + return R(RSCRATCH); + case 3: // ROR + MOV(32, R(RSCRATCH), rm); + if (amount > 0) + ROR_(32, R(RSCRATCH), Imm8(amount)); + else + { + BT(32, R(RCPSR), Imm8(29)); + RCR(32, R(RSCRATCH), Imm8(1)); + } + if (S) + SETcc(CC_C, R(RSCRATCH2)); + return R(RSCRATCH); } assert(false); @@ -386,11 +392,11 @@ OpArg Compiler::Comp_RegShiftImm(int op, int amount, OpArg rm, bool S, bool& car void Compiler::T_Comp_ShiftImm() { - OpArg rd = MapReg(CurrentInstr.T_Reg(0)); - OpArg rs = MapReg(CurrentInstr.T_Reg(3)); + OpArg rd = MapReg(CurInstr.T_Reg(0)); + OpArg rs = MapReg(CurInstr.T_Reg(3)); - int op = (CurrentInstr.Instr >> 11) & 0x3; - int amount = (CurrentInstr.Instr >> 6) & 0x1F; + int op = (CurInstr.Instr >> 11) & 0x3; + int amount = (CurInstr.Instr >> 6) & 0x1F; Comp_AddCycles_C(); @@ -406,12 +412,12 @@ void Compiler::T_Comp_ShiftImm() void Compiler::T_Comp_AddSub_() { - OpArg rd = MapReg(CurrentInstr.T_Reg(0)); - OpArg rs = MapReg(CurrentInstr.T_Reg(3)); + OpArg rd = MapReg(CurInstr.T_Reg(0)); + OpArg rs = MapReg(CurInstr.T_Reg(3)); - int op = (CurrentInstr.Instr >> 9) & 0x3; + int op = (CurInstr.Instr >> 9) & 0x3; - OpArg rn = op >= 2 ? Imm32((CurrentInstr.Instr >> 6) & 0x7) : MapReg(CurrentInstr.T_Reg(6)); + OpArg rn = op >= 2 ? Imm32((CurInstr.Instr >> 6) & 0x7) : MapReg(CurInstr.T_Reg(6)); Comp_AddCycles_C(); @@ -423,38 +429,38 @@ void Compiler::T_Comp_AddSub_() void Compiler::T_Comp_ALU_Imm8() { - OpArg rd = MapReg(CurrentInstr.T_Reg(8)); + OpArg rd = MapReg(CurInstr.T_Reg(8)); - u32 op = (CurrentInstr.Instr >> 11) & 0x3; - OpArg imm = Imm32(CurrentInstr.Instr & 0xFF); + u32 op = (CurInstr.Instr >> 11) & 0x3; + OpArg imm = Imm32(CurInstr.Instr & 0xFF); Comp_AddCycles_C(); switch (op) { - case 0x0: - MOV(32, rd, imm); - TEST(32, rd, rd); - Comp_RetriveFlags(false, false, false); - return; - case 0x1: - Comp_CmpOp(2, rd, imm, false); - return; - case 0x2: - Comp_ArithTriOp(ADD, rd, rd, imm, false, opSetsFlags|opSymmetric|opRetriveCV); - return; - case 0x3: - Comp_ArithTriOp(SUB, rd, rd, imm, false, opSetsFlags|opInvertCarry|opRetriveCV); - return; + case 0x0: + MOV(32, rd, imm); + TEST(32, rd, rd); + Comp_RetriveFlags(false, false, false); + return; + case 0x1: + Comp_CmpOp(2, rd, imm, false); + return; + case 0x2: + Comp_ArithTriOp(ADD, rd, rd, imm, false, opSetsFlags|opSymmetric|opRetriveCV); + return; + case 0x3: + Comp_ArithTriOp(SUB, rd, rd, imm, false, opSetsFlags|opInvertCarry|opRetriveCV); + return; } } void Compiler::T_Comp_ALU() { - OpArg rd = MapReg(CurrentInstr.T_Reg(0)); - OpArg rs = MapReg(CurrentInstr.T_Reg(3)); + OpArg rd = MapReg(CurInstr.T_Reg(0)); + OpArg rs = MapReg(CurInstr.T_Reg(3)); - u32 op = (CurrentInstr.Instr >> 6) & 0xF; + u32 op = (CurInstr.Instr >> 6) & 0xF; if ((op >= 0x2 && op < 0x4) || op == 0x7) Comp_AddCycles_CI(1); @@ -522,28 +528,62 @@ void Compiler::T_Comp_ALU() void Compiler::T_Comp_ALU_HiReg() { - OpArg rd = MapReg(((CurrentInstr.Instr & 0x7) | ((CurrentInstr.Instr >> 4) & 0x8))); - OpArg rs = MapReg((CurrentInstr.Instr >> 3) & 0xF); + u32 rd = ((CurInstr.Instr & 0x7) | ((CurInstr.Instr >> 4) & 0x8)); + OpArg rdMapped = MapReg(rd); + OpArg rs = MapReg((CurInstr.Instr >> 3) & 0xF); - u32 op = (CurrentInstr.Instr >> 8) & 0x3; + u32 op = (CurInstr.Instr >> 8) & 0x3; Comp_AddCycles_C(); switch (op) { - case 0x0: // ADD - Comp_ArithTriOp(ADD, rd, rd, rs, false, opSymmetric|opRetriveCV); - return; - case 0x1: // CMP - Comp_CmpOp(2, rd, rs, false); - return; - case 0x2: // MOV - if (rd != rs) - MOV(32, rd, rs); - TEST(32, rd, rd); - Comp_RetriveFlags(false, false, false); - return; + case 0x0: // ADD + Comp_ArithTriOp(ADD, rdMapped, rdMapped, rs, false, opSymmetric|opRetriveCV); + break; + case 0x1: // CMP + Comp_CmpOp(2, rdMapped, rs, false); + return; // this is on purpose + case 0x2: // MOV + if (rdMapped != rs) + MOV(32, rdMapped, rs); + TEST(32, rdMapped, rdMapped); + Comp_RetriveFlags(false, false, false); + break; + } + + if (rd == 15) + { + OR(32, rdMapped, Imm8(1)); + Comp_JumpTo(rdMapped.GetSimpleReg()); } } +void Compiler::T_Comp_AddSP() +{ + Comp_AddCycles_C(); + + OpArg sp = MapReg(13); + OpArg offset = Imm32((CurInstr.Instr & 0x7F) << 2); + if (CurInstr.Instr & (1 << 7)) + SUB(32, sp, offset); + else + ADD(32, sp, offset); +} + +void Compiler::T_Comp_RelAddr() +{ + Comp_AddCycles_C(); + + OpArg rd = MapReg(CurInstr.T_Reg(8)); + u32 offset = (CurInstr.Instr & 0xFF) << 2; + if (CurInstr.Instr & (1 << 11)) + { + OpArg sp = MapReg(13); + LEA(32, rd.GetSimpleReg(), MDisp(sp.GetSimpleReg(), offset)); + } + else + MOV(32, rd, Imm32((R15 & ~2) + offset)); +} + } \ No newline at end of file diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp index 9096397..b7358a2 100644 --- a/src/ARMJIT_x64/ARMJIT_Compiler.cpp +++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp @@ -9,7 +9,7 @@ using namespace Gen; namespace ARMJIT { template <> -const X64Reg RegCache::NativeRegAllocOrder[] = +const X64Reg RegCache::NativeRegAllocOrder[] = { #ifdef _WIN32 RBX, RSI, RDI, R12, R13 @@ -18,7 +18,7 @@ const X64Reg RegCache::NativeRegAllocOrder[] = #endif }; template <> -const int RegCache::NativeRegsAvailable = +const int RegCache::NativeRegsAvailable = #ifdef _WIN32 5 #else @@ -30,24 +30,33 @@ Compiler::Compiler() { AllocCodeSpace(1024 * 1024 * 16); - for (int i = 0; i < 15; i++) + for (int i = 0; i < 3; i++) { - ReadMemFuncs9[i] = Gen_MemoryRoutine9(false, 32, 0x1000000 * i); - WriteMemFuncs9[i] = Gen_MemoryRoutine9(true, 32, 0x1000000 * i); for (int j = 0; j < 2; j++) { - ReadMemFuncs7[j][i] = Gen_MemoryRoutine7(false, 32, j, 0x1000000 * i); - WriteMemFuncs7[j][i] = Gen_MemoryRoutine7(true, 32, j, 0x1000000 * i); + MemoryFuncs9[i][j] = Gen_MemoryRoutine9(j, 8 << i); + MemoryFuncs7[i][j][0] = Gen_MemoryRoutine7(j, false, 8 << i); + MemoryFuncs7[i][j][1] = Gen_MemoryRoutine7(j, true, 8 << i); } } - ReadMemFuncs9[15] = Gen_MemoryRoutine9(false, 32, 0xFF000000); - WriteMemFuncs9[15] = Gen_MemoryRoutine9(true, 32, 0xFF000000); - ReadMemFuncs7[15][0] = ReadMemFuncs7[15][1] = Gen_MemoryRoutine7(false, 32, false, 0xFF000000); - WriteMemFuncs7[15][0] = WriteMemFuncs7[15][1] = Gen_MemoryRoutine7(true, 32, false, 0xFF000000); ResetStart = GetWritableCodePtr(); } +DataRegion Compiler::ClassifyAddress(u32 addr) +{ + if (Num == 0 && addr >= ((ARMv5*)CurCPU)->DTCMBase && addr < ((ARMv5*)CurCPU)->DTCMBase) + return dataRegionDTCM; + switch (addr & 0xFF000000) + { + case 0x02000000: return dataRegionMainRAM; + case 0x03000000: return Num == 1 && (addr & 0xF00000) == 0x800000 ? dataRegionWRAM7 : dataRegionSWRAM; + case 0x04000000: return dataRegionIO; + case 0x06000000: return dataRegionVRAM; + } + return dataRegionGeneric; +} + void Compiler::LoadCPSR() { assert(!CPSRDirty); @@ -92,6 +101,7 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs Num = cpu->Num; R15 = cpu->R[15]; CodeRegion = cpu->CodeRegion; + CurCPU = cpu; ABI_PushRegistersAndAdjustStack({ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS}, 8, 16); @@ -106,27 +116,32 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs for (int i = 0; i < instrsCount; i++) { R15 += Thumb ? 2 : 4; - CurrentInstr = instrs[i]; - - CompileFunc comp = GetCompFunc(CurrentInstr.Info.Kind); + CurInstr = instrs[i]; - if (CurrentInstr.Info.Branches()) - comp = NULL; + CompileFunc comp = GetCompFunc(CurInstr.Info.Kind); if (comp == NULL || i == instrsCount - 1) { MOV(32, MDisp(RCPU, offsetof(ARM, R[15])), Imm32(R15)); - MOV(32, MDisp(RCPU, offsetof(ARM, CodeCycles)), Imm32(CurrentInstr.CodeCycles)); - MOV(32, MDisp(RCPU, offsetof(ARM, CurInstr)), Imm32(CurrentInstr.Instr)); + MOV(32, MDisp(RCPU, offsetof(ARM, CodeCycles)), Imm32(CurInstr.CodeCycles)); + MOV(32, MDisp(RCPU, offsetof(ARM, CurInstr)), Imm32(CurInstr.Instr)); if (i == instrsCount - 1) { - MOV(32, MDisp(RCPU, offsetof(ARM, NextInstr[0])), Imm32(CurrentInstr.NextInstr[0])); - MOV(32, MDisp(RCPU, offsetof(ARM, NextInstr[1])), Imm32(CurrentInstr.NextInstr[1])); + MOV(32, MDisp(RCPU, offsetof(ARM, NextInstr[0])), Imm32(CurInstr.NextInstr[0])); + MOV(32, MDisp(RCPU, offsetof(ARM, NextInstr[1])), Imm32(CurInstr.NextInstr[1])); } - SaveCPSR(); + if (comp == NULL || CurInstr.Info.Branches()) + SaveCPSR(); } + // run interpreter + cpu->CodeCycles = CurInstr.CodeCycles; + cpu->R[15] = R15; + cpu->CurInstr = CurInstr.Instr; + cpu->NextInstr[0] = CurInstr.NextInstr[0]; + cpu->NextInstr[1] = CurInstr.NextInstr[1]; + if (comp != NULL) RegCache.Prepare(i); else @@ -134,26 +149,33 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs if (Thumb) { + u32 icode = (CurInstr.Instr >> 6) & 0x3FF; if (comp == NULL) { MOV(64, R(ABI_PARAM1), R(RCPU)); - u32 icode = (CurrentInstr.Instr >> 6) & 0x3FF; ABI_CallFunction(ARMInterpreter::THUMBInstrTable[icode]); } else (this->*comp)(); + + ARMInterpreter::THUMBInstrTable[icode](cpu); } else { - u32 cond = CurrentInstr.Cond(); - if (CurrentInstr.Info.Kind == ARMInstrInfo::ak_BLX_IMM) + u32 cond = CurInstr.Cond(); + if (CurInstr.Info.Kind == ARMInstrInfo::ak_BLX_IMM) { MOV(64, R(ABI_PARAM1), R(RCPU)); ABI_CallFunction(ARMInterpreter::A_BLX_IMM); + + ARMInterpreter::A_BLX_IMM(cpu); } else if (cond == 0xF) + { Comp_AddCycles_C(); + cpu->AddCycles_C(); + } else { FixupBranch skipExecute; @@ -180,18 +202,18 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs } + u32 icode = ((CurInstr.Instr >> 4) & 0xF) | ((CurInstr.Instr >> 16) & 0xFF0); if (comp == NULL) { MOV(64, R(ABI_PARAM1), R(RCPU)); - u32 icode = ((CurrentInstr.Instr >> 4) & 0xF) | ((CurrentInstr.Instr >> 16) & 0xFF0); ABI_CallFunction(ARMInterpreter::ARMInstrTable[icode]); } else (this->*comp)(); FixupBranch skipFailed; - if (CurrentInstr.Cond() < 0xE) + if (CurInstr.Cond() < 0xE) { skipFailed = J(); SetJumpTarget(skipExecute); @@ -200,13 +222,17 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs SetJumpTarget(skipFailed); } + + if (cpu->CheckCondition(cond)) + ARMInterpreter::ARMInstrTable[icode](cpu); + else + cpu->AddCycles_C(); } } /* we don't need to collect the interpreted cycles, - since all functions only add to it, the dispatcher - takes care of it. + since cpu->Cycles is taken into account by the dispatcher. */ if (comp == NULL && i != instrsCount - 1) @@ -277,29 +303,29 @@ CompileFunc Compiler::GetCompFunc(int kind) // Mul NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, // ARMv5 stuff - NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, // STR A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, // STRB - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, // LDR A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, // LDRB - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, // STRH - NULL, NULL, NULL, NULL, + A_Comp_MemHalf, A_Comp_MemHalf, A_Comp_MemHalf, A_Comp_MemHalf, // LDRD NULL, NULL, NULL, NULL, // STRD NULL, NULL, NULL, NULL, // LDRH - NULL, NULL, NULL, NULL, + A_Comp_MemHalf, A_Comp_MemHalf, A_Comp_MemHalf, A_Comp_MemHalf, // LDRSB - NULL, NULL, NULL, NULL, + A_Comp_MemHalf, A_Comp_MemHalf, A_Comp_MemHalf, A_Comp_MemHalf, // LDRSH - NULL, NULL, NULL, NULL, + A_Comp_MemHalf, A_Comp_MemHalf, A_Comp_MemHalf, A_Comp_MemHalf, // swap - NULL, NULL, + NULL, NULL, // LDM/STM NULL, NULL, // Branch @@ -314,26 +340,26 @@ CompileFunc Compiler::GetCompFunc(int kind) // Three operand ADD/SUB T_Comp_AddSub_, T_Comp_AddSub_, T_Comp_AddSub_, T_Comp_AddSub_, // 8 bit imm - T_Comp_ALU_Imm8, T_Comp_ALU_Imm8, T_Comp_ALU_Imm8, T_Comp_ALU_Imm8, + T_Comp_ALU_Imm8, T_Comp_ALU_Imm8, T_Comp_ALU_Imm8, T_Comp_ALU_Imm8, // general ALU - T_Comp_ALU, T_Comp_ALU, T_Comp_ALU, T_Comp_ALU, T_Comp_ALU, T_Comp_ALU, T_Comp_ALU, T_Comp_ALU, - T_Comp_ALU, T_Comp_ALU, T_Comp_ALU, T_Comp_ALU, + T_Comp_ALU, T_Comp_ALU, T_Comp_ALU, T_Comp_ALU, + T_Comp_ALU, T_Comp_ALU, T_Comp_ALU, T_Comp_ALU, T_Comp_ALU, NULL, T_Comp_ALU, T_Comp_ALU, // hi reg T_Comp_ALU_HiReg, T_Comp_ALU_HiReg, T_Comp_ALU_HiReg, // pc/sp relative - NULL, NULL, NULL, + T_Comp_RelAddr, T_Comp_RelAddr, T_Comp_AddSP, // LDR pcrel - NULL, + NULL, // LDR/STR reg offset - T_Comp_MemReg, NULL, T_Comp_MemReg, NULL, - // LDR/STR sign extended, half - NULL, NULL, NULL, NULL, + T_Comp_MemReg, T_Comp_MemReg, T_Comp_MemReg, T_Comp_MemReg, + // LDR/STR sign extended, half + T_Comp_MemRegHalf, T_Comp_MemRegHalf, T_Comp_MemRegHalf, T_Comp_MemRegHalf, // LDR/STR imm offset - T_Comp_MemImm, T_Comp_MemImm, NULL, NULL, + T_Comp_MemImm, T_Comp_MemImm, T_Comp_MemImm, T_Comp_MemImm, // LDR/STR half imm offset - NULL, NULL, + T_Comp_MemImmHalf, T_Comp_MemImmHalf, // branch, etc. NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, @@ -346,10 +372,10 @@ CompileFunc Compiler::GetCompFunc(int kind) void Compiler::Comp_AddCycles_C() { s32 cycles = Num ? - NDS::ARM7MemTimings[CurrentInstr.CodeCycles][Thumb ? 1 : 3] - : ((R15 & 0x2) ? 0 : CurrentInstr.CodeCycles); + NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 1 : 3] + : ((R15 & 0x2) ? 0 : CurInstr.CodeCycles); - if (CurrentInstr.Cond() < 0xE) + if (CurInstr.Cond() < 0xE) ADD(32, R(RCycles), Imm8(cycles)); else ConstantCycles += cycles; @@ -358,13 +384,26 @@ void Compiler::Comp_AddCycles_C() void Compiler::Comp_AddCycles_CI(u32 i) { s32 cycles = (Num ? - NDS::ARM7MemTimings[CurrentInstr.CodeCycles][Thumb ? 0 : 2] - : ((R15 & 0x2) ? 0 : CurrentInstr.CodeCycles)) + i; - - if (CurrentInstr.Cond() < 0xE) + NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2] + : ((R15 & 0x2) ? 0 : CurInstr.CodeCycles)) + i; + + if (CurInstr.Cond() < 0xE) ADD(32, R(RCycles), Imm8(cycles)); else ConstantCycles += cycles; } +void Compiler::Comp_JumpTo(Gen::X64Reg addr, bool restoreCPSR) +{ + SaveCPSR(); + + MOV(64, R(ABI_PARAM1), R(RCPU)); + MOV(32, R(ABI_PARAM2), R(addr)); + MOV(32, R(ABI_PARAM3), Imm32(restoreCPSR)); + if (Num == 0) + CALL((void*)&ARMv5::JumpTo); + else + CALL((void*)&ARMv4::JumpTo); +} + } \ No newline at end of file diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.h b/src/ARMJIT_x64/ARMJIT_Compiler.h index 7ab9b25..9395a29 100644 --- a/src/ARMJIT_x64/ARMJIT_Compiler.h +++ b/src/ARMJIT_x64/ARMJIT_Compiler.h @@ -6,6 +6,8 @@ #include "../ARMJIT.h" #include "../ARMJIT_RegCache.h" +#include + namespace ARMJIT { @@ -21,6 +23,19 @@ class Compiler; typedef void (Compiler::*CompileFunc)(); +enum DataRegion +{ + dataRegionGeneric, // hey, that's me! + dataRegionMainRAM, + dataRegionSWRAM, + dataRegionVRAM, + dataRegionIO, + dataRegionExclusive, + dataRegionsCount, + dataRegionDTCM = dataRegionExclusive, + dataRegionWRAM7 = dataRegionExclusive, +}; + class Compiler : public Gen::X64CodeBlock { public: @@ -34,6 +49,8 @@ public: private: CompileFunc GetCompFunc(int kind); + void Comp_JumpTo(Gen::X64Reg addr, bool restoreCPSR = false); + void Comp_AddCycles_C(); void Comp_AddCycles_CI(u32 i); @@ -47,11 +64,14 @@ private: opInvertOp2 = 1 << 5, }; + DataRegion ClassifyAddress(u32 addr); + void A_Comp_Arith(); void A_Comp_MovOp(); void A_Comp_CmpOp(); void A_Comp_MemWB(); + void A_Comp_MemHalf(); void T_Comp_ShiftImm(); void T_Comp_AddSub_(); @@ -59,8 +79,15 @@ private: void T_Comp_ALU(); void T_Comp_ALU_HiReg(); + void T_Comp_RelAddr(); + void T_Comp_AddSP(); + void T_Comp_MemReg(); void T_Comp_MemImm(); + void T_Comp_MemRegHalf(); + void T_Comp_MemImmHalf(); + + void Comp_MemAccess(Gen::OpArg rd, bool signExtend, bool store, int size); void Comp_ArithTriOp(void (Compiler::*op)(int, const Gen::OpArg&, const Gen::OpArg&), Gen::OpArg rd, Gen::OpArg rn, Gen::OpArg op2, bool carryUsed, int opFlags); @@ -70,8 +97,8 @@ private: void Comp_RetriveFlags(bool sign, bool retriveCV, bool carryUsed); - void* Gen_MemoryRoutine9(bool store, int size, u32 region); - void* Gen_MemoryRoutine7(bool store, int size, bool mainRAMCode, u32 region); + void* Gen_MemoryRoutine9(bool store, int size); + void* Gen_MemoryRoutine7(bool store, bool codeMainRAM, int size); Gen::OpArg Comp_RegShiftImm(int op, int amount, Gen::OpArg rm, bool S, bool& carryUsed); Gen::OpArg Comp_RegShiftReg(int op, Gen::OpArg rs, Gen::OpArg rm, bool S, bool& carryUsed); @@ -92,10 +119,12 @@ private: } void* ResetStart; + void* MemoryFuncs9[3][2]; + void* MemoryFuncs7[3][2][2]; bool CPSRDirty = false; - FetchedInstr CurrentInstr; + FetchedInstr CurInstr; RegCache RegCache; @@ -105,12 +134,9 @@ private: u32 CodeRegion; u32 ConstantCycles; -}; -extern void* ReadMemFuncs9[16]; -extern void* ReadMemFuncs7[2][16]; -extern void* WriteMemFuncs9[16]; -extern void* WriteMemFuncs7[2][16]; + ARM* CurCPU; +}; } diff --git a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp index d534269..69746e2 100644 --- a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp +++ b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp @@ -5,7 +5,6 @@ namespace NDS { -#define MAIN_RAM_SIZE 0x400000 extern u8* SWRAM_ARM9; extern u32 SWRAM_ARM9Mask; extern u8* SWRAM_ARM7; @@ -19,11 +18,6 @@ using namespace Gen; namespace ARMJIT { -void* ReadMemFuncs9[16]; -void* ReadMemFuncs7[2][16]; -void* WriteMemFuncs9[16]; -void* WriteMemFuncs7[2][16]; - template int squeezePointer(T* ptr) { @@ -32,569 +26,434 @@ int squeezePointer(T* ptr) return truncated; } -u32 ReadVRAM9(u32 addr) -{ - switch (addr & 0x00E00000) - { - case 0x00000000: return GPU::ReadVRAM_ABG(addr); - case 0x00200000: return GPU::ReadVRAM_BBG(addr); - case 0x00400000: return GPU::ReadVRAM_AOBJ(addr); - case 0x00600000: return GPU::ReadVRAM_BOBJ(addr); - default: return GPU::ReadVRAM_LCDC(addr); - } -} +/* + According to DeSmuME and my own research, approx. 99% (seriously, that's an empirical number) + of all memory load and store instructions always access addresses in the same region as + during the their first execution. -void WriteVRAM9(u32 addr, u32 val) -{ - switch (addr & 0x00E00000) - { - case 0x00000000: GPU::WriteVRAM_ABG(addr, val); return; - case 0x00200000: GPU::WriteVRAM_BBG(addr, val); return; - case 0x00400000: GPU::WriteVRAM_AOBJ(addr, val); return; - case 0x00600000: GPU::WriteVRAM_BOBJ(addr, val); return; - default: GPU::WriteVRAM_LCDC(addr, val); return; - } -} + I tried multiple optimisations, which would benefit from this behaviour + (having fast paths for the first region, …), though none of them yielded a measureable + improvement. +*/ /* - R11 - data to write (store only) - RSCRATCH2 - address - RSCRATCH3 - code cycles + address - ABI_PARAM1 (a.k.a. ECX = RSCRATCH3 on Windows) + store value - ABI_PARAM2 (a.k.a. RDX = RSCRATCH2 on Windows) + code cycles - ABI_PARAM3 */ -void* Compiler::Gen_MemoryRoutine9(bool store, int size, u32 region) +void* Compiler::Gen_MemoryRoutine9(bool store, int size) { + u32 addressMask = ~(size == 32 ? 3 : (size == 16 ? 1 : 0)); AlignCode4(); - void* res = (void*)GetWritableCodePtr(); + void* res = GetWritableCodePtr(); - if (!store) - { - MOV(32, R(RSCRATCH), R(RSCRATCH2)); - AND(32, R(RSCRATCH), Imm8(0x3)); - SHL(32, R(RSCRATCH), Imm8(3)); - // enter the shadow realm! - MOV(32, MDisp(RSP, 8), R(RSCRATCH)); - } + MOV(32, R(RSCRATCH), R(ABI_PARAM1)); + SUB(32, R(RSCRATCH), MDisp(RCPU, offsetof(ARMv5, DTCMBase))); + CMP(32, R(RSCRATCH), MDisp(RCPU, offsetof(ARMv5, DTCMSize))); + FixupBranch insideDTCM = J_CC(CC_B); - // cycle counting! - // this is AddCycles_CDI - MOV(32, R(R10), R(RSCRATCH2)); - SHR(32, R(R10), Imm8(12)); - MOVZX(32, 8, R10, MComplex(RCPU, R10, SCALE_1, offsetof(ARMv5, MemTimings) + 2)); - LEA(32, RSCRATCH, MComplex(RSCRATCH3, R10, SCALE_1, -6)); - CMP(32, R(R10), R(RSCRATCH3)); - CMOVcc(32, RSCRATCH3, R(R10), CC_G); - CMP(32, R(RSCRATCH), R(RSCRATCH3)); - CMOVcc(32, RSCRATCH3, R(RSCRATCH), CC_G); - ADD(32, R(RCycles), R(RSCRATCH3)); - - if (!store) - XOR(32, R(RSCRATCH), R(RSCRATCH)); - AND(32, R(RSCRATCH2), Imm32(~3)); + CMP(32, R(ABI_PARAM1), MDisp(RCPU, offsetof(ARMv5, ITCMSize))); + FixupBranch insideITCM = J_CC(CC_B); + // cycle counting! + MOV(32, R(RSCRATCH), R(ABI_PARAM1)); + SHR(32, R(RSCRATCH), Imm8(12)); + MOVZX(32, 8, RSCRATCH, MComplex(RCPU, RSCRATCH, SCALE_1, offsetof(ARMv5, MemTimings) + (size == 32 ? 2 : 0))); + LEA(32, ABI_PARAM4, MComplex(RSCRATCH, ABI_PARAM3, SCALE_1, -6)); + CMP(32, R(ABI_PARAM3), R(RSCRATCH)); + CMOVcc(32, RSCRATCH, R(ABI_PARAM3), CC_G); + CMP(32, R(ABI_PARAM4), R(RSCRATCH)); + CMOVcc(32, RSCRATCH, R(ABI_PARAM4), CC_G); + ADD(32, R(RCycles), R(RSCRATCH)); + + if (store) { - MOV(32, R(RSCRATCH3), R(RSCRATCH2)); - SUB(32, R(RSCRATCH2), MDisp(RCPU, offsetof(ARMv5, DTCMBase))); - CMP(32, R(RSCRATCH2), MDisp(RCPU, offsetof(ARMv5, DTCMSize))); - FixupBranch outsideDTCM = J_CC(CC_AE); - AND(32, R(RSCRATCH2), Imm32(0x3FFF)); - if (!store) + if (size > 8) + AND(32, R(ABI_PARAM1), Imm32(addressMask)); + switch (size) { - MOV(32, R(RSCRATCH), MComplex(RCPU, RSCRATCH2, SCALE_1, offsetof(ARMv5, DTCM))); - MOV(32, R(ECX), MDisp(RSP, 8)); - ROR_(32, R(RSCRATCH), R(ECX)); + case 32: JMP((u8*)NDS::ARM9Write32, true); break; + case 16: JMP((u8*)NDS::ARM9Write16, true); break; + case 8: JMP((u8*)NDS::ARM9Write8, true); break; } - else - MOV(32, MComplex(RCPU, RSCRATCH2, SCALE_1, offsetof(ARMv5, DTCM)), R(R11)); - RET(); - SetJumpTarget(outsideDTCM); - MOV(32, R(RSCRATCH2), R(RSCRATCH3)); } - - switch (region) + else { - case 0x00000000: - case 0x01000000: - { - CMP(32, R(RSCRATCH2), MDisp(RCPU, offsetof(ARMv5, ITCMSize))); - FixupBranch insideITCM = J_CC(CC_B); - RET(); - SetJumpTarget(insideITCM); - AND(32, R(RSCRATCH2), Imm32(0x7FFF)); - if (!store) - MOV(32, R(RSCRATCH), MComplex(RCPU, RSCRATCH2, SCALE_1, offsetof(ARMv5, ITCM))); - else - { - MOV(32, MComplex(RCPU, RSCRATCH2, SCALE_1, offsetof(ARMv5, ITCM)), R(R11)); - MOV(64, MScaled(RSCRATCH2, SCALE_4, squeezePointer(cache.ARM9_ITCM)), Imm32(0)); - MOV(64, MScaled(RSCRATCH2, SCALE_4, squeezePointer(cache.ARM9_ITCM) + 8), Imm32(0)); - } - } - break; - case 0x02000000: - AND(32, R(RSCRATCH2), Imm32(MAIN_RAM_SIZE - 1)); - if (!store) - MOV(32, R(RSCRATCH), MDisp(RSCRATCH2, squeezePointer(NDS::MainRAM))); - else - { - MOV(32, MDisp(RSCRATCH2, squeezePointer(NDS::MainRAM)), R(R11)); - MOV(64, MScaled(RSCRATCH2, SCALE_4, squeezePointer(cache.MainRAM)), Imm32(0)); - MOV(64, MScaled(RSCRATCH2, SCALE_4, squeezePointer(cache.MainRAM) + 8), Imm32(0)); - } - break; - case 0x03000000: - { - MOV(64, R(RSCRATCH3), M(&NDS::SWRAM_ARM9)); - TEST(64, R(RSCRATCH3), R(RSCRATCH3)); - FixupBranch notMapped = J_CC(CC_Z); - AND(32, R(RSCRATCH2), M(&NDS::SWRAM_ARM9Mask)); - if (!store) - MOV(32, R(RSCRATCH), MRegSum(RSCRATCH2, RSCRATCH3)); - else - { - MOV(32, MRegSum(RSCRATCH2, RSCRATCH3), R(R11)); - MOV(64, MScaled(RSCRATCH2, SCALE_4, squeezePointer(cache.SWRAM)), Imm32(0)); - MOV(64, MScaled(RSCRATCH2, SCALE_4, squeezePointer(cache.SWRAM) + 8), Imm32(0)); - } - SetJumpTarget(notMapped); - } - break; - case 0x04000000: - MOV(32, R(ABI_PARAM1), R(RSCRATCH2)); - if (!store) - { - ABI_PushRegistersAndAdjustStack({}, 8, 0); - ABI_CallFunction(NDS::ARM9IORead32); - ABI_PopRegistersAndAdjustStack({}, 8, 0); - } - else - { - MOV(32, R(ABI_PARAM2), R(R11)); - JMP((u8*)NDS::ARM9IOWrite32, true); - } - break; - case 0x05000000: - { - MOV(32, R(RSCRATCH), Imm32(1<<1)); - MOV(32, R(RSCRATCH3), Imm32(1<<9)); - TEST(32, R(RSCRATCH2), Imm32(0x400)); - CMOVcc(32, RSCRATCH, R(RSCRATCH3), CC_NZ); - TEST(16, R(RSCRATCH), M(&NDS::PowerControl9)); - FixupBranch available = J_CC(CC_NZ); - RET(); - SetJumpTarget(available); - AND(32, R(RSCRATCH2), Imm32(0x7FF)); - if (!store) - MOV(32, R(RSCRATCH), MDisp(RSCRATCH2, squeezePointer(GPU::Palette))); - else - MOV(32, MDisp(RSCRATCH2, squeezePointer(GPU::Palette)), R(R11)); - } - break; - case 0x06000000: - MOV(32, R(ABI_PARAM1), R(RSCRATCH2)); - if (!store) - { - ABI_PushRegistersAndAdjustStack({}, 8); - ABI_CallFunction(ReadVRAM9); - ABI_PopRegistersAndAdjustStack({}, 8); - } - else - { - MOV(32, R(ABI_PARAM2), R(R11)); - JMP((u8*)WriteVRAM9, true); - } - break; - case 0x07000000: + if (size == 32) { - MOV(32, R(RSCRATCH), Imm32(1<<1)); - MOV(32, R(RSCRATCH3), Imm32(1<<9)); - TEST(32, R(RSCRATCH2), Imm32(0x400)); - CMOVcc(32, RSCRATCH, R(RSCRATCH3), CC_NZ); - TEST(16, R(RSCRATCH), M(&NDS::PowerControl9)); - FixupBranch available = J_CC(CC_NZ); + ABI_PushRegistersAndAdjustStack({ABI_PARAM1}, 8); + AND(32, R(ABI_PARAM1), Imm32(addressMask)); + // everything's already in the appropriate register + ABI_CallFunction(NDS::ARM9Read32); + ABI_PopRegistersAndAdjustStack({ECX}, 8); + AND(32, R(ECX), Imm8(3)); + SHL(32, R(ECX), Imm8(3)); + ROR_(32, R(RSCRATCH), R(ECX)); RET(); - SetJumpTarget(available); - AND(32, R(RSCRATCH2), Imm32(0x7FF)); - if (!store) - MOV(32, R(RSCRATCH), MDisp(RSCRATCH2, squeezePointer(GPU::OAM))); - else - MOV(32, MDisp(RSCRATCH2, squeezePointer(GPU::OAM)), R(R11)); } - break; - case 0x08000000: - case 0x09000000: - case 0x0A000000: - if (!store) - MOV(32, R(RSCRATCH), Imm32(0xFFFFFFFF)); - break; - case 0xFF000000: - if (!store) - { - AND(32, R(RSCRATCH2), Imm32(0xFFF)); - MOV(32, R(RSCRATCH), MDisp(RSCRATCH2, squeezePointer(NDS::ARM9BIOS))); - } - break; - default: - MOV(32, R(ABI_PARAM1), R(RSCRATCH2)); - if (!store) + else if (size == 16) { - ABI_PushRegistersAndAdjustStack({}, 8, 0); - ABI_CallFunction(NDS::ARM9Read32); - ABI_PopRegistersAndAdjustStack({}, 8, 0); + AND(32, R(ABI_PARAM1), Imm32(addressMask)); + JMP((u8*)NDS::ARM9Read16, true); } else + JMP((u8*)NDS::ARM9Read8, true); + } + + SetJumpTarget(insideDTCM); + ADD(32, R(RCycles), R(ABI_PARAM3)); + AND(32, R(RSCRATCH), Imm32(0x3FFF & addressMask)); + if (store) + MOV(size, MComplex(RCPU, RSCRATCH, SCALE_1, offsetof(ARMv5, DTCM)), R(ABI_PARAM2)); + else + { + MOVZX(32, size, RSCRATCH, MComplex(RCPU, RSCRATCH, SCALE_1, offsetof(ARMv5, DTCM))); + if (size == 32) { - MOV(32, R(ABI_PARAM2), R(R11)); - JMP((u8*)NDS::ARM9Write32, true); + if (ABI_PARAM1 != ECX) + MOV(32, R(ECX), R(ABI_PARAM1)); + AND(32, R(ECX), Imm8(3)); + SHL(32, R(ECX), Imm8(3)); + ROR_(32, R(RSCRATCH), R(ECX)); } - break; } + RET(); - if (!store) + SetJumpTarget(insideITCM); + ADD(32, R(RCycles), R(ABI_PARAM3)); + MOV(32, R(ABI_PARAM3), R(ABI_PARAM1)); // free up ECX + AND(32, R(ABI_PARAM3), Imm32(0x7FFF & addressMask)); + if (store) { - MOV(32, R(ECX), MDisp(RSP, 8)); - ROR_(32, R(RSCRATCH), R(ECX)); + MOV(size, MComplex(RCPU, ABI_PARAM3, SCALE_1, offsetof(ARMv5, ITCM)), R(ABI_PARAM2)); + XOR(32, R(RSCRATCH), R(RSCRATCH)); + MOV(64, MScaled(ABI_PARAM3, SCALE_4, squeezePointer(cache.ARM9_ITCM)), R(RSCRATCH)); + if (size == 32) + MOV(64, MScaled(ABI_PARAM3, SCALE_4, squeezePointer(cache.ARM9_ITCM) + 8), R(RSCRATCH)); + } + else + { + MOVZX(32, size, RSCRATCH, MComplex(RCPU, ABI_PARAM3, SCALE_1, offsetof(ARMv5, ITCM))); + if (size == 32) + { + if (ABI_PARAM1 != ECX) + MOV(32, R(ECX), R(ABI_PARAM1)); + AND(32, R(ECX), Imm8(3)); + SHL(32, R(ECX), Imm8(3)); + ROR_(32, R(RSCRATCH), R(ECX)); + } } - RET(); + static_assert(RSCRATCH == EAX); + return res; } -void* Compiler::Gen_MemoryRoutine7(bool store, int size, bool mainRAMCode, u32 region) +void* Compiler::Gen_MemoryRoutine7(bool store, bool codeMainRAM, int size) { + u32 addressMask = ~(size == 32 ? 3 : (size == 16 ? 1 : 0)); AlignCode4(); void* res = GetWritableCodePtr(); - if (!store) - { - MOV(32, R(RSCRATCH), R(RSCRATCH2)); - AND(32, R(RSCRATCH), Imm8(0x3)); - SHL(32, R(RSCRATCH), Imm8(3)); - // enter the shadow realm! - MOV(32, MDisp(RSP, 8), R(RSCRATCH)); - } - - // AddCycles_CDI - MOV(32, R(RSCRATCH), R(RSCRATCH2)); + MOV(32, R(RSCRATCH), R(ABI_PARAM1)); SHR(32, R(RSCRATCH), Imm8(15)); - MOVZX(32, 8, RSCRATCH, MDisp(RSCRATCH, squeezePointer(NDS::ARM7MemTimings + 2))); - if ((region == 0x02000000 && mainRAMCode) || (region != 0x02000000 && !mainRAMCode)) + MOVZX(32, 8, ABI_PARAM4, MDisp(RSCRATCH, (size == 32 ? 2 : 0) + squeezePointer(NDS::ARM7MemTimings))); + + MOV(32, R(RSCRATCH), R(ABI_PARAM1)); + AND(32, R(RSCRATCH), Imm32(0xFF000000)); + CMP(32, R(RSCRATCH), Imm32(0x02000000)); + FixupBranch outsideMainRAM = J_CC(CC_NE); + if (codeMainRAM) { - if (!store && region != 0x02000000) - LEA(32, RSCRATCH3, MComplex(RSCRATCH, RSCRATCH3, SCALE_1, 1)); - ADD(32, R(RCycles), R(RSCRATCH3)); + LEA(32, RSCRATCH, MRegSum(ABI_PARAM4, ABI_PARAM3)); + ADD(32, R(RCycles), R(RSCRATCH)); } else { if (!store) - ADD(32, R(region == 0x02000000 ? RSCRATCH2 : RSCRATCH), Imm8(1)); - LEA(32, R10, MComplex(RSCRATCH, RSCRATCH3, SCALE_1, -3)); - CMP(32, R(RSCRATCH3), R(RSCRATCH)); - CMOVcc(32, RSCRATCH, R(RSCRATCH3), CC_G); - CMP(32, R(R10), R(RSCRATCH)); - CMOVcc(32, RSCRATCH, R(R10), CC_G); + ADD(32, R(ABI_PARAM3), Imm8(1)); + LEA(32, RSCRATCH, MComplex(ABI_PARAM4, ABI_PARAM3, SCALE_1, -3)); + CMP(32, R(ABI_PARAM4), R(ABI_PARAM3)); + CMOVcc(32, ABI_PARAM3, R(ABI_PARAM4), CC_G); + CMP(32, R(ABI_PARAM3), R(RSCRATCH)); + CMOVcc(32, RSCRATCH, R(ABI_PARAM3), CC_G); ADD(32, R(RCycles), R(RSCRATCH)); } - - if (!store) + MOV(32, R(ABI_PARAM3), R(ABI_PARAM1)); + AND(32, R(ABI_PARAM3), Imm32((MAIN_RAM_SIZE - 1) & addressMask)); + if (store) + { + MOV(size, MDisp(ABI_PARAM3, squeezePointer(NDS::MainRAM)), R(ABI_PARAM2)); XOR(32, R(RSCRATCH), R(RSCRATCH)); - AND(32, R(RSCRATCH2), Imm32(~3)); + MOV(64, MScaled(ABI_PARAM3, SCALE_4, squeezePointer(cache.MainRAM)), R(RSCRATCH)); + if (size == 32) + MOV(64, MScaled(ABI_PARAM3, SCALE_4, squeezePointer(cache.MainRAM) + 8), R(RSCRATCH)); + } + else + { + MOVZX(32, size, RSCRATCH, MDisp(ABI_PARAM3, squeezePointer(NDS::MainRAM))); + if (size == 32) + { + if (ABI_PARAM1 != ECX) + MOV(32, R(ECX), R(ABI_PARAM1)); + AND(32, R(ECX), Imm8(3)); + SHL(32, R(ECX), Imm8(3)); + ROR_(32, R(RSCRATCH), R(ECX)); + } + } + RET(); - switch (region) + SetJumpTarget(outsideMainRAM); + if (codeMainRAM) + { + if (!store) + ADD(32, R(ABI_PARAM4), Imm8(1)); + LEA(32, RSCRATCH, MComplex(ABI_PARAM4, ABI_PARAM3, SCALE_1, -3)); + CMP(32, R(ABI_PARAM4), R(ABI_PARAM3)); + CMOVcc(32, ABI_PARAM3, R(ABI_PARAM4), CC_G); + CMP(32, R(ABI_PARAM3), R(RSCRATCH)); + CMOVcc(32, RSCRATCH, R(ABI_PARAM3), CC_G); + ADD(32, R(RCycles), R(RSCRATCH)); + } + else + { + LEA(32, RSCRATCH, MComplex(ABI_PARAM4, ABI_PARAM3, SCALE_1, store ? 0 : 1)); + ADD(32, R(RCycles), R(RSCRATCH)); + } + if (store) + { + if (size > 8) + AND(32, R(ABI_PARAM1), Imm32(addressMask)); + switch (size) + { + case 32: JMP((u8*)NDS::ARM7Write32, true); break; + case 16: JMP((u8*)NDS::ARM7Write16, true); break; + case 8: JMP((u8*)NDS::ARM7Write8, true); break; + } + } + else { - case 0x00000000: - if (!store) { - CMP(32, R(RSCRATCH2), Imm32(0x4000)); - FixupBranch outsideBIOS1 = J_CC(CC_AE); - - MOV(32, R(RSCRATCH), MDisp(RCPU, offsetof(ARM, R[15]))); - CMP(32, R(RSCRATCH), Imm32(0x4000)); - FixupBranch outsideBIOS2 = J_CC(CC_AE); - MOV(32, R(RSCRATCH3), M(&NDS::ARM7BIOSProt)); - CMP(32, R(RSCRATCH2), R(RSCRATCH3)); - FixupBranch notDenied1 = J_CC(CC_AE); - CMP(32, R(RSCRATCH), R(RSCRATCH3)); - FixupBranch notDenied2 = J_CC(CC_B); - SetJumpTarget(outsideBIOS2); - MOV(32, R(RSCRATCH), Imm32(0xFFFFFFFF)); - RET(); - - SetJumpTarget(notDenied1); - SetJumpTarget(notDenied2); - MOV(32, R(RSCRATCH), MDisp(RSCRATCH2, squeezePointer(NDS::ARM7BIOS))); - MOV(32, R(ECX), MDisp(RSP, 8)); - ROR_(32, R(RSCRATCH), R(ECX)); - RET(); - - SetJumpTarget(outsideBIOS1); - } - break; - case 0x02000000: - AND(32, R(RSCRATCH2), Imm32(MAIN_RAM_SIZE - 1)); - if (!store) - MOV(32, R(RSCRATCH), MDisp(RSCRATCH2, squeezePointer(NDS::MainRAM))); - else - { - MOV(32, MDisp(RSCRATCH2, squeezePointer(NDS::MainRAM)), R(R11)); - MOV(64, MScaled(RSCRATCH2, SCALE_4, squeezePointer(cache.MainRAM)), Imm32(0)); - MOV(64, MScaled(RSCRATCH2, SCALE_4, squeezePointer(cache.MainRAM) + 8), Imm32(0)); - } - break; - case 0x03000000: - { - TEST(32, R(RSCRATCH2), Imm32(0x800000)); - FixupBranch region = J_CC(CC_NZ); - MOV(64, R(RSCRATCH), M(&NDS::SWRAM_ARM7)); - TEST(64, R(RSCRATCH), R(RSCRATCH)); - FixupBranch notMapped = J_CC(CC_Z); - AND(32, R(RSCRATCH2), M(&NDS::SWRAM_ARM7Mask)); - if (!store) - { - MOV(32, R(RSCRATCH), MRegSum(RSCRATCH, RSCRATCH2)); - MOV(32, R(ECX), MDisp(RSP, 8)); - ROR_(32, R(RSCRATCH), R(ECX)); - } - else - { - MOV(32, MRegSum(RSCRATCH, RSCRATCH2), R(R11)); - MOV(64, MScaled(RSCRATCH2, SCALE_4, squeezePointer(cache.SWRAM)), Imm32(0)); - MOV(64, MScaled(RSCRATCH2, SCALE_4, squeezePointer(cache.SWRAM) + 8), Imm32(0)); - } - RET(); - SetJumpTarget(region); - SetJumpTarget(notMapped); - AND(32, R(RSCRATCH2), Imm32(0xFFFF)); - if (!store) - MOV(32, R(RSCRATCH), MDisp(RSCRATCH2, squeezePointer(NDS::ARM7WRAM))); - else - { - MOV(32, MDisp(RSCRATCH2, squeezePointer(NDS::ARM7WRAM)), R(R11)); - MOV(64, MScaled(RSCRATCH2, SCALE_4, squeezePointer(cache.ARM7_WRAM)), Imm32(0)); - MOV(64, MScaled(RSCRATCH2, SCALE_4, squeezePointer(cache.ARM7_WRAM) + 8), Imm32(0)); - } - } - break; - case 0x04000000: - { - TEST(32, R(RSCRATCH2), Imm32(0x800000)); - FixupBranch region = J_CC(CC_NZ); - MOV(32, R(ABI_PARAM1), R(RSCRATCH2)); - if (!store) - { - ABI_PushRegistersAndAdjustStack({}, 8); - ABI_CallFunction(NDS::ARM7IORead32); - ABI_PopRegistersAndAdjustStack({}, 8); - - MOV(32, R(ECX), MDisp(RSP, 8)); - ROR_(32, R(RSCRATCH), R(ECX)); - RET(); - } - else - { - MOV(32, R(ABI_PARAM2), R(R11)); - JMP((u8*)NDS::ARM7IOWrite32, true); - } - SetJumpTarget(region); - - if (!store) - { - ABI_PushRegistersAndAdjustStack({RSCRATCH2}, 8); - MOV(32, R(ABI_PARAM1), R(RSCRATCH2)); - ABI_CallFunction(Wifi::Read); - ABI_PopRegistersAndAdjustStack({RSCRATCH2}, 8); - - ADD(32, R(RSCRATCH2), Imm8(2)); - ABI_PushRegistersAndAdjustStack({EAX}, 8); - MOV(32, R(ABI_PARAM1), R(RSCRATCH2)); - ABI_CallFunction(Wifi::Read); - MOV(32, R(RSCRATCH2), R(EAX)); - SHL(32, R(RSCRATCH2), Imm8(16)); - ABI_PopRegistersAndAdjustStack({EAX}, 8); - OR(32, R(EAX), R(RSCRATCH2)); - } - else - { - ABI_PushRegistersAndAdjustStack({RSCRATCH2, R11}, 8); - MOV(32, R(ABI_PARAM1), R(RSCRATCH2)); - MOVZX(32, 16, ABI_PARAM2, R(R11)); - ABI_CallFunction(Wifi::Write); - ABI_PopRegistersAndAdjustStack({RSCRATCH2, R11}, 8); - SHR(32, R(R11), Imm8(16)); - ADD(32, R(RSCRATCH2), Imm8(2)); - ABI_PushRegistersAndAdjustStack({RSCRATCH2, R11}, 8); - MOV(32, R(ABI_PARAM1), R(RSCRATCH2)); - MOVZX(32, 16, ABI_PARAM2, R(R11)); - ABI_CallFunction(Wifi::Write); - ABI_PopRegistersAndAdjustStack({RSCRATCH2, R11}, 8); - } - } - break; - case 0x06000000: - MOV(32, R(ABI_PARAM1), R(RSCRATCH2)); - if (!store) - { - ABI_PushRegistersAndAdjustStack({}, 8); - ABI_CallFunction(GPU::ReadVRAM_ARM7); - ABI_PopRegistersAndAdjustStack({}, 8); - } - else - { - AND(32, R(ABI_PARAM1), Imm32(0x40000 - 1)); - MOV(64, MScaled(ABI_PARAM1, SCALE_4, squeezePointer(cache.ARM7_WVRAM)), Imm32(0)); - MOV(64, MScaled(ABI_PARAM1, SCALE_4, squeezePointer(cache.ARM7_WVRAM) + 8), Imm32(0)); - MOV(32, R(ABI_PARAM2), R(R11)); - JMP((u8*)GPU::WriteVRAM_ARM7, true); - } - break; - case 0x08000000: - case 0x09000000: - case 0x0A000000: - if (!store) - MOV(32, R(RSCRATCH), Imm32(0xFFFFFFFF)); - break; - /*default: - ABI_PushRegistersAndAdjustStack({}, 8, 0); - MOV(32, R(ABI_PARAM1), R(RSCRATCH2)); + if (size == 32) + { + ABI_PushRegistersAndAdjustStack({ABI_PARAM1}, 8); + AND(32, R(ABI_PARAM1), Imm32(addressMask)); ABI_CallFunction(NDS::ARM7Read32); - ABI_PopRegistersAndAdjustStack({}, 8, 0); - break;*/ + ABI_PopRegistersAndAdjustStack({ECX}, 8); + AND(32, R(ECX), Imm8(3)); + SHL(32, R(ECX), Imm8(3)); + ROR_(32, R(RSCRATCH), R(ECX)); + RET(); + } + else if (size == 16) + { + AND(32, R(ABI_PARAM1), Imm32(addressMask)); + JMP((u8*)NDS::ARM7Read16, true); + } + else + JMP((u8*)NDS::ARM7Read8, true); } + return res; +} + +void Compiler::Comp_MemAccess(Gen::OpArg rd, bool signExtend, bool store, int size) +{ + if (store) + MOV(32, R(ABI_PARAM2), rd); + u32 cycles = Num + ? NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2] + : (R15 & 0x2 ? 0 : CurInstr.CodeCycles); + MOV(32, R(ABI_PARAM3), Imm32(cycles)); + CALL(Num == 0 + ? MemoryFuncs9[size >> 4][store] + : MemoryFuncs7[size >> 4][store][CodeRegion == 0x02]); + if (!store) { - MOV(32, R(ECX), MDisp(RSP, 8)); - ROR_(32, R(RSCRATCH), R(ECX)); + if (signExtend) + MOVSX(32, size, rd.GetSimpleReg(), R(RSCRATCH)); + else + MOVZX(32, size, rd.GetSimpleReg(), R(RSCRATCH)); } - - RET(); - - return res; } OpArg Compiler::A_Comp_GetMemWBOffset() { - if (!(CurrentInstr.Instr & (1 << 25))) - return Imm32(CurrentInstr.Instr & 0xFFF); + if (!(CurInstr.Instr & (1 << 25))) + { + u32 imm = CurInstr.Instr & 0xFFF; + return Imm32(imm); + } else { - int op = (CurrentInstr.Instr >> 5) & 0x3; - int amount = (CurrentInstr.Instr >> 7) & 0x1F; - OpArg rm = MapReg(CurrentInstr.A_Reg(0)); + int op = (CurInstr.Instr >> 5) & 0x3; + int amount = (CurInstr.Instr >> 7) & 0x1F; + OpArg rm = MapReg(CurInstr.A_Reg(0)); bool carryUsed; + return Comp_RegShiftImm(op, amount, rm, false, carryUsed); } } void Compiler::A_Comp_MemWB() -{ - OpArg rn = MapReg(CurrentInstr.A_Reg(16)); - OpArg rd = MapReg(CurrentInstr.A_Reg(12)); - bool load = CurrentInstr.Instr & (1 << 20); +{ + OpArg rn = MapReg(CurInstr.A_Reg(16)); + OpArg rd = MapReg(CurInstr.A_Reg(12)); + bool load = CurInstr.Instr & (1 << 20); + bool byte = CurInstr.Instr & (1 << 22); + int size = byte ? 8 : 32; - MOV(32, R(RSCRATCH2), rn); - if (CurrentInstr.Instr & (1 << 24)) + if (CurInstr.Instr & (1 << 24)) { OpArg offset = A_Comp_GetMemWBOffset(); - if (CurrentInstr.Instr & (1 << 23)) - ADD(32, R(RSCRATCH2), offset); + if (CurInstr.Instr & (1 << 23)) + MOV_sum(32, ABI_PARAM1, rn, offset); else - SUB(32, R(RSCRATCH2), offset); + { + MOV(32, R(ABI_PARAM1), rn); + SUB(32, R(ABI_PARAM1), offset); + } - if (CurrentInstr.Instr & (1 << 21)) - MOV(32, rn, R(RSCRATCH2)); + if (CurInstr.Instr & (1 << 21)) + MOV(32, rn, R(ABI_PARAM1)); } - - u32 cycles = Num ? NDS::ARM7MemTimings[CurrentInstr.CodeCycles][2] : CurrentInstr.CodeCycles; - MOV(32, R(RSCRATCH3), Imm32(cycles)); - MOV(32, R(RSCRATCH), R(RSCRATCH2)); - SHR(32, R(RSCRATCH), Imm8(24)); - AND(32, R(RSCRATCH), Imm8(0xF)); - void** funcArray; - if (load) - funcArray = Num ? ReadMemFuncs7[CodeRegion == 0x02] : ReadMemFuncs9; else + MOV(32, R(ABI_PARAM1), rn); + + if (!(CurInstr.Instr & (1 << 24))) + { + OpArg offset = A_Comp_GetMemWBOffset(); + + if (CurInstr.Instr & (1 << 23)) + ADD(32, rn, offset); + else + SUB(32, rn, offset); + } + + Comp_MemAccess(rd, false, !load, byte ? 8 : 32); + if (load && CurInstr.A_Reg(12) == 15) + { + if (byte) + printf("!!! LDRB PC %08X\n", R15); + else + { + if (Num == 1) + AND(32, rd, Imm8(0xFE)); // immediate is sign extended + Comp_JumpTo(rd.GetSimpleReg()); + } + } +} + +void Compiler::A_Comp_MemHalf() +{ + OpArg rn = MapReg(CurInstr.A_Reg(16)); + OpArg rd = MapReg(CurInstr.A_Reg(12)); + + OpArg offset = CurInstr.Instr & (1 << 22) + ? Imm32(CurInstr.Instr & 0xF | ((CurInstr.Instr >> 4) & 0xF0)) + : MapReg(CurInstr.A_Reg(0)); + + if (CurInstr.Instr & (1 << 24)) { - funcArray = Num ? WriteMemFuncs7[CodeRegion == 0x02] : WriteMemFuncs9; - MOV(32, R(R11), rd); + if (CurInstr.Instr & (1 << 23)) + MOV_sum(32, ABI_PARAM1, rn, offset); + else + { + MOV(32, R(ABI_PARAM1), rn); + SUB(32, R(ABI_PARAM1), offset); + } + + if (CurInstr.Instr & (1 << 21)) + MOV(32, rn, R(ABI_PARAM1)); } - CALLptr(MScaled(RSCRATCH, SCALE_8, squeezePointer(funcArray))); + else + MOV(32, R(ABI_PARAM1), rn); - if (load) - MOV(32, R(RSCRATCH2), R(RSCRATCH)); + int op = (CurInstr.Instr >> 5) & 0x3; + bool load = CurInstr.Instr & (1 << 20); - if (!(CurrentInstr.Instr & (1 << 24))) + bool signExtend = false; + int size; + if (!load && op == 1) + size = 16; + else if (load) { - OpArg offset = A_Comp_GetMemWBOffset(); + size = op == 2 ? 8 : 16; + signExtend = op > 1; + } - if (CurrentInstr.Instr & (1 << 23)) + if (!(CurInstr.Instr & (1 << 24))) + { + if (CurInstr.Instr & (1 << 23)) ADD(32, rn, offset); else SUB(32, rn, offset); } - if (load) - MOV(32, rd, R(RSCRATCH2)); + Comp_MemAccess(rd, signExtend, !load, size); + + if (load && CurInstr.A_Reg(12) == 15) + printf("!!! MemHalf op PC %08X\n", R15);; } void Compiler::T_Comp_MemReg() { - OpArg rd = MapReg(CurrentInstr.T_Reg(0)); - OpArg rb = MapReg(CurrentInstr.T_Reg(3)); - OpArg ro = MapReg(CurrentInstr.T_Reg(6)); + OpArg rd = MapReg(CurInstr.T_Reg(0)); + OpArg rb = MapReg(CurInstr.T_Reg(3)); + OpArg ro = MapReg(CurInstr.T_Reg(6)); - int op = (CurrentInstr.Instr >> 10) & 0x3; + int op = (CurInstr.Instr >> 10) & 0x3; bool load = op & 0x2; - - MOV(32, R(RSCRATCH2), rb); - ADD(32, R(RSCRATCH2), ro); - - u32 cycles = Num ? NDS::ARM7MemTimings[CurrentInstr.CodeCycles][0] : (R15 & 0x2 ? 0 : CurrentInstr.CodeCycles); - MOV(32, R(RSCRATCH3), Imm32(cycles)); - MOV(32, R(RSCRATCH), R(RSCRATCH2)); - SHR(32, R(RSCRATCH), Imm8(24)); - AND(32, R(RSCRATCH), Imm8(0xF)); - void** funcArray; - if (load) - funcArray = Num ? ReadMemFuncs7[CodeRegion == 0x02] : ReadMemFuncs9; - else - { - funcArray = Num ? WriteMemFuncs7[CodeRegion == 0x02] : WriteMemFuncs9; - MOV(32, R(R11), rd); - } - CALLptr(MScaled(RSCRATCH, SCALE_8, squeezePointer(funcArray))); + bool byte = op & 0x1; + + MOV_sum(32, ABI_PARAM1, rb, ro); - if (load) - MOV(32, rd, R(RSCRATCH)); + Comp_MemAccess(rd, false, !load, byte ? 8 : 32); } void Compiler::T_Comp_MemImm() { - // TODO: aufräumen!!! - OpArg rd = MapReg(CurrentInstr.T_Reg(0)); - OpArg rb = MapReg(CurrentInstr.T_Reg(3)); - - int op = (CurrentInstr.Instr >> 11) & 0x3; - u32 offset = ((CurrentInstr.Instr >> 6) & 0x1F) * 4; + OpArg rd = MapReg(CurInstr.T_Reg(0)); + OpArg rb = MapReg(CurInstr.T_Reg(3)); + + int op = (CurInstr.Instr >> 11) & 0x3; bool load = op & 0x1; + bool byte = op & 0x2; + u32 offset = ((CurInstr.Instr >> 6) & 0x1F) * (byte ? 1 : 4); - LEA(32, RSCRATCH2, MDisp(rb.GetSimpleReg(), offset)); - u32 cycles = Num ? NDS::ARM7MemTimings[CurrentInstr.CodeCycles][0] : (R15 & 0x2 ? 0 : CurrentInstr.CodeCycles); - MOV(32, R(RSCRATCH3), Imm32(cycles)); - MOV(32, R(RSCRATCH), R(RSCRATCH2)); - SHR(32, R(RSCRATCH), Imm8(24)); - AND(32, R(RSCRATCH), Imm8(0xF)); - void** funcArray; - if (load) - funcArray = Num ? ReadMemFuncs7[CodeRegion == 0x02] : ReadMemFuncs9; - else - { - funcArray = Num ? WriteMemFuncs7[CodeRegion == 0x02] : WriteMemFuncs9; - MOV(32, R(R11), rd); - } - CALLptr(MScaled(RSCRATCH, SCALE_8, squeezePointer(funcArray))); + LEA(32, ABI_PARAM1, MDisp(rb.GetSimpleReg(), offset)); + + Comp_MemAccess(rd, false, !load, byte ? 8 : 32); +} + +void Compiler::T_Comp_MemRegHalf() +{ + OpArg rd = MapReg(CurInstr.T_Reg(0)); + OpArg rb = MapReg(CurInstr.T_Reg(3)); + OpArg ro = MapReg(CurInstr.T_Reg(6)); + + int op = (CurInstr.Instr >> 10) & 0x3; + bool load = op != 0; + int size = op != 1 ? 16 : 8; + bool signExtend = op & 1; + + MOV_sum(32, ABI_PARAM1, rb, ro); + + Comp_MemAccess(rd, signExtend, !load, size); +} + +void Compiler::T_Comp_MemImmHalf() +{ + OpArg rd = MapReg(CurInstr.T_Reg(0)); + OpArg rb = MapReg(CurInstr.T_Reg(3)); + + u32 offset = (CurInstr.Instr >> 5) & 0x3E; + bool load = CurInstr.Instr & (1 << 11); + + LEA(32, ABI_PARAM1, MDisp(rb.GetSimpleReg(), offset)); - if (load) - MOV(32, rd, R(RSCRATCH)); + Comp_MemAccess(rd, false, !load, 16); } } \ No newline at end of file diff --git a/src/ARM_InstrInfo.cpp b/src/ARM_InstrInfo.cpp index 41c46e1..32a9645 100644 --- a/src/ARM_InstrInfo.cpp +++ b/src/ARM_InstrInfo.cpp @@ -317,7 +317,7 @@ Info Decode(bool thumb, u32 num, u32 instr) else { u32 data = ARMInstrTable[((instr >> 4) & 0xF) | ((instr >> 16) & 0xFF0)]; - if ((instr & 0xFE000000) == 0xFA000000) + if (num == 0 && (instr & 0xFE000000) == 0xFA000000) data = A_BLX_IMM; if (data & A_ARM9Only && num != 0) diff --git a/src/NDS.cpp b/src/NDS.cpp index b8fd8cb..baa5e0d 100644 --- a/src/NDS.cpp +++ b/src/NDS.cpp @@ -524,6 +524,8 @@ void Reset() KeyCnt = 0; RCnt = 0; + ARMJIT::ResetBlocks(); + NDSCart::Reset(); GBACart::Reset(); GPU::Reset(); -- cgit v1.2.3 From 2efab201e936ab0f60baf1de8e957080141d2d93 Mon Sep 17 00:00:00 2001 From: RSDuck Date: Fri, 12 Jul 2019 03:43:45 +0200 Subject: jit: LDM/STM finally(!) working + MUL, MLA and CLZ --- src/ARM.cpp | 7 +++ src/ARMJIT_x64/ARMJIT_ALU.cpp | 74 +++++++++++++++++++++++ src/ARMJIT_x64/ARMJIT_Branch.cpp | 7 +-- src/ARMJIT_x64/ARMJIT_Compiler.cpp | 108 +++++++++++++++++++++++++++++++-- src/ARMJIT_x64/ARMJIT_Compiler.h | 14 ++++- src/ARMJIT_x64/ARMJIT_LoadStore.cpp | 116 +++++++++++++++++++++++++----------- 6 files changed, 279 insertions(+), 47 deletions(-) (limited to 'src/ARMJIT_x64/ARMJIT_ALU.cpp') diff --git a/src/ARM.cpp b/src/ARM.cpp index aca876d..a77fbc4 100644 --- a/src/ARM.cpp +++ b/src/ARM.cpp @@ -80,8 +80,15 @@ ARMv4::ARMv4() : ARM(1) // } +namespace ARMJIT {extern int instructionPopularityARM[ARMInstrInfo::ak_Count];} + void ARM::Reset() { + FILE* blabla = fopen("fhhg", "w"); + for (int i = 0; i < ARMInstrInfo::ak_Count; i++) + fprintf(blabla, "%d -> %dx\n", i, ARMJIT::instructionPopularityARM[i]); + fclose(blabla); + Cycles = 0; Halted = 0; diff --git a/src/ARMJIT_x64/ARMJIT_ALU.cpp b/src/ARMJIT_x64/ARMJIT_ALU.cpp index c22751e..cbe67fd 100644 --- a/src/ARMJIT_x64/ARMJIT_ALU.cpp +++ b/src/ARMJIT_x64/ARMJIT_ALU.cpp @@ -223,6 +223,73 @@ void Compiler::A_Comp_MovOp() Comp_JumpTo(rd.GetSimpleReg(), S); } +void Compiler::A_Comp_CLZ() +{ + OpArg rd = MapReg(CurInstr.A_Reg(12)); + OpArg rm = MapReg(CurInstr.A_Reg(0)); + + MOV(32, R(RSCRATCH), Imm32(32)); + TEST(32, rm, rm); + FixupBranch skipZero = J_CC(CC_Z); + BSR(32, RSCRATCH, rm); + XOR(32, R(RSCRATCH), Imm8(0x1F)); // 31 - RSCRATCH + SetJumpTarget(skipZero); + MOV(32, rd, R(RSCRATCH)); +} + +void Compiler::Comp_MulOp(bool S, bool add, Gen::OpArg rd, Gen::OpArg rm, Gen::OpArg rs, Gen::OpArg rn) +{ + if (Num == 0) + Comp_AddCycles_CI(S ? 3 : 1); + else + { + XOR(32, R(RSCRATCH), R(RSCRATCH)); + MOV(32, R(RSCRATCH3), rs); + TEST(32, R(RSCRATCH3), R(RSCRATCH3)); + FixupBranch zeroBSR = J_CC(CC_Z); + BSR(32, RSCRATCH2, R(RSCRATCH3)); + NOT(32, R(RSCRATCH3)); + BSR(32, RSCRATCH, R(RSCRATCH3)); + CMP(32, R(RSCRATCH2), R(RSCRATCH)); + CMOVcc(32, RSCRATCH, R(RSCRATCH2), CC_L); + SHR(32, R(RSCRATCH), Imm8(3)); + SetJumpTarget(zeroBSR); // fortunately that's even right + Comp_AddCycles_CI(RSCRATCH, add ? 2 : 1); + } + + static_assert(EAX == RSCRATCH); + MOV(32, R(RSCRATCH), rm); + if (add) + { + IMUL(32, RSCRATCH, rs); + LEA(32, rd.GetSimpleReg(), MRegSum(RSCRATCH, rn.GetSimpleReg())); + TEST(32, rd, rd); + } + else + { + IMUL(32, RSCRATCH, rs); + MOV(32, rd, R(RSCRATCH)); + TEST(32, R(RSCRATCH), R(RSCRATCH)); + } + + if (S) + Comp_RetriveFlags(false, false, false); +} + +void Compiler::A_Comp_MUL_MLA() +{ + bool S = CurInstr.Instr & (1 << 20); + bool add = CurInstr.Instr & (1 << 21); + OpArg rd = MapReg(CurInstr.A_Reg(16)); + OpArg rm = MapReg(CurInstr.A_Reg(0)); + OpArg rs = MapReg(CurInstr.A_Reg(8)); + OpArg rn; + if (add) + rn = MapReg(CurInstr.A_Reg(12)); + + Comp_MulOp(S, add, rd, rm, rs, rn); +} + void Compiler::Comp_RetriveFlags(bool sign, bool retriveCV, bool carryUsed) { CPSRDirty = true; @@ -455,6 +522,13 @@ void Compiler::T_Comp_ALU_Imm8() } } +void Compiler::T_Comp_MUL() +{ + OpArg rd = MapReg(CurInstr.T_Reg(0)); + OpArg rs = MapReg(CurInstr.T_Reg(3)); + Comp_MulOp(true, false, rd, rd, rs, Imm8(-1)); +} + void Compiler::T_Comp_ALU() { OpArg rd = MapReg(CurInstr.T_Reg(0)); diff --git a/src/ARMJIT_x64/ARMJIT_Branch.cpp b/src/ARMJIT_x64/ARMJIT_Branch.cpp index fb2acba..bd01ffb 100644 --- a/src/ARMJIT_x64/ARMJIT_Branch.cpp +++ b/src/ARMJIT_x64/ARMJIT_Branch.cpp @@ -126,17 +126,14 @@ void Compiler::Comp_JumpTo(u32 addr, bool forceNonConstantCycles) void Compiler::Comp_JumpTo(Gen::X64Reg addr, bool restoreCPSR) { - BitSet16 hiRegsLoaded(RegCache.DirtyRegs & 0xFFFF0000); + BitSet16 hiRegsLoaded(RegCache.DirtyRegs & 0xFF00); bool previouslyDirty = CPSRDirty; SaveCPSR(); if (restoreCPSR) { if (Thumb || CurInstr.Cond() >= 0xE) - { - for (int reg : hiRegsLoaded) - RegCache.UnloadRegister(reg); - } + RegCache.Flush(); else { // the ugly way... diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp index 6799a90..8a895d1 100644 --- a/src/ARMJIT_x64/ARMJIT_Compiler.cpp +++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp @@ -26,10 +26,14 @@ const int RegisterCache::NativeRegsAvailable = #endif ; +int instructionPopularityARM[ARMInstrInfo::ak_Count]; + Compiler::Compiler() { AllocCodeSpace(1024 * 1024 * 16); + memset(instructionPopularityARM, 0, sizeof(instructionPopularityARM)); + for (int i = 0; i < 3; i++) { for (int j = 0; j < 2; j++) @@ -47,7 +51,88 @@ Compiler::Compiler() MemoryFuncsSeq7[i][j][1] = Gen_MemoryRoutineSeq7(i, j, true); } - ResetStart = GetWritableCodePtr(); + { + // RSCRATCH mode + // ABI_PARAM2 reg number + // ABI_PARAM3 value in current mode + // ret - ABI_PARAM3 + ReadBanked = (void*)GetWritableCodePtr(); + CMP(32, R(RSCRATCH), Imm8(0x11)); + FixupBranch fiq = J_CC(CC_E); + SUB(32, R(ABI_PARAM2), Imm8(13 - 8)); + FixupBranch notEverything = J_CC(CC_L); + CMP(32, R(RSCRATCH), Imm8(0x12)); + FixupBranch irq = J_CC(CC_E); + CMP(32, R(RSCRATCH), Imm8(0x13)); + FixupBranch svc = J_CC(CC_E); + CMP(32, R(RSCRATCH), Imm8(0x17)); + FixupBranch abt = J_CC(CC_E); + CMP(32, R(RSCRATCH), Imm8(0x1B)); + FixupBranch und = J_CC(CC_E); + SetJumpTarget(notEverything); + RET(); + + SetJumpTarget(fiq); + MOV(32, R(ABI_PARAM3), MComplex(RCPU, ABI_PARAM2, SCALE_4, offsetof(ARM, R_FIQ))); + RET(); + SetJumpTarget(irq); + MOV(32, R(ABI_PARAM3), MComplex(RCPU, ABI_PARAM2, SCALE_4, offsetof(ARM, R_IRQ))); + RET(); + SetJumpTarget(svc); + MOV(32, R(ABI_PARAM3), MComplex(RCPU, ABI_PARAM2, SCALE_4, offsetof(ARM, R_SVC))); + RET(); + SetJumpTarget(abt); + MOV(32, R(ABI_PARAM3), MComplex(RCPU, ABI_PARAM2, SCALE_4, offsetof(ARM, R_ABT))); + RET(); + SetJumpTarget(und); + MOV(32, R(ABI_PARAM3), MComplex(RCPU, ABI_PARAM2, SCALE_4, offsetof(ARM, R_UND))); + RET(); + } + { + // RSCRATCH mode + // ABI_PARAM2 reg n + // ABI_PARAM3 value + // carry flag set if the register isn't banked + WriteBanked = (void*)GetWritableCodePtr(); + CMP(32, R(RSCRATCH), Imm8(0x11)); + FixupBranch fiq = J_CC(CC_E); + SUB(32, R(ABI_PARAM2), Imm8(13 - 8)); + FixupBranch notEverything = J_CC(CC_L); + CMP(32, R(RSCRATCH), Imm8(0x12)); + FixupBranch irq = J_CC(CC_E); + CMP(32, R(RSCRATCH), Imm8(0x13)); + FixupBranch svc = J_CC(CC_E); + CMP(32, R(RSCRATCH), Imm8(0x17)); + FixupBranch abt = J_CC(CC_E); + CMP(32, R(RSCRATCH), Imm8(0x1B)); + FixupBranch und = J_CC(CC_E); + SetJumpTarget(notEverything); + STC(); + RET(); + + SetJumpTarget(fiq); + MOV(32, MComplex(RCPU, ABI_PARAM2, SCALE_4, offsetof(ARM, R_FIQ)), R(ABI_PARAM3)); + CLC(); + RET(); + SetJumpTarget(irq); + MOV(32, MComplex(RCPU, ABI_PARAM2, SCALE_4, offsetof(ARM, R_IRQ)), R(ABI_PARAM3)); + CLC(); + RET(); + SetJumpTarget(svc); + MOV(32, MComplex(RCPU, ABI_PARAM2, SCALE_4, offsetof(ARM, R_SVC)), R(ABI_PARAM3)); + CLC(); + RET(); + SetJumpTarget(abt); + MOV(32, MComplex(RCPU, ABI_PARAM2, SCALE_4, offsetof(ARM, R_ABT)), R(ABI_PARAM3)); + CLC(); + RET(); + SetJumpTarget(und); + MOV(32, MComplex(RCPU, ABI_PARAM2, SCALE_4, offsetof(ARM, R_UND)), R(ABI_PARAM3)); + CLC(); + RET(); + } + + ResetStart = (void*)GetWritableCodePtr(); } void Compiler::LoadCPSR() @@ -136,6 +221,9 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs CurInstr = instrs[i]; CompileFunc comp = GetCompFunc(CurInstr.Info.Kind); + + if (!Thumb) + instructionPopularityARM[CurInstr.Info.Kind] += comp == NULL; if (comp == NULL || i == instrsCount - 1) { @@ -287,9 +375,9 @@ CompileFunc Compiler::GetCompFunc(int kind) // CMN A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, // Mul - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + A_Comp_MUL_MLA, A_Comp_MUL_MLA, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, // ARMv5 stuff - NULL, NULL, NULL, NULL, NULL, + A_Comp_CLZ, NULL, NULL, NULL, NULL, // STR A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, //NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, @@ -315,7 +403,7 @@ CompileFunc Compiler::GetCompFunc(int kind) // swap NULL, NULL, // LDM/STM - NULL, NULL, + A_Comp_LDM_STM, A_Comp_LDM_STM, // Branch A_Comp_BranchImm, A_Comp_BranchImm, A_Comp_BranchImm, A_Comp_BranchXchangeReg, A_Comp_BranchXchangeReg, // system stuff @@ -333,7 +421,7 @@ CompileFunc Compiler::GetCompFunc(int kind) T_Comp_ALU, T_Comp_ALU, T_Comp_ALU, T_Comp_ALU, T_Comp_ALU, T_Comp_ALU, T_Comp_ALU, T_Comp_ALU, T_Comp_ALU, T_Comp_ALU, T_Comp_ALU, T_Comp_ALU, - T_Comp_ALU, NULL, T_Comp_ALU, T_Comp_ALU, + T_Comp_ALU, T_Comp_MUL, T_Comp_ALU, T_Comp_ALU, // hi reg T_Comp_ALU_HiReg, T_Comp_ALU_HiReg, T_Comp_ALU_HiReg, // pc/sp relative @@ -387,4 +475,14 @@ void Compiler::Comp_AddCycles_CI(u32 i) ConstantCycles += cycles; } +void Compiler::Comp_AddCycles_CI(Gen::X64Reg i, int add) +{ + s32 cycles = Num ? + NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2] + : ((R15 & 0x2) ? 0 : CurInstr.CodeCycles); + + LEA(32, RSCRATCH, MDisp(i, add + cycles)); + ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(RSCRATCH)); +} + } \ No newline at end of file diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.h b/src/ARMJIT_x64/ARMJIT_Compiler.h index 45b488a..89dfe28 100644 --- a/src/ARMJIT_x64/ARMJIT_Compiler.h +++ b/src/ARMJIT_x64/ARMJIT_Compiler.h @@ -40,6 +40,7 @@ private: void Comp_AddCycles_C(bool forceNonConstant = false); void Comp_AddCycles_CI(u32 i); + void Comp_AddCycles_CI(Gen::X64Reg i, int add); enum { @@ -55,6 +56,10 @@ private: void A_Comp_MovOp(); void A_Comp_CmpOp(); + void A_Comp_MUL_MLA(); + + void A_Comp_CLZ(); + void A_Comp_MemWB(); void A_Comp_MemHalf(); void A_Comp_LDM_STM(); @@ -62,11 +67,13 @@ private: void A_Comp_BranchImm(); void A_Comp_BranchXchangeReg(); + void T_Comp_ShiftImm(); void T_Comp_AddSub_(); void T_Comp_ALU_Imm8(); void T_Comp_ALU(); void T_Comp_ALU_HiReg(); + void T_Comp_MUL(); void T_Comp_RelAddr(); void T_Comp_AddSP(); @@ -88,7 +95,7 @@ private: void T_Comp_BL_Merged(FetchedInstr prefix); void Comp_MemAccess(Gen::OpArg rd, bool signExtend, bool store, int size); - s32 Comp_MemAccessBlock(Gen::OpArg rb, BitSet16 regs, bool store, bool preinc, bool decrement, bool usermode); + s32 Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc, bool decrement, bool usermode); void Comp_ArithTriOp(void (Compiler::*op)(int, const Gen::OpArg&, const Gen::OpArg&), Gen::OpArg rd, Gen::OpArg rn, Gen::OpArg op2, bool carryUsed, int opFlags); @@ -96,6 +103,8 @@ private: Gen::OpArg rd, Gen::OpArg rn, Gen::OpArg op2, bool carryUsed, int opFlags); void Comp_CmpOp(int op, Gen::OpArg rn, Gen::OpArg op2, bool carryUsed); + void Comp_MulOp(bool S, bool add, Gen::OpArg rd, Gen::OpArg rm, Gen::OpArg rs, Gen::OpArg rn); + void Comp_RetriveFlags(bool sign, bool retriveCV, bool carryUsed); void* Gen_MemoryRoutine9(bool store, int size); @@ -133,6 +142,9 @@ private: void* MemoryFuncsSeq9[2][2]; void* MemoryFuncsSeq7[2][2][2]; + void* ReadBanked; + void* WriteBanked; + bool CPSRDirty = false; FetchedInstr CurInstr; diff --git a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp index 69b324c..8fbcafd 100644 --- a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp +++ b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp @@ -1,7 +1,5 @@ #include "ARMJIT_Compiler.h" -#include "../GPU.h" -#include "../Wifi.h" using namespace Gen; @@ -362,7 +360,7 @@ void* Compiler::Gen_MemoryRoutineSeq9(bool store, bool preinc) CMP(32, R(ABI_PARAM3), Imm8(1)); FixupBranch skipSequential = J_CC(CC_E); SUB(32, R(ABI_PARAM3), Imm8(1)); - IMUL(32, R(ABI_PARAM3)); + IMUL(32, RSCRATCH, R(ABI_PARAM3)); ADD(32, R(ABI_PARAM2), R(RSCRATCH)); SetJumpTarget(skipSequential); @@ -413,10 +411,11 @@ void* Compiler::Gen_MemoryRoutineSeq7(bool store, bool preinc, bool codeMainRAM) POP(ABI_PARAM4); POP(ABI_PARAM3); + // TODO: optimise this CMP(32, R(ABI_PARAM3), Imm8(1)); FixupBranch skipSequential = J_CC(CC_E); SUB(32, R(ABI_PARAM3), Imm8(1)); - IMUL(32, R(ABI_PARAM3)); + IMUL(32, RSCRATCH, R(ABI_PARAM3)); ADD(32, R(ABI_PARAM2), R(RSCRATCH)); SetJumpTarget(skipSequential); @@ -458,25 +457,35 @@ void Compiler::Comp_MemAccess(OpArg rd, bool signExtend, bool store, int size) } } -s32 Compiler::Comp_MemAccessBlock(OpArg rb, BitSet16 regs, bool store, bool preinc, bool decrement, bool usermode) +void printStuff2(u32 a, u32 b) { + printf("b %x %x\n", a, b); +} + +s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc, bool decrement, bool usermode) +{ + FILE* f; + const u8* start = GetCodePtr(); + int regsCount = regs.Count(); if (decrement) { - MOV_sum(32, ABI_PARAM1, rb, Imm32(-regsCount * 4)); + MOV_sum(32, ABI_PARAM1, MapReg(rn), Imm32(-regsCount * 4)); preinc ^= true; } else - MOV(32, R(ABI_PARAM1), rb); + MOV(32, R(ABI_PARAM1), MapReg(rn)); + + s32 offset = (regsCount * 4) * (decrement ? -1 : 1); - MOV(32, R(ABI_PARAM3), Imm32(regsCount)); - u32 cycles = Num + u32 cycles = Num ? NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2] : (R15 & 0x2 ? 0 : CurInstr.CodeCycles); MOV(32, R(ABI_PARAM4), Imm32(cycles)); if (!store) { + MOV(32, R(ABI_PARAM3), Imm32(regsCount)); SUB(32, R(RSP), regsCount < 16 ? Imm8(regsCount * 8) : Imm32(regsCount * 8)); MOV(64, R(ABI_PARAM2), R(RSP)); @@ -484,20 +493,29 @@ s32 Compiler::Comp_MemAccessBlock(OpArg rb, BitSet16 regs, bool store, bool prei ? MemoryFuncsSeq9[0][preinc] : MemoryFuncsSeq7[0][preinc][CodeRegion == 0x02]); + bool firstUserMode = true; for (int reg = 15; reg >= 0; reg--) { if (regs[reg]) { - /*if (usermode && reg >= 8 && reg < 15) + if (usermode && reg >= 8 && reg < 15) { - MOV(32, R(RSCRATCH2), R(RCPSR)); - AND(32, R(RSCRATCH2), Imm8(0x1F)); - // (RSCRATCH2 - 0x11) * 8 + squeezePointer(userModeOffsets) + (reg - 8), algebra is great! - MOVZX(32, 8, RSCRATCH2, MScaled(RSCRATCH2, SCALE_8, squeezePointer(userModeOffsets) - 0x10 * 8 + (reg - 8))); - POP(RSCRATCH); - MOV(32, MRegSum(RCPU, RSCRATCH2), R(RSCRATCH)); + if (firstUserMode) + { + MOV(32, R(RSCRATCH), R(RCPSR)); + AND(32, R(RSCRATCH), Imm8(0x1F)); + firstUserMode = false; + } + MOV(32, R(ABI_PARAM2), Imm32(reg - 8)); + POP(ABI_PARAM3); + CALL(WriteBanked); + FixupBranch sucessfulWritten = J_CC(CC_NC); + if (RegCache.Mapping[reg] != INVALID_REG && RegCache.DirtyRegs & (1 << reg)) + MOV(32, R(RegCache.Mapping[reg]), R(ABI_PARAM3)); + SaveReg(reg, ABI_PARAM3); + SetJumpTarget(sucessfulWritten); } - else */if (RegCache.Mapping[reg] == INVALID_REG) + else if (RegCache.Mapping[reg] == INVALID_REG) { assert(reg != 15); @@ -516,32 +534,48 @@ s32 Compiler::Comp_MemAccessBlock(OpArg rb, BitSet16 regs, bool store, bool prei if (regs[15]) { if (Num == 1) - OR(32, MapReg(15), Imm8(1)); + { + if (Thumb) + OR(32, MapReg(15), Imm8(1)); + else + AND(32, MapReg(15), Imm8(0xFE)); + } Comp_JumpTo(MapReg(15).GetSimpleReg(), usermode); } } else { + bool firstUserMode = true; for (int reg : regs) { - /*if (usermode && reg >= 8 && reg < 15) + if (usermode && reg >= 8 && reg < 15) { - MOV(32, R(RSCRATCH), R(RCPSR)); - AND(32, R(RSCRATCH), Imm8(0x1F)); - // (RSCRATCH2 - 0x11) * 8 + squeezePointer(userModeOffsets) + (reg - 8), algebra is great! - MOVZX(32, 8, RSCRATCH, MScaled(RSCRATCH, SCALE_8, squeezePointer(userModeOffsets) - 0x10 * 8 + (reg - 8))); - MOV(32, R(RSCRATCH), MRegSum(RCPU, RSCRATCH)); - PUSH(RSCRATCH); + if (firstUserMode) + { + MOV(32, R(RSCRATCH), R(RCPSR)); + AND(32, R(RSCRATCH), Imm8(0x1F)); + firstUserMode = false; + } + if (RegCache.Mapping[reg] == INVALID_REG) + LoadReg(reg, ABI_PARAM3); + else + MOV(32, R(ABI_PARAM3), R(RegCache.Mapping[reg])); + MOV(32, R(ABI_PARAM2), Imm32(reg - 8)); + CALL(ReadBanked); + PUSH(ABI_PARAM3); } - else */if (RegCache.Mapping[reg] == INVALID_REG) + else if (RegCache.Mapping[reg] == INVALID_REG) { LoadReg(reg, RSCRATCH); PUSH(RSCRATCH); } else + { PUSH(MapReg(reg).GetSimpleReg()); + } } MOV(64, R(ABI_PARAM2), R(RSP)); + MOV(32, R(ABI_PARAM3), Imm32(regsCount)); CALL(Num == 0 ? MemoryFuncsSeq9[1][preinc] @@ -550,7 +584,14 @@ s32 Compiler::Comp_MemAccessBlock(OpArg rb, BitSet16 regs, bool store, bool prei ADD(32, R(RSP), regsCount < 16 ? Imm8(regsCount * 8) : Imm32(regsCount * 8)); } - return (regsCount * 4) * (decrement ? -1 : 1); + if (usermode && !store) + { + f= fopen("ldm", "a"); + fwrite(start, GetCodePtr() - start, 1, f); + fclose(f); + } + + return offset; } OpArg Compiler::A_Comp_GetMemWBOffset() @@ -697,16 +738,20 @@ void Compiler::A_Comp_LDM_STM() { BitSet16 regs(CurInstr.Instr & 0xFFFF); - bool load = (CurInstr.Instr >> 20) & 1; - bool pre = (CurInstr.Instr >> 24) & 1; - bool add = (CurInstr.Instr >> 23) & 1; - bool writeback = (CurInstr.Instr >> 21) & 1; - bool usermode = (CurInstr.Instr >> 22) & 1; + bool load = CurInstr.Instr & (1 << 20); + bool pre = CurInstr.Instr & (1 << 24); + bool add = CurInstr.Instr & (1 << 23); + bool writeback = CurInstr.Instr & (1 << 21); + bool usermode = CurInstr.Instr & (1 << 22); OpArg rn = MapReg(CurInstr.A_Reg(16)); - s32 offset = Comp_MemAccessBlock(rn, regs, !load, pre, !add, false); + s32 offset = Comp_MemAccessBlock(CurInstr.A_Reg(16), regs, !load, pre, !add, usermode); + if (load && writeback && regs[CurInstr.A_Reg(16)]) + writeback = Num == 0 + ? (!(regs & ~BitSet16(1 << CurInstr.A_Reg(16)))) || (regs & ~BitSet16((2 << CurInstr.A_Reg(16)) - 1)) + : false; if (writeback) ADD(32, rn, offset >= INT8_MIN && offset < INT8_MAX ? Imm8(offset) : Imm32(offset)); } @@ -789,8 +834,7 @@ void Compiler::T_Comp_PUSH_POP() } OpArg sp = MapReg(13); - - s32 offset = Comp_MemAccessBlock(sp, regs, !load, !load, !load, false); + s32 offset = Comp_MemAccessBlock(13, regs, !load, !load, !load, false); ADD(32, sp, Imm8(offset)); // offset will be always be in range since PUSH accesses 9 regs max } @@ -801,7 +845,7 @@ void Compiler::T_Comp_LDMIA_STMIA() OpArg rb = MapReg(CurInstr.T_Reg(8)); bool load = CurInstr.Instr & (1 << 11); - s32 offset = Comp_MemAccessBlock(rb, regs, !load, false, false, false); + s32 offset = Comp_MemAccessBlock(CurInstr.T_Reg(8), regs, !load, false, false, false); if (!load || !regs[CurInstr.T_Reg(8)]) ADD(32, rb, Imm8(offset)); -- cgit v1.2.3 From 9b3c14b58abd987d9eb992b04f1f10ee8a6c91f7 Mon Sep 17 00:00:00 2001 From: RSDuck Date: Fri, 12 Jul 2019 16:42:42 +0200 Subject: jit: SMULL and SMLAL --- src/ARMJIT_x64/ARMJIT_ALU.cpp | 56 ++++++++++++++++++++++++++++++++++++-- src/ARMJIT_x64/ARMJIT_Compiler.cpp | 2 +- src/ARMJIT_x64/ARMJIT_Compiler.h | 1 + 3 files changed, 55 insertions(+), 4 deletions(-) (limited to 'src/ARMJIT_x64/ARMJIT_ALU.cpp') diff --git a/src/ARMJIT_x64/ARMJIT_ALU.cpp b/src/ARMJIT_x64/ARMJIT_ALU.cpp index cbe67fd..4afafed 100644 --- a/src/ARMJIT_x64/ARMJIT_ALU.cpp +++ b/src/ARMJIT_x64/ARMJIT_ALU.cpp @@ -290,6 +290,59 @@ void Compiler::A_Comp_MUL_MLA() Comp_MulOp(S, add, rd, rm, rs, rn); } +void Compiler::A_Comp_SMULL_SMLAL() +{ + bool S = CurInstr.Instr & (1 << 20); + bool add = CurInstr.Instr & (1 << 21); + OpArg rd = MapReg(CurInstr.A_Reg(16)); + OpArg rm = MapReg(CurInstr.A_Reg(0)); + OpArg rs = MapReg(CurInstr.A_Reg(8)); + OpArg rn = MapReg(CurInstr.A_Reg(12)); + + if (Num == 0) + Comp_AddCycles_CI(S ? 3 : 1); + else + { + XOR(32, R(RSCRATCH), R(RSCRATCH)); + MOV(32, R(RSCRATCH3), rs); + TEST(32, R(RSCRATCH3), R(RSCRATCH3)); + FixupBranch zeroBSR = J_CC(CC_Z); + BSR(32, RSCRATCH2, R(RSCRATCH3)); + NOT(32, R(RSCRATCH3)); + BSR(32, RSCRATCH, R(RSCRATCH3)); + CMP(32, R(RSCRATCH2), R(RSCRATCH)); + CMOVcc(32, RSCRATCH, R(RSCRATCH2), CC_L); + SHR(32, R(RSCRATCH), Imm8(3)); + SetJumpTarget(zeroBSR); // fortunately that's even right + Comp_AddCycles_CI(RSCRATCH, 2); + } + + MOVSX(64, 32, RSCRATCH2, rm); + MOVSX(64, 32, RSCRATCH3, rs); + if (add) + { + MOV(32, R(RSCRATCH), rd); + SHL(64, R(RSCRATCH), Imm8(32)); + OR(64, R(RSCRATCH), rn); + + IMUL(64, RSCRATCH2, R(RSCRATCH3)); + ADD(64, R(RSCRATCH2), R(RSCRATCH)); + } + else + { + IMUL(64, RSCRATCH2, R(RSCRATCH3)); + if (S) + TEST(64, R(RSCRATCH2), R(RSCRATCH2)); + } + + if (S) + Comp_RetriveFlags(false, false, false); + + MOV(32, rn, R(RSCRATCH2)); + SHR(64, R(RSCRATCH2), Imm8(32)); + MOV(32, rd, R(RSCRATCH2)); +} + void Compiler::Comp_RetriveFlags(bool sign, bool retriveCV, bool carryUsed) { CPSRDirty = true; @@ -302,9 +355,6 @@ void Compiler::Comp_RetriveFlags(bool sign, bool retriveCV, bool carryUsed) LEA(32, RSCRATCH2, MComplex(RSCRATCH, RSCRATCH3, SCALE_2, 0)); } - if (carryUsed == 983298) - printf("etwas ist faul im lande daenemark %x\n", CurInstr.Instr); - SETcc(CC_S, R(RSCRATCH)); SETcc(CC_Z, R(RSCRATCH3)); LEA(32, RSCRATCH, MComplex(RSCRATCH3, RSCRATCH, SCALE_2, 0)); diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp index 8a895d1..b6dd529 100644 --- a/src/ARMJIT_x64/ARMJIT_Compiler.cpp +++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp @@ -375,7 +375,7 @@ CompileFunc Compiler::GetCompFunc(int kind) // CMN A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, // Mul - A_Comp_MUL_MLA, A_Comp_MUL_MLA, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + A_Comp_MUL_MLA, A_Comp_MUL_MLA, NULL, NULL, NULL, A_Comp_SMULL_SMLAL, NULL, NULL, NULL, NULL, NULL, // ARMv5 stuff A_Comp_CLZ, NULL, NULL, NULL, NULL, // STR diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.h b/src/ARMJIT_x64/ARMJIT_Compiler.h index 89dfe28..f9bc227 100644 --- a/src/ARMJIT_x64/ARMJIT_Compiler.h +++ b/src/ARMJIT_x64/ARMJIT_Compiler.h @@ -57,6 +57,7 @@ private: void A_Comp_CmpOp(); void A_Comp_MUL_MLA(); + void A_Comp_SMULL_SMLAL(); void A_Comp_CLZ(); -- cgit v1.2.3 From dcf6e1cad2b38dc4fe0dcbdb789f92e01f802a4a Mon Sep 17 00:00:00 2001 From: RSDuck Date: Sun, 14 Jul 2019 02:37:32 +0200 Subject: jit: fix linux --- src/ARMJIT_x64/ARMJIT_ALU.cpp | 48 +++--- src/ARMJIT_x64/ARMJIT_Branch.cpp | 2 +- src/ARMJIT_x64/ARMJIT_Compiler.cpp | 288 +++++++++++++++++++----------------- src/ARMJIT_x64/ARMJIT_Compiler.h | 8 +- src/ARMJIT_x64/ARMJIT_LoadStore.cpp | 15 +- src/dolphin/Log.h | 13 +- src/dolphin/MemoryUtil.cpp | 13 +- 7 files changed, 193 insertions(+), 194 deletions(-) (limited to 'src/ARMJIT_x64/ARMJIT_ALU.cpp') diff --git a/src/ARMJIT_x64/ARMJIT_ALU.cpp b/src/ARMJIT_x64/ARMJIT_ALU.cpp index 4afafed..013f54c 100644 --- a/src/ARMJIT_x64/ARMJIT_ALU.cpp +++ b/src/ARMJIT_x64/ARMJIT_ALU.cpp @@ -154,13 +154,13 @@ void Compiler::A_Comp_Arith() switch (op) { case 0x0: // AND - Comp_ArithTriOp(AND, rd, rn, op2, carryUsed, opSymmetric|sFlag); + Comp_ArithTriOp(&Compiler::AND, rd, rn, op2, carryUsed, opSymmetric|sFlag); break; case 0x1: // EOR - Comp_ArithTriOp(XOR, rd, rn, op2, carryUsed, opSymmetric|sFlag); + Comp_ArithTriOp(&Compiler::XOR, rd, rn, op2, carryUsed, opSymmetric|sFlag); break; case 0x2: // SUB - Comp_ArithTriOp(SUB, rd, rn, op2, carryUsed, sFlag|opRetriveCV|opInvertCarry); + Comp_ArithTriOp(&Compiler::SUB, rd, rn, op2, carryUsed, sFlag|opRetriveCV|opInvertCarry); break; case 0x3: // RSB if (op2.IsZero()) @@ -172,25 +172,25 @@ void Compiler::A_Comp_Arith() Comp_RetriveFlags(true, true, false); } else - Comp_ArithTriOpReverse(SUB, rd, rn, op2, carryUsed, sFlag|opRetriveCV|opInvertCarry); + Comp_ArithTriOpReverse(&Compiler::SUB, rd, rn, op2, carryUsed, sFlag|opRetriveCV|opInvertCarry); break; case 0x4: // ADD - Comp_ArithTriOp(ADD, rd, rn, op2, carryUsed, opSymmetric|sFlag|opRetriveCV); + Comp_ArithTriOp(&Compiler::ADD, rd, rn, op2, carryUsed, opSymmetric|sFlag|opRetriveCV); break; case 0x5: // ADC - Comp_ArithTriOp(ADC, rd, rn, op2, carryUsed, opSymmetric|sFlag|opRetriveCV|opSyncCarry); + Comp_ArithTriOp(&Compiler::ADC, rd, rn, op2, carryUsed, opSymmetric|sFlag|opRetriveCV|opSyncCarry); break; case 0x6: // SBC - Comp_ArithTriOp(SBB, rd, rn, op2, carryUsed, opSymmetric|sFlag|opRetriveCV|opSyncCarry|opInvertCarry); + Comp_ArithTriOp(&Compiler::SBB, rd, rn, op2, carryUsed, opSymmetric|sFlag|opRetriveCV|opSyncCarry|opInvertCarry); break; case 0x7: // RSC - Comp_ArithTriOpReverse(SBB, rd, rn, op2, carryUsed, sFlag|opRetriveCV|opInvertCarry|opSyncCarry); + Comp_ArithTriOpReverse(&Compiler::SBB, rd, rn, op2, carryUsed, sFlag|opRetriveCV|opInvertCarry|opSyncCarry); break; case 0xC: // ORR - Comp_ArithTriOp(OR, rd, rn, op2, carryUsed, opSymmetric|sFlag); + Comp_ArithTriOp(&Compiler::OR, rd, rn, op2, carryUsed, opSymmetric|sFlag); break; case 0xE: // BIC - Comp_ArithTriOp(AND, rd, rn, op2, carryUsed, sFlag|opSymmetric|opInvertOp2); + Comp_ArithTriOp(&Compiler::AND, rd, rn, op2, carryUsed, sFlag|opSymmetric|opInvertOp2); break; default: assert("unimplemented"); @@ -392,11 +392,11 @@ OpArg Compiler::Comp_RegShiftReg(int op, Gen::OpArg rs, Gen::OpArg rm, bool S, b { void (Compiler::*shiftOp)(int, const OpArg&, const OpArg&) = NULL; if (op == 0) - shiftOp = SHL; + shiftOp = &Compiler::SHL; else if (op == 1) - shiftOp = SHR; + shiftOp = &Compiler::SHR; else if (op == 2) - shiftOp = SAR; + shiftOp = &Compiler::SAR; CMP(32, R(ECX), Imm8(32)); FixupBranch lt32 = J_CC(CC_L); @@ -539,9 +539,9 @@ void Compiler::T_Comp_AddSub_() Comp_AddCycles_C(); if (op & 1) - Comp_ArithTriOp(SUB, rd, rs, rn, false, opSetsFlags|opInvertCarry|opRetriveCV); + Comp_ArithTriOp(&Compiler::SUB, rd, rs, rn, false, opSetsFlags|opInvertCarry|opRetriveCV); else - Comp_ArithTriOp(ADD, rd, rs, rn, false, opSetsFlags|opSymmetric|opRetriveCV); + Comp_ArithTriOp(&Compiler::ADD, rd, rs, rn, false, opSetsFlags|opSymmetric|opRetriveCV); } void Compiler::T_Comp_ALU_Imm8() @@ -564,10 +564,10 @@ void Compiler::T_Comp_ALU_Imm8() Comp_CmpOp(2, rd, imm, false); return; case 0x2: - Comp_ArithTriOp(ADD, rd, rd, imm, false, opSetsFlags|opSymmetric|opRetriveCV); + Comp_ArithTriOp(&Compiler::ADD, rd, rd, imm, false, opSetsFlags|opSymmetric|opRetriveCV); return; case 0x3: - Comp_ArithTriOp(SUB, rd, rd, imm, false, opSetsFlags|opInvertCarry|opRetriveCV); + Comp_ArithTriOp(&Compiler::SUB, rd, rd, imm, false, opSetsFlags|opInvertCarry|opRetriveCV); return; } } @@ -594,10 +594,10 @@ void Compiler::T_Comp_ALU() switch (op) { case 0x0: // AND - Comp_ArithTriOp(AND, rd, rd, rs, false, opSetsFlags|opSymmetric); + Comp_ArithTriOp(&Compiler::AND, rd, rd, rs, false, opSetsFlags|opSymmetric); return; case 0x1: // EOR - Comp_ArithTriOp(XOR, rd, rd, rs, false, opSetsFlags|opSymmetric); + Comp_ArithTriOp(&Compiler::XOR, rd, rd, rs, false, opSetsFlags|opSymmetric); return; case 0x2: case 0x3: @@ -613,10 +613,10 @@ void Compiler::T_Comp_ALU() } return; case 0x5: // ADC - Comp_ArithTriOp(ADC, rd, rd, rs, false, opSetsFlags|opSymmetric|opSyncCarry|opRetriveCV); + Comp_ArithTriOp(&Compiler::ADC, rd, rd, rs, false, opSetsFlags|opSymmetric|opSyncCarry|opRetriveCV); return; case 0x6: // SBC - Comp_ArithTriOp(SBB, rd, rd, rs, false, opSetsFlags|opSyncCarry|opInvertCarry|opRetriveCV); + Comp_ArithTriOp(&Compiler::SBB, rd, rd, rs, false, opSetsFlags|opSyncCarry|opInvertCarry|opRetriveCV); return; case 0x8: // TST Comp_CmpOp(0, rd, rs, false); @@ -634,10 +634,10 @@ void Compiler::T_Comp_ALU() Comp_CmpOp(3, rd, rs, false); return; case 0xC: // ORR - Comp_ArithTriOp(OR, rd, rd, rs, false, opSetsFlags|opSymmetric); + Comp_ArithTriOp(&Compiler::OR, rd, rd, rs, false, opSetsFlags|opSymmetric); return; case 0xE: // BIC - Comp_ArithTriOp(AND, rd, rd, rs, false, opSetsFlags|opSymmetric|opInvertOp2); + Comp_ArithTriOp(&Compiler::AND, rd, rd, rs, false, opSetsFlags|opSymmetric|opInvertOp2); return; case 0xF: // MVN if (rd != rs) @@ -663,7 +663,7 @@ void Compiler::T_Comp_ALU_HiReg() switch (op) { case 0x0: // ADD - Comp_ArithTriOp(ADD, rdMapped, rdMapped, rs, false, opSymmetric|opRetriveCV); + Comp_ArithTriOp(&Compiler::ADD, rdMapped, rdMapped, rs, false, opSymmetric|opRetriveCV); break; case 0x1: // CMP Comp_CmpOp(2, rdMapped, rs, false); diff --git a/src/ARMJIT_x64/ARMJIT_Branch.cpp b/src/ARMJIT_x64/ARMJIT_Branch.cpp index bd01ffb..05c8ec6 100644 --- a/src/ARMJIT_x64/ARMJIT_Branch.cpp +++ b/src/ARMJIT_x64/ARMJIT_Branch.cpp @@ -118,7 +118,7 @@ void Compiler::Comp_JumpTo(u32 addr, bool forceNonConstantCycles) if (setupRegion) { - MOV(32, R(ABI_PARAM1), R(RCPU)); + MOV(64, R(ABI_PARAM1), R(RCPU)); MOV(32, R(ABI_PARAM2), Imm32(newPC)); CALL((void*)&ARMv5::SetupCodeMem); } diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp index e043f58..2b7ccd2 100644 --- a/src/ARMJIT_x64/ARMJIT_Compiler.cpp +++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp @@ -4,6 +4,12 @@ #include +#ifdef _WIN32 +#else +#include +#include +#endif + using namespace Gen; namespace ARMJIT @@ -28,9 +34,34 @@ const int RegisterCache::NativeRegsAvailable = int instructionPopularityARM[ARMInstrInfo::ak_Count]; +/* + We'll repurpose this .bss memory + + */ +u8 CodeMemory[1024 * 1024 * 32]; + Compiler::Compiler() { - AllocCodeSpace(1024 * 1024 * 16); +#ifdef _WIN32 +#else + u64 pagesize = sysconf(_SC_PAGE_SIZE); +#endif + + u8* pageAligned = (u8*)(((u64)CodeMemory & ~(pagesize - 1)) + pagesize); + u64 alignedSize = (((u64)CodeMemory + sizeof(CodeMemory)) & ~(pagesize - 1)) - (u64)pageAligned; + +#ifdef _WIN32 +#else + mprotect(pageAligned, alignedSize, PROT_EXEC | PROT_READ | PROT_WRITE); +#endif + + region = pageAligned; + region_size = alignedSize; + total_region_size = region_size; + + ClearCodeSpace(); + + SetCodePtr(pageAligned); memset(instructionPopularityARM, 0, sizeof(instructionPopularityARM)); @@ -187,6 +218,124 @@ Gen::FixupBranch Compiler::CheckCondition(u32 cond) } } +#define F(x) &Compiler::x +const Compiler::CompileFunc A_Comp[ARMInstrInfo::ak_Count] = +{ + // AND + F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), + F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), + // EOR + F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), + F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), + // SUB + F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), + F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), + // RSB + F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), + F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), + // ADD + F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), + F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), + // ADC + F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), + F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), + // SBC + F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), + F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), + // RSC + F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), + F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), + // ORR + F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), + F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), + // MOV + F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), + F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), + // BIC + F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), + F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), + // MVN + F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), + F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), + // TST + F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), + // TEQ + F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), + // CMP + F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), + // CMN + F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), + // Mul + F(A_Comp_MUL_MLA), F(A_Comp_MUL_MLA), NULL, NULL, NULL, F(A_Comp_SMULL_SMLAL), NULL, NULL, NULL, NULL, NULL, + // ARMv5 stuff + F(A_Comp_CLZ), NULL, NULL, NULL, NULL, + // STR + F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), + // STRB + F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), + // LDR + F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), + // LDRB + F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), + // STRH + F(A_Comp_MemHalf), F(A_Comp_MemHalf), F(A_Comp_MemHalf), F(A_Comp_MemHalf), + // LDRD, STRD never used by anything so they stay interpreted (by anything I mean the 5 games I checked) + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + // LDRH + F(A_Comp_MemHalf), F(A_Comp_MemHalf), F(A_Comp_MemHalf), F(A_Comp_MemHalf), + // LDRSB + F(A_Comp_MemHalf), F(A_Comp_MemHalf), F(A_Comp_MemHalf), F(A_Comp_MemHalf), + // LDRSH + F(A_Comp_MemHalf), F(A_Comp_MemHalf), F(A_Comp_MemHalf), F(A_Comp_MemHalf), + // swap + NULL, NULL, + // LDM/STM + F(A_Comp_LDM_STM), F(A_Comp_LDM_STM), + // Branch + F(A_Comp_BranchImm), F(A_Comp_BranchImm), F(A_Comp_BranchImm), F(A_Comp_BranchXchangeReg), F(A_Comp_BranchXchangeReg), + // system stuff + NULL, NULL, NULL, NULL, NULL, NULL, NULL, +}; + +const Compiler::CompileFunc T_Comp[ARMInstrInfo::tk_Count] = { + // Shift imm + F(T_Comp_ShiftImm), F(T_Comp_ShiftImm), F(T_Comp_ShiftImm), + // Three operand ADD/SUB + F(T_Comp_AddSub_), F(T_Comp_AddSub_), F(T_Comp_AddSub_), F(T_Comp_AddSub_), + // 8 bit imm + F(T_Comp_ALU_Imm8), F(T_Comp_ALU_Imm8), F(T_Comp_ALU_Imm8), F(T_Comp_ALU_Imm8), + // general ALU + F(T_Comp_ALU), F(T_Comp_ALU), F(T_Comp_ALU), F(T_Comp_ALU), + F(T_Comp_ALU), F(T_Comp_ALU), F(T_Comp_ALU), F(T_Comp_ALU), + F(T_Comp_ALU), F(T_Comp_ALU), F(T_Comp_ALU), F(T_Comp_ALU), + F(T_Comp_ALU), F(T_Comp_MUL), F(T_Comp_ALU), F(T_Comp_ALU), + // hi reg + F(T_Comp_ALU_HiReg), F(T_Comp_ALU_HiReg), F(T_Comp_ALU_HiReg), + // pc/sp relative + F(T_Comp_RelAddr), F(T_Comp_RelAddr), F(T_Comp_AddSP), + // LDR pcrel + F(T_Comp_LoadPCRel), + // LDR/STR reg offset + F(T_Comp_MemReg), F(T_Comp_MemReg), F(T_Comp_MemReg), F(T_Comp_MemReg), + // LDR/STR sign extended, half + F(T_Comp_MemRegHalf), F(T_Comp_MemRegHalf), F(T_Comp_MemRegHalf), F(T_Comp_MemRegHalf), + // LDR/STR imm offset + F(T_Comp_MemImm), F(T_Comp_MemImm), F(T_Comp_MemImm), F(T_Comp_MemImm), + // LDR/STR half imm offset + F(T_Comp_MemImmHalf), F(T_Comp_MemImmHalf), + // LDR/STR sp rel + F(T_Comp_MemSPRel), F(T_Comp_MemSPRel), + // PUSH/POP + F(T_Comp_PUSH_POP), F(T_Comp_PUSH_POP), + // LDMIA, STMIA + F(T_Comp_LDMIA_STMIA), F(T_Comp_LDMIA_STMIA), + // Branch + F(T_Comp_BCOND), F(T_Comp_BranchXchangeReg), F(T_Comp_BranchXchangeReg), F(T_Comp_B), F(T_Comp_BL_LONG_1), F(T_Comp_BL_LONG_2), + // Unk, SVC + NULL, NULL +}; +#undef F + CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrsCount) { if (IsAlmostFull()) @@ -206,7 +355,7 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs bool mergedThumbBL = false; - ABI_PushRegistersAndAdjustStack({ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS}, 8, 16); + ABI_PushRegistersAndAdjustStack(BitSet32(ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS & ~RSP), 8); MOV(64, R(RCPU), ImmPtr(cpu)); @@ -220,8 +369,10 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs R15 += Thumb ? 2 : 4; CurInstr = instrs[i]; - CompileFunc comp = GetCompFunc(CurInstr.Info.Kind); - + CompileFunc comp = Thumb + ? T_Comp[CurInstr.Info.Kind] + : A_Comp[CurInstr.Info.Kind]; + if (!Thumb) instructionPopularityARM[CurInstr.Info.Kind] += comp == NULL; @@ -318,139 +469,12 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs MOV(32, R(RAX), Imm32(ConstantCycles)); - ABI_PopRegistersAndAdjustStack({ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS}, 8, 16); + ABI_PopRegistersAndAdjustStack(BitSet32(ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS & ~RSP), 8); RET(); return res; } -CompileFunc Compiler::GetCompFunc(int kind) -{ - // this might look like waste of space, so many repeatitions, but it's invaluable for debugging. - // see ARMInstrInfo.h for the order - CompileFunc const A_Comp[ARMInstrInfo::ak_Count] = - { - // AND - A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, - A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, - // EOR - A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, - A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, - // SUB - A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, - A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, - // RSB - A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, - A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, - // ADD - A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, - A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, - // ADC - A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, - A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, - // SBC - A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, - A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, - // RSC - A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, - A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, - // ORR - A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, - A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, - // MOV - A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, - A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, - // BIC - A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, - A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, A_Comp_Arith, - // MVN - A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, - A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, A_Comp_MovOp, - // TST - A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, - // TEQ - A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, - // CMP - A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, - // CMN - A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, A_Comp_CmpOp, - // Mul - A_Comp_MUL_MLA, A_Comp_MUL_MLA, NULL, NULL, NULL, A_Comp_SMULL_SMLAL, NULL, NULL, NULL, NULL, NULL, - // ARMv5 stuff - A_Comp_CLZ, NULL, NULL, NULL, NULL, - // STR - A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, - //NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - // STRB - //NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, - // LDR - //NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, - // LDRB - //NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, A_Comp_MemWB, - // STRH - A_Comp_MemHalf, A_Comp_MemHalf, A_Comp_MemHalf, A_Comp_MemHalf, - // LDRD, STRD never used by anything so they stay interpreted (by anything I mean the 5 games I checked) - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - // LDRH - A_Comp_MemHalf, A_Comp_MemHalf, A_Comp_MemHalf, A_Comp_MemHalf, - // LDRSB - A_Comp_MemHalf, A_Comp_MemHalf, A_Comp_MemHalf, A_Comp_MemHalf, - // LDRSH - A_Comp_MemHalf, A_Comp_MemHalf, A_Comp_MemHalf, A_Comp_MemHalf, - // swap - NULL, NULL, - // LDM/STM - A_Comp_LDM_STM, A_Comp_LDM_STM, - // Branch - A_Comp_BranchImm, A_Comp_BranchImm, A_Comp_BranchImm, A_Comp_BranchXchangeReg, A_Comp_BranchXchangeReg, - // system stuff - NULL, NULL, NULL, NULL, NULL, NULL, NULL, - }; - - CompileFunc const T_Comp[ARMInstrInfo::tk_Count] = { - // Shift imm - T_Comp_ShiftImm, T_Comp_ShiftImm, T_Comp_ShiftImm, - // Three operand ADD/SUB - T_Comp_AddSub_, T_Comp_AddSub_, T_Comp_AddSub_, T_Comp_AddSub_, - // 8 bit imm - T_Comp_ALU_Imm8, T_Comp_ALU_Imm8, T_Comp_ALU_Imm8, T_Comp_ALU_Imm8, - // general ALU - T_Comp_ALU, T_Comp_ALU, T_Comp_ALU, T_Comp_ALU, - T_Comp_ALU, T_Comp_ALU, T_Comp_ALU, T_Comp_ALU, - T_Comp_ALU, T_Comp_ALU, T_Comp_ALU, T_Comp_ALU, - T_Comp_ALU, T_Comp_MUL, T_Comp_ALU, T_Comp_ALU, - // hi reg - T_Comp_ALU_HiReg, T_Comp_ALU_HiReg, T_Comp_ALU_HiReg, - // pc/sp relative - T_Comp_RelAddr, T_Comp_RelAddr, T_Comp_AddSP, - // LDR pcrel - T_Comp_LoadPCRel, - // LDR/STR reg offset - T_Comp_MemReg, T_Comp_MemReg, T_Comp_MemReg, T_Comp_MemReg, - // LDR/STR sign extended, half - T_Comp_MemRegHalf, T_Comp_MemRegHalf, T_Comp_MemRegHalf, T_Comp_MemRegHalf, - // LDR/STR imm offset - T_Comp_MemImm, T_Comp_MemImm, T_Comp_MemImm, T_Comp_MemImm, - // LDR/STR half imm offset - T_Comp_MemImmHalf, T_Comp_MemImmHalf, - // LDR/STR sp rel - T_Comp_MemSPRel, T_Comp_MemSPRel, - // PUSH/POP - T_Comp_PUSH_POP, T_Comp_PUSH_POP, - // LDMIA, STMIA - T_Comp_LDMIA_STMIA, T_Comp_LDMIA_STMIA, - // Branch - T_Comp_BCOND, T_Comp_BranchXchangeReg, T_Comp_BranchXchangeReg, T_Comp_B, T_Comp_BL_LONG_1, T_Comp_BL_LONG_2, - // Unk, SVC - NULL, NULL - }; - - return Thumb ? T_Comp[kind] : A_Comp[kind]; -} - void Compiler::Comp_AddCycles_C(bool forceNonConstant) { s32 cycles = Num ? diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.h b/src/ARMJIT_x64/ARMJIT_Compiler.h index f9bc227..e04f96a 100644 --- a/src/ARMJIT_x64/ARMJIT_Compiler.h +++ b/src/ARMJIT_x64/ARMJIT_Compiler.h @@ -6,8 +6,6 @@ #include "../ARMJIT.h" #include "../ARMJIT_RegisterCache.h" -#include - namespace ARMJIT { @@ -18,9 +16,6 @@ const Gen::X64Reg RSCRATCH = Gen::EAX; const Gen::X64Reg RSCRATCH2 = Gen::EDX; const Gen::X64Reg RSCRATCH3 = Gen::ECX; -class Compiler; - -typedef void (Compiler::*CompileFunc)(); class Compiler : public Gen::X64CodeBlock { @@ -32,8 +27,7 @@ public: void LoadReg(int reg, Gen::X64Reg nativeReg); void SaveReg(int reg, Gen::X64Reg nativeReg); -private: - CompileFunc GetCompFunc(int kind); + typedef void (Compiler::*CompileFunc)(); void Comp_JumpTo(Gen::X64Reg addr, bool restoreCPSR = false); void Comp_JumpTo(u32 addr, bool forceNonConstantCycles = false); diff --git a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp index 8fbcafd..15a40f8 100644 --- a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp +++ b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp @@ -464,9 +464,6 @@ void printStuff2(u32 a, u32 b) s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc, bool decrement, bool usermode) { - FILE* f; - const u8* start = GetCodePtr(); - int regsCount = regs.Count(); if (decrement) @@ -482,11 +479,12 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc u32 cycles = Num ? NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2] : (R15 & 0x2 ? 0 : CurInstr.CodeCycles); + MOV(32, R(ABI_PARAM4), Imm32(cycles)); if (!store) { MOV(32, R(ABI_PARAM3), Imm32(regsCount)); - SUB(32, R(RSP), regsCount < 16 ? Imm8(regsCount * 8) : Imm32(regsCount * 8)); + SUB(64, R(RSP), regsCount < 16 ? Imm8(regsCount * 8) : Imm32(regsCount * 8)); MOV(64, R(ABI_PARAM2), R(RSP)); CALL(Num == 0 @@ -581,14 +579,7 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc ? MemoryFuncsSeq9[1][preinc] : MemoryFuncsSeq7[1][preinc][CodeRegion == 0x02]); - ADD(32, R(RSP), regsCount < 16 ? Imm8(regsCount * 8) : Imm32(regsCount * 8)); - } - - if (usermode && !store) - { - f= fopen("ldm", "a"); - fwrite(start, GetCodePtr() - start, 1, f); - fclose(f); + ADD(64, R(RSP), regsCount < 16 ? Imm8(regsCount * 8) : Imm32(regsCount * 8)); } return offset; diff --git a/src/dolphin/Log.h b/src/dolphin/Log.h index 21e69a5..a7f4b6a 100644 --- a/src/dolphin/Log.h +++ b/src/dolphin/Log.h @@ -4,12 +4,13 @@ #include -#define PanicAlert(msg) \ - do \ - { \ - printf("%s\n", msg); \ - Crash(); \ - } while (false) +#define PanicAlert(fmt, ...) \ + do \ + { \ + printf(fmt "\n", ## __VA_ARGS__); \ + abort(); \ + } while (false) + #define DYNA_REC 0 diff --git a/src/dolphin/MemoryUtil.cpp b/src/dolphin/MemoryUtil.cpp index 01cb897..7273a8a 100644 --- a/src/dolphin/MemoryUtil.cpp +++ b/src/dolphin/MemoryUtil.cpp @@ -6,15 +6,9 @@ #include #include -#define PanicAlert(fmt, ...) \ - do \ - { \ - printf(fmt "\n", ## __VA_ARGS__); \ - abort(); \ - } while (false) - #include "../types.h" #include "CommonFuncs.h" +#include "Log.h" #ifdef _WIN32 #include @@ -39,8 +33,6 @@ namespace Common void* AllocateExecutableMemory(size_t size) { - printf("c\n"); - #if defined(_WIN32) void* ptr = VirtualAlloc(nullptr, size, MEM_COMMIT, PAGE_EXECUTE_READWRITE); #else @@ -50,13 +42,10 @@ void* AllocateExecutableMemory(size_t size) if (ptr == MAP_FAILED) ptr = nullptr; #endif - printf("a\n"); if (ptr == nullptr) PanicAlert("Failed to allocate executable memory"); - printf("b\n"); - return ptr; } -- cgit v1.2.3 From 4a0f6b3b4bd60815d0c8259e4ec2a944bfb716be Mon Sep 17 00:00:00 2001 From: RSDuck Date: Sun, 21 Jul 2019 17:28:16 +0200 Subject: jit: fix thumb hi reg alu and mcr halt + mcr/mrc aren't always, msr_imm is never unk on ARM7 --- src/ARMJIT.cpp | 2 +- src/ARMJIT_x64/ARMJIT_ALU.cpp | 4 +--- src/ARMJIT_x64/ARMJIT_Branch.cpp | 21 ++++++++++++++------- src/ARM_InstrInfo.cpp | 33 ++++++++++++++++++++++++++++----- src/ARM_InstrInfo.h | 1 + 5 files changed, 45 insertions(+), 16 deletions(-) (limited to 'src/ARMJIT_x64/ARMJIT_ALU.cpp') diff --git a/src/ARMJIT.cpp b/src/ARMJIT.cpp index aad14c0..6948eee 100644 --- a/src/ARMJIT.cpp +++ b/src/ARMJIT.cpp @@ -174,7 +174,7 @@ CompiledBlock CompileBlock(ARM* cpu) instrs[i].Info = ARMInstrInfo::Decode(thumb, cpu->Num, instrs[i].Instr); i++; - } while(!instrs[i - 1].Info.Branches() && i < Config::JIT_MaxBlockSize); + } while(!instrs[i - 1].Info.EndBlock && i < Config::JIT_MaxBlockSize); CompiledBlock block = compiler->CompileBlock(cpu, instrs, i); diff --git a/src/ARMJIT_x64/ARMJIT_ALU.cpp b/src/ARMJIT_x64/ARMJIT_ALU.cpp index 013f54c..bdf06f7 100644 --- a/src/ARMJIT_x64/ARMJIT_ALU.cpp +++ b/src/ARMJIT_x64/ARMJIT_ALU.cpp @@ -663,7 +663,7 @@ void Compiler::T_Comp_ALU_HiReg() switch (op) { case 0x0: // ADD - Comp_ArithTriOp(&Compiler::ADD, rdMapped, rdMapped, rs, false, opSymmetric|opRetriveCV); + Comp_ArithTriOp(&Compiler::ADD, rdMapped, rdMapped, rs, false, opSymmetric); break; case 0x1: // CMP Comp_CmpOp(2, rdMapped, rs, false); @@ -671,8 +671,6 @@ void Compiler::T_Comp_ALU_HiReg() case 0x2: // MOV if (rdMapped != rs) MOV(32, rdMapped, rs); - TEST(32, rdMapped, rdMapped); - Comp_RetriveFlags(false, false, false); break; } diff --git a/src/ARMJIT_x64/ARMJIT_Branch.cpp b/src/ARMJIT_x64/ARMJIT_Branch.cpp index 6ae4aad..9d4c1e2 100644 --- a/src/ARMJIT_x64/ARMJIT_Branch.cpp +++ b/src/ARMJIT_x64/ARMJIT_Branch.cpp @@ -235,16 +235,23 @@ void Compiler::T_Comp_B() void Compiler::T_Comp_BranchXchangeReg() { bool link = CurInstr.Instr & (1 << 7); - if (link && Num == 1) - { - printf("BLX unsupported on ARM7!!!\n"); - return; - } - OpArg rn = MapReg(CurInstr.A_Reg(3)); if (link) + { + if (Num == 1) + { + printf("BLX unsupported on ARM7!!!\n"); + return; + } + MOV(32, R(RSCRATCH), MapReg(CurInstr.A_Reg(3))); MOV(32, MapReg(14), Imm32(R15 - 1)); - Comp_JumpTo(rn.GetSimpleReg()); + Comp_JumpTo(RSCRATCH); + } + else + { + OpArg rn = MapReg(CurInstr.A_Reg(3)); + Comp_JumpTo(rn.GetSimpleReg()); + } } void Compiler::T_Comp_BL_LONG_1() diff --git a/src/ARM_InstrInfo.cpp b/src/ARM_InstrInfo.cpp index 5db2471..b70c8dc 100644 --- a/src/ARM_InstrInfo.cpp +++ b/src/ARM_InstrInfo.cpp @@ -152,11 +152,11 @@ const u32 A_BX = A_BranchAlways | A_Read0 | ak(ak_BX); const u32 A_BLX_REG = A_BranchAlways | A_Link | A_Read0 | ak(ak_BLX_REG); const u32 A_UNK = A_BranchAlways | A_Link | ak(ak_UNK); -const u32 A_MSR_IMM = A_UnkOnARM7 | ak(ak_MSR_IMM); -const u32 A_MSR_REG = A_Read0 | A_UnkOnARM7 | ak(ak_MSR_REG); -const u32 A_MRS = A_Write12 | A_UnkOnARM7 | ak(ak_MRS); -const u32 A_MCR = A_Read12 | A_UnkOnARM7 | ak(ak_MCR); -const u32 A_MRC = A_Write12 | A_UnkOnARM7 | ak(ak_MRC); +const u32 A_MSR_IMM = ak(ak_MSR_IMM); +const u32 A_MSR_REG = A_Read0 | ak(ak_MSR_REG); +const u32 A_MRS = A_Write12 | ak(ak_MRS); +const u32 A_MCR = A_Read12 | ak(ak_MCR); +const u32 A_MRC = A_Write12 | ak(ak_MRC); const u32 A_SVC = A_BranchAlways | A_Link | ak(ak_SVC); // THUMB @@ -310,6 +310,7 @@ Info Decode(bool thumb, u32 num, u32 instr) res.DstRegs |= 1 << 15; res.Kind = (data >> 16) & 0x3F; + res.EndBlock = res.Branches(); return res; } @@ -324,6 +325,26 @@ Info Decode(bool thumb, u32 num, u32 instr) res.Kind = (data >> 13) & 0x1FF; + if (res.Kind == ak_MCR) + { + u32 cn = (instr >> 16) & 0xF; + u32 cm = instr & 0xF; + u32 cpinfo = (instr >> 5) & 0x7; + u32 id = (cn<<8)|(cm<<4)|cpinfo; + if (id == 0x704 || id == 0x782) + res.EndBlock |= true; + } + if (res.Kind == ak_MCR || res.Kind == ak_MRC) + { + u32 cp = ((instr >> 8) & 0xF); + if ((num == 0 && cp != 15) || (num == 1 && cp != 14)) + { + printf("happens\n"); + data = A_UNK; + res.Kind = ak_UNK; + } + } + if (data & A_Read0) res.SrcRegs |= 1 << (instr & 0xF); if (data & A_Read16) @@ -361,6 +382,8 @@ Info Decode(bool thumb, u32 num, u32 instr) if (res.Kind == ak_LDM) res.DstRegs |= instr & (1 << 15); // this is right + res.EndBlock |= res.Branches(); + return res; } } diff --git a/src/ARM_InstrInfo.h b/src/ARM_InstrInfo.h index 51dcfa2..4fe9b10 100644 --- a/src/ARM_InstrInfo.h +++ b/src/ARM_InstrInfo.h @@ -220,6 +220,7 @@ struct Info u16 DstRegs, SrcRegs; u16 Kind; + bool EndBlock; bool Branches() { return DstRegs & (1 << 15); -- cgit v1.2.3 From f31976fed0c0c61e403ccaee5154c1f25d24d60d Mon Sep 17 00:00:00 2001 From: RSDuck Date: Mon, 22 Jul 2019 01:04:42 +0200 Subject: jit: fix RSC --- src/ARMJIT_x64/ARMJIT_ALU.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'src/ARMJIT_x64/ARMJIT_ALU.cpp') diff --git a/src/ARMJIT_x64/ARMJIT_ALU.cpp b/src/ARMJIT_x64/ARMJIT_ALU.cpp index bdf06f7..368fd8b 100644 --- a/src/ARMJIT_x64/ARMJIT_ALU.cpp +++ b/src/ARMJIT_x64/ARMJIT_ALU.cpp @@ -181,7 +181,7 @@ void Compiler::A_Comp_Arith() Comp_ArithTriOp(&Compiler::ADC, rd, rn, op2, carryUsed, opSymmetric|sFlag|opRetriveCV|opSyncCarry); break; case 0x6: // SBC - Comp_ArithTriOp(&Compiler::SBB, rd, rn, op2, carryUsed, opSymmetric|sFlag|opRetriveCV|opSyncCarry|opInvertCarry); + Comp_ArithTriOp(&Compiler::SBB, rd, rn, op2, carryUsed, sFlag|opRetriveCV|opSyncCarry|opInvertCarry); break; case 0x7: // RSC Comp_ArithTriOpReverse(&Compiler::SBB, rd, rn, op2, carryUsed, sFlag|opRetriveCV|opInvertCarry|opSyncCarry); -- cgit v1.2.3 From 5e443e79625b66daf15350d68921d74673cb5232 Mon Sep 17 00:00:00 2001 From: RSDuck Date: Fri, 16 Aug 2019 23:17:08 +0200 Subject: remove unneeded dolphin code, C++11 static_assert --- src/ARMJIT.cpp | 2 + src/ARMJIT_x64/ARMJIT_ALU.cpp | 4 +- src/ARMJIT_x64/ARMJIT_Compiler.cpp | 19 ++++---- src/ARMJIT_x64/ARMJIT_Compiler.h | 5 +- src/ARMJIT_x64/ARMJIT_LoadStore.cpp | 2 +- src/dolphin/Assert.h | 47 ------------------- src/dolphin/CodeBlock.h | 91 ------------------------------------- src/dolphin/Compat.h | 63 +++++++++++++++++++++++++ src/dolphin/Intrinsics.h | 72 ----------------------------- src/dolphin/Log.h | 21 --------- src/dolphin/x64CPUDetect.cpp | 1 - src/dolphin/x64Emitter.cpp | 3 +- src/dolphin/x64Emitter.h | 13 +----- 13 files changed, 84 insertions(+), 259 deletions(-) delete mode 100644 src/dolphin/Assert.h delete mode 100644 src/dolphin/CodeBlock.h create mode 100644 src/dolphin/Compat.h delete mode 100644 src/dolphin/Intrinsics.h delete mode 100644 src/dolphin/Log.h (limited to 'src/ARMJIT_x64/ARMJIT_ALU.cpp') diff --git a/src/ARMJIT.cpp b/src/ARMJIT.cpp index 6948eee..74554d7 100644 --- a/src/ARMJIT.cpp +++ b/src/ARMJIT.cpp @@ -188,6 +188,8 @@ CompiledBlock CompileBlock(ARM* cpu) void InvalidateBlockCache() { + printf("Resetting JIT block cache...\n"); + memset(cache.MainRAM, 0, sizeof(cache.MainRAM)); memset(cache.SWRAM, 0, sizeof(cache.SWRAM)); memset(cache.ARM9_BIOS, 0, sizeof(cache.ARM9_BIOS)); diff --git a/src/ARMJIT_x64/ARMJIT_ALU.cpp b/src/ARMJIT_x64/ARMJIT_ALU.cpp index 368fd8b..f0bcf8e 100644 --- a/src/ARMJIT_x64/ARMJIT_ALU.cpp +++ b/src/ARMJIT_x64/ARMJIT_ALU.cpp @@ -257,7 +257,7 @@ void Compiler::Comp_MulOp(bool S, bool add, Gen::OpArg rd, Gen::OpArg rm, Gen::O Comp_AddCycles_CI(RSCRATCH, add ? 2 : 1); } - static_assert(EAX == RSCRATCH); + static_assert(EAX == RSCRATCH, "Someone changed RSCRATCH!"); MOV(32, R(RSCRATCH), rm); if (add) { @@ -383,7 +383,7 @@ OpArg Compiler::Comp_RegShiftReg(int op, Gen::OpArg rs, Gen::OpArg rm, bool S, b } MOV(32, R(RSCRATCH), rm); - static_assert(RSCRATCH3 == ECX); + static_assert(RSCRATCH3 == ECX, "Someone changed RSCRATCH3"); MOV(32, R(ECX), rs); AND(32, R(ECX), Imm32(0xFF)); diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp index cb11f73..0fbcfda 100644 --- a/src/ARMJIT_x64/ARMJIT_Compiler.cpp +++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp @@ -63,12 +63,11 @@ Compiler::Compiler() mprotect(pageAligned, alignedSize, PROT_EXEC | PROT_READ | PROT_WRITE); #endif - region = pageAligned; - region_size = alignedSize; - total_region_size = region_size; + ResetStart = pageAligned; + CodeMemSize = alignedSize; } - ClearCodeSpace(); + Reset(); for (int i = 0; i < 3; i++) { @@ -169,9 +168,8 @@ Compiler::Compiler() } // move the region forward to prevent overwriting the generated functions - region_size -= GetWritableCodePtr() - region; - total_region_size = region_size; - region = GetWritableCodePtr(); + CodeMemSize -= GetWritableCodePtr() - ResetStart; + ResetStart = GetWritableCodePtr(); } void Compiler::LoadCPSR() @@ -208,7 +206,7 @@ Gen::FixupBranch Compiler::CheckCondition(u32 cond) { if (cond >= 0x8) { - static_assert(RSCRATCH3 == ECX); + static_assert(RSCRATCH3 == ECX, "RSCRATCH has to be equal to ECX!"); MOV(32, R(RSCRATCH3), R(RCPSR)); SHR(32, R(RSCRATCH3), Imm8(28)); MOV(32, R(RSCRATCH), Imm32(1)); @@ -346,12 +344,13 @@ const Compiler::CompileFunc T_Comp[ARMInstrInfo::tk_Count] = { void Compiler::Reset() { - ClearCodeSpace(); + memset(ResetStart, 0xcc, CodeMemSize); + SetCodePtr(ResetStart); } CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrsCount) { - if (IsAlmostFull()) + if (CodeMemSize - (GetWritableCodePtr() - ResetStart) < 1024 * 32) // guess... InvalidateBlockCache(); ConstantCycles = 0; diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.h b/src/ARMJIT_x64/ARMJIT_Compiler.h index 0ce7d8d..3151cbc 100644 --- a/src/ARMJIT_x64/ARMJIT_Compiler.h +++ b/src/ARMJIT_x64/ARMJIT_Compiler.h @@ -17,7 +17,7 @@ const Gen::X64Reg RSCRATCH2 = Gen::EDX; const Gen::X64Reg RSCRATCH3 = Gen::ECX; -class Compiler : public Gen::X64CodeBlock +class Compiler : public Gen::XEmitter { public: Compiler(); @@ -132,6 +132,9 @@ public: return Gen::R(RegCache.Mapping[reg]); } + u8* ResetStart; + u32 CodeMemSize; + void* MemoryFuncs9[3][2]; void* MemoryFuncs7[3][2][2]; diff --git a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp index ee0a7af..6386f8b 100644 --- a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp +++ b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp @@ -171,7 +171,7 @@ void* Compiler::Gen_MemoryRoutine9(bool store, int size) } RET(); - static_assert(RSCRATCH == EAX); + static_assert(RSCRATCH == EAX, "Someone changed RSCRATCH!"); return res; } diff --git a/src/dolphin/Assert.h b/src/dolphin/Assert.h deleted file mode 100644 index 4eb16e0..0000000 --- a/src/dolphin/Assert.h +++ /dev/null @@ -1,47 +0,0 @@ -// Copyright 2015 Dolphin Emulator Project -// Licensed under GPLv2+ -// Refer to the license_dolphin.txt file included. - -#pragma once - -#include - -#define ASSERT_MSG(_t_, _a_, _fmt_, ...) \ - assert(_a_) \ - /*do \ - { \ - if (!(_a_)) \ - { \ - if (!PanicYesNo(_fmt_, ##__VA_ARGS__)) \ - Crash(); \ - } \ - } while (0)*/ - -#define DEBUG_ASSERT_MSG(_t_, _a_, _msg_, ...) \ - assert(_a_); \ - /*do \ - { \ - if (MAX_LOGLEVEL >= LogTypes::LOG_LEVELS::LDEBUG && !(_a_)) \ - { \ - ERROR_LOG(_t_, _msg_, ##__VA_ARGS__); \ - if (!PanicYesNo(_msg_, ##__VA_ARGS__)) \ - Crash(); \ - } \ - } while (0)*/ - -#define ASSERT(_a_) \ - assert(_a_) \ - /*do \ - { \ - ASSERT_MSG(MASTER_LOG, _a_, \ - _trans("An error occurred.\n\n Line: %d\n File: %s\n\nIgnore and continue?"), \ - __LINE__, __FILE__); \ - } while (0)*/ - -#define DEBUG_ASSERT(_a_) \ - assert(_a_) \ - /*do \ - { \ - if (MAX_LOGLEVEL >= LogTypes::LOG_LEVELS::LDEBUG) \ - ASSERT(_a_); \ - } while (0)*/ diff --git a/src/dolphin/CodeBlock.h b/src/dolphin/CodeBlock.h deleted file mode 100644 index e71cf6d..0000000 --- a/src/dolphin/CodeBlock.h +++ /dev/null @@ -1,91 +0,0 @@ -// Copyright 2014 Dolphin Emulator Project -// Licensed under GPLv2+ -// Refer to the license_dolphin.txt file included. - -#pragma once - -#include -#include - -#include "Assert.h" -#include "../types.h" - -namespace Common -{ -// Everything that needs to generate code should inherit from this. -// You get memory management for free, plus, you can use all emitter functions without -// having to prefix them with gen-> or something similar. -// Example implementation: -// class JIT : public CodeBlock {} -template -class CodeBlock : public T -{ -private: - // A privately used function to set the executable RAM space to something invalid. - // For debugging usefulness it should be used to set the RAM to a host specific breakpoint - // instruction - virtual void PoisonMemory() = 0; - -protected: - u8* region = nullptr; - // Size of region we can use. - size_t region_size = 0; - // Original size of the region we allocated. - size_t total_region_size = 0; - - bool m_is_child = false; - std::vector m_children; - -public: - CodeBlock() = default; - virtual ~CodeBlock() - { - } - CodeBlock(const CodeBlock&) = delete; - CodeBlock& operator=(const CodeBlock&) = delete; - CodeBlock(CodeBlock&&) = delete; - CodeBlock& operator=(CodeBlock&&) = delete; - - // Always clear code space with breakpoints, so that if someone accidentally executes - // uninitialized, it just breaks into the debugger. - void ClearCodeSpace() - { - PoisonMemory(); - ResetCodePtr(); - } - - bool IsInSpace(const u8* ptr) const { return ptr >= region && ptr < (region + region_size); } - - void ResetCodePtr() { T::SetCodePtr(region); } - size_t GetSpaceLeft() const - { - ASSERT(static_cast(T::GetCodePtr() - region) < region_size); - return region_size - (T::GetCodePtr() - region); - } - - bool IsAlmostFull() const - { - // This should be bigger than the biggest block ever. - return GetSpaceLeft() < 0x10000; - } - - bool HasChildren() const { return region_size != total_region_size; } - u8* AllocChildCodeSpace(size_t child_size) - { - ASSERT_MSG(DYNA_REC, child_size < GetSpaceLeft(), "Insufficient space for child allocation."); - u8* child_region = region + region_size - child_size; - region_size -= child_size; - return child_region; - } - void AddChildCodeSpace(CodeBlock* child, size_t child_size) - { - u8* child_region = AllocChildCodeSpace(child_size); - child->m_is_child = true; - child->region = child_region; - child->region_size = child_size; - child->total_region_size = child_size; - child->ResetCodePtr(); - m_children.emplace_back(child); - } -}; -} // namespace Common diff --git a/src/dolphin/Compat.h b/src/dolphin/Compat.h new file mode 100644 index 0000000..f2f52a5 --- /dev/null +++ b/src/dolphin/Compat.h @@ -0,0 +1,63 @@ +// Stubs for Assert.h and Log.h +#pragma once + +#include + +// Assert stub +#define ASSERT_MSG(_t_, _a_, _fmt_, ...) \ + assert(_a_) \ + /*do \ + { \ + if (!(_a_)) \ + { \ + if (!PanicYesNo(_fmt_, ##__VA_ARGS__)) \ + Crash(); \ + } \ + } while (0)*/ + +#define DEBUG_ASSERT_MSG(_t_, _a_, _msg_, ...) \ + assert(_a_); \ + /*do \ + { \ + if (MAX_LOGLEVEL >= LogTypes::LOG_LEVELS::LDEBUG && !(_a_)) \ + { \ + ERROR_LOG(_t_, _msg_, ##__VA_ARGS__); \ + if (!PanicYesNo(_msg_, ##__VA_ARGS__)) \ + Crash(); \ + } \ + } while (0)*/ + +#define ASSERT(_a_) \ + assert(_a_) \ + /*do \ + { \ + ASSERT_MSG(MASTER_LOG, _a_, \ + _trans("An error occurred.\n\n Line: %d\n File: %s\n\nIgnore and continue?"), \ + __LINE__, __FILE__); \ + } while (0)*/ + +#define DEBUG_ASSERT(_a_) \ + assert(_a_) \ + /*do \ + { \ + if (MAX_LOGLEVEL >= LogTypes::LOG_LEVELS::LDEBUG) \ + ASSERT(_a_); \ + } while (0)*/ + +// Log Stub +#include + +#define PanicAlert(fmt, ...) \ + do \ + { \ + printf(fmt "\n", ## __VA_ARGS__); \ + abort(); \ + } while (false) + +#define DYNA_REC 0 + +#define ERROR_LOG(which, fmt, ...) \ + do \ + { \ + printf(fmt "\n", ## __VA_ARGS__); \ + } while (false) diff --git a/src/dolphin/Intrinsics.h b/src/dolphin/Intrinsics.h deleted file mode 100644 index 483f219..0000000 --- a/src/dolphin/Intrinsics.h +++ /dev/null @@ -1,72 +0,0 @@ -// Copyright 2015 Dolphin Emulator Project -// Licensed under GPLv2+ -// Refer to the license_dolphin.txt file included. - -#pragma once - -#if defined(_M_X86) - -/** - * It is assumed that all compilers used to build Dolphin support intrinsics up to and including - * SSE 4.2 on x86/x64. - */ - -#if defined(__GNUC__) || defined(__clang__) - -/** - * Due to limitations in GCC, SSE intrinsics are only available when compiling with the - * corresponding instruction set enabled. However, using the target attribute, we can compile - * single functions with a different target instruction set, while still creating a generic build. - * - * Since this instruction set is enabled per-function, any callers should verify that the - * instruction set is supported at runtime before calling it, and provide a fallback implementation - * when not supported. - * - * When building with -march=native, or enabling the instruction sets in the compile flags, permit - * usage of the instrinsics without any function attributes. If the command-line architecture does - * not support this instruction set, enable it via function targeting. - */ - -#include -#ifndef __SSE4_2__ -#define FUNCTION_TARGET_SSE42 [[gnu::target("sse4.2")]] -#endif -#ifndef __SSE4_1__ -#define FUNCTION_TARGET_SSR41 [[gnu::target("sse4.1")]] -#endif -#ifndef __SSSE3__ -#define FUNCTION_TARGET_SSSE3 [[gnu::target("ssse3")]] -#endif -#ifndef __SSE3__ -#define FUNCTION_TARGET_SSE3 [[gnu::target("sse3")]] -#endif - -#elif defined(_MSC_VER) || defined(__INTEL_COMPILER) - -/** - * MSVC and ICC support intrinsics for any instruction set without any function attributes. - */ -#include - -#endif // defined(_MSC_VER) || defined(__INTEL_COMPILER) - -#endif // _M_X86 - -/** - * Define the FUNCTION_TARGET macros to nothing if they are not needed, or not on an X86 platform. - * This way when a function is defined with FUNCTION_TARGET you don't need to define a second - * version without the macro around a #ifdef guard. Be careful when using intrinsics, as all use - * should still be placed around a #ifdef _M_X86 if the file is compiled on all architectures. - */ -#ifndef FUNCTION_TARGET_SSE42 -#define FUNCTION_TARGET_SSE42 -#endif -#ifndef FUNCTION_TARGET_SSR41 -#define FUNCTION_TARGET_SSR41 -#endif -#ifndef FUNCTION_TARGET_SSSE3 -#define FUNCTION_TARGET_SSSE3 -#endif -#ifndef FUNCTION_TARGET_SSE3 -#define FUNCTION_TARGET_SSE3 -#endif diff --git a/src/dolphin/Log.h b/src/dolphin/Log.h deleted file mode 100644 index a7f4b6a..0000000 --- a/src/dolphin/Log.h +++ /dev/null @@ -1,21 +0,0 @@ -#pragma once - -#include "CommonFuncs.h" - -#include - -#define PanicAlert(fmt, ...) \ - do \ - { \ - printf(fmt "\n", ## __VA_ARGS__); \ - abort(); \ - } while (false) - - -#define DYNA_REC 0 - -#define ERROR_LOG(which, fmt, ...) \ - do \ - { \ - printf(fmt "\n", ## __VA_ARGS__); \ - } while (false) diff --git a/src/dolphin/x64CPUDetect.cpp b/src/dolphin/x64CPUDetect.cpp index 05ee11c..49b51c9 100644 --- a/src/dolphin/x64CPUDetect.cpp +++ b/src/dolphin/x64CPUDetect.cpp @@ -7,7 +7,6 @@ #include "CPUDetect.h" #include "../types.h" -#include "Intrinsics.h" #ifndef _MSVC_VER diff --git a/src/dolphin/x64Emitter.cpp b/src/dolphin/x64Emitter.cpp index 7849624..343f314 100644 --- a/src/dolphin/x64Emitter.cpp +++ b/src/dolphin/x64Emitter.cpp @@ -7,9 +7,10 @@ #include "CPUDetect.h" #include "../types.h" -#include "Log.h" #include "x64Emitter.h" #include "x64Reg.h" +#include "Compat.h" +#include "CommonFuncs.h" namespace Gen { diff --git a/src/dolphin/x64Emitter.h b/src/dolphin/x64Emitter.h index 122850d..869acb6 100644 --- a/src/dolphin/x64Emitter.h +++ b/src/dolphin/x64Emitter.h @@ -12,9 +12,8 @@ #include #include -#include "Assert.h" +#include "Compat.h" #include "BitSet.h" -#include "CodeBlock.h" #include "../types.h" #include "x64ABI.h" @@ -1167,14 +1166,4 @@ public: } }; // class XEmitter -class X64CodeBlock : public Common::CodeBlock -{ -private: - void PoisonMemory() override - { - // x86/64: 0xCC = breakpoint - memset(region, 0xCC, region_size); - } -}; - } // namespace -- cgit v1.2.3 From 5ea91b8a039e0735ac5cb102e2375c26c4f7a150 Mon Sep 17 00:00:00 2001 From: RSDuck Date: Sun, 25 Aug 2019 12:28:48 +0200 Subject: optimise away unneeded flag sets - especially useful for thumb code and larger max block sizes - can still be improved upon --- src/ARMJIT.cpp | 24 ++++ src/ARMJIT.h | 1 + src/ARMJIT_x64/ARMJIT_ALU.cpp | 64 +++++++--- src/ARMJIT_x64/ARMJIT_Compiler.cpp | 9 ++ src/ARMJIT_x64/ARMJIT_Compiler.h | 6 +- src/ARM_InstrInfo.cpp | 238 +++++++++++++++++++++++-------------- src/ARM_InstrInfo.h | 13 ++ src/libui_sdl/main.cpp | 2 + 8 files changed, 248 insertions(+), 109 deletions(-) (limited to 'src/ARMJIT_x64/ARMJIT_ALU.cpp') diff --git a/src/ARMJIT.cpp b/src/ARMJIT.cpp index 949bc1c..3b6bc2e 100644 --- a/src/ARMJIT.cpp +++ b/src/ARMJIT.cpp @@ -126,6 +126,24 @@ void DeInit() delete compiler; } +void floodFillSetFlags(FetchedInstr instrs[], int start, u8 flags) +{ + for (int j = start; j >= 0; j--) + { + u8 match = instrs[j].Info.WriteFlags & flags; + u8 matchMaybe = (instrs[j].Info.WriteFlags >> 4) & flags; + if (matchMaybe) // writes flags maybe + instrs[j].SetFlags |= matchMaybe; + if (match) + { + instrs[j].SetFlags |= match; + flags &= ~match; + if (!flags) + return; + } + } +} + CompiledBlock CompileBlock(ARM* cpu) { bool thumb = cpu->CPSR & 0x20; @@ -175,8 +193,14 @@ CompiledBlock CompileBlock(ARM* cpu) instrs[i].Info = ARMInstrInfo::Decode(thumb, cpu->Num, instrs[i].Instr); i++; + + bool canCompile = compiler->CanCompile(thumb, instrs[i - 1].Info.Kind); + if (instrs[i - 1].Info.ReadFlags != 0 || !canCompile) + floodFillSetFlags(instrs, i - 2, canCompile ? instrs[i - 1].Info.ReadFlags : 0xF); } while(!instrs[i - 1].Info.EndBlock && i < Config::JIT_MaxBlockSize); + floodFillSetFlags(instrs, i - 1, 0xF); + CompiledBlock block = compiler->CompileBlock(cpu, instrs, i); if (cpu->Num == 0) diff --git a/src/ARMJIT.h b/src/ARMJIT.h index 0fc1c38..6197695 100644 --- a/src/ARMJIT.h +++ b/src/ARMJIT.h @@ -28,6 +28,7 @@ struct FetchedInstr return Instr >> 28; } + u8 SetFlags; u32 Instr; u32 NextInstr[2]; diff --git a/src/ARMJIT_x64/ARMJIT_ALU.cpp b/src/ARMJIT_x64/ARMJIT_ALU.cpp index f0bcf8e..6a7d711 100644 --- a/src/ARMJIT_x64/ARMJIT_ALU.cpp +++ b/src/ARMJIT_x64/ARMJIT_ALU.cpp @@ -111,6 +111,8 @@ OpArg Compiler::A_Comp_GetALUOp2(bool S, bool& carryUsed) } else { + S = S && (CurInstr.SetFlags & 0x2); + int op = (CurInstr.Instr >> 5) & 0x3; if (CurInstr.Instr & (1 << 4)) { @@ -215,7 +217,8 @@ void Compiler::A_Comp_MovOp() if (S) { - TEST(32, rd, rd); + if (FlagsNZRequired()) + TEST(32, rd, rd); Comp_RetriveFlags(false, false, carryUsed); } @@ -263,12 +266,14 @@ void Compiler::Comp_MulOp(bool S, bool add, Gen::OpArg rd, Gen::OpArg rm, Gen::O { IMUL(32, RSCRATCH, rs); LEA(32, rd.GetSimpleReg(), MRegSum(RSCRATCH, rn.GetSimpleReg())); - TEST(32, rd, rd); + if (S && FlagsNZRequired()) + TEST(32, rd, rd); } else { IMUL(32, RSCRATCH, rs); MOV(32, rd, R(RSCRATCH)); + if (S && FlagsNZRequired()) TEST(32, R(RSCRATCH), R(RSCRATCH)); } @@ -331,7 +336,7 @@ void Compiler::A_Comp_SMULL_SMLAL() else { IMUL(64, RSCRATCH2, R(RSCRATCH3)); - if (S) + if (S && FlagsNZRequired()) TEST(64, R(RSCRATCH2), R(RSCRATCH2)); } @@ -345,9 +350,20 @@ void Compiler::A_Comp_SMULL_SMLAL() void Compiler::Comp_RetriveFlags(bool sign, bool retriveCV, bool carryUsed) { - CPSRDirty = true; + if (CurInstr.SetFlags == 0) + return; + if (retriveCV && !(CurInstr.SetFlags & 0x3)) + retriveCV = false; bool carryOnly = !retriveCV && carryUsed; + if (carryOnly && !(CurInstr.SetFlags & 0x2)) + { + carryUsed = false; + carryOnly = false; + } + + CPSRDirty = true; + if (retriveCV) { SETcc(CC_O, R(RSCRATCH)); @@ -355,19 +371,28 @@ void Compiler::Comp_RetriveFlags(bool sign, bool retriveCV, bool carryUsed) LEA(32, RSCRATCH2, MComplex(RSCRATCH, RSCRATCH3, SCALE_2, 0)); } - SETcc(CC_S, R(RSCRATCH)); - SETcc(CC_Z, R(RSCRATCH3)); - LEA(32, RSCRATCH, MComplex(RSCRATCH3, RSCRATCH, SCALE_2, 0)); - int shiftAmount = 30; - if (retriveCV || carryUsed) + if (FlagsNZRequired()) { - LEA(32, RSCRATCH, MComplex(RSCRATCH2, RSCRATCH, carryOnly ? SCALE_2 : SCALE_4, 0)); - shiftAmount = carryOnly ? 29 : 28; - } - SHL(32, R(RSCRATCH), Imm8(shiftAmount)); + SETcc(CC_S, R(RSCRATCH)); + SETcc(CC_Z, R(RSCRATCH3)); + LEA(32, RSCRATCH, MComplex(RSCRATCH3, RSCRATCH, SCALE_2, 0)); + int shiftAmount = 30; + if (retriveCV || carryUsed) + { + LEA(32, RSCRATCH, MComplex(RSCRATCH2, RSCRATCH, carryOnly ? SCALE_2 : SCALE_4, 0)); + shiftAmount = carryOnly ? 29 : 28; + } + SHL(32, R(RSCRATCH), Imm8(shiftAmount)); - AND(32, R(RCPSR), Imm32(0x3FFFFFFF & ~(carryUsed << 29) & ~((retriveCV ? 3 : 0) << 28))); - OR(32, R(RCPSR), R(RSCRATCH)); + AND(32, R(RCPSR), Imm32(0x3FFFFFFF & ~(carryUsed << 29) & ~((retriveCV ? 3 : 0) << 28))); + OR(32, R(RCPSR), R(RSCRATCH)); + } + else + { + SHL(32, R(RSCRATCH2), Imm8(carryOnly ? 29 : 28)); + AND(32, R(RCPSR), Imm32(0xFFFFFFFF & ~(carryUsed << 29) & ~((retriveCV ? 3 : 0) << 28))); + OR(32, R(RCPSR), R(RSCRATCH2)); + } } // always uses RSCRATCH, RSCRATCH2 only if S == true @@ -523,7 +548,8 @@ void Compiler::T_Comp_ShiftImm() if (shifted != rd) MOV(32, rd, shifted); - TEST(32, rd, rd); + if (FlagsNZRequired()) + TEST(32, rd, rd); Comp_RetriveFlags(false, false, carryUsed); } @@ -557,7 +583,8 @@ void Compiler::T_Comp_ALU_Imm8() { case 0x0: MOV(32, rd, imm); - TEST(32, rd, rd); + if (FlagsNZRequired()) + TEST(32, rd, rd); Comp_RetriveFlags(false, false, false); return; case 0x1: @@ -607,7 +634,8 @@ void Compiler::T_Comp_ALU() int shiftOp = op == 0x7 ? 3 : op - 0x2; bool carryUsed; OpArg shifted = Comp_RegShiftReg(shiftOp, rs, rd, true, carryUsed); - TEST(32, shifted, shifted); + if (FlagsNZRequired()) + TEST(32, shifted, shifted); MOV(32, rd, shifted); Comp_RetriveFlags(false, false, true); } diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp index ab13cb6..6abb2bb 100644 --- a/src/ARMJIT_x64/ARMJIT_Compiler.cpp +++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp @@ -342,6 +342,11 @@ const Compiler::CompileFunc T_Comp[ARMInstrInfo::tk_Count] = { }; #undef F +bool Compiler::CanCompile(bool thumb, u16 kind) +{ + return (thumb ? T_Comp[kind] : A_Comp[kind]) != NULL; +} + void Compiler::Reset() { memset(ResetStart, 0xcc, CodeMemSize); @@ -380,11 +385,15 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs // TODO: this is ugly as a whole, do better RegCache = RegisterCache(this, instrs, instrsCount); + printf("block start %d\n", Thumb); + for (int i = 0; i < instrsCount; i++) { R15 += Thumb ? 2 : 4; CurInstr = instrs[i]; + printf("%x %d %d %d\n", CurInstr.Instr, CurInstr.SetFlags, CurInstr.Info.WriteFlags, CurInstr.Info.ReadFlags); + CompileFunc comp = Thumb ? T_Comp[CurInstr.Info.Kind] : A_Comp[CurInstr.Info.Kind]; diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.h b/src/ARMJIT_x64/ARMJIT_Compiler.h index 3151cbc..8861884 100644 --- a/src/ARMJIT_x64/ARMJIT_Compiler.h +++ b/src/ARMJIT_x64/ARMJIT_Compiler.h @@ -29,6 +29,8 @@ public: void LoadReg(int reg, Gen::X64Reg nativeReg); void SaveReg(int reg, Gen::X64Reg nativeReg); + bool CanCompile(bool thumb, u16 kind); + typedef void (Compiler::*CompileFunc)(); void Comp_JumpTo(Gen::X64Reg addr, bool restoreCPSR = false); @@ -64,7 +66,6 @@ public: void A_Comp_BranchImm(); void A_Comp_BranchXchangeReg(); - void T_Comp_ShiftImm(); void T_Comp_AddSub_(); void T_Comp_ALU_Imm8(); @@ -121,6 +122,9 @@ public: void LoadCPSR(); void SaveCPSR(); + bool FlagsNZRequired() + { return CurInstr.SetFlags & 0xC; } + Gen::FixupBranch CheckCondition(u32 cond); Gen::OpArg MapReg(int reg) diff --git a/src/ARM_InstrInfo.cpp b/src/ARM_InstrInfo.cpp index 4813799..ea6d827 100644 --- a/src/ARM_InstrInfo.cpp +++ b/src/ARM_InstrInfo.cpp @@ -5,7 +5,7 @@ namespace ARMInstrInfo { -#define ak(x) ((x) << 13) +#define ak(x) ((x) << 18) enum { A_Read0 = 1 << 0, @@ -26,69 +26,81 @@ enum { A_Link = 1 << 10, A_UnkOnARM7 = 1 << 11, + + A_SetNZ = 1 << 12, + A_SetCV = 1 << 13, + A_SetMaybeC = 1 << 14, + A_MulFlags = 1 << 15, + A_ReadC = 1 << 16, + A_RRXReadC = 1 << 17, }; #define A_BIOP A_Read16 #define A_MONOOP 0 -#define A_IMPLEMENT_ALU_OP(x,k) \ - const u32 A_##x##_IMM = A_Write12 | A_##k | ak(ak_##x##_IMM); \ - const u32 A_##x##_REG_LSL_IMM = A_Write12 | A_##k | A_Read0 | ak(ak_##x##_REG_LSL_IMM); \ - const u32 A_##x##_REG_LSR_IMM = A_Write12 | A_##k | A_Read0 | ak(ak_##x##_REG_LSR_IMM); \ - const u32 A_##x##_REG_ASR_IMM = A_Write12 | A_##k | A_Read0 | ak(ak_##x##_REG_ASR_IMM); \ - const u32 A_##x##_REG_ROR_IMM = A_Write12 | A_##k | A_Read0 | ak(ak_##x##_REG_ROR_IMM); \ - const u32 A_##x##_REG_LSL_REG = A_Write12 | A_##k | A_Read0 | A_Read8 | ak(ak_##x##_REG_LSL_REG); \ - const u32 A_##x##_REG_LSR_REG = A_Write12 | A_##k | A_Read0 | A_Read8 | ak(ak_##x##_REG_LSR_REG); \ - const u32 A_##x##_REG_ASR_REG = A_Write12 | A_##k | A_Read0 | A_Read8 | ak(ak_##x##_REG_ASR_REG); \ - const u32 A_##x##_REG_ROR_REG = A_Write12 | A_##k | A_Read0 | A_Read8 | ak(ak_##x##_REG_ROR_REG); \ +#define A_ARITH A_SetCV +#define A_LOGIC A_SetMaybeC +#define A_ARITH_IMM A_SetCV +#define A_LOGIC_IMM 0 + +#define A_IMPLEMENT_ALU_OP(x,k,a,c) \ + const u32 A_##x##_IMM = A_Write12 | c | A_##k | ak(ak_##x##_IMM); \ + const u32 A_##x##_REG_LSL_IMM = A_Write12 | c | A_##k | A_Read0 | ak(ak_##x##_REG_LSL_IMM); \ + const u32 A_##x##_REG_LSR_IMM = A_Write12 | c | A_##k | A_Read0 | ak(ak_##x##_REG_LSR_IMM); \ + const u32 A_##x##_REG_ASR_IMM = A_Write12 | c | A_##k | A_Read0 | ak(ak_##x##_REG_ASR_IMM); \ + const u32 A_##x##_REG_ROR_IMM = A_RRXReadC | A_Write12 | c | A_##k | A_Read0 | ak(ak_##x##_REG_ROR_IMM); \ + const u32 A_##x##_REG_LSL_REG = A_Write12 | c | A_##k | A_Read0 | A_Read8 | ak(ak_##x##_REG_LSL_REG); \ + const u32 A_##x##_REG_LSR_REG = A_Write12 | c | A_##k | A_Read0 | A_Read8 | ak(ak_##x##_REG_LSR_REG); \ + const u32 A_##x##_REG_ASR_REG = A_Write12 | c | A_##k | A_Read0 | A_Read8 | ak(ak_##x##_REG_ASR_REG); \ + const u32 A_##x##_REG_ROR_REG = A_Write12 | c | A_##k | A_Read0 | A_Read8 | ak(ak_##x##_REG_ROR_REG); \ \ - const u32 A_##x##_IMM_S = A_Write12 | A_##k | ak(ak_##x##_IMM_S); \ - const u32 A_##x##_REG_LSL_IMM_S = A_Write12 | A_##k | A_Read0 | ak(ak_##x##_REG_LSL_IMM_S); \ - const u32 A_##x##_REG_LSR_IMM_S = A_Write12 | A_##k | A_Read0 | ak(ak_##x##_REG_LSR_IMM_S); \ - const u32 A_##x##_REG_ASR_IMM_S = A_Write12 | A_##k | A_Read0 | ak(ak_##x##_REG_ASR_IMM_S); \ - const u32 A_##x##_REG_ROR_IMM_S = A_Write12 | A_##k | A_Read0 | ak(ak_##x##_REG_ROR_IMM_S); \ - const u32 A_##x##_REG_LSL_REG_S = A_Write12 | A_##k | A_Read0 | A_Read8 | ak(ak_##x##_REG_LSL_REG_S); \ - const u32 A_##x##_REG_LSR_REG_S = A_Write12 | A_##k | A_Read0 | A_Read8 | ak(ak_##x##_REG_LSR_REG_S); \ - const u32 A_##x##_REG_ASR_REG_S = A_Write12 | A_##k | A_Read0 | A_Read8 | ak(ak_##x##_REG_ASR_REG_S); \ - const u32 A_##x##_REG_ROR_REG_S = A_Write12 | A_##k | A_Read0 | A_Read8 | ak(ak_##x##_REG_ROR_REG_S); - -A_IMPLEMENT_ALU_OP(AND,BIOP) -A_IMPLEMENT_ALU_OP(EOR,BIOP) -A_IMPLEMENT_ALU_OP(SUB,BIOP) -A_IMPLEMENT_ALU_OP(RSB,BIOP) -A_IMPLEMENT_ALU_OP(ADD,BIOP) -A_IMPLEMENT_ALU_OP(ADC,BIOP) -A_IMPLEMENT_ALU_OP(SBC,BIOP) -A_IMPLEMENT_ALU_OP(RSC,BIOP) -A_IMPLEMENT_ALU_OP(ORR,BIOP) -A_IMPLEMENT_ALU_OP(MOV,MONOOP) -A_IMPLEMENT_ALU_OP(BIC,BIOP) -A_IMPLEMENT_ALU_OP(MVN,MONOOP) + const u32 A_##x##_IMM_S = A_SetNZ | c | A_##a##_IMM | A_Write12 | A_##k | ak(ak_##x##_IMM_S); \ + const u32 A_##x##_REG_LSL_IMM_S = A_SetNZ | c | A_##a | A_Write12 | A_##k | A_Read0 | ak(ak_##x##_REG_LSL_IMM_S); \ + const u32 A_##x##_REG_LSR_IMM_S = A_SetNZ | c | A_##a | A_Write12 | A_##k | A_Read0 | ak(ak_##x##_REG_LSR_IMM_S); \ + const u32 A_##x##_REG_ASR_IMM_S = A_SetNZ | c | A_##a | A_Write12 | A_##k | A_Read0 | ak(ak_##x##_REG_ASR_IMM_S); \ + const u32 A_##x##_REG_ROR_IMM_S = A_RRXReadC | A_SetNZ | c | A_##a | A_Write12 | A_##k | A_Read0 | ak(ak_##x##_REG_ROR_IMM_S); \ + const u32 A_##x##_REG_LSL_REG_S = A_SetNZ | c | A_##a | A_Write12 | A_##k | A_Read0 | A_Read8 | ak(ak_##x##_REG_LSL_REG_S); \ + const u32 A_##x##_REG_LSR_REG_S = A_SetNZ | c | A_##a | A_Write12 | A_##k | A_Read0 | A_Read8 | ak(ak_##x##_REG_LSR_REG_S); \ + const u32 A_##x##_REG_ASR_REG_S = A_SetNZ | c | A_##a | A_Write12 | A_##k | A_Read0 | A_Read8 | ak(ak_##x##_REG_ASR_REG_S); \ + const u32 A_##x##_REG_ROR_REG_S = A_SetNZ | c | A_##a | A_Write12 | A_##k | A_Read0 | A_Read8 | ak(ak_##x##_REG_ROR_REG_S); + +A_IMPLEMENT_ALU_OP(AND,BIOP,LOGIC,0) +A_IMPLEMENT_ALU_OP(EOR,BIOP,LOGIC,0) +A_IMPLEMENT_ALU_OP(SUB,BIOP,ARITH,0) +A_IMPLEMENT_ALU_OP(RSB,BIOP,ARITH,0) +A_IMPLEMENT_ALU_OP(ADD,BIOP,ARITH,0) +A_IMPLEMENT_ALU_OP(ADC,BIOP,ARITH,A_ReadC) +A_IMPLEMENT_ALU_OP(SBC,BIOP,ARITH,A_ReadC) +A_IMPLEMENT_ALU_OP(RSC,BIOP,ARITH,A_ReadC) +A_IMPLEMENT_ALU_OP(ORR,BIOP,LOGIC,0) +A_IMPLEMENT_ALU_OP(MOV,MONOOP,LOGIC,0) +A_IMPLEMENT_ALU_OP(BIC,BIOP,LOGIC,0) +A_IMPLEMENT_ALU_OP(MVN,MONOOP,LOGIC,0) const u32 A_MOV_REG_LSL_IMM_DBG = A_MOV_REG_LSL_IMM; -#define A_IMPLEMENT_ALU_TEST(x) \ - const u32 A_##x##_IMM = A_Read16 | A_Read0 | ak(ak_##x##_IMM); \ - const u32 A_##x##_REG_LSL_IMM = A_Read16 | A_Read0 | ak(ak_##x##_REG_LSL_IMM); \ - const u32 A_##x##_REG_LSR_IMM = A_Read16 | A_Read0 | ak(ak_##x##_REG_LSR_IMM); \ - const u32 A_##x##_REG_ASR_IMM = A_Read16 | A_Read0 | ak(ak_##x##_REG_ASR_IMM); \ - const u32 A_##x##_REG_ROR_IMM = A_Read16 | A_Read0 | ak(ak_##x##_REG_ROR_IMM); \ - const u32 A_##x##_REG_LSL_REG = A_Read16 | A_Read0 | A_Read8 | ak(ak_##x##_REG_LSL_REG); \ - const u32 A_##x##_REG_LSR_REG = A_Read16 | A_Read0 | A_Read8 | ak(ak_##x##_REG_LSR_REG); \ - const u32 A_##x##_REG_ASR_REG = A_Read16 | A_Read0 | A_Read8 | ak(ak_##x##_REG_ASR_REG); \ - const u32 A_##x##_REG_ROR_REG = A_Read16 | A_Read0 | A_Read8 | ak(ak_##x##_REG_ROR_REG); - -A_IMPLEMENT_ALU_TEST(TST) -A_IMPLEMENT_ALU_TEST(TEQ) -A_IMPLEMENT_ALU_TEST(CMP) -A_IMPLEMENT_ALU_TEST(CMN) - -const u32 A_MUL = A_Write16 | A_Read0 | A_Read8 | ak(ak_MUL); -const u32 A_MLA = A_Write16 | A_Read0 | A_Read8 | A_Read12 | ak(ak_MLA); -const u32 A_UMULL = A_Write16 | A_Write12 | A_Read0 | A_Read8 | ak(ak_UMULL); -const u32 A_UMLAL = A_Write16 | A_Write12 | A_Read16 | A_Read12 | A_Read0 | A_Read8 | ak(ak_UMLAL); -const u32 A_SMULL = A_Write16 | A_Write12 | A_Read0 | A_Read8 | ak(ak_SMULL); -const u32 A_SMLAL = A_Write16 | A_Write12 | A_Read16 | A_Read12 | A_Read0 | A_Read8 | ak(ak_SMLAL); +#define A_IMPLEMENT_ALU_TEST(x,a) \ + const u32 A_##x##_IMM = A_SetNZ | A_Read16 | A_##a | A_Read0 | ak(ak_##x##_IMM); \ + const u32 A_##x##_REG_LSL_IMM = A_SetNZ | A_Read16 | A_##a | A_Read0 | ak(ak_##x##_REG_LSL_IMM); \ + const u32 A_##x##_REG_LSR_IMM = A_SetNZ | A_Read16 | A_##a | A_Read0 | ak(ak_##x##_REG_LSR_IMM); \ + const u32 A_##x##_REG_ASR_IMM = A_SetNZ | A_Read16 | A_##a | A_Read0 | ak(ak_##x##_REG_ASR_IMM); \ + const u32 A_##x##_REG_ROR_IMM = A_RRXReadC | A_SetNZ | A_Read16 | A_##a | A_Read0 | ak(ak_##x##_REG_ROR_IMM); \ + const u32 A_##x##_REG_LSL_REG = A_SetNZ | A_Read16 | A_##a | A_Read0 | A_Read8 | ak(ak_##x##_REG_LSL_REG); \ + const u32 A_##x##_REG_LSR_REG = A_SetNZ | A_Read16 | A_##a | A_Read0 | A_Read8 | ak(ak_##x##_REG_LSR_REG); \ + const u32 A_##x##_REG_ASR_REG = A_SetNZ | A_Read16 | A_##a | A_Read0 | A_Read8 | ak(ak_##x##_REG_ASR_REG); \ + const u32 A_##x##_REG_ROR_REG = A_SetNZ | A_Read16 | A_##a | A_Read0 | A_Read8 | ak(ak_##x##_REG_ROR_REG); + +A_IMPLEMENT_ALU_TEST(TST,LOGIC) +A_IMPLEMENT_ALU_TEST(TEQ,LOGIC) +A_IMPLEMENT_ALU_TEST(CMP,ARITH) +A_IMPLEMENT_ALU_TEST(CMN,ARITH) + +const u32 A_MUL = A_MulFlags | A_Write16 | A_Read0 | A_Read8 | ak(ak_MUL); +const u32 A_MLA = A_MulFlags | A_Write16 | A_Read0 | A_Read8 | A_Read12 | ak(ak_MLA); +const u32 A_UMULL = A_MulFlags | A_Write16 | A_Write12 | A_Read0 | A_Read8 | ak(ak_UMULL); +const u32 A_UMLAL = A_MulFlags | A_Write16 | A_Write12 | A_Read16 | A_Read12 | A_Read0 | A_Read8 | ak(ak_UMLAL); +const u32 A_SMULL = A_MulFlags | A_Write16 | A_Write12 | A_Read0 | A_Read8 | ak(ak_SMULL); +const u32 A_SMLAL = A_MulFlags | A_Write16 | A_Write12 | A_Read16 | A_Read12 | A_Read0 | A_Read8 | ak(ak_SMLAL); const u32 A_SMLAxy = A_Write16 | A_Read0 | A_Read8 | A_Read12 | ak(ak_SMLALxy); const u32 A_SMLAWy = A_Write16 | A_Read0 | A_Read8 | A_Read12 | ak(ak_SMLAWy); const u32 A_SMULWy = A_Write16 | A_Read0 | A_Read8 | ak(ak_SMULWy); @@ -161,7 +173,7 @@ const u32 A_SVC = A_BranchAlways | A_Link | ak(ak_SVC); // THUMB -#define tk(x) ((x) << 16) +#define tk(x) ((x) << 20) enum { T_Read0 = 1 << 0, @@ -183,42 +195,47 @@ enum { T_ReadR14 = 1 << 13, T_WriteR14 = 1 << 14, - T_PopPC = 1 << 15 + T_PopPC = 1 << 15, + + T_SetNZ = 1 << 16, + T_SetCV = 1 << 17, + T_SetMaybeC = 1 << 18, + T_ReadC = 1 << 19 }; -const u32 T_LSL_IMM = T_Write0 | T_Read3 | tk(tk_LSL_IMM); -const u32 T_LSR_IMM = T_Write0 | T_Read3 | tk(tk_LSR_IMM); -const u32 T_ASR_IMM = T_Write0 | T_Read3 | tk(tk_ASR_IMM); - -const u32 T_ADD_REG_ = T_Write0 | T_Read3 | T_Read6 | tk(tk_ADD_REG_); -const u32 T_SUB_REG_ = T_Write0 | T_Read3 | T_Read6 | tk(tk_SUB_REG_); -const u32 T_ADD_IMM_ = T_Write0 | T_Read3 | tk(tk_ADD_IMM_); -const u32 T_SUB_IMM_ = T_Write0 | T_Read3 | tk(tk_SUB_IMM_); - -const u32 T_MOV_IMM = T_Write8 | tk(tk_MOV_IMM); -const u32 T_CMP_IMM = T_Write8 | tk(tk_CMP_IMM); -const u32 T_ADD_IMM = T_Write8 | T_Read8 | tk(tk_ADD_IMM); -const u32 T_SUB_IMM = T_Write8 | T_Read8 | tk(tk_SUB_IMM); - -const u32 T_AND_REG = T_Write0 | T_Read0 | T_Read3 | tk(tk_AND_REG); -const u32 T_EOR_REG = T_Write0 | T_Read0 | T_Read3 | tk(tk_EOR_REG); -const u32 T_LSL_REG = T_Write0 | T_Read0 | T_Read3 | tk(tk_LSL_REG); -const u32 T_LSR_REG = T_Write0 | T_Read0 | T_Read3 | tk(tk_LSR_REG); -const u32 T_ASR_REG = T_Write0 | T_Read0 | T_Read3 | tk(tk_ASR_REG); -const u32 T_ADC_REG = T_Write0 | T_Read0 | T_Read3 | tk(tk_ADC_REG); -const u32 T_SBC_REG = T_Write0 | T_Read0 | T_Read3 | tk(tk_SBC_REG); -const u32 T_ROR_REG = T_Write0 | T_Read0 | T_Read3 | tk(tk_ROR_REG); -const u32 T_TST_REG = T_Read0 | T_Read3 | tk(tk_TST_REG); -const u32 T_NEG_REG = T_Write0 | T_Read3 | tk(tk_NEG_REG); -const u32 T_CMP_REG = T_Read0 | T_Read3 | tk(tk_CMP_REG); -const u32 T_CMN_REG = T_Read0 | T_Read3 | tk(tk_CMN_REG); -const u32 T_ORR_REG = T_Write0 | T_Read0 | T_Read3 | tk(tk_ORR_REG); -const u32 T_MUL_REG = T_Write0 | T_Read0 | T_Read3 | tk(tk_MUL_REG); -const u32 T_BIC_REG = T_Write0 | T_Read0 | T_Read3 | tk(tk_BIC_REG); -const u32 T_MVN_REG = T_Write0 | T_Read3 | tk(tk_MVN_REG); +const u32 T_LSL_IMM = T_SetNZ | T_SetMaybeC | T_Write0 | T_Read3 | tk(tk_LSL_IMM); +const u32 T_LSR_IMM = T_SetNZ | T_SetMaybeC | T_Write0 | T_Read3 | tk(tk_LSR_IMM); +const u32 T_ASR_IMM = T_SetNZ | T_SetMaybeC | T_Write0 | T_Read3 | tk(tk_ASR_IMM); + +const u32 T_ADD_REG_ = T_SetNZ | T_SetCV | T_Write0 | T_Read3 | T_Read6 | tk(tk_ADD_REG_); +const u32 T_SUB_REG_ = T_SetNZ | T_SetCV | T_Write0 | T_Read3 | T_Read6 | tk(tk_SUB_REG_); +const u32 T_ADD_IMM_ = T_SetNZ | T_SetCV | T_Write0 | T_Read3 | tk(tk_ADD_IMM_); +const u32 T_SUB_IMM_ = T_SetNZ | T_SetCV | T_Write0 | T_Read3 | tk(tk_SUB_IMM_); + +const u32 T_MOV_IMM = T_SetNZ | T_Write8 | tk(tk_MOV_IMM); +const u32 T_CMP_IMM = T_SetNZ | T_SetCV | T_Write8 | tk(tk_CMP_IMM); +const u32 T_ADD_IMM = T_SetNZ | T_SetCV | T_Write8 | T_Read8 | tk(tk_ADD_IMM); +const u32 T_SUB_IMM = T_SetNZ | T_SetCV | T_Write8 | T_Read8 | tk(tk_SUB_IMM); + +const u32 T_AND_REG = T_SetNZ | T_Write0 | T_Read0 | T_Read3 | tk(tk_AND_REG); +const u32 T_EOR_REG = T_SetNZ | T_Write0 | T_Read0 | T_Read3 | tk(tk_EOR_REG); +const u32 T_LSL_REG = T_SetNZ | T_SetMaybeC | T_Write0 | T_Read0 | T_Read3 | tk(tk_LSL_REG); +const u32 T_LSR_REG = T_SetNZ | T_SetMaybeC | T_Write0 | T_Read0 | T_Read3 | tk(tk_LSR_REG); +const u32 T_ASR_REG = T_SetNZ | T_SetMaybeC | T_Write0 | T_Read0 | T_Read3 | tk(tk_ASR_REG); +const u32 T_ADC_REG = T_ReadC | T_SetNZ | T_SetCV | T_Write0 | T_Read0 | T_Read3 | tk(tk_ADC_REG); +const u32 T_SBC_REG = T_ReadC | T_SetNZ | T_SetCV | T_Write0 | T_Read0 | T_Read3 | tk(tk_SBC_REG); +const u32 T_ROR_REG = T_SetNZ | T_SetMaybeC | T_Write0 | T_Read0 | T_Read3 | tk(tk_ROR_REG); +const u32 T_TST_REG = T_SetNZ | T_Read0 | T_Read3 | tk(tk_TST_REG); +const u32 T_NEG_REG = T_SetNZ | T_SetCV | T_Write0 | T_Read3 | tk(tk_NEG_REG); +const u32 T_CMP_REG = T_SetNZ | T_SetCV | T_Read0 | T_Read3 | tk(tk_CMP_REG); +const u32 T_CMN_REG = T_SetNZ | T_SetCV | T_Read0 | T_Read3 | tk(tk_CMN_REG); +const u32 T_ORR_REG = T_SetNZ | T_Write0 | T_Read0 | T_Read3 | tk(tk_ORR_REG); +const u32 T_MUL_REG = T_SetNZ | T_Write0 | T_Read0 | T_Read3 | tk(tk_MUL_REG); +const u32 T_BIC_REG = T_SetNZ | T_Write0 | T_Read0 | T_Read3 | tk(tk_BIC_REG); +const u32 T_MVN_REG = T_SetNZ | T_Write0 | T_Read3 | tk(tk_MVN_REG); const u32 T_ADD_HIREG = T_WriteHi0 | T_ReadHi0 | T_ReadHi3 | tk(tk_ADD_HIREG); -const u32 T_CMP_HIREG = T_ReadHi0 | T_ReadHi3 | tk(tk_CMP_HIREG); +const u32 T_CMP_HIREG = T_SetNZ | T_SetCV | T_ReadHi0 | T_ReadHi3 | tk(tk_CMP_HIREG); const u32 T_MOV_HIREG = T_WriteHi0 | T_ReadHi3 | tk(tk_MOV_HIREG); const u32 T_ADD_PCREL = T_Write8 | tk(tk_ADD_PCREL); @@ -268,10 +285,20 @@ const u32 T_SVC = T_BranchAlways | T_WriteR14 | tk(tk_SVC); Info Decode(bool thumb, u32 num, u32 instr) { + const u8 FlagsReadPerCond[7] = { + flag_Z, + flag_C, + flag_N, + flag_V, + flag_C | flag_Z, + flag_N | flag_V, + flag_Z | flag_N | flag_V}; + Info res = {0}; if (thumb) { u32 data = THUMBInstrTable[(instr >> 6) & 0x3FF]; + res.Kind = (data >> 20) & 0x3F; if (data & T_Read0) res.SrcRegs |= 1 << (instr & 0x7); @@ -309,7 +336,18 @@ Info Decode(bool thumb, u32 num, u32 instr) if (data & T_PopPC && instr & (1 << 8)) res.DstRegs |= 1 << 15; - res.Kind = (data >> 16) & 0x3F; + if (data & T_SetNZ) + res.WriteFlags |= flag_N | flag_Z; + if (data & T_SetCV) + res.WriteFlags |= flag_C | flag_V; + if (data & T_SetMaybeC) + res.WriteFlags |= flag_C << 4; + if (data & T_ReadC) + res.ReadFlags |= flag_C; + + if (res.Kind == tk_BCOND) + res.ReadFlags |= FlagsReadPerCond[(instr >> 9) & 0x7]; + res.EndBlock = res.Branches(); return res; @@ -323,7 +361,7 @@ Info Decode(bool thumb, u32 num, u32 instr) if (data & A_UnkOnARM7 && num != 0) data = A_UNK; - res.Kind = (data >> 13) & 0x1FF; + res.Kind = (data >> 18) & 0x1FF; if (res.Kind == ak_MCR) { @@ -382,6 +420,26 @@ Info Decode(bool thumb, u32 num, u32 instr) if (res.Kind == ak_LDM) res.DstRegs |= instr & (1 << 15); // this is right + if (data & A_SetNZ) + res.WriteFlags |= flag_N | flag_Z; + if (data & A_SetCV) + res.WriteFlags |= flag_C | flag_V; + if (data & A_SetMaybeC) + res.WriteFlags |= flag_C << 4; + if ((data & A_MulFlags) && (instr & (1 << 20))) + res.WriteFlags |= flag_N | flag_Z; + if (data & A_ReadC) + res.ReadFlags |= flag_C; + if ((data & A_RRXReadC) && !((instr >> 7) & 0x1F)) + res.ReadFlags |= flag_C; + + if ((instr >> 28) < 0xE) + { + // make non conditional flag sets conditional + res.WriteFlags = res.WriteFlags | (res.WriteFlags << 4); + res.ReadFlags |= FlagsReadPerCond[instr >> 29]; + } + res.EndBlock |= res.Branches(); return res; diff --git a/src/ARM_InstrInfo.h b/src/ARM_InstrInfo.h index 4fe9b10..5336837 100644 --- a/src/ARM_InstrInfo.h +++ b/src/ARM_InstrInfo.h @@ -215,11 +215,24 @@ enum tk_Count }; +enum +{ + flag_N = 1 << 3, + flag_Z = 1 << 2, + flag_C = 1 << 1, + flag_V = 1 << 0, +}; + struct Info { u16 DstRegs, SrcRegs; u16 Kind; + u8 ReadFlags; + // lower 4 bits - set always + // upper 4 bits - might set flag + u8 WriteFlags; + bool EndBlock; bool Branches() { diff --git a/src/libui_sdl/main.cpp b/src/libui_sdl/main.cpp index 0066668..c3db88d 100644 --- a/src/libui_sdl/main.cpp +++ b/src/libui_sdl/main.cpp @@ -2675,6 +2675,8 @@ void RecreateMainWindow(bool opengl) int main(int argc, char** argv) { + freopen("miauz.txt", "w", stdout); + srand(time(NULL)); printf("melonDS " MELONDS_VERSION "\n"); -- cgit v1.2.3 From ea562d2fec9f4ab73e9ff3f519ff5ecb65736cd7 Mon Sep 17 00:00:00 2001 From: RSDuck Date: Sun, 25 Aug 2019 13:06:27 +0200 Subject: fixes for flag optimisation --- src/ARMJIT.cpp | 1 + src/ARMJIT_x64/ARMJIT_ALU.cpp | 2 +- src/ARM_InstrInfo.cpp | 2 +- 3 files changed, 3 insertions(+), 2 deletions(-) (limited to 'src/ARMJIT_x64/ARMJIT_ALU.cpp') diff --git a/src/ARMJIT.cpp b/src/ARMJIT.cpp index 3b6bc2e..5d92e47 100644 --- a/src/ARMJIT.cpp +++ b/src/ARMJIT.cpp @@ -163,6 +163,7 @@ CompiledBlock CompileBlock(ARM* cpu) { r15 += thumb ? 2 : 4; + instrs[i].SetFlags = 0; instrs[i].Instr = nextInstr[0]; instrs[i].NextInstr[0] = nextInstr[0] = nextInstr[1]; diff --git a/src/ARMJIT_x64/ARMJIT_ALU.cpp b/src/ARMJIT_x64/ARMJIT_ALU.cpp index 6a7d711..f868ddf 100644 --- a/src/ARMJIT_x64/ARMJIT_ALU.cpp +++ b/src/ARMJIT_x64/ARMJIT_ALU.cpp @@ -387,7 +387,7 @@ void Compiler::Comp_RetriveFlags(bool sign, bool retriveCV, bool carryUsed) AND(32, R(RCPSR), Imm32(0x3FFFFFFF & ~(carryUsed << 29) & ~((retriveCV ? 3 : 0) << 28))); OR(32, R(RCPSR), R(RSCRATCH)); } - else + else if (carryUsed || retriveCV) { SHL(32, R(RSCRATCH2), Imm8(carryOnly ? 29 : 28)); AND(32, R(RCPSR), Imm32(0xFFFFFFFF & ~(carryUsed << 29) & ~((retriveCV ? 3 : 0) << 28))); diff --git a/src/ARM_InstrInfo.cpp b/src/ARM_InstrInfo.cpp index ea6d827..3634c35 100644 --- a/src/ARM_InstrInfo.cpp +++ b/src/ARM_InstrInfo.cpp @@ -436,7 +436,7 @@ Info Decode(bool thumb, u32 num, u32 instr) if ((instr >> 28) < 0xE) { // make non conditional flag sets conditional - res.WriteFlags = res.WriteFlags | (res.WriteFlags << 4); + res.WriteFlags = (res.WriteFlags | (res.WriteFlags << 4)) & 0xF0; res.ReadFlags |= FlagsReadPerCond[instr >> 29]; } -- cgit v1.2.3 From a687be9879e5cab4ea5d8646c8cf47c214b18856 Mon Sep 17 00:00:00 2001 From: RSDuck Date: Thu, 3 Oct 2019 01:10:59 +0200 Subject: new block cache and much more... - more reliable code invalidation detection - blocks aren't stopped at any branch, but are being followed if possible to get larger blocks - idle loop recognition - optimised literal loads, load/store cycle counting and loads/stores from constant addresses --- src/ARM.cpp | 44 ++- src/ARM.h | 16 +- src/ARMInterpreter.h | 9 + src/ARMJIT.cpp | 755 ++++++++++++++++++++++++++++++------ src/ARMJIT.h | 141 ++----- src/ARMJIT_Internal.h | 198 ++++++++++ src/ARMJIT_RegisterCache.h | 36 +- src/ARMJIT_x64/ARMJIT_ALU.cpp | 16 +- src/ARMJIT_x64/ARMJIT_Branch.cpp | 43 +- src/ARMJIT_x64/ARMJIT_Compiler.cpp | 184 +++++++-- src/ARMJIT_x64/ARMJIT_Compiler.h | 51 ++- src/ARMJIT_x64/ARMJIT_LoadStore.cpp | 629 ++++++++++++++---------------- src/ARM_InstrInfo.cpp | 47 ++- src/ARM_InstrInfo.h | 11 +- src/CP15.cpp | 12 +- src/Config.cpp | 2 + src/Config.h | 1 + src/NDS.cpp | 22 +- src/libui_sdl/DlgEmuSettings.cpp | 22 +- 19 files changed, 1550 insertions(+), 689 deletions(-) create mode 100644 src/ARMJIT_Internal.h (limited to 'src/ARMJIT_x64/ARMJIT_ALU.cpp') diff --git a/src/ARM.cpp b/src/ARM.cpp index e404943..423c940 100644 --- a/src/ARM.cpp +++ b/src/ARM.cpp @@ -580,21 +580,26 @@ void ARMv5::ExecuteJIT() return; } - ARMJIT::CompiledBlock block = ARMJIT::LookUpBlock<0>(instrAddr); - Cycles += (block ? block : ARMJIT::CompileBlock(this))(); + ARMJIT::JitBlockEntry block = ARMJIT::LookUpBlock<0>(instrAddr); + if (block) + Cycles += block(); + else + ARMJIT::CompileBlock(this); + + NDS::ARM9Timestamp += Cycles; + Cycles = 0; + if (IRQ) TriggerIRQ(); if (Halted) { - if (Halted == 1 && NDS::ARM9Timestamp < NDS::ARM9Target) + bool idleLoop = Halted & 0x20; + Halted &= ~0x20; + if ((Halted == 1 || idleLoop) && NDS::ARM9Timestamp < NDS::ARM9Target) { NDS::ARM9Timestamp = NDS::ARM9Target; } break; } - if (IRQ) TriggerIRQ(); - - NDS::ARM9Timestamp += Cycles; - Cycles = 0; } if (Halted == 2) @@ -710,23 +715,28 @@ void ARMv4::ExecuteJIT() printf("ARMv4 PC in non executable region %08X\n", R[15]); return; } - ARMJIT::CompiledBlock block = ARMJIT::LookUpBlock<1>(instrAddr); - Cycles += (block ? block : ARMJIT::CompileBlock(this))(); + + ARMJIT::JitBlockEntry block = ARMJIT::LookUpBlock<1>(instrAddr); + if (block) + Cycles += block(); + else + ARMJIT::CompileBlock(this); + + NDS::ARM7Timestamp += Cycles; + Cycles = 0; // TODO optimize this shit!!! + if (IRQ) TriggerIRQ(); if (Halted) { - if (Halted == 1 && NDS::ARM7Timestamp < NDS::ARM7Target) + bool idleLoop = Halted & 0x20; + Halted &= ~0x20; + if ((Halted == 1 || idleLoop) && NDS::ARM7Timestamp < NDS::ARM7Target) { NDS::ARM7Timestamp = NDS::ARM7Target; } break; } - - if (IRQ) TriggerIRQ(); - - NDS::ARM7Timestamp += Cycles; - Cycles = 0; } if (Halted == 2) @@ -736,6 +746,8 @@ void ARMv4::ExecuteJIT() void ARMv5::FillPipeline() { + SetupCodeMem(R[15]); + if (CPSR & 0x20) { if ((R[15] - 2) & 0x2) @@ -758,6 +770,8 @@ void ARMv5::FillPipeline() void ARMv4::FillPipeline() { + SetupCodeMem(R[15]); + if (CPSR & 0x20) { NextInstr[0] = CodeRead16(R[15] - 2); diff --git a/src/ARM.h b/src/ARM.h index 4d387bc..8a01068 100644 --- a/src/ARM.h +++ b/src/ARM.h @@ -299,7 +299,7 @@ public: { *val = NDS::ARM7Read8(addr); DataRegion = addr >> 24; - DataCycles = NDS::ARM7MemTimings[DataRegion][0]; + DataCycles = NDS::ARM7MemTimings[addr >> 15][0]; } void DataRead16(u32 addr, u32* val) @@ -308,7 +308,7 @@ public: *val = NDS::ARM7Read16(addr); DataRegion = addr >> 24; - DataCycles = NDS::ARM7MemTimings[DataRegion][0]; + DataCycles = NDS::ARM7MemTimings[addr >> 15][0]; } void DataRead32(u32 addr, u32* val) @@ -317,7 +317,7 @@ public: *val = NDS::ARM7Read32(addr); DataRegion = addr >> 24; - DataCycles = NDS::ARM7MemTimings[DataRegion][2]; + DataCycles = NDS::ARM7MemTimings[addr >> 15][2]; } void DataRead32S(u32 addr, u32* val) @@ -325,14 +325,14 @@ public: addr &= ~3; *val = NDS::ARM7Read32(addr); - DataCycles += NDS::ARM7MemTimings[DataRegion][3]; + DataCycles += NDS::ARM7MemTimings[addr >> 15][3]; } void DataWrite8(u32 addr, u8 val) { NDS::ARM7Write8(addr, val); DataRegion = addr >> 24; - DataCycles = NDS::ARM7MemTimings[DataRegion][0]; + DataCycles = NDS::ARM7MemTimings[addr >> 15][0]; } void DataWrite16(u32 addr, u16 val) @@ -341,7 +341,7 @@ public: NDS::ARM7Write16(addr, val); DataRegion = addr >> 24; - DataCycles = NDS::ARM7MemTimings[DataRegion][0]; + DataCycles = NDS::ARM7MemTimings[addr >> 15][0]; } void DataWrite32(u32 addr, u32 val) @@ -350,7 +350,7 @@ public: NDS::ARM7Write32(addr, val); DataRegion = addr >> 24; - DataCycles = NDS::ARM7MemTimings[DataRegion][2]; + DataCycles = NDS::ARM7MemTimings[addr >> 15][2]; } void DataWrite32S(u32 addr, u32 val) @@ -358,7 +358,7 @@ public: addr &= ~3; NDS::ARM7Write32(addr, val); - DataCycles += NDS::ARM7MemTimings[DataRegion][3]; + DataCycles += NDS::ARM7MemTimings[addr >> 15][3]; } diff --git a/src/ARMInterpreter.h b/src/ARMInterpreter.h index 7244238..2bf8167 100644 --- a/src/ARMInterpreter.h +++ b/src/ARMInterpreter.h @@ -28,6 +28,15 @@ namespace ARMInterpreter extern void (*ARMInstrTable[4096])(ARM* cpu); extern void (*THUMBInstrTable[1024])(ARM* cpu); +void A_MSR_IMM(ARM* cpu); +void A_MSR_REG(ARM* cpu); +void A_MRS(ARM* cpu); +void A_MCR(ARM* cpu); +void A_MRC(ARM* cpu); +void A_SVC(ARM* cpu); + +void T_SVC(ARM* cpu); + void A_BLX_IMM(ARM* cpu); // I'm a special one look at me } diff --git a/src/ARMJIT.cpp b/src/ARMJIT.cpp index 85cadf3..686bdd6 100644 --- a/src/ARMJIT.cpp +++ b/src/ARMJIT.cpp @@ -1,122 +1,137 @@ #include "ARMJIT.h" #include +#include #include "Config.h" +#include "ARMJIT_Internal.h" #include "ARMJIT_x64/ARMJIT_Compiler.h" +#include "ARMInterpreter_ALU.h" +#include "ARMInterpreter_LoadStore.h" +#include "ARMInterpreter_Branch.h" +#include "ARMInterpreter.h" + +#include "GPU3D.h" +#include "SPU.h" +#include "Wifi.h" + namespace ARMJIT { +#define JIT_DEBUGPRINT(msg, ...) + Compiler* compiler; -BlockCache cache; -#define DUP2(x) x, x +const u32 ExeMemRegionSizes[] = { + 0x8000, // Unmapped Region (dummy) + 0x8000, // ITCM + 4*1024*1024, // Main RAM + 0x8000, // SWRAM + 0xA4000, // LCDC + 0x8000, // ARM9 BIOS + 0x4000, // ARM7 BIOS + 0x10000, // ARM7 WRAM + 0x40000 // ARM7 WVRAM +}; -static ptrdiff_t JIT_MEM[2][32] = { - //arm9 - { - /* 0X*/ DUP2(offsetof(BlockCache, ARM9_ITCM)), - /* 1X*/ DUP2(offsetof(BlockCache, ARM9_ITCM)), // mirror - /* 2X*/ DUP2(offsetof(BlockCache, MainRAM)), - /* 3X*/ DUP2(offsetof(BlockCache, SWRAM)), - /* 4X*/ DUP2(-1), - /* 5X*/ DUP2(-1), - /* 6X*/ -1, - offsetof(BlockCache, ARM9_LCDC), // Plain ARM9-CPU Access (LCDC mode) (max 656KB) - /* 7X*/ DUP2(-1), - /* 8X*/ DUP2(-1), - /* 9X*/ DUP2(-1), - /* AX*/ DUP2(-1), - /* BX*/ DUP2(-1), - /* CX*/ DUP2(-1), - /* DX*/ DUP2(-1), - /* EX*/ DUP2(-1), - /* FX*/ DUP2(offsetof(BlockCache, ARM9_BIOS)) - }, - //arm7 - { - /* 0X*/ DUP2(offsetof(BlockCache, ARM7_BIOS)), - /* 1X*/ DUP2(-1), - /* 2X*/ DUP2(offsetof(BlockCache, MainRAM)), - /* 3X*/ offsetof(BlockCache, SWRAM), - offsetof(BlockCache, ARM7_WRAM), - /* 4X*/ DUP2(-1), - /* 5X*/ DUP2(-1), - /* 6X*/ DUP2(offsetof(BlockCache, ARM7_WVRAM)), /* contrary to Gbatek, melonDS and itself, - DeSmuME doesn't mirror the 64 MB region at 0x6800000 */ - /* 7X*/ DUP2(-1), - /* 8X*/ DUP2(-1), - /* 9X*/ DUP2(-1), - /* AX*/ DUP2(-1), - /* BX*/ DUP2(-1), - /* CX*/ DUP2(-1), - /* DX*/ DUP2(-1), - /* EX*/ DUP2(-1), - /* FX*/ DUP2(-1) - } +const u32 ExeMemRegionOffsets[] = { + 0, + 0x8000, + 0x10000, + 0x410000, + 0x418000, + 0x4BC000, + 0x4C4000, + 0x4C8000, + 0x4D8000, + 0x518000, }; -static u32 JIT_MASK[2][32] = { +#define DUP2(x) x, x + +const static ExeMemKind JIT_MEM[2][32] = { //arm9 { - /* 0X*/ DUP2(0x00007FFF), - /* 1X*/ DUP2(0x00007FFF), - /* 2X*/ DUP2(0x003FFFFF), - /* 3X*/ DUP2(0x00007FFF), - /* 4X*/ DUP2(0x00000000), - /* 5X*/ DUP2(0x00000000), - /* 6X*/ 0x00000000, - 0x000FFFFF, - /* 7X*/ DUP2(0x00000000), - /* 8X*/ DUP2(0x00000000), - /* 9X*/ DUP2(0x00000000), - /* AX*/ DUP2(0x00000000), - /* BX*/ DUP2(0x00000000), - /* CX*/ DUP2(0x00000000), - /* DX*/ DUP2(0x00000000), - /* EX*/ DUP2(0x00000000), - /* FX*/ DUP2(0x00007FFF) + /* 0X*/ DUP2(exeMem_ITCM), + /* 1X*/ DUP2(exeMem_ITCM), // mirror + /* 2X*/ DUP2(exeMem_MainRAM), + /* 3X*/ DUP2(exeMem_SWRAM), + /* 4X*/ DUP2(exeMem_Unmapped), + /* 5X*/ DUP2(exeMem_Unmapped), + /* 6X*/ exeMem_Unmapped, + exeMem_LCDC, // Plain ARM9-CPU Access (LCDC mode) (max 656KB) + /* 7X*/ DUP2(exeMem_Unmapped), + /* 8X*/ DUP2(exeMem_Unmapped), + /* 9X*/ DUP2(exeMem_Unmapped), + /* AX*/ DUP2(exeMem_Unmapped), + /* BX*/ DUP2(exeMem_Unmapped), + /* CX*/ DUP2(exeMem_Unmapped), + /* DX*/ DUP2(exeMem_Unmapped), + /* EX*/ DUP2(exeMem_Unmapped), + /* FX*/ DUP2(exeMem_ARM9_BIOS) }, //arm7 { - /* 0X*/ DUP2(0x00003FFF), - /* 1X*/ DUP2(0x00000000), - /* 2X*/ DUP2(0x003FFFFF), - /* 3X*/ 0x00007FFF, - 0x0000FFFF, - /* 4X*/ 0x00000000, - 0x0000FFFF, - /* 5X*/ DUP2(0x00000000), - /* 6X*/ DUP2(0x0003FFFF), - /* 7X*/ DUP2(0x00000000), - /* 8X*/ DUP2(0x00000000), - /* 9X*/ DUP2(0x00000000), - /* AX*/ DUP2(0x00000000), - /* BX*/ DUP2(0x00000000), - /* CX*/ DUP2(0x00000000), - /* DX*/ DUP2(0x00000000), - /* EX*/ DUP2(0x00000000), - /* FX*/ DUP2(0x00000000) + /* 0X*/ DUP2(exeMem_ARM7_BIOS), + /* 1X*/ DUP2(exeMem_Unmapped), + /* 2X*/ DUP2(exeMem_MainRAM), + /* 3X*/ exeMem_SWRAM, + exeMem_ARM7_WRAM, + /* 4X*/ DUP2(exeMem_Unmapped), + /* 5X*/ DUP2(exeMem_Unmapped), + /* 6X*/ DUP2(exeMem_ARM7_WVRAM), /* contrary to Gbatek, melonDS and itself, + DeSmuME doesn't mirror the 64 MB region at 0x6800000 */ + /* 7X*/ DUP2(exeMem_Unmapped), + /* 8X*/ DUP2(exeMem_Unmapped), + /* 9X*/ DUP2(exeMem_Unmapped), + /* AX*/ DUP2(exeMem_Unmapped), + /* BX*/ DUP2(exeMem_Unmapped), + /* CX*/ DUP2(exeMem_Unmapped), + /* DX*/ DUP2(exeMem_Unmapped), + /* EX*/ DUP2(exeMem_Unmapped), + /* FX*/ DUP2(exeMem_Unmapped) } }; #undef DUP2 +/* + translates address to pseudo physical address + - more compact, eliminates mirroring, everything comes in a row + - we only need one translation table +*/ +u32 AddrTranslate9[0x2000]; +u32 AddrTranslate7[0x4000]; -void Init() +JitBlockEntry FastBlockAccess[ExeMemSpaceSize / 2]; +AddressRange CodeRanges[ExeMemSpaceSize / 256]; + +TinyVector JitBlocks; +JitBlock* RestoreCandidates[0x1000] = {NULL}; + +u32 HashRestoreCandidate(u32 pseudoPhysicalAddr) { - memset(&cache, 0, sizeof(BlockCache)); + return (u32)(((u64)pseudoPhysicalAddr * 11400714819323198485llu) >> 53); +} +void Init() +{ for (int i = 0; i < 0x2000; i++) - cache.AddrMapping9[i] = JIT_MEM[0][i >> 8] == -1 ? NULL : - (CompiledBlock*)((u8*)&cache + JIT_MEM[0][i >> 8]) - + (((i << 15) & JIT_MASK[0][i >> 8]) >> 1); + { + ExeMemKind kind = JIT_MEM[0][i >> 8]; + u32 size = ExeMemRegionSizes[kind]; + + AddrTranslate9[i] = ExeMemRegionOffsets[kind] + ((i << 15) & (size - 1)); + } for (int i = 0; i < 0x4000; i++) - cache.AddrMapping7[i] = JIT_MEM[1][i >> 9] == -1 ? NULL : - (CompiledBlock*)((u8*)&cache + JIT_MEM[1][i >> 9]) - + (((i << 14) & JIT_MASK[1][i >> 9]) >> 1); + { + ExeMemKind kind = JIT_MEM[1][i >> 9]; + u32 size = ExeMemRegionSizes[kind]; + + AddrTranslate7[i] = ExeMemRegionOffsets[kind] + ((i << 14) & (size - 1)); + } compiler = new Compiler(); } @@ -126,7 +141,7 @@ void DeInit() delete compiler; } -void floodFillSetFlags(FetchedInstr instrs[], int start, u8 flags) +void FloodFillSetFlags(FetchedInstr instrs[], int start, u8 flags) { for (int j = start; j >= 0; j--) { @@ -144,7 +159,154 @@ void floodFillSetFlags(FetchedInstr instrs[], int start, u8 flags) } } -CompiledBlock CompileBlock(ARM* cpu) +bool DecodeBranch(bool thumb, const FetchedInstr& instr, u32& cond, u32& targetAddr) +{ + if (thumb) + { + u32 r15 = instr.Addr + 4; + cond = 0xE; + + if (instr.Info.Kind == ARMInstrInfo::tk_BL_LONG && !(instr.Instr & (1 << 12))) + { + targetAddr = r15 + ((s32)((instr.Instr & 0x7FF) << 21) >> 9); + targetAddr += ((instr.Instr >> 16) & 0x7FF) << 1; + return true; + } + else if (instr.Info.Kind == ARMInstrInfo::tk_B) + { + s32 offset = (s32)((instr.Instr & 0x7FF) << 21) >> 20; + targetAddr = r15 + offset; + return true; + } + else if (instr.Info.Kind == ARMInstrInfo::tk_BCOND) + { + cond = (instr.Instr >> 8) & 0xF; + s32 offset = (s32)(instr.Instr << 24) >> 23; + targetAddr = r15 + offset; + return true; + } + } + else + { + cond = instr.Cond(); + if (instr.Info.Kind == ARMInstrInfo::ak_BL + || instr.Info.Kind == ARMInstrInfo::ak_B) + { + s32 offset = (s32)(instr.Instr << 8) >> 6; + u32 r15 = instr.Addr + 8; + targetAddr = r15 + offset; + return true; + } + } + return false; +} + +bool IsIdleLoop(FetchedInstr* instrs, int instrsCount) +{ + // see https://github.com/dolphin-emu/dolphin/blob/master/Source/Core/Core/PowerPC/PPCAnalyst.cpp#L678 + // it basically checks if one iteration of a loop depends on another + // the rules are quite simple + + u16 regsWrittenTo = 0; + u16 regsDisallowedToWrite = 0; + for (int i = 0; i < instrsCount; i++) + { + //printf("instr %d %x regs(%x %x) %x %x\n", i, instrs[i].Instr, instrs[i].Info.DstRegs, instrs[i].Info.SrcRegs, regsWrittenTo, regsDisallowedToWrite); + if (instrs[i].Info.SpecialKind == ARMInstrInfo::special_WriteMem) + return false; + if (i < instrsCount - 1 && instrs[i].Info.Branches()) + return false; + + u16 srcRegs = instrs[i].Info.SrcRegs & ~(1 << 15); + u16 dstRegs = instrs[i].Info.DstRegs & ~(1 << 15); + + regsDisallowedToWrite |= srcRegs & ~regsWrittenTo; + + if (dstRegs & regsDisallowedToWrite) + return false; + regsWrittenTo |= dstRegs; + } + return true; +} + +typedef void (*InterpreterFunc)(ARM* cpu); + +#define F(x) &ARMInterpreter::A_##x +#define F_ALU(name, s) \ + F(name##_REG_LSL_IMM##s), F(name##_REG_LSR_IMM##s), F(name##_REG_ASR_IMM##s), F(name##_REG_ROR_IMM##s), \ + F(name##_REG_LSL_REG##s), F(name##_REG_LSR_REG##s), F(name##_REG_ASR_REG##s), F(name##_REG_ROR_REG##s), F(name##_IMM##s) +#define F_MEM_WB(name) \ + F(name##_REG_LSL), F(name##_REG_LSR), F(name##_REG_ASR), F(name##_REG_ROR), F(name##_IMM), \ + F(name##_POST_REG_LSL), F(name##_POST_REG_LSR), F(name##_POST_REG_ASR), F(name##_POST_REG_ROR), F(name##_POST_IMM) +#define F_MEM_HD(name) \ + F(name##_REG), F(name##_IMM), F(name##_POST_REG), F(name##_POST_IMM) +InterpreterFunc InterpretARM[ARMInstrInfo::ak_Count] = +{ + F_ALU(AND,), F_ALU(AND,_S), + F_ALU(EOR,), F_ALU(EOR,_S), + F_ALU(SUB,), F_ALU(SUB,_S), + F_ALU(RSB,), F_ALU(RSB,_S), + F_ALU(ADD,), F_ALU(ADD,_S), + F_ALU(ADC,), F_ALU(ADC,_S), + F_ALU(SBC,), F_ALU(SBC,_S), + F_ALU(RSC,), F_ALU(RSC,_S), + F_ALU(ORR,), F_ALU(ORR,_S), + F_ALU(MOV,), F_ALU(MOV,_S), + F_ALU(BIC,), F_ALU(BIC,_S), + F_ALU(MVN,), F_ALU(MVN,_S), + F_ALU(TST,), + F_ALU(TEQ,), + F_ALU(CMP,), + F_ALU(CMN,), + + F(MUL), F(MLA), F(UMULL), F(UMLAL), F(SMULL), F(SMLAL), F(SMLAxy), F(SMLAWy), F(SMULWy), F(SMLALxy), F(SMULxy), + F(CLZ), F(QADD), F(QDADD), F(QSUB), F(QDSUB), + + F_MEM_WB(STR), + F_MEM_WB(STRB), + F_MEM_WB(LDR), + F_MEM_WB(LDRB), + + F_MEM_HD(STRH), + F_MEM_HD(LDRD), + F_MEM_HD(STRD), + F_MEM_HD(LDRH), + F_MEM_HD(LDRSB), + F_MEM_HD(LDRSH), + + F(SWP), F(SWPB), + F(LDM), F(STM), + + F(B), F(BL), F(BLX_IMM), F(BX), F(BLX_REG), + F(UNK), F(MSR_IMM), F(MSR_REG), F(MRS), F(MCR), F(MRC), F(SVC) +}; +#undef F_ALU +#undef F_MEM_WB +#undef F_MEM_HD +#undef F + +#define F(x) ARMInterpreter::T_##x +InterpreterFunc InterpretTHUMB[ARMInstrInfo::tk_Count] = +{ + F(LSL_IMM), F(LSR_IMM), F(ASR_IMM), + F(ADD_REG_), F(SUB_REG_), F(ADD_IMM_), F(SUB_IMM_), + F(MOV_IMM), F(CMP_IMM), F(ADD_IMM), F(SUB_IMM), + F(AND_REG), F(EOR_REG), F(LSL_REG), F(LSR_REG), F(ASR_REG), + F(ADC_REG), F(SBC_REG), F(ROR_REG), F(TST_REG), F(NEG_REG), + F(CMP_REG), F(CMN_REG), F(ORR_REG), F(MUL_REG), F(BIC_REG), F(MVN_REG), + F(ADD_HIREG), F(CMP_HIREG), F(MOV_HIREG), + F(ADD_PCREL), F(ADD_SPREL), F(ADD_SP), + F(LDR_PCREL), F(STR_REG), F(STRB_REG), F(LDR_REG), F(LDRB_REG), F(STRH_REG), + F(LDRSB_REG), F(LDRH_REG), F(LDRSH_REG), F(STR_IMM), F(LDR_IMM), F(STRB_IMM), + F(LDRB_IMM), F(STRH_IMM), F(LDRH_IMM), F(STR_SPREL), F(LDR_SPREL), + F(PUSH), F(POP), F(LDMIA), F(STMIA), + F(BCOND), F(BX), F(BLX_REG), F(B), F(BL_LONG_1), F(BL_LONG_2), + F(UNK), F(SVC), + NULL // BL_LONG psudo opcode +}; +#undef F + +void CompileBlock(ARM* cpu) { bool thumb = cpu->CPSR & 0x20; @@ -153,17 +315,41 @@ CompiledBlock CompileBlock(ARM* cpu) if (Config::JIT_MaxBlockSize > 32) Config::JIT_MaxBlockSize = 32; + u32 blockAddr = cpu->R[15] - (thumb ? 2 : 4); + if (!(cpu->Num == 0 + ? IsMapped<0>(blockAddr) + : IsMapped<1>(blockAddr))) + { + printf("Trying to compile a block in unmapped memory: %x\n", blockAddr); + } + + u32 pseudoPhysicalAddr = cpu->Num == 0 + ? TranslateAddr<0>(blockAddr) + : TranslateAddr<1>(blockAddr); + FetchedInstr instrs[Config::JIT_MaxBlockSize]; int i = 0; - u32 blockAddr = cpu->R[15] - (thumb ? 2 : 4); u32 r15 = cpu->R[15]; + + u32 addresseRanges[32] = {}; + u32 numAddressRanges = 0; + cpu->FillPipeline(); u32 nextInstr[2] = {cpu->NextInstr[0], cpu->NextInstr[1]}; u32 nextInstrAddr[2] = {blockAddr, r15}; + + JIT_DEBUGPRINT("start block %x (%x) %p %p (region invalidates %dx)\n", + blockAddr, pseudoPhysicalAddr, FastBlockAccess[pseudoPhysicalAddr / 2], + cpu->Num == 0 ? LookUpBlock<0>(blockAddr) : LookUpBlock<1>(blockAddr), + CodeRanges[pseudoPhysicalAddr / 256].TimesInvalidated); + + u32 lastSegmentStart = blockAddr; + do { r15 += thumb ? 2 : 4; + instrs[i].BranchFlags = 0; instrs[i].SetFlags = 0; instrs[i].Instr = nextInstr[0]; instrs[i].NextInstr[0] = nextInstr[0] = nextInstr[1]; @@ -171,6 +357,25 @@ CompiledBlock CompileBlock(ARM* cpu) instrs[i].Addr = nextInstrAddr[0]; nextInstrAddr[0] = nextInstrAddr[1]; nextInstrAddr[1] = r15; + JIT_DEBUGPRINT("instr %08x %x\n", instrs[i].Instr & (thumb ? 0xFFFF : ~0), instrs[i].Addr); + + u32 translatedAddr = (cpu->Num == 0 + ? TranslateAddr<0>(instrs[i].Addr) + : TranslateAddr<1>(instrs[i].Addr)) & ~0xFF; + if (i == 0 || translatedAddr != addresseRanges[numAddressRanges - 1]) + { + bool returning = false; + for (int j = 0; j < numAddressRanges; j++) + { + if (addresseRanges[j] == translatedAddr) + { + returning = true; + break; + } + } + if (!returning) + addresseRanges[numAddressRanges++] = translatedAddr; + } if (cpu->Num == 0) { @@ -198,6 +403,34 @@ CompiledBlock CompileBlock(ARM* cpu) instrs[i].NextInstr[1] = nextInstr[1]; instrs[i].Info = ARMInstrInfo::Decode(thumb, cpu->Num, instrs[i].Instr); + cpu->R[15] = r15; + cpu->CurInstr = instrs[i].Instr; + cpu->CodeCycles = instrs[i].CodeCycles; + + if (thumb) + { + InterpretTHUMB[instrs[i].Info.Kind](cpu); + } + else + { + if (cpu->Num == 0 && instrs[i].Info.Kind == ARMInstrInfo::ak_BLX_IMM) + { + ARMInterpreter::A_BLX_IMM(cpu); + } + else + { + u32 icode = ((instrs[i].Instr >> 4) & 0xF) | ((instrs[i].Instr >> 16) & 0xFF0); + assert(InterpretARM[instrs[i].Info.Kind] == ARMInterpreter::ARMInstrTable[icode] || instrs[i].Info.Kind == ARMInstrInfo::ak_MOV_REG_LSL_IMM); + if (cpu->CheckCondition(instrs[i].Cond())) + InterpretARM[instrs[i].Info.Kind](cpu); + else + cpu->AddCycles_C(); + } + } + + instrs[i].DataCycles = cpu->DataCycles; + instrs[i].DataRegion = cpu->DataRegion; + if (thumb && instrs[i].Info.Kind == ARMInstrInfo::tk_BL_LONG_2 && i > 0 && instrs[i - 1].Info.Kind == ARMInstrInfo::tk_BL_LONG_1) { @@ -208,40 +441,340 @@ CompiledBlock CompileBlock(ARM* cpu) instrs[i - 1].Info.EndBlock = true; i--; } - i++; + if (instrs[i].Info.Branches() && Config::JIT_BrancheOptimisations) + { + bool hasBranched = cpu->R[15] != r15; + + u32 cond, target; + bool staticBranch = DecodeBranch(thumb, instrs[i], cond, target); + JIT_DEBUGPRINT("branch cond %x target %x (%d)\n", cond, target, hasBranched); + + if (staticBranch) + { + bool isBackJump = false; + if (hasBranched) + { + for (int j = 0; j < i; j++) + { + if (instrs[i].Addr == target) + { + isBackJump = true; + break; + } + } + } + + if (cond < 0xE && target < instrs[i].Addr && target >= lastSegmentStart) + { + // we might have an idle loop + u32 offset = (target - blockAddr) / (thumb ? 2 : 4); + if (IsIdleLoop(instrs + offset, i - offset + 1)) + { + instrs[i].BranchFlags |= branch_IdleBranch; + JIT_DEBUGPRINT("found %s idle loop %d in block %x\n", thumb ? "thumb" : "arm", cpu->Num, blockAddr); + } + } + else if (hasBranched && (!thumb || cond == 0xE) && !isBackJump && i + 1 < Config::JIT_MaxBlockSize) + { + u32 targetPseudoPhysical = cpu->Num == 0 + ? TranslateAddr<0>(target) + : TranslateAddr<1>(target); + + r15 = target + (thumb ? 2 : 4); + assert(r15 == cpu->R[15]); + + JIT_DEBUGPRINT("block lengthened by static branch (target %x)\n", target); + + nextInstr[0] = cpu->NextInstr[0]; + nextInstr[1] = cpu->NextInstr[1]; + + nextInstrAddr[0] = target; + nextInstrAddr[1] = r15; + + lastSegmentStart = target; + + instrs[i].Info.EndBlock = false; + + if (cond < 0xE) + instrs[i].BranchFlags |= branch_FollowCondTaken; + } + } + + if (!hasBranched && cond < 0xE && i + 1 < Config::JIT_MaxBlockSize) + { + instrs[i].Info.EndBlock = false; + instrs[i].BranchFlags |= branch_FollowCondNotTaken; + } + } + + i++; bool canCompile = compiler->CanCompile(thumb, instrs[i - 1].Info.Kind); - if (instrs[i - 1].Info.ReadFlags != 0 || !canCompile) - floodFillSetFlags(instrs, i - 2, canCompile ? instrs[i - 1].Info.ReadFlags : 0xF); - } while(!instrs[i - 1].Info.EndBlock && i < Config::JIT_MaxBlockSize); + bool secondaryFlagReadCond = !canCompile || (instrs[i - 1].BranchFlags & (branch_FollowCondTaken | branch_FollowCondNotTaken)); + if (instrs[i - 1].Info.ReadFlags != 0 || secondaryFlagReadCond) + FloodFillSetFlags(instrs, i - 2, !secondaryFlagReadCond ? instrs[i - 1].Info.ReadFlags : 0xF); + } while(!instrs[i - 1].Info.EndBlock && i < Config::JIT_MaxBlockSize && !cpu->Halted); - floodFillSetFlags(instrs, i - 1, 0xF); + u32 restoreSlot = HashRestoreCandidate(pseudoPhysicalAddr); + JitBlock* prevBlock = RestoreCandidates[restoreSlot]; + bool mayRestore = true; + if (prevBlock && prevBlock->PseudoPhysicalAddr == pseudoPhysicalAddr) + { + RestoreCandidates[restoreSlot] = NULL; + if (prevBlock->NumInstrs == i) + { + for (int j = 0; j < i; j++) + { + if (prevBlock->Instrs()[j] != instrs[j].Instr) + { + mayRestore = false; + break; + } + } + } + else + mayRestore = false; - CompiledBlock block = compiler->CompileBlock(cpu, instrs, i); + if (prevBlock->NumAddresses == numAddressRanges) + { + for (int j = 0; j < numAddressRanges; j++) + { + if (prevBlock->AddressRanges()[j] != addresseRanges[j]) + { + mayRestore = false; + break; + } + } + } + else + mayRestore = false; + } + else + { + mayRestore = false; + prevBlock = NULL; + } - if (cpu->Num == 0) - InsertBlock<0>(blockAddr, block); + JitBlock* block; + if (!mayRestore) + { + if (prevBlock) + delete prevBlock; + + block = new JitBlock(i, numAddressRanges); + for (int j = 0; j < i; j++) + block->Instrs()[j] = instrs[j].Instr; + for (int j = 0; j < numAddressRanges; j++) + block->AddressRanges()[j] = addresseRanges[j]; + + block->StartAddr = blockAddr; + block->PseudoPhysicalAddr = pseudoPhysicalAddr; + + FloodFillSetFlags(instrs, i - 1, 0xF); + + block->EntryPoint = compiler->CompileBlock(cpu, thumb, instrs, i); + } else - InsertBlock<1>(blockAddr, block); + { + JIT_DEBUGPRINT("restored! %p\n", prevBlock); + block = prevBlock; + } + + for (int j = 0; j < numAddressRanges; j++) + { + assert(addresseRanges[j] == block->AddressRanges()[j]); + CodeRanges[addresseRanges[j] / 256].Blocks.Add(block); + } + + FastBlockAccess[block->PseudoPhysicalAddr / 2] = block->EntryPoint; - return block; + JitBlocks.Add(block); } -void InvalidateBlockCache() +void InvalidateByAddr(u32 pseudoPhysical) { - printf("Resetting JIT block cache...\n"); + JIT_DEBUGPRINT("invalidating by addr %x\n", pseudoPhysical); + AddressRange* range = &CodeRanges[pseudoPhysical / 256]; + int startLength = range->Blocks.Length; + for (int i = 0; i < range->Blocks.Length; i++) + { + assert(range->Blocks.Length == startLength); + JitBlock* block = range->Blocks[i]; + for (int j = 0; j < block->NumAddresses; j++) + { + u32 addr = block->AddressRanges()[j]; + if ((addr / 256) != (pseudoPhysical / 256)) + { + AddressRange* otherRange = &CodeRanges[addr / 256]; + assert(otherRange != range); + assert(otherRange->Blocks.RemoveByValue(block)); + } + } + + assert(JitBlocks.RemoveByValue(block)); + + FastBlockAccess[block->PseudoPhysicalAddr / 2] = NULL; - memset(cache.MainRAM, 0, sizeof(cache.MainRAM)); - memset(cache.SWRAM, 0, sizeof(cache.SWRAM)); - memset(cache.ARM9_BIOS, 0, sizeof(cache.ARM9_BIOS)); - memset(cache.ARM9_ITCM, 0, sizeof(cache.ARM9_ITCM)); - memset(cache.ARM9_LCDC, 0, sizeof(cache.ARM9_LCDC)); - memset(cache.ARM7_BIOS, 0, sizeof(cache.ARM7_BIOS)); - memset(cache.ARM7_WRAM, 0, sizeof(cache.ARM7_WRAM)); - memset(cache.ARM7_WVRAM, 0, sizeof(cache.ARM7_WVRAM)); + u32 slot = HashRestoreCandidate(block->PseudoPhysicalAddr); + if (RestoreCandidates[slot] && RestoreCandidates[slot] != block) + delete RestoreCandidates[slot]; + + RestoreCandidates[slot] = block; + } + if ((range->TimesInvalidated + 1) > range->TimesInvalidated) + range->TimesInvalidated++; + + range->Blocks.Clear(); +} + +void InvalidateByAddr7(u32 addr) +{ + u32 pseudoPhysical = TranslateAddr<1>(addr); + if (__builtin_expect(CodeRanges[pseudoPhysical / 256].Blocks.Length > 0, false)) + InvalidateByAddr(pseudoPhysical); +} + +void InvalidateITCM(u32 addr) +{ + u32 pseudoPhysical = addr + ExeMemRegionOffsets[exeMem_ITCM]; + if (CodeRanges[pseudoPhysical / 256].Blocks.Length > 0) + InvalidateByAddr(pseudoPhysical); +} + +void InvalidateAll() +{ + JIT_DEBUGPRINT("invalidating all %x\n", JitBlocks.Length); + for (int i = 0; i < JitBlocks.Length; i++) + { + JitBlock* block = JitBlocks[i]; + + FastBlockAccess[block->PseudoPhysicalAddr / 2] = NULL; + + for (int j = 0; j < block->NumAddresses; j++) + { + u32 addr = block->AddressRanges()[j]; + AddressRange* range = &CodeRanges[addr / 256]; + range->Blocks.Clear(); + if (range->TimesInvalidated + 1 > range->TimesInvalidated) + range->TimesInvalidated++; + } + + u32 slot = HashRestoreCandidate(block->PseudoPhysicalAddr); + if (RestoreCandidates[slot] && RestoreCandidates[slot] != block) + delete RestoreCandidates[slot]; + + RestoreCandidates[slot] = block; + } + + JitBlocks.Clear(); +} + +void ResetBlockCache() +{ + printf("Resetting JIT block cache...\n"); + + memset(FastBlockAccess, 0, sizeof(FastBlockAccess)); + for (int i = 0; i < sizeof(RestoreCandidates)/sizeof(RestoreCandidates[0]); i++) + { + if (RestoreCandidates[i]) + { + delete RestoreCandidates[i]; + RestoreCandidates[i] = NULL; + } + } + for (int i = 0; i < JitBlocks.Length; i++) + { + JitBlock* block = JitBlocks[i]; + for (int j = 0; j < block->NumAddresses; j++) + { + u32 addr = block->AddressRanges()[j]; + CodeRanges[addr / 256].Blocks.Clear(); + CodeRanges[addr / 256].TimesInvalidated = 0; + } + delete block; + } + JitBlocks.Clear(); compiler->Reset(); } +void* GetFuncForAddr(ARM* cpu, u32 addr, bool store, int size) +{ + if (cpu->Num == 0) + { + if ((addr & 0xFF000000) == 0x04000000) + { + /* + unfortunately we can't map GPU2D this way + since it's hidden inside an object + + though GPU3D registers are accessed much more intensive + */ + if (addr >= 0x04000320 && addr < 0x040006A4) + { + switch (size | store) + { + case 8: return (void*)GPU3D::Read8; + case 9: return (void*)GPU3D::Write8; + case 16: return (void*)GPU3D::Read16; + case 17: return (void*)GPU3D::Write16; + case 32: return (void*)GPU3D::Read32; + case 33: return (void*)GPU3D::Write32; + } + } + + switch (size | store) + { + case 8: return (void*)NDS::ARM9IORead8; + case 9: return (void*)NDS::ARM9IOWrite8; + case 16: return (void*)NDS::ARM9IORead16; + case 17: return (void*)NDS::ARM9IOWrite16; + case 32: return (void*)NDS::ARM9IORead32; + case 33: return (void*)NDS::ARM9IOWrite32; + } + } + } + else + { + switch (addr & 0xFF800000) + { + case 0x04000000: + if (addr >= 0x04000400 && addr < 0x04000520) + { + switch (size | store) + { + case 8: return (void*)SPU::Read8; + case 9: return (void*)SPU::Write8; + case 16: return (void*)SPU::Read16; + case 17: return (void*)SPU::Write16; + case 32: return (void*)SPU::Read32; + case 33: return (void*)SPU::Write32; + } + } + + switch (size | store) + { + case 8: return (void*)NDS::ARM7IORead8; + case 9: return (void*)NDS::ARM7IOWrite8; + case 16: return (void*)NDS::ARM7IORead16; + case 17: return (void*)NDS::ARM7IOWrite16; + case 32: return (void*)NDS::ARM7IORead32; + case 33: return (void*)NDS::ARM7IOWrite32; + } + break; + case 0x04800000: + if (addr < 0x04810000 && size == 16) + { + if (store) + return (void*)Wifi::Write; + else + return (void*)Wifi::Read; + } + break; + } + } + return NULL; +} + } \ No newline at end of file diff --git a/src/ARMJIT.h b/src/ARMJIT.h index 7e448ef..1db4d66 100644 --- a/src/ARMJIT.h +++ b/src/ARMJIT.h @@ -9,142 +9,67 @@ namespace ARMJIT { -typedef u32 (*CompiledBlock)(); - -struct FetchedInstr +enum ExeMemKind { - u32 A_Reg(int pos) const - { - return (Instr >> pos) & 0xF; - } - - u32 T_Reg(int pos) const - { - return (Instr >> pos) & 0x7; - } - - u32 Cond() const - { - return Instr >> 28; - } - - u8 SetFlags; - u32 Instr; - u32 NextInstr[2]; - u32 Addr; - - u8 CodeCycles; - - ARMInstrInfo::Info Info; + exeMem_Unmapped = 0, + exeMem_ITCM, + exeMem_MainRAM, + exeMem_SWRAM, + exeMem_LCDC, + exeMem_ARM9_BIOS, + exeMem_ARM7_BIOS, + exeMem_ARM7_WRAM, + exeMem_ARM7_WVRAM, + exeMem_Count }; -/* - Copied from DeSmuME - Some names where changed to match the nomenclature of melonDS +extern const u32 ExeMemRegionOffsets[]; +extern const u32 ExeMemRegionSizes[]; - Since it's nowhere explained and atleast I needed some time to get behind it, - here's a summary on how it works: - more or less all memory locations from which code can be executed are - represented by an array of function pointers, which point to null or - a function which executes a block instructions starting from there. +typedef u32 (*JitBlockEntry)(); - The most significant 4 bits of each address is ignored. This 28 bit space is - divided into 0x2000 32 KB for ARM9 and 0x4000 16 KB for ARM7, each of which - a pointer to the relevant place inside the afore mentioned arrays. 32 and 16 KB - are the sizes of the smallest contigous memory region mapped to the respective CPU. - Because ARM addresses are always aligned to 4 bytes and Thumb to a 2 byte boundary, - we only need every second half word to be adressable. +extern u32 AddrTranslate9[0x2000]; +extern u32 AddrTranslate7[0x4000]; - In case a memory write hits mapped memory, the function block at this - address is set to null, so it's recompiled the next time it's executed. - - This method has disadvantages, namely that only writing to the - first instruction of a block marks it as invalid and that memory remapping - (SWRAM and VRAM) isn't taken into account. -*/ - -struct BlockCache -{ - CompiledBlock* AddrMapping9[0x2000] = {0}; - CompiledBlock* AddrMapping7[0x4000] = {0}; - - CompiledBlock MainRAM[4*1024*1024/2]; - CompiledBlock SWRAM[0x8000/2]; // Shared working RAM - CompiledBlock ARM9_ITCM[0x8000/2]; - CompiledBlock ARM9_LCDC[0xA4000/2]; - CompiledBlock ARM9_BIOS[0x8000/2]; - CompiledBlock ARM7_BIOS[0x4000/2]; - CompiledBlock ARM7_WRAM[0x10000/2]; // dedicated ARM7 WRAM - CompiledBlock ARM7_WVRAM[0x40000/2]; // VRAM allocated as Working RAM -}; - -extern BlockCache cache; +const u32 ExeMemSpaceSize = 0x518000; // I hate you C++, sometimes I really hate you... +extern JitBlockEntry FastBlockAccess[ExeMemSpaceSize / 2]; template inline bool IsMapped(u32 addr) { if (num == 0) - return cache.AddrMapping9[(addr & 0xFFFFFFF) >> 15]; + return AddrTranslate9[(addr & 0xFFFFFFF) >> 15] >= ExeMemRegionSizes[exeMem_Unmapped]; else - return cache.AddrMapping7[(addr & 0xFFFFFFF) >> 14]; + return AddrTranslate7[(addr & 0xFFFFFFF) >> 14] >= ExeMemRegionSizes[exeMem_Unmapped]; } template -inline CompiledBlock LookUpBlock(u32 addr) +inline u32 TranslateAddr(u32 addr) { if (num == 0) - return cache.AddrMapping9[(addr & 0xFFFFFFF) >> 15][(addr & 0x7FFF) >> 1]; + return AddrTranslate9[(addr & 0xFFFFFFF) >> 15] + (addr & 0x7FFF); else - return cache.AddrMapping7[(addr & 0xFFFFFFF) >> 14][(addr & 0x3FFF) >> 1]; + return AddrTranslate7[(addr & 0xFFFFFFF) >> 14] + (addr & 0x3FFF); } template -inline void Invalidate16(u32 addr) +inline JitBlockEntry LookUpBlock(u32 addr) { - if (IsMapped(addr)) - { - if (num == 0) - cache.AddrMapping9[(addr & 0xFFFFFFF) >> 15][(addr & 0x7FFF) >> 1] = NULL; - else - cache.AddrMapping7[(addr & 0xFFFFFFF) >> 14][(addr & 0x3FFF) >> 1] = NULL; - } -} - -template -inline void Invalidate32(u32 addr) -{ - if (IsMapped(addr)) - { - if (num == 0) - { - CompiledBlock* page = cache.AddrMapping9[(addr & 0xFFFFFFF) >> 15]; - page[(addr & 0x7FFF) >> 1] = NULL; - page[((addr + 2) & 0x7FFF) >> 1] = NULL; - } - else - { - CompiledBlock* page = cache.AddrMapping7[(addr & 0xFFFFFFF) >> 14]; - page[(addr & 0x3FFF) >> 1] = NULL; - page[((addr + 2) & 0x3FFF) >> 1] = NULL; - } - } -} - -template -inline void InsertBlock(u32 addr, CompiledBlock func) -{ - if (num == 0) - cache.AddrMapping9[(addr & 0xFFFFFFF) >> 15][(addr & 0x7FFF) >> 1] = func; - else - cache.AddrMapping7[(addr & 0xFFFFFFF) >> 14][(addr & 0x3FFF) >> 1] = func; + return FastBlockAccess[TranslateAddr(addr) / 2]; } void Init(); void DeInit(); -CompiledBlock CompileBlock(ARM* cpu); +void InvalidateByAddr(u32 pseudoPhysical); +void InvalidateAll(); + +void InvalidateITCM(u32 addr); +void InvalidateByAddr7(u32 addr); + +void CompileBlock(ARM* cpu); -void InvalidateBlockCache(); +void ResetBlockCache(); } diff --git a/src/ARMJIT_Internal.h b/src/ARMJIT_Internal.h new file mode 100644 index 0000000..4acb488 --- /dev/null +++ b/src/ARMJIT_Internal.h @@ -0,0 +1,198 @@ +#ifndef ARMJIT_INTERNAL_H +#define ARMJIT_INTERNAL_H + +#include "types.h" +#include + +#include "ARMJIT.h" + +// here lands everything which doesn't fit into ARMJIT.h +// where it would be included by pretty much everything +namespace ARMJIT +{ + +enum +{ + branch_IdleBranch = 1 << 0, + branch_FollowCondTaken = 1 << 1, + branch_FollowCondNotTaken = 1 << 2 +}; + +struct FetchedInstr +{ + u32 A_Reg(int pos) const + { + return (Instr >> pos) & 0xF; + } + + u32 T_Reg(int pos) const + { + return (Instr >> pos) & 0x7; + } + + u32 Cond() const + { + return Instr >> 28; + } + + u8 BranchFlags; + u8 SetFlags; + u32 Instr; + u32 NextInstr[2]; + u32 Addr; + + u8 CodeCycles; + u8 DataCycles; + u8 DataRegion; + + ARMInstrInfo::Info Info; +}; + +/* + TinyVector + - because reinventing the wheel is the best! + + - meant to be used very often, with not so many elements + max 1 << 16 elements + - doesn't allocate while no elements are inserted + - not stl confirmant of course + - probably only works with POD types + - remove operations don't preserve order, but O(1)! +*/ +template +struct __attribute__((packed)) TinyVector +{ + T* Data = NULL; + u16 Capacity = 0; + u32 Length = 0; // make it 32 bit so we don't need movzx + + ~TinyVector() + { + delete[] Data; + } + + void MakeCapacity(u32 capacity) + { + assert(capacity <= UINT16_MAX); + assert(capacity > Capacity); + T* newMem = new T[capacity]; + if (Data != NULL) + memcpy(newMem, Data, sizeof(Data) * Length); + + T* oldData = Data; + Data = newMem; + if (oldData != NULL) + delete[] oldData; + + Capacity = capacity; + } + + void Clear() + { + Length = 0; + } + + void Add(T element) + { + assert(Length + 1 <= UINT16_MAX); + if (Length + 1 > Capacity) + MakeCapacity(((Capacity + 4) * 3) / 2); + + Data[Length++] = element; + } + + void Remove(int index) + { + assert(index >= 0 && index < Length); + + Length--; + Data[index] = Data[Length]; + /*for (int i = index; i < Length; i++) + Data[i] = Data[i + 1];*/ + } + + int Find(T needle) + { + for (int i = 0; i < Length; i++) + { + if (Data[i] == needle) + return i; + } + return -1; + } + + bool RemoveByValue(T needle) + { + for (int i = 0; i < Length; i++) + { + if (Data[i] == needle) + { + Remove(i); + return true; + } + } + return false; + } + + T& operator[](int index) + { + assert(index >= 0 && index < Length); + return Data[index]; + } +}; + +class JitBlock +{ +public: + JitBlock(u32 numInstrs, u32 numAddresses) + { + NumInstrs = numInstrs; + NumAddresses = numAddresses; + Data = new u32[numInstrs + numAddresses]; + } + + ~JitBlock() + { + delete[] Data; + } + + u32 StartAddr; + u32 PseudoPhysicalAddr; + + u32 NumInstrs; + u32 NumAddresses; + + JitBlockEntry EntryPoint; + + u32* Instrs() + { return Data; } + u32* AddressRanges() + { return Data + NumInstrs; } + +private: + /* + 0.. Blocks; + u16 TimesInvalidated; +}; + +extern AddressRange CodeRanges[ExeMemSpaceSize / 256]; + +typedef void (*InterpreterFunc)(ARM* cpu); +extern InterpreterFunc InterpretARM[]; +extern InterpreterFunc InterpretTHUMB[]; + +void* GetFuncForAddr(ARM* cpu, u32 addr, bool store, int size); + +} + +#endif \ No newline at end of file diff --git a/src/ARMJIT_RegisterCache.h b/src/ARMJIT_RegisterCache.h index fe2f203..ed6a2b7 100644 --- a/src/ARMJIT_RegisterCache.h +++ b/src/ARMJIT_RegisterCache.h @@ -60,15 +60,46 @@ public: assert("Welp!"); } + void PutLiteral(int reg, u32 val) + { + LiteralsLoaded |= (1 << reg); + LiteralValues[reg] = val; + } + + void UnloadLiteral(int reg) + { + LiteralsLoaded &= ~(1 << reg); + } + + bool IsLiteral(int reg) + { + return LiteralsLoaded & (1 << reg); + } + + void PrepareExit() + { + BitSet16 dirtyRegs(DirtyRegs); + for (int reg : dirtyRegs) + Compiler->SaveReg(reg, Mapping[reg]); + } + void Flush() { BitSet16 loadedSet(LoadedRegs); for (int reg : loadedSet) UnloadRegister(reg); + LiteralsLoaded = 0; } void Prepare(bool thumb, int i) { + if (LoadedRegs & (1 << 15)) + UnloadRegister(15); + + BitSet16 invalidedLiterals(LiteralsLoaded & Instrs[i].Info.DstRegs); + for (int reg : invalidedLiterals) + UnloadLiteral(reg); + u16 futureNeeded = 0; int ranking[16]; for (int j = 0; j < 16; j++) @@ -86,7 +117,7 @@ public: for (int reg : neverNeededAgain) UnloadRegister(reg); - FetchedInstr Instr = Instrs[i]; + FetchedInstr Instr = Instrs[i]; u16 necessaryRegs = (Instr.Info.SrcRegs & ~(1 << 15)) | Instr.Info.DstRegs; BitSet16 needToBeLoaded(necessaryRegs & ~LoadedRegs); if (needToBeLoaded != BitSet16(0)) @@ -125,6 +156,9 @@ public: static const int NativeRegsAvailable; Reg Mapping[16]; + u32 LiteralValues[16]; + + u16 LiteralsLoaded = 0; u32 NativeRegsUsed = 0; u16 LoadedRegs = 0; u16 DirtyRegs = 0; diff --git a/src/ARMJIT_x64/ARMJIT_ALU.cpp b/src/ARMJIT_x64/ARMJIT_ALU.cpp index f868ddf..14c223b 100644 --- a/src/ARMJIT_x64/ARMJIT_ALU.cpp +++ b/src/ARMJIT_x64/ARMJIT_ALU.cpp @@ -213,7 +213,13 @@ void Compiler::A_Comp_MovOp() MOV(32, rd, op2); if (((CurInstr.Instr >> 21) & 0xF) == 0xF) + { NOT(32, rd); + if (op2.IsImm() && CurInstr.Cond() == 0xE) + RegCache.PutLiteral(CurInstr.A_Reg(12), ~op2.Imm32()); + } + else if (op2.IsImm() && CurInstr.Cond() == 0xE) + RegCache.PutLiteral(CurInstr.A_Reg(12), op2.Imm32()); if (S) { @@ -564,7 +570,13 @@ void Compiler::T_Comp_AddSub_() Comp_AddCycles_C(); - if (op & 1) + // special case for thumb mov being alias to add rd, rn, #0 + if (CurInstr.SetFlags == 0 && rn.IsImm() && rn.Imm32() == 0) + { + if (rd != rs) + MOV(32, rd, rs); + } + else if (op & 1) Comp_ArithTriOp(&Compiler::SUB, rd, rs, rn, false, opSetsFlags|opInvertCarry|opRetriveCV); else Comp_ArithTriOp(&Compiler::ADD, rd, rs, rn, false, opSetsFlags|opSymmetric|opRetriveCV); @@ -614,7 +626,7 @@ void Compiler::T_Comp_ALU() u32 op = (CurInstr.Instr >> 6) & 0xF; if ((op >= 0x2 && op < 0x4) || op == 0x7) - Comp_AddCycles_CI(1); + Comp_AddCycles_CI(1); // shift by reg else Comp_AddCycles_C(); diff --git a/src/ARMJIT_x64/ARMJIT_Branch.cpp b/src/ARMJIT_x64/ARMJIT_Branch.cpp index cc7a3c4..0dedb3f 100644 --- a/src/ARMJIT_x64/ARMJIT_Branch.cpp +++ b/src/ARMJIT_x64/ARMJIT_Branch.cpp @@ -16,9 +16,6 @@ int squeezePointer(T* ptr) void Compiler::Comp_JumpTo(u32 addr, bool forceNonConstantCycles) { // we can simplify constant branches by a lot - // it's not completely safe to assume stuff like, which instructions to preload - // we'll see how it works out - IrregularCycles = true; u32 newPC; @@ -39,18 +36,12 @@ void Compiler::Comp_JumpTo(u32 addr, bool forceNonConstantCycles) { ARMv5* cpu9 = (ARMv5*)CurCPU; - u32 oldregion = R15 >> 24; - u32 newregion = addr >> 24; - u32 regionCodeCycles = cpu9->MemTimings[addr >> 12][0]; u32 compileTimeCodeCycles = cpu9->RegionCodeCycles; cpu9->RegionCodeCycles = regionCodeCycles; - MOV(32, MDisp(RCPU, offsetof(ARMv5, RegionCodeCycles)), Imm32(regionCodeCycles)); - - bool setupRegion = newregion != oldregion; - if (setupRegion) - cpu9->SetupCodeMem(addr); + if (Exit) + MOV(32, MDisp(RCPU, offsetof(ARMv5, RegionCodeCycles)), Imm32(regionCodeCycles)); if (addr & 0x1) { @@ -83,12 +74,7 @@ void Compiler::Comp_JumpTo(u32 addr, bool forceNonConstantCycles) cycles += cpu9->CodeCycles; } - MOV(64, MDisp(RCPU, offsetof(ARM, CodeMem.Mem)), Imm32(squeezePointer(cpu9->CodeMem.Mem))); - MOV(32, MDisp(RCPU, offsetof(ARM, CodeMem.Mask)), Imm32(cpu9->CodeMem.Mask)); - cpu9->RegionCodeCycles = compileTimeCodeCycles; - if (setupRegion) - cpu9->SetupCodeMem(R15); } else { @@ -100,8 +86,11 @@ void Compiler::Comp_JumpTo(u32 addr, bool forceNonConstantCycles) cpu7->CodeRegion = codeRegion; cpu7->CodeCycles = codeCycles; - MOV(32, MDisp(RCPU, offsetof(ARM, CodeRegion)), Imm32(codeRegion)); - MOV(32, MDisp(RCPU, offsetof(ARM, CodeCycles)), Imm32(codeCycles)); + if (Exit) + { + MOV(32, MDisp(RCPU, offsetof(ARM, CodeRegion)), Imm32(codeRegion)); + MOV(32, MDisp(RCPU, offsetof(ARM, CodeCycles)), Imm32(codeCycles)); + } if (addr & 0x1) { @@ -133,7 +122,8 @@ void Compiler::Comp_JumpTo(u32 addr, bool forceNonConstantCycles) cpu7->CodeCycles = addr >> 15; } - MOV(32, MDisp(RCPU, offsetof(ARM, R[15])), Imm32(newPC)); + if (Exit) + MOV(32, MDisp(RCPU, offsetof(ARM, R[15])), Imm32(newPC)); if ((Thumb || CurInstr.Cond() >= 0xE) && !forceNonConstantCycles) ConstantCycles += cycles; else @@ -219,10 +209,23 @@ void Compiler::T_Comp_BCOND() s32 offset = (s32)(CurInstr.Instr << 24) >> 23; Comp_JumpTo(R15 + offset + 1, true); + Comp_SpecialBranchBehaviour(); + FixupBranch skipFailed = J(); SetJumpTarget(skipExecute); + + if (CurInstr.BranchFlags & branch_FollowCondTaken) + { + RegCache.PrepareExit(); + SaveCPSR(false); + + MOV(32, R(RAX), Imm32(ConstantCycles)); + ABI_PopRegistersAndAdjustStack(BitSet32(ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS & ~BitSet32({RSP})), 8); + RET(); + } + Comp_AddCycles_C(true); - SetJumpTarget(skipFailed); + SetJumpTarget(skipFailed); } void Compiler::T_Comp_B() diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp index d8ce1aa..25c55a3 100644 --- a/src/ARMJIT_x64/ARMJIT_Compiler.cpp +++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp @@ -72,12 +72,15 @@ Compiler::Compiler() for (int i = 0; i < 3; i++) { for (int j = 0; j < 2; j++) - { MemoryFuncs9[i][j] = Gen_MemoryRoutine9(j, 8 << i); - MemoryFuncs7[i][j][0] = Gen_MemoryRoutine7(j, false, 8 << i); - MemoryFuncs7[i][j][1] = Gen_MemoryRoutine7(j, true, 8 << i); - } } + MemoryFuncs7[0][0] = (void*)NDS::ARM7Read8; + MemoryFuncs7[0][1] = (void*)NDS::ARM7Write8; + MemoryFuncs7[1][0] = (void*)NDS::ARM7Read16; + MemoryFuncs7[1][1] = (void*)NDS::ARM7Write16; + MemoryFuncs7[2][0] = (void*)NDS::ARM7Read32; + MemoryFuncs7[2][1] = (void*)NDS::ARM7Write32; + for (int i = 0; i < 2; i++) for (int j = 0; j < 2; j++) { @@ -179,12 +182,13 @@ void Compiler::LoadCPSR() MOV(32, R(RCPSR), MDisp(RCPU, offsetof(ARM, CPSR))); } -void Compiler::SaveCPSR() +void Compiler::SaveCPSR(bool flagClean) { if (CPSRDirty) { MOV(32, MDisp(RCPU, offsetof(ARM, CPSR)), R(RCPSR)); - CPSRDirty = false; + if (flagClean) + CPSRDirty = false; } } @@ -204,6 +208,9 @@ void Compiler::SaveReg(int reg, X64Reg nativeReg) // invalidates RSCRATCH and RSCRATCH3 Gen::FixupBranch Compiler::CheckCondition(u32 cond) { + // hack, ldm/stm can get really big TODO: make this better + bool ldmStm = !Thumb && + (CurInstr.Info.Kind == ARMInstrInfo::ak_LDM || CurInstr.Info.Kind == ARMInstrInfo::ak_STM); if (cond >= 0x8) { static_assert(RSCRATCH3 == ECX, "RSCRATCH has to be equal to ECX!"); @@ -213,14 +220,14 @@ Gen::FixupBranch Compiler::CheckCondition(u32 cond) SHL(32, R(RSCRATCH), R(RSCRATCH3)); TEST(32, R(RSCRATCH), Imm32(ARM::ConditionTable[cond])); - return J_CC(CC_Z); + return J_CC(CC_Z, ldmStm); } else { // could have used a LUT, but then where would be the fun? TEST(32, R(RCPSR), Imm32(1 << (28 + ((~(cond >> 1) & 1) << 1 | (cond >> 2 & 1) ^ (cond >> 1 & 1))))); - return J_CC(cond & 1 ? CC_NZ : CC_Z); + return J_CC(cond & 1 ? CC_NZ : CC_Z, ldmStm); } } @@ -354,25 +361,34 @@ void Compiler::Reset() SetCodePtr(ResetStart); } -CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrsCount) +void Compiler::Comp_SpecialBranchBehaviour() +{ + if (CurInstr.BranchFlags & branch_IdleBranch) + OR(32, MDisp(RCPU, offsetof(ARM, Halted)), Imm8(0x20)); + + if (CurInstr.BranchFlags & branch_FollowCondNotTaken) + { + RegCache.PrepareExit(); + SaveCPSR(false); + + MOV(32, R(RAX), Imm32(ConstantCycles)); + ABI_PopRegistersAndAdjustStack(BitSet32(ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS & ~BitSet32({RSP})), 8); + RET(); + } +} + +JitBlockEntry Compiler::CompileBlock(ARM* cpu, bool thumb, FetchedInstr instrs[], int instrsCount) { if (CodeMemSize - (GetWritableCodePtr() - ResetStart) < 1024 * 32) // guess... - InvalidateBlockCache(); + ResetBlockCache(); ConstantCycles = 0; - Thumb = cpu->CPSR & 0x20; + Thumb = thumb; Num = cpu->Num; - CodeRegion = cpu->CodeRegion; + CodeRegion = instrs[0].Addr >> 24; CurCPU = cpu; - CompiledBlock res = (CompiledBlock)GetWritableCodePtr(); - - if (!(Num == 0 - ? IsMapped<0>(instrs[0].Addr - (Thumb ? 2 : 4)) - : IsMapped<1>(instrs[0].Addr - (Thumb ? 2 : 4)))) - { - printf("Trying to compile a block in unmapped memory\n"); - } + JitBlockEntry res = (JitBlockEntry)GetWritableCodePtr(); ABI_PushRegistersAndAdjustStack(BitSet32(ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS & ~BitSet32({RSP})), 8); @@ -380,7 +396,6 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs LoadCPSR(); - // TODO: this is ugly as a whole, do better RegCache = RegisterCache(this, instrs, instrsCount); for (int i = 0; i < instrsCount; i++) @@ -388,21 +403,25 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs CurInstr = instrs[i]; R15 = CurInstr.Addr + (Thumb ? 4 : 8); + Exit = i == instrsCount - 1 || (CurInstr.BranchFlags & branch_FollowCondNotTaken); + CompileFunc comp = Thumb ? T_Comp[CurInstr.Info.Kind] : A_Comp[CurInstr.Info.Kind]; bool isConditional = Thumb ? CurInstr.Info.Kind == ARMInstrInfo::tk_BCOND : CurInstr.Cond() < 0xE; - if (comp == NULL || (i == instrsCount - 1 && (!CurInstr.Info.Branches() || isConditional))) + if (comp == NULL || (CurInstr.BranchFlags & branch_FollowCondTaken) || (i == instrsCount - 1 && (!CurInstr.Info.Branches() || isConditional))) { MOV(32, MDisp(RCPU, offsetof(ARM, R[15])), Imm32(R15)); - MOV(32, MDisp(RCPU, offsetof(ARM, CodeCycles)), Imm32(CurInstr.CodeCycles)); - MOV(32, MDisp(RCPU, offsetof(ARM, CurInstr)), Imm32(CurInstr.Instr)); - if (comp == NULL) + { + MOV(32, MDisp(RCPU, offsetof(ARM, CodeCycles)), Imm32(CurInstr.CodeCycles)); + MOV(32, MDisp(RCPU, offsetof(ARM, CurInstr)), Imm32(CurInstr.Instr)); + SaveCPSR(); + } } - + if (comp != NULL) RegCache.Prepare(Thumb, i); else @@ -410,12 +429,11 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs if (Thumb) { - u32 icode = (CurInstr.Instr >> 6) & 0x3FF; if (comp == NULL) { MOV(64, R(ABI_PARAM1), R(RCPU)); - ABI_CallFunction(ARMInterpreter::THUMBInstrTable[icode]); + ABI_CallFunction(InterpretTHUMB[CurInstr.Info.Kind]); } else (this->*comp)(); @@ -434,7 +452,9 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs } } else if (cond == 0xF) + { Comp_AddCycles_C(); + } else { IrregularCycles = false; @@ -443,25 +463,36 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs if (cond < 0xE) skipExecute = CheckCondition(cond); - u32 icode = ((CurInstr.Instr >> 4) & 0xF) | ((CurInstr.Instr >> 16) & 0xFF0); if (comp == NULL) { MOV(64, R(ABI_PARAM1), R(RCPU)); - ABI_CallFunction(ARMInterpreter::ARMInstrTable[icode]); + ABI_CallFunction(InterpretARM[CurInstr.Info.Kind]); } else (this->*comp)(); + Comp_SpecialBranchBehaviour(); + if (CurInstr.Cond() < 0xE) { - if (IrregularCycles) + if (IrregularCycles || (CurInstr.BranchFlags & branch_FollowCondTaken)) { FixupBranch skipFailed = J(); SetJumpTarget(skipExecute); Comp_AddCycles_C(true); + if (CurInstr.BranchFlags & branch_FollowCondTaken) + { + RegCache.PrepareExit(); + SaveCPSR(false); + + MOV(32, R(RAX), Imm32(ConstantCycles)); + ABI_PopRegistersAndAdjustStack(BitSet32(ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS & ~BitSet32({RSP})), 8); + RET(); + } + SetJumpTarget(skipFailed); } else @@ -483,6 +514,12 @@ CompiledBlock Compiler::CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrs ABI_PopRegistersAndAdjustStack(BitSet32(ABI_ALL_CALLEE_SAVED & ABI_ALL_GPRS & ~BitSet32({RSP})), 8); RET(); + /*FILE* codeout = fopen("codeout", "a"); + fprintf(codeout, "beginning block argargarg__ %x!!!", instrs[0].Addr); + fwrite((u8*)res, GetWritableCodePtr() - (u8*)res, 1, codeout); + + fclose(codeout);*/ + return res; } @@ -528,4 +565,89 @@ void Compiler::Comp_AddCycles_CI(Gen::X64Reg i, int add) } } +void Compiler::Comp_AddCycles_CDI() +{ + if (Num == 0) + Comp_AddCycles_CD(); + else + { + IrregularCycles = true; + + s32 cycles; + + s32 numC = NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2]; + s32 numD = CurInstr.DataCycles; + + if (CurInstr.DataRegion == 0x02) // mainRAM + { + if (CodeRegion == 0x02) + cycles = numC + numD; + else + { + numC++; + cycles = std::max(numC + numD - 3, std::max(numC, numD)); + } + } + else if (CodeRegion == 0x02) + { + numD++; + cycles = std::max(numC + numD - 3, std::max(numC, numD)); + } + else + { + cycles = numC + numD + 1; + } + + printf("%x: %d %d cycles cdi (%d)\n", CurInstr.Instr, Num, CurInstr.DataCycles, cycles); + + if (!Thumb && CurInstr.Cond() < 0xE) + ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm8(cycles)); + else + ConstantCycles += cycles; + } +} + +void Compiler::Comp_AddCycles_CD() +{ + u32 cycles = 0; + if (Num == 0) + { + s32 numC = (R15 & 0x2) ? 0 : CurInstr.CodeCycles; + s32 numD = CurInstr.DataCycles; + + //if (DataRegion != CodeRegion) + cycles = std::max(numC + numD - 6, std::max(numC, numD)); + + IrregularCycles = cycles != numC; + } + else + { + s32 numC = NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2]; + s32 numD = CurInstr.DataCycles; + + if (CurInstr.DataRegion == 0x02) + { + if (CodeRegion == 0x02) + cycles += numC + numD; + else + cycles += std::max(numC + numD - 3, std::max(numC, numD)); + } + else if (CodeRegion == 0x02) + { + cycles += std::max(numC + numD - 3, std::max(numC, numD)); + } + else + { + cycles += numC + numD; + } + + IrregularCycles = true; + } + + if (!Thumb && CurInstr.Cond() < 0xE) + ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm8(cycles)); + else + ConstantCycles += cycles; +} + } \ No newline at end of file diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.h b/src/ARMJIT_x64/ARMJIT_Compiler.h index fcb2380..792ff66 100644 --- a/src/ARMJIT_x64/ARMJIT_Compiler.h +++ b/src/ARMJIT_x64/ARMJIT_Compiler.h @@ -4,6 +4,7 @@ #include "../dolphin/x64Emitter.h" #include "../ARMJIT.h" +#include "../ARMJIT_Internal.h" #include "../ARMJIT_RegisterCache.h" namespace ARMJIT @@ -16,6 +17,32 @@ const Gen::X64Reg RSCRATCH = Gen::EAX; const Gen::X64Reg RSCRATCH2 = Gen::EDX; const Gen::X64Reg RSCRATCH3 = Gen::ECX; +struct ComplexOperand +{ + ComplexOperand() + {} + + ComplexOperand(u32 imm) + : IsImm(true), Imm(imm) + {} + ComplexOperand(int reg, int op, int amount) + : IsImm(false) + { + Reg.Reg = reg; + Reg.Op = op; + Reg.Amount = amount; + } + + bool IsImm; + union + { + struct + { + int Reg, Op, Amount; + } Reg; + u32 Imm; + }; +}; class Compiler : public Gen::XEmitter { @@ -24,7 +51,7 @@ public: void Reset(); - CompiledBlock CompileBlock(ARM* cpu, FetchedInstr instrs[], int instrsCount); + JitBlockEntry CompileBlock(ARM* cpu, bool thumb, FetchedInstr instrs[], int instrsCount); void LoadReg(int reg, Gen::X64Reg nativeReg); void SaveReg(int reg, Gen::X64Reg nativeReg); @@ -39,6 +66,8 @@ public: void Comp_AddCycles_C(bool forceNonConstant = false); void Comp_AddCycles_CI(u32 i); void Comp_AddCycles_CI(Gen::X64Reg i, int add); + void Comp_AddCycles_CDI(); + void Comp_AddCycles_CD(); enum { @@ -92,8 +121,17 @@ public: void T_Comp_BL_LONG_2(); void T_Comp_BL_Merged(); - void Comp_MemAccess(Gen::OpArg rd, bool signExtend, bool store, int size); + enum + { + memop_Writeback = 1 << 0, + memop_Post = 1 << 1, + memop_SignExtend = 1 << 2, + memop_Store = 1 << 3, + memop_SubtractOffset = 1 << 4 + }; + void Comp_MemAccess(int rd, int rn, const ComplexOperand& op2, int size, int flags); s32 Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc, bool decrement, bool usermode); + void Comp_MemLoadLiteral(int size, int rd, u32 addr); void Comp_ArithTriOp(void (Compiler::*op)(int, const Gen::OpArg&, const Gen::OpArg&), Gen::OpArg rd, Gen::OpArg rn, Gen::OpArg op2, bool carryUsed, int opFlags); @@ -105,8 +143,9 @@ public: void Comp_RetriveFlags(bool sign, bool retriveCV, bool carryUsed); + void Comp_SpecialBranchBehaviour(); + void* Gen_MemoryRoutine9(bool store, int size); - void* Gen_MemoryRoutine7(bool store, bool codeMainRAM, int size); void* Gen_MemoryRoutineSeq9(bool store, bool preinc); void* Gen_MemoryRoutineSeq7(bool store, bool preinc, bool codeMainRAM); @@ -117,10 +156,9 @@ public: Gen::OpArg Comp_RegShiftReg(int op, Gen::OpArg rs, Gen::OpArg rm, bool S, bool& carryUsed); Gen::OpArg A_Comp_GetALUOp2(bool S, bool& carryUsed); - Gen::OpArg A_Comp_GetMemWBOffset(); void LoadCPSR(); - void SaveCPSR(); + void SaveCPSR(bool flagClean = true); bool FlagsNZRequired() { return CurInstr.SetFlags & 0xC; } @@ -139,10 +177,11 @@ public: u8* ResetStart; u32 CodeMemSize; + bool Exit; bool IrregularCycles; void* MemoryFuncs9[3][2]; - void* MemoryFuncs7[3][2][2]; + void* MemoryFuncs7[3][2]; void* MemoryFuncsSeq9[2][2]; void* MemoryFuncsSeq7[2][2][2]; diff --git a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp index bf8280d..13ca415 100644 --- a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp +++ b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp @@ -27,51 +27,7 @@ int squeezePointer(T* ptr) /* address - ABI_PARAM1 (a.k.a. ECX = RSCRATCH3 on Windows) store value - ABI_PARAM2 (a.k.a. RDX = RSCRATCH2 on Windows) - code cycles - ABI_PARAM3 */ - -#define CALC_CYCLES_9(numC, numD, scratch) \ - LEA(32, scratch, MComplex(numD, numC, SCALE_1, -6)); \ - CMP(32, R(numC), R(numD)); \ - CMOVcc(32, numD, R(numC), CC_G); \ - CMP(32, R(numD), R(scratch)); \ - CMOVcc(32, scratch, R(numD), CC_G); \ - ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(scratch)); -#define CALC_CYCLES_7_DATA_MAIN_RAM(numC, numD, scratch) \ - if (codeMainRAM) \ - { \ - LEA(32, scratch, MRegSum(numD, numC)); \ - ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(scratch)); \ - } \ - else \ - { \ - if (!store) \ - ADD(32, R(numC), Imm8(1)); \ - LEA(32, scratch, MComplex(numD, numC, SCALE_1, -3)); \ - CMP(32, R(numD), R(numC)); \ - CMOVcc(32, numC, R(numD), CC_G); \ - CMP(32, R(numC), R(scratch)); \ - CMOVcc(32, scratch, R(numC), CC_G); \ - ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(scratch)); \ - } -#define CALC_CYCLES_7_DATA_NON_MAIN_RAM(numC, numD, scratch) \ - if (codeMainRAM) \ - { \ - if (!store) \ - ADD(32, R(numD), Imm8(1)); \ - LEA(32, scratch, MComplex(numD, numC, SCALE_1, -3)); \ - CMP(32, R(numD), R(numC)); \ - CMOVcc(32, numC, R(numD), CC_G); \ - CMP(32, R(numC), R(scratch)); \ - CMOVcc(32, scratch, R(numC), CC_G); \ - ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(scratch)); \ - } \ - else \ - { \ - LEA(32, scratch, MComplex(numD, numC, SCALE_1, store ? 0 : 1)); \ - ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(scratch)); \ - } - void* Compiler::Gen_MemoryRoutine9(bool store, int size) { u32 addressMask = ~(size == 32 ? 3 : (size == 16 ? 1 : 0)); @@ -86,12 +42,6 @@ void* Compiler::Gen_MemoryRoutine9(bool store, int size) CMP(32, R(ABI_PARAM1), MDisp(RCPU, offsetof(ARMv5, ITCMSize))); FixupBranch insideITCM = J_CC(CC_B); - // cycle counting! - MOV(32, R(ABI_PARAM4), R(ABI_PARAM1)); - SHR(32, R(ABI_PARAM4), Imm8(12)); - MOVZX(32, 8, ABI_PARAM4, MComplex(RCPU, ABI_PARAM4, SCALE_4, offsetof(ARMv5, MemTimings) + (size == 32 ? 2 : 1))); - CALC_CYCLES_9(ABI_PARAM3, ABI_PARAM4, RSCRATCH) - if (store) { if (size > 8) @@ -127,7 +77,6 @@ void* Compiler::Gen_MemoryRoutine9(bool store, int size) } SetJumpTarget(insideDTCM); - ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(ABI_PARAM3)); AND(32, R(RSCRATCH), Imm32(0x3FFF & addressMask)); if (store) MOV(size, MComplex(RCPU, RSCRATCH, SCALE_1, offsetof(ARMv5, DTCM)), R(ABI_PARAM2)); @@ -146,16 +95,22 @@ void* Compiler::Gen_MemoryRoutine9(bool store, int size) RET(); SetJumpTarget(insideITCM); - ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(ABI_PARAM3)); MOV(32, R(ABI_PARAM3), R(ABI_PARAM1)); // free up ECX AND(32, R(ABI_PARAM3), Imm32(0x7FFF & addressMask)); if (store) { MOV(size, MComplex(RCPU, ABI_PARAM3, SCALE_1, offsetof(ARMv5, ITCM)), R(ABI_PARAM2)); - XOR(32, R(RSCRATCH), R(RSCRATCH)); - MOV(64, MScaled(ABI_PARAM3, SCALE_4, squeezePointer(cache.ARM9_ITCM)), R(RSCRATCH)); - if (size == 32) - MOV(64, MScaled(ABI_PARAM3, SCALE_4, squeezePointer(cache.ARM9_ITCM) + 8), R(RSCRATCH)); + + // if CodeRanges[pseudoPhysical/256].Blocks.Length > 0 we're writing into code! + static_assert(sizeof(AddressRange) == 16); + LEA(32, ABI_PARAM1, MDisp(ABI_PARAM3, ExeMemRegionOffsets[exeMem_ITCM])); + MOV(32, R(RSCRATCH), R(ABI_PARAM1)); + SHR(32, R(RSCRATCH), Imm8(8)); + SHL(32, R(RSCRATCH), Imm8(4)); + CMP(32, MDisp(RSCRATCH, squeezePointer(CodeRanges) + offsetof(AddressRange, Blocks.Length)), Imm8(0)); + FixupBranch noCode = J_CC(CC_Z); + JMP((u8*)InvalidateByAddr, true); + SetJumpTarget(noCode); } else { @@ -176,83 +131,6 @@ void* Compiler::Gen_MemoryRoutine9(bool store, int size) return res; } -void* Compiler::Gen_MemoryRoutine7(bool store, bool codeMainRAM, int size) -{ - u32 addressMask = ~(size == 32 ? 3 : (size == 16 ? 1 : 0)); - AlignCode4(); - void* res = GetWritableCodePtr(); - - MOV(32, R(RSCRATCH), R(ABI_PARAM1)); - SHR(32, R(RSCRATCH), Imm8(15)); - MOVZX(32, 8, ABI_PARAM4, MScaled(RSCRATCH, SCALE_4, (size == 32 ? 2 : 0) + squeezePointer(NDS::ARM7MemTimings))); - - MOV(32, R(RSCRATCH), R(ABI_PARAM1)); - AND(32, R(RSCRATCH), Imm32(0xFF000000)); - CMP(32, R(RSCRATCH), Imm32(0x02000000)); - FixupBranch outsideMainRAM = J_CC(CC_NE); - CALC_CYCLES_7_DATA_MAIN_RAM(ABI_PARAM3, ABI_PARAM4, RSCRATCH) - MOV(32, R(ABI_PARAM3), R(ABI_PARAM1)); - AND(32, R(ABI_PARAM3), Imm32((MAIN_RAM_SIZE - 1) & addressMask)); - if (store) - { - MOV(size, MDisp(ABI_PARAM3, squeezePointer(NDS::MainRAM)), R(ABI_PARAM2)); - XOR(32, R(RSCRATCH), R(RSCRATCH)); - MOV(64, MScaled(ABI_PARAM3, SCALE_4, squeezePointer(cache.MainRAM)), R(RSCRATCH)); - if (size == 32) - MOV(64, MScaled(ABI_PARAM3, SCALE_4, squeezePointer(cache.MainRAM) + 8), R(RSCRATCH)); - } - else - { - MOVZX(32, size, RSCRATCH, MDisp(ABI_PARAM3, squeezePointer(NDS::MainRAM))); - if (size == 32) - { - if (ABI_PARAM1 != ECX) - MOV(32, R(ECX), R(ABI_PARAM1)); - AND(32, R(ECX), Imm8(3)); - SHL(32, R(ECX), Imm8(3)); - ROR_(32, R(RSCRATCH), R(ECX)); - } - } - RET(); - - SetJumpTarget(outsideMainRAM); - CALC_CYCLES_7_DATA_NON_MAIN_RAM(ABI_PARAM3, ABI_PARAM4, RSCRATCH) - if (store) - { - if (size > 8) - AND(32, R(ABI_PARAM1), Imm32(addressMask)); - switch (size) - { - case 32: JMP((u8*)NDS::ARM7Write32, true); break; - case 16: JMP((u8*)NDS::ARM7Write16, true); break; - case 8: JMP((u8*)NDS::ARM7Write8, true); break; - } - } - else - { - if (size == 32) - { - ABI_PushRegistersAndAdjustStack({ABI_PARAM1}, 8); - AND(32, R(ABI_PARAM1), Imm32(addressMask)); - ABI_CallFunction(NDS::ARM7Read32); - ABI_PopRegistersAndAdjustStack({ECX}, 8); - AND(32, R(ECX), Imm8(3)); - SHL(32, R(ECX), Imm8(3)); - ROR_(32, R(RSCRATCH), R(ECX)); - RET(); - } - else if (size == 16) - { - AND(32, R(ABI_PARAM1), Imm32(addressMask)); - JMP((u8*)NDS::ARM7Read16, true); - } - else - JMP((u8*)NDS::ARM7Read8, true); - } - - return res; -} - #define MEMORY_SEQ_WHILE_COND \ if (!store) \ MOV(32, currentElement, R(EAX));\ @@ -266,24 +144,13 @@ void* Compiler::Gen_MemoryRoutine7(bool store, bool codeMainRAM, int size) ABI_PARAM1 address ABI_PARAM2 address where registers are stored ABI_PARAM3 how many values to read/write - ABI_PARAM4 code cycles Dolphin x64CodeEmitter is my favourite assembler */ void* Compiler::Gen_MemoryRoutineSeq9(bool store, bool preinc) { - const u8* zero = GetCodePtr(); - ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(ABI_PARAM4)); - RET(); - void* res = (void*)GetWritableCodePtr(); - TEST(32, R(ABI_PARAM3), R(ABI_PARAM3)); - J_CC(CC_Z, zero); - - PUSH(ABI_PARAM3); - PUSH(ABI_PARAM4); // we need you later - const u8* repeat = GetCodePtr(); if (preinc) @@ -311,12 +178,7 @@ void* Compiler::Gen_MemoryRoutineSeq9(bool store, bool preinc) ABI_PopRegistersAndAdjustStack({ABI_PARAM1, ABI_PARAM2, ABI_PARAM3}, 8); MEMORY_SEQ_WHILE_COND - MOV(32, R(RSCRATCH), R(ABI_PARAM1)); - SHR(32, R(RSCRATCH), Imm8(12)); - MOVZX(32, 8, ABI_PARAM2, MComplex(RCPU, RSCRATCH, SCALE_4, 2 + offsetof(ARMv5, MemTimings))); - MOVZX(32, 8, RSCRATCH, MComplex(RCPU, RSCRATCH, SCALE_4, 3 + offsetof(ARMv5, MemTimings))); - - FixupBranch finishIt1 = J(); + RET(); SetJumpTarget(insideDTCM); AND(32, R(RSCRATCH), Imm32(0x3FFF & ~3)); @@ -329,9 +191,7 @@ void* Compiler::Gen_MemoryRoutineSeq9(bool store, bool preinc) MOV(32, R(RSCRATCH), MComplex(RCPU, RSCRATCH, SCALE_1, offsetof(ARMv5, DTCM))); MEMORY_SEQ_WHILE_COND - MOV(32, R(RSCRATCH), Imm32(1)); // sequential access time - MOV(32, R(ABI_PARAM2), Imm32(1)); // non sequential - FixupBranch finishIt2 = J(); + RET(); SetJumpTarget(insideITCM); MOV(32, R(RSCRATCH), R(ABI_PARAM1)); @@ -340,31 +200,23 @@ void* Compiler::Gen_MemoryRoutineSeq9(bool store, bool preinc) { MOV(32, R(ABI_PARAM4), currentElement); MOV(32, MComplex(RCPU, RSCRATCH, SCALE_1, offsetof(ARMv5, ITCM)), R(ABI_PARAM4)); - XOR(32, R(ABI_PARAM4), R(ABI_PARAM4)); - MOV(64, MScaled(RSCRATCH, SCALE_4, squeezePointer(cache.ARM9_ITCM)), R(ABI_PARAM4)); - MOV(64, MScaled(RSCRATCH, SCALE_4, squeezePointer(cache.ARM9_ITCM) + 8), R(ABI_PARAM4)); + + ADD(32, R(RSCRATCH), Imm32(ExeMemRegionOffsets[exeMem_ITCM])); + MOV(32, R(ABI_PARAM4), R(RSCRATCH)); + SHR(32, R(RSCRATCH), Imm8(8)); + SHL(32, R(RSCRATCH), Imm8(4)); + CMP(32, MDisp(RSCRATCH, squeezePointer(CodeRanges) + offsetof(AddressRange, Blocks.Length)), Imm8(0)); + FixupBranch noCode = J_CC(CC_Z); + ABI_PushRegistersAndAdjustStack({ABI_PARAM1, ABI_PARAM2, ABI_PARAM3}, 8); + MOV(32, R(ABI_PARAM1), R(ABI_PARAM4)); + CALL((u8*)InvalidateByAddr); + ABI_PopRegistersAndAdjustStack({ABI_PARAM1, ABI_PARAM2, ABI_PARAM3}, 8); + SetJumpTarget(noCode); } else MOV(32, R(RSCRATCH), MComplex(RCPU, RSCRATCH, SCALE_1, offsetof(ARMv5, ITCM))); MEMORY_SEQ_WHILE_COND - MOV(32, R(RSCRATCH), Imm32(1)); - MOV(32, R(ABI_PARAM2), Imm32(1)); - - SetJumpTarget(finishIt1); - SetJumpTarget(finishIt2); - - POP(ABI_PARAM4); - POP(ABI_PARAM3); - - CMP(32, R(ABI_PARAM3), Imm8(1)); - FixupBranch skipSequential = J_CC(CC_E); - SUB(32, R(ABI_PARAM3), Imm8(1)); - IMUL(32, RSCRATCH, R(ABI_PARAM3)); - ADD(32, R(ABI_PARAM2), R(RSCRATCH)); - SetJumpTarget(skipSequential); - - CALC_CYCLES_9(ABI_PARAM4, ABI_PARAM2, RSCRATCH) RET(); return res; @@ -372,18 +224,8 @@ void* Compiler::Gen_MemoryRoutineSeq9(bool store, bool preinc) void* Compiler::Gen_MemoryRoutineSeq7(bool store, bool preinc, bool codeMainRAM) { - const u8* zero = GetCodePtr(); - ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(ABI_PARAM4)); - RET(); - void* res = (void*)GetWritableCodePtr(); - TEST(32, R(ABI_PARAM3), R(ABI_PARAM3)); - J_CC(CC_Z, zero); - - PUSH(ABI_PARAM3); - PUSH(ABI_PARAM4); // we need you later - const u8* repeat = GetCodePtr(); if (preinc) @@ -403,59 +245,227 @@ void* Compiler::Gen_MemoryRoutineSeq7(bool store, bool preinc, bool codeMainRAM) ABI_PopRegistersAndAdjustStack({ABI_PARAM1, ABI_PARAM2, ABI_PARAM3}, 8); MEMORY_SEQ_WHILE_COND - MOV(32, R(RSCRATCH), R(ABI_PARAM1)); - SHR(32, R(RSCRATCH), Imm8(15)); - MOVZX(32, 8, ABI_PARAM2, MScaled(RSCRATCH, SCALE_4, 2 + squeezePointer(NDS::ARM7MemTimings))); - MOVZX(32, 8, RSCRATCH, MScaled(RSCRATCH, SCALE_4, 3 + squeezePointer(NDS::ARM7MemTimings))); + RET(); - POP(ABI_PARAM4); - POP(ABI_PARAM3); + return res; +} - // TODO: optimise this - CMP(32, R(ABI_PARAM3), Imm8(1)); - FixupBranch skipSequential = J_CC(CC_E); - SUB(32, R(ABI_PARAM3), Imm8(1)); - IMUL(32, RSCRATCH, R(ABI_PARAM3)); - ADD(32, R(ABI_PARAM2), R(RSCRATCH)); - SetJumpTarget(skipSequential); +#undef MEMORY_SEQ_WHILE_COND - MOV(32, R(RSCRATCH), R(ABI_PARAM1)); - AND(32, R(RSCRATCH), Imm32(0xFF000000)); - CMP(32, R(RSCRATCH), Imm32(0x02000000)); - FixupBranch outsideMainRAM = J_CC(CC_NE); - CALC_CYCLES_7_DATA_MAIN_RAM(ABI_PARAM4, ABI_PARAM2, RSCRATCH) - RET(); +void Compiler::Comp_MemLoadLiteral(int size, int rd, u32 addr) +{ + u32 val; + // make sure arm7 bios is accessible + u32 tmpR15 = CurCPU->R[15]; + CurCPU->R[15] = R15; + if (size == 32) + { + CurCPU->DataRead32(addr & ~0x3, &val); + val = ROR(val, (addr & 0x3) << 3); + } + else if (size == 16) + CurCPU->DataRead16(addr & ~0x1, &val); + else + CurCPU->DataRead8(addr, &val); + CurCPU->R[15] = tmpR15; - SetJumpTarget(outsideMainRAM); - CALC_CYCLES_7_DATA_NON_MAIN_RAM(ABI_PARAM4, ABI_PARAM2, RSCRATCH) - RET(); + MOV(32, MapReg(rd), Imm32(val)); - return res; + if (Thumb || CurInstr.Cond() == 0xE) + RegCache.PutLiteral(rd, val); + + Comp_AddCycles_CDI(); } -#undef CALC_CYCLES_9 -#undef MEMORY_SEQ_WHILE_COND +void fault(u32 a, u32 b) +{ + printf("actually not static! %x %x\n", a, b); +} -void Compiler::Comp_MemAccess(OpArg rd, bool signExtend, bool store, int size) +void Compiler::Comp_MemAccess(int rd, int rn, const ComplexOperand& op2, int size, int flags) { - IrregularCycles = true; + if (flags & memop_Store) + { + Comp_AddCycles_CD(); + } + else + { + Comp_AddCycles_CDI(); + } - if (store) - MOV(32, R(ABI_PARAM2), rd); - u32 cycles = Num - ? NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2] - : (R15 & 0x2 ? 0 : CurInstr.CodeCycles); - MOV(32, R(ABI_PARAM3), Imm32(cycles)); - CALL(Num == 0 - ? MemoryFuncs9[size >> 4][store] - : MemoryFuncs7[size >> 4][store][CodeRegion == 0x02]); + u32 addressMask = ~0; + if (size == 32) + addressMask = ~3; + if (size == 16) + addressMask = ~1; - if (!store) + if (rn == 15 && rd != 15 && op2.IsImm && !(flags & (memop_Post|memop_Store|memop_Writeback))) { - if (signExtend) - MOVSX(32, size, rd.GetSimpleReg(), R(RSCRATCH)); + Comp_MemLoadLiteral(size, rd, + R15 + op2.Imm * ((flags & memop_SubtractOffset) ? -1 : 1)); + } + else + { + OpArg rdMapped = MapReg(rd); + OpArg rnMapped = MapReg(rn); + + bool inlinePreparation = Num == 1; + u32 constLocalROR32 = 4; + + void* memoryFunc = Num == 0 + ? MemoryFuncs9[size >> 4][!!(flags & memop_Store)] + : MemoryFuncs7[size >> 4][!!((flags & memop_Store))]; + + if ((rd != 15 || (flags & memop_Store)) && op2.IsImm && RegCache.IsLiteral(rn)) + { + u32 addr = RegCache.LiteralValues[rn] + op2.Imm * ((flags & memop_SubtractOffset) ? -1 : 1); + + /*MOV(32, R(ABI_PARAM1), Imm32(CurInstr.Instr)); + MOV(32, R(ABI_PARAM1), Imm32(R15)); + MOV_sum(32, RSCRATCH, rnMapped, Imm32(op2.Imm * ((flags & memop_SubtractOffset) ? -1 : 1))); + CMP(32, R(RSCRATCH), Imm32(addr)); + FixupBranch eq = J_CC(CC_E); + CALL((void*)fault); + SetJumpTarget(eq);*/ + + NDS::MemRegion region; + region.Mem = NULL; + if (Num == 0) + { + ARMv5* cpu5 = (ARMv5*)CurCPU; + + // stupid dtcm... + if (addr >= cpu5->DTCMBase && addr < (cpu5->DTCMBase + cpu5->DTCMSize)) + { + region.Mem = cpu5->DTCM; + region.Mask = 0x3FFF; + } + else + { + NDS::ARM9GetMemRegion(addr, flags & memop_Store, ®ion); + } + } + else + NDS::ARM7GetMemRegion(addr, flags & memop_Store, ®ion); + + if (region.Mem != NULL) + { + void* ptr = ®ion.Mem[addr & addressMask & region.Mask]; + + if (flags & memop_Store) + { + MOV(size, M(ptr), MapReg(rd)); + } + else + { + if (flags & memop_SignExtend) + MOVSX(32, size, rdMapped.GetSimpleReg(), M(ptr)); + else + MOVZX(32, size, rdMapped.GetSimpleReg(), M(ptr)); + + if (size == 32 && addr & ~0x3) + { + ROR_(32, rdMapped, Imm8((addr & 0x3) << 3)); + } + } + + return; + } + + void* specialFunc = GetFuncForAddr(CurCPU, addr, flags & memop_Store, size); + if (specialFunc) + { + memoryFunc = specialFunc; + inlinePreparation = true; + constLocalROR32 = addr & 0x3; + } + } + + X64Reg finalAddr = ABI_PARAM1; + if (flags & memop_Post) + { + MOV(32, R(ABI_PARAM1), rnMapped); + + finalAddr = rnMapped.GetSimpleReg(); + } + + if (op2.IsImm) + { + MOV_sum(32, finalAddr, rnMapped, Imm32(op2.Imm * ((flags & memop_SubtractOffset) ? -1 : 1))); + } else - MOVZX(32, size, rd.GetSimpleReg(), R(RSCRATCH)); + { + OpArg rm = MapReg(op2.Reg.Reg); + + if (!(flags & memop_SubtractOffset) && rm.IsSimpleReg() && rnMapped.IsSimpleReg() + && op2.Reg.Op == 0 && op2.Reg.Amount > 0 && op2.Reg.Amount <= 3) + { + LEA(32, finalAddr, + MComplex(rnMapped.GetSimpleReg(), rm.GetSimpleReg(), 1 << op2.Reg.Amount, 0)); + } + else + { + bool throwAway; + OpArg offset = + Comp_RegShiftImm(op2.Reg.Op, op2.Reg.Amount, rm, false, throwAway); + + if (flags & memop_SubtractOffset) + { + MOV(32, R(finalAddr), rnMapped); + if (!offset.IsZero()) + SUB(32, R(finalAddr), offset); + } + else + MOV_sum(32, finalAddr, rnMapped, offset); + } + } + + if ((flags & memop_Writeback) && !(flags & memop_Post)) + MOV(32, rnMapped, R(finalAddr)); + + if (flags & memop_Store) + MOV(32, R(ABI_PARAM2), rdMapped); + + if (!(flags & memop_Store) && inlinePreparation && constLocalROR32 == 4 && size == 32) + MOV(32, rdMapped, R(ABI_PARAM1)); + + if (inlinePreparation && size > 8) + AND(32, R(ABI_PARAM1), Imm8(addressMask)); + + CALL(memoryFunc); + + if (!(flags & memop_Store)) + { + if (inlinePreparation && size == 32) + { + if (constLocalROR32 == 4) + { + static_assert(RSCRATCH3 == ECX); + MOV(32, R(ECX), rdMapped); + AND(32, R(ECX), Imm8(3)); + SHL(32, R(ECX), Imm8(3)); + ROR_(32, R(RSCRATCH), R(ECX)); + } + else if (constLocalROR32 != 0) + ROR_(32, R(RSCRATCH), Imm8(constLocalROR32 << 3)); + } + + if (flags & memop_SignExtend) + MOVSX(32, size, rdMapped.GetSimpleReg(), R(RSCRATCH)); + else + MOVZX(32, size, rdMapped.GetSimpleReg(), R(RSCRATCH)); + } + + if (!(flags & memop_Store) && rd == 15) + { + if (size < 32) + printf("!!! LDR <32 bit PC %08X %x\n", R15, CurInstr.Instr); + { + if (Num == 1) + AND(32, rdMapped, Imm8(0xFE)); // immediate is sign extended + Comp_JumpTo(rdMapped.GetSimpleReg()); + } + } } } @@ -475,16 +485,13 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc s32 offset = (regsCount * 4) * (decrement ? -1 : 1); - u32 cycles = Num - ? NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2] - : (R15 & 0x2 ? 0 : CurInstr.CodeCycles); - // we need to make sure that the stack stays aligned to 16 bytes u32 stackAlloc = ((regsCount + 1) & ~1) * 8; - MOV(32, R(ABI_PARAM4), Imm32(cycles)); if (!store) { + Comp_AddCycles_CDI(); + MOV(32, R(ABI_PARAM3), Imm32(regsCount)); SUB(64, R(RSP), stackAlloc <= INT8_MAX ? Imm8(stackAlloc) : Imm32(stackAlloc)); MOV(64, R(ABI_PARAM2), R(RSP)); @@ -548,6 +555,8 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc } else { + Comp_AddCycles_CD(); + if (regsCount & 1) PUSH(RSCRATCH); @@ -594,81 +603,45 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc return offset; } -OpArg Compiler::A_Comp_GetMemWBOffset() -{ - if (!(CurInstr.Instr & (1 << 25))) - { - u32 imm = CurInstr.Instr & 0xFFF; - return Imm32(imm); - } - else - { - int op = (CurInstr.Instr >> 5) & 0x3; - int amount = (CurInstr.Instr >> 7) & 0x1F; - OpArg rm = MapReg(CurInstr.A_Reg(0)); - bool carryUsed; - - return Comp_RegShiftImm(op, amount, rm, false, carryUsed); - } -} void Compiler::A_Comp_MemWB() { - OpArg rn = MapReg(CurInstr.A_Reg(16)); - OpArg rd = MapReg(CurInstr.A_Reg(12)); bool load = CurInstr.Instr & (1 << 20); bool byte = CurInstr.Instr & (1 << 22); int size = byte ? 8 : 32; + + int flags = 0; + if (!load) + flags |= memop_Store; + if (!(CurInstr.Instr & (1 << 24))) + flags |= memop_Post; + if (CurInstr.Instr & (1 << 21)) + flags |= memop_Writeback; + if (!(CurInstr.Instr & (1 << 23))) + flags |= memop_SubtractOffset; - if (CurInstr.Instr & (1 << 24)) + ComplexOperand offset; + if (!(CurInstr.Instr & (1 << 25))) { - OpArg offset = A_Comp_GetMemWBOffset(); - if (CurInstr.Instr & (1 << 23)) - MOV_sum(32, ABI_PARAM1, rn, offset); - else - { - MOV(32, R(ABI_PARAM1), rn); - SUB(32, R(ABI_PARAM1), offset); - } - - if (CurInstr.Instr & (1 << 21)) - MOV(32, rn, R(ABI_PARAM1)); + offset = ComplexOperand(CurInstr.Instr & 0xFFF); } else - MOV(32, R(ABI_PARAM1), rn); - - if (!(CurInstr.Instr & (1 << 24))) { - OpArg offset = A_Comp_GetMemWBOffset(); + int op = (CurInstr.Instr >> 5) & 0x3; + int amount = (CurInstr.Instr >> 7) & 0x1F; + int rm = CurInstr.A_Reg(0); - if (CurInstr.Instr & (1 << 23)) - ADD(32, rn, offset); - else - SUB(32, rn, offset); + offset = ComplexOperand(rm, op, amount); } - Comp_MemAccess(rd, false, !load, byte ? 8 : 32); - if (load && CurInstr.A_Reg(12) == 15) - { - if (byte) - printf("!!! LDRB PC %08X\n", R15); - else - { - if (Num == 1) - AND(32, rd, Imm8(0xFE)); // immediate is sign extended - Comp_JumpTo(rd.GetSimpleReg()); - } - } + Comp_MemAccess(CurInstr.A_Reg(12), CurInstr.A_Reg(16), offset, size, flags); } void Compiler::A_Comp_MemHalf() { - OpArg rn = MapReg(CurInstr.A_Reg(16)); - OpArg rd = MapReg(CurInstr.A_Reg(12)); - - OpArg offset = CurInstr.Instr & (1 << 22) - ? Imm32(CurInstr.Instr & 0xF | ((CurInstr.Instr >> 4) & 0xF0)) - : MapReg(CurInstr.A_Reg(0)); + ComplexOperand offset = CurInstr.Instr & (1 << 22) + ? ComplexOperand(CurInstr.Instr & 0xF | ((CurInstr.Instr >> 4) & 0xF0)) + : ComplexOperand(CurInstr.A_Reg(0), 0, 0); int op = (CurInstr.Instr >> 5) & 0x3; bool load = CurInstr.Instr & (1 << 20); @@ -689,49 +662,29 @@ void Compiler::A_Comp_MemHalf() if (size == 32 && Num == 1) return; // NOP - if (CurInstr.Instr & (1 << 24)) - { - if (CurInstr.Instr & (1 << 23)) - MOV_sum(32, ABI_PARAM1, rn, offset); - else - { - MOV(32, R(ABI_PARAM1), rn); - SUB(32, R(ABI_PARAM1), offset); - } - - if (CurInstr.Instr & (1 << 21)) - MOV(32, rn, R(ABI_PARAM1)); - } - else - MOV(32, R(ABI_PARAM1), rn); - + int flags = 0; + if (signExtend) + flags |= memop_SignExtend; + if (!load) + flags |= memop_Store; if (!(CurInstr.Instr & (1 << 24))) - { - if (CurInstr.Instr & (1 << 23)) - ADD(32, rn, offset); - else - SUB(32, rn, offset); - } + flags |= memop_Post; + if (!(CurInstr.Instr & (1 << 23))) + flags |= memop_SubtractOffset; + if (CurInstr.Instr & (1 << 21)) + flags |= memop_Writeback; - Comp_MemAccess(rd, signExtend, !load, size); - - if (load && CurInstr.A_Reg(12) == 15) - printf("!!! MemHalf op PC %08X\n", R15);; + Comp_MemAccess(CurInstr.A_Reg(12), CurInstr.A_Reg(16), offset, size, flags); } void Compiler::T_Comp_MemReg() { - OpArg rd = MapReg(CurInstr.T_Reg(0)); - OpArg rb = MapReg(CurInstr.T_Reg(3)); - OpArg ro = MapReg(CurInstr.T_Reg(6)); - int op = (CurInstr.Instr >> 10) & 0x3; bool load = op & 0x2; bool byte = op & 0x1; - MOV_sum(32, ABI_PARAM1, rb, ro); - - Comp_MemAccess(rd, false, !load, byte ? 8 : 32); + Comp_MemAccess(CurInstr.T_Reg(0), CurInstr.T_Reg(3), ComplexOperand(CurInstr.T_Reg(6), 0, 0), + byte ? 8 : 32, load ? 0 : memop_Store); } void Compiler::A_Comp_LDM_STM() @@ -758,67 +711,55 @@ void Compiler::A_Comp_LDM_STM() void Compiler::T_Comp_MemImm() { - OpArg rd = MapReg(CurInstr.T_Reg(0)); - OpArg rb = MapReg(CurInstr.T_Reg(3)); - int op = (CurInstr.Instr >> 11) & 0x3; bool load = op & 0x1; bool byte = op & 0x2; u32 offset = ((CurInstr.Instr >> 6) & 0x1F) * (byte ? 1 : 4); - LEA(32, ABI_PARAM1, MDisp(rb.GetSimpleReg(), offset)); - - Comp_MemAccess(rd, false, !load, byte ? 8 : 32); + Comp_MemAccess(CurInstr.T_Reg(0), CurInstr.T_Reg(3), ComplexOperand(offset), + byte ? 8 : 32, load ? 0 : memop_Store); } void Compiler::T_Comp_MemRegHalf() { - OpArg rd = MapReg(CurInstr.T_Reg(0)); - OpArg rb = MapReg(CurInstr.T_Reg(3)); - OpArg ro = MapReg(CurInstr.T_Reg(6)); - int op = (CurInstr.Instr >> 10) & 0x3; bool load = op != 0; int size = op != 1 ? 16 : 8; bool signExtend = op & 1; - MOV_sum(32, ABI_PARAM1, rb, ro); + int flags = 0; + if (signExtend) + flags |= memop_SignExtend; + if (!load) + flags |= memop_Store; - Comp_MemAccess(rd, signExtend, !load, size); + Comp_MemAccess(CurInstr.T_Reg(0), CurInstr.T_Reg(3), ComplexOperand(CurInstr.T_Reg(6), 0, 0), + size, flags); } void Compiler::T_Comp_MemImmHalf() { - OpArg rd = MapReg(CurInstr.T_Reg(0)); - OpArg rb = MapReg(CurInstr.T_Reg(3)); - u32 offset = (CurInstr.Instr >> 5) & 0x3E; bool load = CurInstr.Instr & (1 << 11); - LEA(32, ABI_PARAM1, MDisp(rb.GetSimpleReg(), offset)); - - Comp_MemAccess(rd, false, !load, 16); + Comp_MemAccess(CurInstr.T_Reg(0), CurInstr.T_Reg(3), ComplexOperand(offset), 16, + load ? 0 : memop_Store); } void Compiler::T_Comp_LoadPCRel() { - OpArg rd = MapReg(CurInstr.T_Reg(8)); u32 addr = (R15 & ~0x2) + ((CurInstr.Instr & 0xFF) << 2); - // hopefully this doesn't break - u32 val; CurCPU->DataRead32(addr, &val); - MOV(32, rd, Imm32(val)); + Comp_MemLoadLiteral(32, CurInstr.T_Reg(8), addr); } void Compiler::T_Comp_MemSPRel() { u32 offset = (CurInstr.Instr & 0xFF) * 4; - OpArg rd = MapReg(CurInstr.T_Reg(8)); bool load = CurInstr.Instr & (1 << 11); - LEA(32, ABI_PARAM1, MDisp(MapReg(13).GetSimpleReg(), offset)); - - Comp_MemAccess(rd, false, !load, 32); + Comp_MemAccess(CurInstr.T_Reg(8), 13, ComplexOperand(offset), 32, + load ? 0 : memop_Store); } void Compiler::T_Comp_PUSH_POP() diff --git a/src/ARM_InstrInfo.cpp b/src/ARM_InstrInfo.cpp index 9239e29..0fbde26 100644 --- a/src/ARM_InstrInfo.cpp +++ b/src/ARM_InstrInfo.cpp @@ -36,7 +36,7 @@ enum { A_StaticShiftSetC = 1 << 18, A_SetC = 1 << 19, - A_WriteMemory = 1 << 20, + A_WriteMem = 1 << 20 }; #define A_BIOP A_Read16 @@ -109,7 +109,7 @@ const u32 A_UMULL = A_MulFlags | A_Write16 | A_Write12 | A_Read0 | A_Read8 | ak( const u32 A_UMLAL = A_MulFlags | A_Write16 | A_Write12 | A_Read16 | A_Read12 | A_Read0 | A_Read8 | ak(ak_UMLAL); const u32 A_SMULL = A_MulFlags | A_Write16 | A_Write12 | A_Read0 | A_Read8 | ak(ak_SMULL); const u32 A_SMLAL = A_MulFlags | A_Write16 | A_Write12 | A_Read16 | A_Read12 | A_Read0 | A_Read8 | ak(ak_SMLAL); -const u32 A_SMLAxy = A_Write16 | A_Read0 | A_Read8 | A_Read12 | ak(ak_SMLALxy); +const u32 A_SMLAxy = A_Write16 | A_Read0 | A_Read8 | A_Read12 | ak(ak_SMLAxy); const u32 A_SMLAWy = A_Write16 | A_Read0 | A_Read8 | A_Read12 | ak(ak_SMLAWy); const u32 A_SMULWy = A_Write16 | A_Read0 | A_Read8 | ak(ak_SMULWy); const u32 A_SMLALxy = A_Write16 | A_Write12 | A_Read16 | A_Read12 | A_Read0 | A_Read8 | ak(ak_SMLALxy); @@ -123,7 +123,7 @@ const u32 A_QDADD = A_Write12 | A_Read0 | A_Read16 | A_UnkOnARM7 | ak(ak_QDADD); const u32 A_QDSUB = A_Write12 | A_Read0 | A_Read16 | A_UnkOnARM7 | ak(ak_QDSUB); #define A_LDR A_Write12 -#define A_STR A_Read12 | A_WriteMemory +#define A_STR A_Read12 | A_WriteMem #define A_IMPLEMENT_WB_LDRSTR(x,k) \ const u32 A_##x##_IMM = A_##k | A_Read16 | A_MemWriteback | ak(ak_##x##_IMM); \ @@ -144,7 +144,7 @@ A_IMPLEMENT_WB_LDRSTR(LDR,LDR) A_IMPLEMENT_WB_LDRSTR(LDRB,LDR) #define A_LDRD A_Write12Double -#define A_STRD A_Read12Double | A_WriteMemory +#define A_STRD A_Read12Double | A_WriteMem #define A_IMPLEMENT_HD_LDRSTR(x,k) \ const u32 A_##x##_IMM = A_##k | A_Read16 | A_MemWriteback | ak(ak_##x##_IMM); \ @@ -159,11 +159,11 @@ A_IMPLEMENT_HD_LDRSTR(LDRH,LDR) A_IMPLEMENT_HD_LDRSTR(LDRSB,LDR) A_IMPLEMENT_HD_LDRSTR(LDRSH,LDR) -const u32 A_SWP = A_Write12 | A_Read16 | A_Read0 | A_WriteMemory | ak(ak_SWP); -const u32 A_SWPB = A_Write12 | A_Read16 | A_Read0 | A_WriteMemory | ak(ak_SWPB); +const u32 A_SWP = A_Write12 | A_Read16 | A_Read0 | A_WriteMem | ak(ak_SWP); +const u32 A_SWPB = A_Write12 | A_Read16 | A_Read0 | A_WriteMem | ak(ak_SWPB); const u32 A_LDM = A_Read16 | A_MemWriteback | ak(ak_LDM); -const u32 A_STM = A_Read16 | A_MemWriteback | A_WriteMemory | ak(ak_STM); +const u32 A_STM = A_Read16 | A_MemWriteback | A_WriteMem | ak(ak_STM); const u32 A_B = A_BranchAlways | ak(ak_B); const u32 A_BL = A_BranchAlways | A_Link | ak(ak_BL); @@ -181,7 +181,7 @@ const u32 A_SVC = A_BranchAlways | A_Link | ak(ak_SVC); // THUMB -#define tk(x) ((x) << 21) +#define tk(x) ((x) << 22) enum { T_Read0 = 1 << 0, @@ -210,6 +210,8 @@ enum { T_SetMaybeC = 1 << 18, T_ReadC = 1 << 19, T_SetC = 1 << 20, + + T_WriteMem = 1 << 21, }; const u32 T_LSL_IMM = T_SetNZ | T_SetMaybeC | T_Write0 | T_Read3 | tk(tk_LSL_IMM); @@ -253,30 +255,30 @@ const u32 T_ADD_SP = T_WriteR13 | T_ReadR13 | tk(tk_ADD_SP); const u32 T_LDR_PCREL = T_Write8 | tk(tk_LDR_PCREL); -const u32 T_STR_REG = T_Read0 | T_Read3 | T_Read6 | tk(tk_STR_REG); -const u32 T_STRB_REG = T_Read0 | T_Read3 | T_Read6 | tk(tk_STRB_REG); +const u32 T_STR_REG = T_Read0 | T_Read3 | T_Read6 | T_WriteMem | tk(tk_STR_REG); +const u32 T_STRB_REG = T_Read0 | T_Read3 | T_Read6 | T_WriteMem | tk(tk_STRB_REG); const u32 T_LDR_REG = T_Write0 | T_Read3 | T_Read6 | tk(tk_LDR_REG); const u32 T_LDRB_REG = T_Write0 | T_Read3 | T_Read6 | tk(tk_LDRB_REG); -const u32 T_STRH_REG = T_Read0 | T_Read3 | T_Read6 | tk(tk_STRH_REG); +const u32 T_STRH_REG = T_Read0 | T_Read3 | T_Read6 | T_WriteMem | tk(tk_STRH_REG); const u32 T_LDRSB_REG = T_Write0 | T_Read3 | T_Read6 | tk(tk_LDRSB_REG); const u32 T_LDRH_REG = T_Write0 | T_Read3 | T_Read6 | tk(tk_LDRH_REG); const u32 T_LDRSH_REG = T_Write0 | T_Read3 | T_Read6 | tk(tk_LDRSH_REG); -const u32 T_STR_IMM = T_Read0 | T_Read3 | tk(tk_STR_IMM); +const u32 T_STR_IMM = T_Read0 | T_Read3 | T_WriteMem | tk(tk_STR_IMM); const u32 T_LDR_IMM = T_Write0 | T_Read3 | tk(tk_LDR_IMM); -const u32 T_STRB_IMM = T_Read0 | T_Read3 | tk(tk_STRB_IMM); +const u32 T_STRB_IMM = T_Read0 | T_Read3 | T_WriteMem | tk(tk_STRB_IMM); const u32 T_LDRB_IMM = T_Write0 | T_Read3 | tk(tk_LDRB_IMM); -const u32 T_STRH_IMM = T_Read0 | T_Read3 | tk(tk_STRH_IMM); +const u32 T_STRH_IMM = T_Read0 | T_Read3 | T_WriteMem | tk(tk_STRH_IMM); const u32 T_LDRH_IMM = T_Write0 | T_Read3 | tk(tk_LDRH_IMM); -const u32 T_STR_SPREL = T_Read8 | T_ReadR13 | tk(tk_STR_SPREL); +const u32 T_STR_SPREL = T_Read8 | T_ReadR13 | T_WriteMem | tk(tk_STR_SPREL); const u32 T_LDR_SPREL = T_Write8 | T_ReadR13 | tk(tk_LDR_SPREL); -const u32 T_PUSH = T_ReadR13 | T_WriteR13 | tk(tk_PUSH); +const u32 T_PUSH = T_ReadR13 | T_WriteR13 | T_WriteMem | tk(tk_PUSH); const u32 T_POP = T_PopPC | T_ReadR13 | T_WriteR13 | tk(tk_POP); const u32 T_LDMIA = T_Read8 | T_Write8 | tk(tk_LDMIA); -const u32 T_STMIA = T_Read8 | T_Write8 | tk(tk_STMIA); +const u32 T_STMIA = T_Read8 | T_Write8 | T_WriteMem | tk(tk_STMIA); const u32 T_BCOND = T_BranchAlways | tk(tk_BCOND); const u32 T_BX = T_BranchAlways | T_ReadHi3 | tk(tk_BX); @@ -307,7 +309,7 @@ Info Decode(bool thumb, u32 num, u32 instr) if (thumb) { u32 data = THUMBInstrTable[(instr >> 6) & 0x3FF]; - res.Kind = (data >> 21) & 0x3F; + res.Kind = (data >> 22) & 0x3F; if (data & T_Read0) res.SrcRegs |= 1 << (instr & 0x7); @@ -356,6 +358,9 @@ Info Decode(bool thumb, u32 num, u32 instr) if (data & T_SetC) res.WriteFlags |= flag_C; + if (data & T_WriteMem) + res.SpecialKind = special_WriteMem; + res.EndBlock |= res.Branches(); if (res.Kind == tk_BCOND) @@ -382,6 +387,9 @@ Info Decode(bool thumb, u32 num, u32 instr) u32 id = (cn<<8)|(cm<<4)|cpinfo; if (id == 0x704 || id == 0x782 || id == 0x750 || id == 0x751 || id == 0x752) res.EndBlock |= true; + + if (id == 0x704 || id == 0x782) + res.SpecialKind = special_WaitForInterrupt; } if (res.Kind == ak_MCR || res.Kind == ak_MRC) { @@ -449,6 +457,9 @@ Info Decode(bool thumb, u32 num, u32 instr) if ((data & A_SetC) || (data & A_StaticShiftSetC) && ((instr >> 7) & 0x1F)) res.WriteFlags |= flag_C; + if (data & A_WriteMem) + res.SpecialKind = special_WriteMem; + if ((instr >> 28) < 0xE) { // make non conditional flag sets conditional diff --git a/src/ARM_InstrInfo.h b/src/ARM_InstrInfo.h index d01c600..d02f168 100644 --- a/src/ARM_InstrInfo.h +++ b/src/ARM_InstrInfo.h @@ -226,18 +226,27 @@ enum flag_V = 1 << 0, }; +enum +{ + special_NotSpecialAtAll = 0, + special_WriteMem, + special_WaitForInterrupt +}; + struct Info { u16 DstRegs, SrcRegs; u16 Kind; + u8 SpecialKind; + u8 ReadFlags; // lower 4 bits - set always // upper 4 bits - might set flag u8 WriteFlags; bool EndBlock; - bool Branches() + bool Branches() const { return DstRegs & (1 << 15); } diff --git a/src/CP15.cpp b/src/CP15.cpp index e6e91c3..10c3b1b 100644 --- a/src/CP15.cpp +++ b/src/CP15.cpp @@ -561,9 +561,11 @@ void ARMv5::CP15Write(u32 id, u32 val) case 0x750: + ARMJIT::InvalidateAll(); ICacheInvalidateAll(); return; case 0x751: + ARMJIT::InvalidateByAddr(ARMJIT::TranslateAddr<0>(val)); ICacheInvalidateByAddr(val); return; case 0x752: @@ -813,7 +815,7 @@ void ARMv5::DataWrite8(u32 addr, u8 val) DataCycles = 1; *(u8*)&ITCM[addr & 0x7FFF] = val; #ifdef JIT_ENABLED - ARMJIT::cache.ARM9_ITCM[(addr & 0x7FFF) >> 1] = NULL; + ARMJIT::InvalidateITCM(addr & 0x7FFF); #endif return; } @@ -837,7 +839,7 @@ void ARMv5::DataWrite16(u32 addr, u16 val) DataCycles = 1; *(u16*)&ITCM[addr & 0x7FFF] = val; #ifdef JIT_ENABLED - ARMJIT::cache.ARM9_ITCM[(addr & 0x7FFF) >> 1] = NULL; + ARMJIT::InvalidateITCM(addr & 0x7FFF); #endif return; } @@ -861,8 +863,7 @@ void ARMv5::DataWrite32(u32 addr, u32 val) DataCycles = 1; *(u32*)&ITCM[addr & 0x7FFF] = val; #ifdef JIT_ENABLED - ARMJIT::cache.ARM9_ITCM[(addr & 0x7FFF) >> 1] = NULL; - ARMJIT::cache.ARM9_ITCM[((addr + 2) & 0x7FFF) >> 1] = NULL; + ARMJIT::InvalidateITCM(addr & 0x7FFF); #endif return; } @@ -886,8 +887,7 @@ void ARMv5::DataWrite32S(u32 addr, u32 val) DataCycles += 1; *(u32*)&ITCM[addr & 0x7FFF] = val; #ifdef JIT_ENABLED - ARMJIT::cache.ARM9_ITCM[(addr & 0x7FFF) >> 1] = NULL; - ARMJIT::cache.ARM9_ITCM[((addr & 0x7FFF) >> 1) + 1] = NULL; + ARMJIT::InvalidateITCM(addr & 0x7FFF); #endif return; } diff --git a/src/Config.cpp b/src/Config.cpp index 3cff0ed..63d61a3 100644 --- a/src/Config.cpp +++ b/src/Config.cpp @@ -37,6 +37,7 @@ int GL_Antialias; #ifdef JIT_ENABLED bool JIT_Enable = false; int JIT_MaxBlockSize = 12; +bool JIT_BrancheOptimisations = true; #endif ConfigEntry ConfigFile[] = @@ -50,6 +51,7 @@ ConfigEntry ConfigFile[] = #ifdef JIT_ENABLED {"JIT_Enable", 0, &JIT_Enable, 0, NULL, 0}, {"JIT_MaxBlockSize", 0, &JIT_MaxBlockSize, 10, NULL, 0}, + {"JIT_BrancheOptimisations", 0, &JIT_BrancheOptimisations, 1, NULL, 0}, #endif {"", -1, NULL, 0, NULL, 0} diff --git a/src/Config.h b/src/Config.h index c13eae3..0fcefc3 100644 --- a/src/Config.h +++ b/src/Config.h @@ -49,6 +49,7 @@ extern int GL_Antialias; #ifdef JIT_ENABLED extern bool JIT_Enable; extern int JIT_MaxBlockSize; +extern bool JIT_BrancheOptimisations; #endif } diff --git a/src/NDS.cpp b/src/NDS.cpp index 1baa308..e9e6795 100644 --- a/src/NDS.cpp +++ b/src/NDS.cpp @@ -536,7 +536,7 @@ void Reset() RCnt = 0; #ifdef JIT_ENABLED - ARMJIT::InvalidateBlockCache(); + ARMJIT::ResetBlockCache(); #endif NDSCart::Reset(); @@ -757,7 +757,7 @@ bool DoSavestate(Savestate* file) #ifdef JIT_ENABLED if (!file->Saving) { - ARMJIT::InvalidateBlockCache(); + ARMJIT::ResetBlockCache(); } #endif @@ -1870,10 +1870,6 @@ u32 ARM9Read32(u32 addr) void ARM9Write8(u32 addr, u8 val) { -#ifdef JIT_ENABLED - ARMJIT::Invalidate16<0>(addr); -#endif - switch (addr & 0xFF000000) { case 0x02000000: @@ -1924,10 +1920,6 @@ void ARM9Write8(u32 addr, u8 val) void ARM9Write16(u32 addr, u16 val) { -#ifdef JIT_ENABLED - ARMJIT::Invalidate16<0>(addr); -#endif - switch (addr & 0xFF000000) { case 0x02000000: @@ -1994,10 +1986,6 @@ void ARM9Write16(u32 addr, u16 val) void ARM9Write32(u32 addr, u32 val) { -#ifdef JIT_ENABLED - ARMJIT::Invalidate32<0>(addr); -#endif - switch (addr & 0xFF000000) { case 0x02000000: @@ -2292,7 +2280,7 @@ u32 ARM7Read32(u32 addr) void ARM7Write8(u32 addr, u8 val) { #ifdef JIT_ENABLED - ARMJIT::Invalidate16<1>(addr); + ARMJIT::InvalidateByAddr7(addr); #endif switch (addr & 0xFF800000) @@ -2355,7 +2343,7 @@ void ARM7Write8(u32 addr, u8 val) void ARM7Write16(u32 addr, u16 val) { #ifdef JIT_ENABLED - ARMJIT::Invalidate16<1>(addr); + ARMJIT::InvalidateByAddr7(addr); #endif switch (addr & 0xFF800000) @@ -2428,7 +2416,7 @@ void ARM7Write16(u32 addr, u16 val) void ARM7Write32(u32 addr, u32 val) { #ifdef JIT_ENABLED - ARMJIT::Invalidate32<1>(addr); + ARMJIT::InvalidateByAddr7(addr); #endif switch (addr & 0xFF800000) diff --git a/src/libui_sdl/DlgEmuSettings.cpp b/src/libui_sdl/DlgEmuSettings.cpp index 09ea8eb..45e8e0c 100644 --- a/src/libui_sdl/DlgEmuSettings.cpp +++ b/src/libui_sdl/DlgEmuSettings.cpp @@ -42,6 +42,7 @@ uiCheckbox* cbDirectBoot; #ifdef JIT_ENABLED uiCheckbox* cbJITEnabled; uiEntry* enJITMaxBlockSize; +uiCheckbox* cbJITBranchOptimisations; #endif int OnCloseWindow(uiWindow* window, void* blarg) @@ -64,13 +65,15 @@ void OnOk(uiButton* btn, void* blarg) bool enableJit = uiCheckboxChecked(cbJITEnabled); char* maxBlockSizeStr = uiEntryText(enJITMaxBlockSize); long blockSize = strtol(maxBlockSizeStr, NULL, 10); + bool branchOptimisations = uiCheckboxChecked(cbJITBranchOptimisations); uiFreeText(maxBlockSizeStr); if (blockSize < 1) blockSize = 1; if (blockSize > 32) blockSize = 32; - if (enableJit != Config::JIT_Enable || blockSize != Config::JIT_MaxBlockSize) + if (enableJit != Config::JIT_Enable || blockSize != Config::JIT_MaxBlockSize || + branchOptimisations != Config::JIT_BrancheOptimisations) { if (RunningSomething && !uiMsgBoxConfirm(win, "Reset emulator", @@ -79,6 +82,7 @@ void OnOk(uiButton* btn, void* blarg) Config::JIT_Enable = enableJit; Config::JIT_MaxBlockSize = blockSize; + Config::JIT_BrancheOptimisations = uiCheckboxChecked(cbJITBranchOptimisations); restart = true; } @@ -101,9 +105,15 @@ void OnOk(uiButton* btn, void* blarg) void OnJITStateChanged(uiCheckbox* cb, void* blarg) { if (uiCheckboxChecked(cb)) + { uiControlEnable(uiControl(enJITMaxBlockSize)); + uiControlEnable(uiControl(cbJITBranchOptimisations)); + } else + { uiControlDisable(uiControl(enJITMaxBlockSize)); + uiControlDisable(uiControl(cbJITBranchOptimisations)); + } } #endif @@ -159,6 +169,14 @@ void Open() enJITMaxBlockSize = uiNewEntry(); uiBoxAppend(row, uiControl(enJITMaxBlockSize), 0); } + + { + uiBox* row = uiNewHorizontalBox(); + uiBoxAppend(in_ctrl, uiControl(row), 0); + + cbJITBranchOptimisations = uiNewCheckbox("Branch optimisations (breaks in rare cases games!)"); + uiBoxAppend(row, uiControl(cbJITBranchOptimisations), 0); + } } #endif @@ -194,6 +212,8 @@ void Open() uiEntrySetText(enJITMaxBlockSize, maxBlockSizeStr); } OnJITStateChanged(cbJITEnabled, NULL); + + uiCheckboxSetChecked(cbJITBranchOptimisations, Config::JIT_BrancheOptimisations); #endif uiControlShow(uiControl(win)); -- cgit v1.2.3 From 68d552074bf2c1989d96a8c28cc3f6fe1e6c8b8e Mon Sep 17 00:00:00 2001 From: RSDuck Date: Sat, 25 Apr 2020 14:42:37 +0200 Subject: compile UMULLs and some fixes --- src/ARMJIT_x64/ARMJIT_ALU.cpp | 33 +++++++++++++++++++++++++-------- src/ARMJIT_x64/ARMJIT_Compiler.cpp | 4 ++-- src/ARMJIT_x64/ARMJIT_Compiler.h | 2 +- src/ARMJIT_x64/ARMJIT_LoadStore.cpp | 3 ++- 4 files changed, 30 insertions(+), 12 deletions(-) (limited to 'src/ARMJIT_x64/ARMJIT_ALU.cpp') diff --git a/src/ARMJIT_x64/ARMJIT_ALU.cpp b/src/ARMJIT_x64/ARMJIT_ALU.cpp index 14c223b..43b94b6 100644 --- a/src/ARMJIT_x64/ARMJIT_ALU.cpp +++ b/src/ARMJIT_x64/ARMJIT_ALU.cpp @@ -301,10 +301,11 @@ void Compiler::A_Comp_MUL_MLA() Comp_MulOp(S, add, rd, rm, rs, rn); } -void Compiler::A_Comp_SMULL_SMLAL() +void Compiler::A_Comp_Mul_Long() { bool S = CurInstr.Instr & (1 << 20); bool add = CurInstr.Instr & (1 << 21); + bool sign = CurInstr.Instr & (1 << 22); OpArg rd = MapReg(CurInstr.A_Reg(16)); OpArg rm = MapReg(CurInstr.A_Reg(0)); OpArg rs = MapReg(CurInstr.A_Reg(8)); @@ -318,18 +319,34 @@ void Compiler::A_Comp_SMULL_SMLAL() MOV(32, R(RSCRATCH3), rs); TEST(32, R(RSCRATCH3), R(RSCRATCH3)); FixupBranch zeroBSR = J_CC(CC_Z); - BSR(32, RSCRATCH2, R(RSCRATCH3)); - NOT(32, R(RSCRATCH3)); - BSR(32, RSCRATCH, R(RSCRATCH3)); - CMP(32, R(RSCRATCH2), R(RSCRATCH)); - CMOVcc(32, RSCRATCH, R(RSCRATCH2), CC_L); + if (sign) + { + BSR(32, RSCRATCH2, R(RSCRATCH3)); + NOT(32, R(RSCRATCH3)); + BSR(32, RSCRATCH, R(RSCRATCH3)); + CMP(32, R(RSCRATCH2), R(RSCRATCH)); + CMOVcc(32, RSCRATCH, R(RSCRATCH2), CC_L); + } + else + { + BSR(32, RSCRATCH, R(RSCRATCH3)); + } + SHR(32, R(RSCRATCH), Imm8(3)); SetJumpTarget(zeroBSR); // fortunately that's even right Comp_AddCycles_CI(RSCRATCH, 2); } - MOVSX(64, 32, RSCRATCH2, rm); - MOVSX(64, 32, RSCRATCH3, rs); + if (sign) + { + MOVSX(64, 32, RSCRATCH2, rm); + MOVSX(64, 32, RSCRATCH3, rs); + } + else + { + MOV(32, R(RSCRATCH2), rm); + MOV(32, R(RSCRATCH3), rs); + } if (add) { MOV(32, R(RSCRATCH), rd); diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp index be3709e..1b2d312 100644 --- a/src/ARMJIT_x64/ARMJIT_Compiler.cpp +++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp @@ -300,7 +300,7 @@ const Compiler::CompileFunc A_Comp[ARMInstrInfo::ak_Count] = // CMN F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), // Mul - F(A_Comp_MUL_MLA), F(A_Comp_MUL_MLA), NULL, NULL, NULL, F(A_Comp_SMULL_SMLAL), NULL, NULL, NULL, NULL, NULL, + F(A_Comp_MUL_MLA), F(A_Comp_MUL_MLA), F(A_Comp_Mul_Long), F(A_Comp_Mul_Long), F(A_Comp_Mul_Long), F(A_Comp_Mul_Long), NULL, NULL, NULL, NULL, NULL, // ARMv5 stuff F(A_Comp_CLZ), NULL, NULL, NULL, NULL, // STR @@ -628,7 +628,7 @@ void Compiler::Comp_AddCycles_CI(Gen::X64Reg i, int add) } else { - ConstantCycles += i + cycles; + ConstantCycles += cycles; SUB(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(i)); } } diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.h b/src/ARMJIT_x64/ARMJIT_Compiler.h index b428c33..a448b6d 100644 --- a/src/ARMJIT_x64/ARMJIT_Compiler.h +++ b/src/ARMJIT_x64/ARMJIT_Compiler.h @@ -89,7 +89,7 @@ public: void A_Comp_CmpOp(); void A_Comp_MUL_MLA(); - void A_Comp_SMULL_SMLAL(); + void A_Comp_Mul_Long(); void A_Comp_CLZ(); diff --git a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp index 4cafc1c..7f6fa53 100644 --- a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp +++ b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp @@ -423,7 +423,8 @@ void Compiler::Comp_MemAccess(int rd, int rn, const ComplexOperand& op2, int siz if (flags & memop_SubtractOffset) { - MOV(32, R(finalAddr), rnMapped); + if (R(finalAddr) != rnMapped) + MOV(32, R(finalAddr), rnMapped); if (!offset.IsZero()) SUB(32, R(finalAddr), offset); } -- cgit v1.2.3