aboutsummaryrefslogtreecommitdiff
path: root/src/ARMJIT_A64
diff options
context:
space:
mode:
authorRSDuck <rsduck@users.noreply.github.com>2020-06-14 21:04:25 +0200
committerRSDuck <rsduck@users.noreply.github.com>2020-06-16 12:11:19 +0200
commite335a8ca7615c702cfa2dcdb71deb69468088fd8 (patch)
treec09dcec016d87e7d82a6aec377f8eb3fa9949026 /src/ARMJIT_A64
parentfea9f95bba7475b2cd3b624a3ccd6cdee00a33f1 (diff)
first steps in bringing over the JIT refactor/fastmem
Diffstat (limited to 'src/ARMJIT_A64')
-rw-r--r--src/ARMJIT_A64/ARMJIT_ALU.cpp123
-rw-r--r--src/ARMJIT_A64/ARMJIT_Branch.cpp99
-rw-r--r--src/ARMJIT_A64/ARMJIT_Compiler.cpp383
-rw-r--r--src/ARMJIT_A64/ARMJIT_Compiler.h71
-rw-r--r--src/ARMJIT_A64/ARMJIT_Linkage.s68
-rw-r--r--src/ARMJIT_A64/ARMJIT_LoadStore.cpp790
6 files changed, 902 insertions, 632 deletions
diff --git a/src/ARMJIT_A64/ARMJIT_ALU.cpp b/src/ARMJIT_A64/ARMJIT_ALU.cpp
index 0fe6a97..5f021a0 100644
--- a/src/ARMJIT_A64/ARMJIT_ALU.cpp
+++ b/src/ARMJIT_A64/ARMJIT_ALU.cpp
@@ -243,7 +243,7 @@ void Compiler::Comp_Arithmetic(int op, bool S, ARM64Reg rd, ARM64Reg rn, Op2 op2
if (S && !CurInstr.SetFlags)
S = false;
- bool CVInGP = false;
+ bool CVInGPR = false;
switch (op)
{
case 0x2: // SUB
@@ -306,7 +306,7 @@ void Compiler::Comp_Arithmetic(int op, bool S, ARM64Reg rd, ARM64Reg rn, Op2 op2
UBFX(W2, RCPSR, 29, 1);
if (S)
{
- CVInGP = true;
+ CVInGPR = true;
ADDS(W1, rn, W2);
CSET(W2, CC_CS);
CSET(W3, CC_VS);
@@ -335,7 +335,7 @@ void Compiler::Comp_Arithmetic(int op, bool S, ARM64Reg rd, ARM64Reg rn, Op2 op2
ORN(W1, WZR, op2.Reg.Rm, op2.ToArithOption());
if (S)
{
- CVInGP = true;
+ CVInGPR = true;
ADDS(W1, W2, W1);
CSET(W2, CC_CS);
CSET(W3, CC_VS);
@@ -355,7 +355,7 @@ void Compiler::Comp_Arithmetic(int op, bool S, ARM64Reg rd, ARM64Reg rn, Op2 op2
MVN(W1, rn);
if (S)
{
- CVInGP = true;
+ CVInGPR = true;
ADDS(W1, W2, W1);
CSET(W2, CC_CS);
CSET(W3, CC_VS);
@@ -379,12 +379,12 @@ void Compiler::Comp_Arithmetic(int op, bool S, ARM64Reg rd, ARM64Reg rn, Op2 op2
if (S)
{
- if (CVInGP)
+ if (CVInGPR)
{
BFI(RCPSR, W2, 29, 1);
BFI(RCPSR, W3, 28, 1);
}
- Comp_RetriveFlags(!CVInGP);
+ Comp_RetriveFlags(!CVInGPR);
}
}
@@ -501,7 +501,23 @@ void Compiler::A_Comp_ALUMovOp()
MOVI2R(rd, op2.Imm);
}
else
- MOV(rd, op2.Reg.Rm, op2.ToArithOption());
+ {
+ // ORR with shifted operand has cycles latency
+ if (op2.Reg.ShiftAmount > 0)
+ {
+ switch (op2.Reg.ShiftType)
+ {
+ case ST_LSL: LSL(rd, op2.Reg.Rm, op2.Reg.ShiftAmount); break;
+ case ST_LSR: LSR(rd, op2.Reg.Rm, op2.Reg.ShiftAmount); break;
+ case ST_ASR: ASR(rd, op2.Reg.Rm, op2.Reg.ShiftAmount); break;
+ case ST_ROR: ROR_(rd, op2.Reg.Rm, op2.Reg.ShiftAmount); break;
+ }
+ }
+ else
+ {
+ MOV(rd, op2.Reg.Rm, op2.ToArithOption());
+ }
+ }
}
if (S)
@@ -558,10 +574,7 @@ void Compiler::Comp_Mul_Mla(bool S, bool mla, ARM64Reg rd, ARM64Reg rm, ARM64Reg
}
else
{
- CLZ(W0, rs);
- CLS(W1, rs);
- CMP(W0, W1);
- CSEL(W0, W0, W1, CC_GT);
+ CLS(W0, rs);
Comp_AddCycles_CI(mla ? 1 : 0, W0, ArithOption(W0, ST_LSR, 3));
}
@@ -594,10 +607,10 @@ void Compiler::A_Comp_Mul_Long()
}
else
{
- CLZ(W0, rs);
- CLS(W1, rs);
- CMP(W0, W1);
- CSEL(W0, W0, W1, CC_GT);
+ if (sign)
+ CLS(W0, rs);
+ else
+ CLZ(W0, rs);
Comp_AddCycles_CI(0, W0, ArithOption(W0, ST_LSR, 3));
}
@@ -628,6 +641,86 @@ void Compiler::A_Comp_Mul_Long()
Comp_RetriveFlags(false);
}
+void Compiler::A_Comp_Mul_Short()
+{
+ ARM64Reg rd = MapReg(CurInstr.A_Reg(16));
+ ARM64Reg rm = MapReg(CurInstr.A_Reg(0));
+ ARM64Reg rs = MapReg(CurInstr.A_Reg(8));
+ u32 op = (CurInstr.Instr >> 21) & 0xF;
+
+ bool x = CurInstr.Instr & (1 << 5);
+ bool y = CurInstr.Instr & (1 << 6);
+
+ SBFX(W1, rs, y ? 16 : 0, 16);
+
+ if (op == 0b1000)
+ {
+ // SMLAxy
+
+ SBFX(W0, rm, x ? 16 : 0, 16);
+
+ MUL(W0, W0, W1);
+
+ ORRI2R(W1, RCPSR, 0x08000000);
+
+ ARM64Reg rn = MapReg(CurInstr.A_Reg(12));
+ ADDS(rd, W0, rn);
+
+ CSEL(RCPSR, W1, RCPSR, CC_VS);
+
+ CPSRDirty = true;
+
+ Comp_AddCycles_C();
+ }
+ else if (op == 0b1011)
+ {
+ // SMULxy
+
+ SBFX(W0, rm, x ? 16 : 0, 16);
+
+ MUL(rd, W0, W1);
+
+ Comp_AddCycles_C();
+ }
+ else if (op == 0b1010)
+ {
+ // SMLALxy
+
+ ARM64Reg rn = MapReg(CurInstr.A_Reg(12));
+
+ MOV(W2, rn);
+ BFI(X2, rd, 32, 32);
+
+ SBFX(W0, rm, x ? 16 : 0, 16);
+
+ SMADDL(EncodeRegTo64(rn), W0, W1, X2);
+
+ UBFX(EncodeRegTo64(rd), EncodeRegTo64(rn), 32, 32);
+
+ Comp_AddCycles_CI(1);
+ }
+ else if (op == 0b1001)
+ {
+ // SMLAWy/SMULWy
+ SMULL(X0, rm, W1);
+ ASR(x ? EncodeRegTo64(rd) : X0, X0, 16);
+
+ if (!x)
+ {
+ ORRI2R(W1, RCPSR, 0x08000000);
+
+ ARM64Reg rn = MapReg(CurInstr.A_Reg(12));
+ ADDS(rd, W0, rn);
+
+ CSEL(RCPSR, W1, RCPSR, CC_VS);
+
+ CPSRDirty = true;
+ }
+
+ Comp_AddCycles_C();
+ }
+}
+
void Compiler::A_Comp_Mul()
{
ARM64Reg rd = MapReg(CurInstr.A_Reg(16));
diff --git a/src/ARMJIT_A64/ARMJIT_Branch.cpp b/src/ARMJIT_A64/ARMJIT_Branch.cpp
index 542f0b7..f130938 100644
--- a/src/ARMJIT_A64/ARMJIT_Branch.cpp
+++ b/src/ARMJIT_A64/ARMJIT_Branch.cpp
@@ -143,7 +143,7 @@ void Compiler::Comp_JumpTo(u32 addr, bool forceNonConstantCycles)
if ((Thumb || CurInstr.Cond() >= 0xE) && !forceNonConstantCycles)
ConstantCycles += cycles;
else
- ADD(RCycles, RCycles, cycles);
+ SUB(RCycles, RCycles, cycles);
}
@@ -152,23 +152,19 @@ void* Compiler::Gen_JumpTo9(int kind)
AlignCode16();
void* res = GetRXPtr();
- MOVI2R(W2, kCodeCacheTiming);
- // W1 - code cycles non branch
- // W2 - branch code cycles
LSR(W1, W0, 12);
- LSL(W1, W1, 2);
ADDI2R(W1, W1, offsetof(ARMv5, MemTimings), W2);
LDRB(W1, RCPU, W1);
- LDR(INDEX_UNSIGNED, W3, RCPU, offsetof(ARMv5, ITCMSize));
+ LDR(INDEX_UNSIGNED, W2, RCPU, offsetof(ARMv5, ITCMSize));
STR(INDEX_UNSIGNED, W1, RCPU, offsetof(ARMv5, RegionCodeCycles));
- CMP(W0, W3);
- FixupBranch outsideITCM = B(CC_LO);
- MOVI2R(W1, 1);
- MOVI2R(W2, 1);
- SetJumpTarget(outsideITCM);
+ CMP(W1, 0xFF);
+ MOVI2R(W3, kCodeCacheTiming);
+ CSEL(W1, W3, W1, CC_EQ);
+ CMP(W0, W2);
+ CSINC(W1, W1, WZR, CC_HS);
FixupBranch switchToThumb;
if (kind == 0)
@@ -176,40 +172,36 @@ void* Compiler::Gen_JumpTo9(int kind)
if (kind == 0 || kind == 1)
{
- ANDI2R(W0, W0, ~3);
-
+ // ARM
if (kind == 0)
ANDI2R(RCPSR, RCPSR, ~0x20);
- ADD(W3, W0, 4);
- STR(INDEX_UNSIGNED, W3, RCPU, offsetof(ARM, R[15]));
-
- ADD(W1, W1, W2);
- ADD(RCycles, RCycles, W1);
+ ANDI2R(W0, W0, ~3);
+ ADD(W0, W0, 4);
+ STR(INDEX_UNSIGNED, W0, RCPU, offsetof(ARMv5, R[15]));
+ ADD(W1, W1, W1);
+ SUB(RCycles, RCycles, W1);
RET();
}
+
if (kind == 0 || kind == 2)
{
+ // Thumb
if (kind == 0)
{
SetJumpTarget(switchToThumb);
-
ORRI2R(RCPSR, RCPSR, 0x20);
}
ANDI2R(W0, W0, ~1);
+ ADD(W0, W0, 2);
+ STR(INDEX_UNSIGNED, W0, RCPU, offsetof(ARMv5, R[15]));
- ADD(W3, W0, 2);
- STR(INDEX_UNSIGNED, W3, RCPU, offsetof(ARM, R[15]));
-
- FixupBranch halfwordLoc = TBZ(W0, 1);
- ADD(W1, W1, W2);
- ADD(RCycles, RCycles, W1);
- RET();
-
- SetJumpTarget(halfwordLoc);
- ADD(RCycles, RCycles, W2);
+ ADD(W2, W1, W1);
+ TSTI2R(W0, 0x2);
+ CSEL(W1, W1, W2, CC_EQ);
+ SUB(RCycles, RCycles, W1);
RET();
}
@@ -237,7 +229,7 @@ void* Compiler::Gen_JumpTo7(int kind)
UBFX(W2, W3, 0, 8);
UBFX(W3, W3, 8, 8);
ADD(W2, W3, W2);
- ADD(RCycles, RCycles, W2);
+ SUB(RCycles, RCycles, W2);
ANDI2R(W0, W0, ~3);
@@ -261,7 +253,7 @@ void* Compiler::Gen_JumpTo7(int kind)
UBFX(W2, W3, 16, 8);
UBFX(W3, W3, 24, 8);
ADD(W2, W3, W2);
- ADD(RCycles, RCycles, W2);
+ SUB(RCycles, RCycles, W2);
ANDI2R(W0, W0, ~1);
@@ -287,22 +279,11 @@ void Compiler::Comp_JumpTo(Arm64Gen::ARM64Reg addr, bool switchThumb, bool resto
}
else
{
- BitSet16 hiRegsLoaded(RegCache.DirtyRegs & 0xFF00);
- bool previouslyDirty = CPSRDirty;
+
+ bool cpsrDirty = CPSRDirty;
SaveCPSR();
-
- if (restoreCPSR)
- {
- if (Thumb || CurInstr.Cond() >= 0xE)
- RegCache.Flush();
- else
- {
- // the ugly way...
- // we only save them, to load and save them again
- for (int reg : hiRegsLoaded)
- SaveReg(reg, RegCache.Mapping[reg]);
- }
- }
+ SaveCycles();
+ PushRegs(restoreCPSR);
if (switchThumb)
MOV(W1, addr);
@@ -319,16 +300,12 @@ void Compiler::Comp_JumpTo(Arm64Gen::ARM64Reg addr, bool switchThumb, bool resto
QuickCallFunction(X3, jumpToTrampoline<ARMv5>);
else
QuickCallFunction(X3, jumpToTrampoline<ARMv4>);
-
- if (!Thumb && restoreCPSR && CurInstr.Cond() < 0xE)
- {
- for (int reg : hiRegsLoaded)
- LoadReg(reg, RegCache.Mapping[reg]);
- }
- if (previouslyDirty)
- LoadCPSR();
- CPSRDirty = previouslyDirty;
+ PopRegs(restoreCPSR);
+ LoadCycles();
+ LoadCPSR();
+ if (CurInstr.Cond() < 0xE)
+ CPSRDirty = cpsrDirty;
}
}
@@ -368,21 +345,13 @@ void Compiler::T_Comp_BCOND()
s32 offset = (s32)(CurInstr.Instr << 24) >> 23;
Comp_JumpTo(R15 + offset + 1, true);
- Comp_BranchSpecialBehaviour();
+ Comp_BranchSpecialBehaviour(true);
FixupBranch skipFailed = B();
SetJumpTarget(skipExecute);
Comp_AddCycles_C(true);
- if (CurInstr.BranchFlags & branch_FollowCondTaken)
- {
- SaveCPSR(false);
- RegCache.PrepareExit();
-
- ADD(W0, RCycles, ConstantCycles);
- ABI_PopRegisters(SavedRegs);
- RET();
- }
+ Comp_BranchSpecialBehaviour(false);
SetJumpTarget(skipFailed);
}
diff --git a/src/ARMJIT_A64/ARMJIT_Compiler.cpp b/src/ARMJIT_A64/ARMJIT_Compiler.cpp
index a67f357..42435ed 100644
--- a/src/ARMJIT_A64/ARMJIT_Compiler.cpp
+++ b/src/ARMJIT_A64/ARMJIT_Compiler.cpp
@@ -1,9 +1,3 @@
-#include "ARMJIT_Compiler.h"
-
-#include "../ARMInterpreter.h"
-
-#include "../ARMJIT_Internal.h"
-
#ifdef __SWITCH__
#include "../switch/compat_switch.h"
@@ -13,10 +7,17 @@ extern char __start__;
#include <unistd.h>
#endif
+#include "ARMJIT_Compiler.h"
+
+#include "../ARMJIT_Internal.h"
+#include "../ARMInterpreter.h"
+#include "../Config.h"
+
#include <malloc.h>
using namespace Arm64Gen;
+extern "C" void ARM_Ret();
namespace ARMJIT
{
@@ -28,7 +29,10 @@ namespace ARMJIT
like x64. At one hand you can translate a lot of instructions directly.
But at the same time, there are a ton of exceptions, like for
example ADD and SUB can't have a RORed second operand on ARMv8.
- */
+
+ While writing a JIT when an instruction is recompiled into multiple ones
+ not to write back until you've read all the other operands!
+*/
template <>
const ARM64Reg RegisterCache<Compiler, ARM64Reg>::NativeRegAllocOrder[] =
@@ -46,6 +50,132 @@ void Compiler::MovePC()
ADD(MapReg(15), MapReg(15), Thumb ? 2 : 4);
}
+void Compiler::A_Comp_MRS()
+{
+ Comp_AddCycles_C();
+
+ ARM64Reg rd = MapReg(CurInstr.A_Reg(12));
+
+ if (CurInstr.Instr & (1 << 22))
+ {
+ ANDI2R(W5, RCPSR, 0x1F);
+ MOVI2R(W3, 0);
+ MOVI2R(W1, 15 - 8);
+ BL(ReadBanked);
+ MOV(rd, W3);
+ }
+ else
+ MOV(rd, RCPSR);
+}
+
+void Compiler::A_Comp_MSR()
+{
+ Comp_AddCycles_C();
+
+ ARM64Reg val;
+ if (CurInstr.Instr & (1 << 25))
+ {
+ val = W0;
+ MOVI2R(val, ROR((CurInstr.Instr & 0xFF), ((CurInstr.Instr >> 7) & 0x1E)));
+ }
+ else
+ {
+ val = MapReg(CurInstr.A_Reg(0));
+ }
+
+ u32 mask = 0;
+ if (CurInstr.Instr & (1<<16)) mask |= 0x000000FF;
+ if (CurInstr.Instr & (1<<17)) mask |= 0x0000FF00;
+ if (CurInstr.Instr & (1<<18)) mask |= 0x00FF0000;
+ if (CurInstr.Instr & (1<<19)) mask |= 0xFF000000;
+
+ if (CurInstr.Instr & (1 << 22))
+ {
+ ANDI2R(W5, RCPSR, 0x1F);
+ MOVI2R(W3, 0);
+ MOVI2R(W1, 15 - 8);
+ BL(ReadBanked);
+
+ MOVI2R(W1, mask);
+ MOVI2R(W2, mask & 0xFFFFFF00);
+ ANDI2R(W5, RCPSR, 0x1F);
+ CMP(W5, 0x10);
+ CSEL(W1, W2, W1, CC_EQ);
+
+ BIC(W3, W3, W1);
+ AND(W0, val, W1);
+ ORR(W3, W3, W0);
+
+ MOVI2R(W1, 15 - 8);
+
+ BL(WriteBanked);
+ }
+ else
+ {
+ mask &= 0xFFFFFFDF;
+ CPSRDirty = true;
+
+ if ((mask & 0xFF) == 0)
+ {
+ ANDI2R(RCPSR, RCPSR, ~mask);
+ ANDI2R(W0, val, mask);
+ ORR(RCPSR, RCPSR, W0);
+ }
+ else
+ {
+ MOVI2R(W2, mask);
+ MOVI2R(W3, mask & 0xFFFFFF00);
+ ANDI2R(W1, RCPSR, 0x1F);
+ // W1 = first argument
+ CMP(W1, 0x10);
+ CSEL(W2, W3, W2, CC_EQ);
+
+ BIC(RCPSR, RCPSR, W2);
+ AND(W0, val, W2);
+ ORR(RCPSR, RCPSR, W0);
+
+ MOV(W2, RCPSR);
+ MOV(X0, RCPU);
+
+ PushRegs(true);
+
+ QuickCallFunction(X3, (void*)&ARM::UpdateMode);
+
+ PopRegs(true);
+ }
+ }
+}
+
+void Compiler::PushRegs(bool saveHiRegs)
+{
+ if (saveHiRegs)
+ {
+ if (Thumb || CurInstr.Cond() == 0xE)
+ {
+ BitSet16 hiRegsLoaded(RegCache.LoadedRegs & 0x7F00);
+ for (int reg : hiRegsLoaded)
+ RegCache.UnloadRegister(reg);
+ }
+ else
+ {
+ BitSet16 hiRegsDirty(RegCache.LoadedRegs & 0x7F00);
+ for (int reg : hiRegsDirty)
+ SaveReg(reg, RegCache.Mapping[reg]);
+ }
+ }
+}
+
+void Compiler::PopRegs(bool saveHiRegs)
+{
+ if (saveHiRegs)
+ {
+ BitSet16 hiRegsLoaded(RegCache.LoadedRegs & 0x7F00);
+
+ for (int reg : hiRegsLoaded)
+ LoadReg(reg, RegCache.Mapping[reg]);
+ }
+}
+
Compiler::Compiler()
{
#ifdef __SWITCH__
@@ -80,8 +210,7 @@ Compiler::Compiler()
assert(succeded);
SetCodeBase((u8*)JitRWStart, (u8*)JitRXStart);
- JitMemUseableSize = JitMemSize;
- Reset();
+ JitMemMainSize = JitMemSize;
#else
u64 pageSize = sysconf(_SC_PAGE_SIZE);
u8* pageAligned = (u8*)(((u64)JitMem & ~(pageSize - 1)) + pageSize);
@@ -90,31 +219,8 @@ Compiler::Compiler()
SetCodeBase(pageAligned, pageAligned);
JitMemUseableSize = alignedSize;
- Reset();
#endif
-
- for (int i = 0; i < 3; i++)
- {
- for (int j = 0; j < 2; j++)
- {
- MemFunc9[i][j] = Gen_MemoryRoutine9(8 << i, j);
- }
- }
- MemFunc7[0][0] = (void*)NDS::ARM7Read8;
- MemFunc7[1][0] = (void*)NDS::ARM7Read16;
- MemFunc7[2][0] = (void*)NDS::ARM7Read32;
- MemFunc7[0][1] = (void*)NDS::ARM7Write8;
- MemFunc7[1][1] = (void*)NDS::ARM7Write16;
- MemFunc7[2][1] = (void*)NDS::ARM7Write32;
-
- for (int i = 0; i < 2; i++)
- {
- for (int j = 0; j < 2; j++)
- {
- MemFuncsSeq9[i][j] = Gen_MemoryRoutine9Seq(i, j);
- MemFuncsSeq7[i][j] = Gen_MemoryRoutine7Seq(i, j);
- }
- }
+ SetCodePtr(0);
for (int i = 0; i < 3; i++)
{
@@ -123,26 +229,26 @@ Compiler::Compiler()
}
/*
- W0 - mode
+ W5 - mode
W1 - reg num
W3 - in/out value of reg
*/
{
ReadBanked = GetRXPtr();
- ADD(X2, RCPU, X1, ArithOption(X1, ST_LSL, 2));
- CMP(W0, 0x11);
+ ADD(X2, RCPU, X1, ArithOption(X2, ST_LSL, 2));
+ CMP(W5, 0x11);
FixupBranch fiq = B(CC_EQ);
SUBS(W1, W1, 13 - 8);
- ADD(X2, RCPU, X1, ArithOption(X1, ST_LSL, 2));
+ ADD(X2, RCPU, X1, ArithOption(X2, ST_LSL, 2));
FixupBranch notEverything = B(CC_LT);
- CMP(W0, 0x12);
+ CMP(W5, 0x12);
FixupBranch irq = B(CC_EQ);
- CMP(W0, 0x13);
+ CMP(W5, 0x13);
FixupBranch svc = B(CC_EQ);
- CMP(W0, 0x17);
+ CMP(W5, 0x17);
FixupBranch abt = B(CC_EQ);
- CMP(W0, 0x1B);
+ CMP(W5, 0x1B);
FixupBranch und = B(CC_EQ);
SetJumpTarget(notEverything);
RET();
@@ -166,19 +272,19 @@ Compiler::Compiler()
{
WriteBanked = GetRXPtr();
- ADD(X2, RCPU, X1, ArithOption(X1, ST_LSL, 2));
- CMP(W0, 0x11);
+ ADD(X2, RCPU, X1, ArithOption(X2, ST_LSL, 2));
+ CMP(W5, 0x11);
FixupBranch fiq = B(CC_EQ);
SUBS(W1, W1, 13 - 8);
- ADD(X2, RCPU, X1, ArithOption(X1, ST_LSL, 2));
+ ADD(X2, RCPU, X1, ArithOption(X2, ST_LSL, 2));
FixupBranch notEverything = B(CC_LT);
- CMP(W0, 0x12);
+ CMP(W5, 0x12);
FixupBranch irq = B(CC_EQ);
- CMP(W0, 0x13);
+ CMP(W5, 0x13);
FixupBranch svc = B(CC_EQ);
- CMP(W0, 0x17);
+ CMP(W5, 0x17);
FixupBranch abt = B(CC_EQ);
- CMP(W0, 0x1B);
+ CMP(W5, 0x1B);
FixupBranch und = B(CC_EQ);
SetJumpTarget(notEverything);
MOVI2R(W4, 0);
@@ -206,9 +312,71 @@ Compiler::Compiler()
RET();
}
- //FlushIcache();
+ for (int num = 0; num < 2; num++)
+ {
+ for (int size = 0; size < 3; size++)
+ {
+ for (int reg = 0; reg < 8; reg++)
+ {
+ ARM64Reg rdMapped = (ARM64Reg)(W19 + reg);
+ PatchedStoreFuncs[num][size][reg] = GetRXPtr();
+ if (num == 0)
+ {
+ MOV(X1, RCPU);
+ MOV(W2, rdMapped);
+ }
+ else
+ {
+ MOV(W1, rdMapped);
+ }
+ ABI_PushRegisters({30});
+ switch ((8 << size) | num)
+ {
+ case 32: QuickCallFunction(X3, SlowWrite9<u32>); break;
+ case 33: QuickCallFunction(X3, SlowWrite7<u32>); break;
+ case 16: QuickCallFunction(X3, SlowWrite9<u16>); break;
+ case 17: QuickCallFunction(X3, SlowWrite7<u16>); break;
+ case 8: QuickCallFunction(X3, SlowWrite9<u8>); break;
+ case 9: QuickCallFunction(X3, SlowWrite7<u8>); break;
+ }
+ ABI_PopRegisters({30});
+ RET();
+
+ for (int signextend = 0; signextend < 2; signextend++)
+ {
+ PatchedLoadFuncs[num][size][signextend][reg] = GetRXPtr();
+ if (num == 0)
+ MOV(X1, RCPU);
+ ABI_PushRegisters({30});
+ switch ((8 << size) | num)
+ {
+ case 32: QuickCallFunction(X3, SlowRead9<u32>); break;
+ case 33: QuickCallFunction(X3, SlowRead7<u32>); break;
+ case 16: QuickCallFunction(X3, SlowRead9<u16>); break;
+ case 17: QuickCallFunction(X3, SlowRead7<u16>); break;
+ case 8: QuickCallFunction(X3, SlowRead9<u8>); break;
+ case 9: QuickCallFunction(X3, SlowRead7<u8>); break;
+ }
+ ABI_PopRegisters({30});
+ if (size == 32)
+ MOV(rdMapped, W0);
+ else if (signextend)
+ SBFX(rdMapped, W0, 0, 8 << size);
+ else
+ UBFX(rdMapped, W0, 0, 8 << size);
+ RET();
+ }
+ }
+ }
+ }
+
+ FlushIcache();
+
+ JitMemSecondarySize = 1024*1024*4;
+
+ JitMemMainSize -= GetCodeOffset();
+ JitMemMainSize -= JitMemSecondarySize;
- JitMemUseableSize -= GetCodeOffset();
SetCodeBase((u8*)GetRWPtr(), (u8*)GetRXPtr());
}
@@ -227,6 +395,16 @@ Compiler::~Compiler()
#endif
}
+void Compiler::LoadCycles()
+{
+ LDR(INDEX_UNSIGNED, RCycles, RCPU, offsetof(ARM, Cycles));
+}
+
+void Compiler::SaveCycles()
+{
+ STR(INDEX_UNSIGNED, RCycles, RCPU, offsetof(ARM, Cycles));
+}
+
void Compiler::LoadReg(int reg, ARM64Reg nativeReg)
{
if (reg == 15)
@@ -325,7 +503,7 @@ const Compiler::CompileFunc A_Comp[ARMInstrInfo::ak_Count] =
// CMN
F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp),
// Mul
- F(Mul), F(Mul), F(Mul_Long), F(Mul_Long), F(Mul_Long), F(Mul_Long), NULL, NULL, NULL, NULL, NULL,
+ F(Mul), F(Mul), F(Mul_Long), F(Mul_Long), F(Mul_Long), F(Mul_Long), F(Mul_Short), F(Mul_Short), F(Mul_Short), F(Mul_Short), F(Mul_Short),
// ARMv5 exclusives
F(Clz), NULL, NULL, NULL, NULL,
@@ -356,7 +534,7 @@ const Compiler::CompileFunc A_Comp[ARMInstrInfo::ak_Count] =
// Branch
F(BranchImm), F(BranchImm), F(BranchImm), F(BranchXchangeReg), F(BranchXchangeReg),
// Special
- NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+ NULL, F(MSR), F(MSR), F(MRS), NULL, NULL, NULL,
&Compiler::Nop
};
#undef F
@@ -404,29 +582,34 @@ bool Compiler::CanCompile(bool thumb, u16 kind)
return (thumb ? T_Comp[kind] : A_Comp[kind]) != NULL;
}
-void Compiler::Comp_BranchSpecialBehaviour()
+void Compiler::Comp_BranchSpecialBehaviour(bool taken)
{
- if (CurInstr.BranchFlags & branch_IdleBranch)
+ if (taken && CurInstr.BranchFlags & branch_IdleBranch)
{
MOVI2R(W0, 1);
STRB(INDEX_UNSIGNED, W0, RCPU, offsetof(ARM, IdleLoop));
}
- if (CurInstr.BranchFlags & branch_FollowCondNotTaken)
+ if ((CurInstr.BranchFlags & branch_FollowCondNotTaken && taken)
+ || (CurInstr.BranchFlags & branch_FollowCondTaken && !taken))
{
- SaveCPSR(false);
RegCache.PrepareExit();
- ADD(W0, RCycles, ConstantCycles);
- ABI_PopRegisters(SavedRegs);
- RET();
+
+ SUB(RCycles, RCycles, ConstantCycles);
+ QuickTailCall(X0, ARM_Ret);
}
}
JitBlockEntry Compiler::CompileBlock(ARM* cpu, bool thumb, FetchedInstr instrs[], int instrsCount)
{
- if (JitMemUseableSize - GetCodeOffset() < 1024 * 16)
+ if (JitMemMainSize - GetCodeOffset() < 1024 * 16)
+ {
+ printf("JIT near memory full, resetting...\n");
+ ResetBlockCache();
+ }
+ if ((JitMemMainSize + JitMemSecondarySize) - OtherCodeRegion < 1024 * 8)
{
- printf("JIT memory full, resetting...\n");
+ printf("JIT far memory full, resetting...\n");
ResetBlockCache();
}
@@ -437,21 +620,7 @@ JitBlockEntry Compiler::CompileBlock(ARM* cpu, bool thumb, FetchedInstr instrs[]
CurCPU = cpu;
ConstantCycles = 0;
RegCache = RegisterCache<Compiler, ARM64Reg>(this, instrs, instrsCount, true);
-
- //printf("compiling block at %x\n", R15 - (Thumb ? 2 : 4));
- const u32 ALL_CALLEE_SAVED = 0x7FF80000;
-
- SavedRegs = BitSet32((RegCache.GetPushRegs() | BitSet32(0x78000000)) & BitSet32(ALL_CALLEE_SAVED));
-
- //if (Num == 1)
- {
- ABI_PushRegisters(SavedRegs);
-
- MOVP2R(RCPU, CurCPU);
- MOVI2R(RCycles, 0);
-
- LoadCPSR();
- }
+ CPSRDirty = false;
for (int i = 0; i < instrsCount; i++)
{
@@ -486,6 +655,7 @@ JitBlockEntry Compiler::CompileBlock(ARM* cpu, bool thumb, FetchedInstr instrs[]
if (comp == NULL)
{
+ SaveCycles();
SaveCPSR();
RegCache.Flush();
}
@@ -535,25 +705,18 @@ JitBlockEntry Compiler::CompileBlock(ARM* cpu, bool thumb, FetchedInstr instrs[]
(this->*comp)();
}
- Comp_BranchSpecialBehaviour();
+ Comp_BranchSpecialBehaviour(true);
if (cond < 0xE)
{
- if (IrregularCycles)
+ if (IrregularCycles || (CurInstr.BranchFlags & branch_FollowCondTaken))
{
FixupBranch skipNop = B();
SetJumpTarget(skipExecute);
Comp_AddCycles_C();
- if (CurInstr.BranchFlags & branch_FollowCondTaken)
- {
- SaveCPSR(false);
- RegCache.PrepareExit();
- ADD(W0, RCycles, ConstantCycles);
- ABI_PopRegisters(SavedRegs);
- RET();
- }
+ Comp_BranchSpecialBehaviour(false);
SetJumpTarget(skipNop);
}
@@ -565,76 +728,74 @@ JitBlockEntry Compiler::CompileBlock(ARM* cpu, bool thumb, FetchedInstr instrs[]
}
if (comp == NULL)
+ {
+ LoadCycles();
LoadCPSR();
+ }
}
RegCache.Flush();
- //if (Num == 1)
- {
- SaveCPSR();
-
- ADD(W0, RCycles, ConstantCycles);
-
- ABI_PopRegisters(SavedRegs);
- }
- //else
- // ADD(RCycles, RCycles, ConstantCycles);
-
- RET();
+ SUB(RCycles, RCycles, ConstantCycles);
+ QuickTailCall(X0, ARM_Ret);
FlushIcache();
- //printf("finished\n");
-
return res;
}
void Compiler::Reset()
{
+ LoadStorePatches.clear();
+
SetCodePtr(0);
+ OtherCodeRegion = JitMemMainSize;
const u32 brk_0 = 0xD4200000;
- for (int i = 0; i < JitMemUseableSize / 4; i++)
+ for (int i = 0; i < (JitMemMainSize + JitMemSecondarySize) / 4; i++)
*(((u32*)GetRWPtr()) + i) = brk_0;
}
-void Compiler::Comp_AddCycles_C(bool nonConst)
+void Compiler::Comp_AddCycles_C(bool forceNonConstant)
{
s32 cycles = Num ?
NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 1 : 3]
: ((R15 & 0x2) ? 0 : CurInstr.CodeCycles);
- if (!nonConst && !CurInstr.Info.Branches())
+ if (forceNonConstant)
ConstantCycles += cycles;
else
- ADD(RCycles, RCycles, cycles);
+ SUB(RCycles, RCycles, cycles);
}
void Compiler::Comp_AddCycles_CI(u32 numI)
{
+ IrregularCycles = true;
+
s32 cycles = (Num ?
NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2]
: ((R15 & 0x2) ? 0 : CurInstr.CodeCycles)) + numI;
- if (Thumb || CurInstr.Cond() >= 0xE)
+ if (Thumb || CurInstr.Cond() == 0xE)
ConstantCycles += cycles;
else
- ADD(RCycles, RCycles, cycles);
+ SUB(RCycles, RCycles, cycles);
}
void Compiler::Comp_AddCycles_CI(u32 c, ARM64Reg numI, ArithOption shift)
{
+ IrregularCycles = true;
+
s32 cycles = (Num ?
NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2]
: ((R15 & 0x2) ? 0 : CurInstr.CodeCycles)) + c;
- ADD(RCycles, RCycles, numI, shift);
+ SUB(RCycles, RCycles, cycles);
if (Thumb || CurInstr.Cond() >= 0xE)
- ConstantCycles += c;
+ ConstantCycles += cycles;
else
- ADD(RCycles, RCycles, cycles);
+ SUB(RCycles, RCycles, cycles);
}
void Compiler::Comp_AddCycles_CDI()
@@ -671,7 +832,7 @@ void Compiler::Comp_AddCycles_CDI()
}
if (!Thumb && CurInstr.Cond() < 0xE)
- ADD(RCycles, RCycles, cycles);
+ SUB(RCycles, RCycles, cycles);
else
ConstantCycles += cycles;
}
@@ -715,7 +876,7 @@ void Compiler::Comp_AddCycles_CD()
}
if ((!Thumb && CurInstr.Cond() < 0xE) && IrregularCycles)
- ADD(RCycles, RCycles, cycles);
+ SUB(RCycles, RCycles, cycles);
else
ConstantCycles += cycles;
}
diff --git a/src/ARMJIT_A64/ARMJIT_Compiler.h b/src/ARMJIT_A64/ARMJIT_Compiler.h
index 5c9ef41..e4ffc63 100644
--- a/src/ARMJIT_A64/ARMJIT_Compiler.h
+++ b/src/ARMJIT_A64/ARMJIT_Compiler.h
@@ -9,6 +9,8 @@
#include "../ARMJIT_Internal.h"
#include "../ARMJIT_RegisterCache.h"
+#include <unordered_map>
+
namespace ARMJIT
{
@@ -64,7 +66,14 @@ struct Op2
};
};
-class Compiler : Arm64Gen::ARM64XEmitter
+struct LoadStorePatch
+{
+ void* PatchFunc;
+ s32 PatchOffset;
+ u32 PatchSize;
+};
+
+class Compiler : public Arm64Gen::ARM64XEmitter
{
public:
typedef void (Compiler::*CompileFunc)();
@@ -72,6 +81,9 @@ public:
Compiler();
~Compiler();
+ void PushRegs(bool saveHiRegs);
+ void PopRegs(bool saveHiRegs);
+
Arm64Gen::ARM64Reg MapReg(int reg)
{
assert(RegCache.Mapping[reg] != Arm64Gen::INVALID_REG);
@@ -89,7 +101,7 @@ public:
void Reset();
- void Comp_AddCycles_C(bool forceNonConst = false);
+ void Comp_AddCycles_C(bool forceNonConstant = false);
void Comp_AddCycles_CI(u32 numI);
void Comp_AddCycles_CI(u32 c, Arm64Gen::ARM64Reg numI, Arm64Gen::ArithOption shift);
void Comp_AddCycles_CD();
@@ -103,6 +115,9 @@ public:
void LoadCPSR();
void SaveCPSR(bool markClean = true);
+ void LoadCycles();
+ void SaveCycles();
+
void Nop() {}
void A_Comp_ALUTriOp();
@@ -111,6 +126,7 @@ public:
void A_Comp_Mul();
void A_Comp_Mul_Long();
+ void A_Comp_Mul_Short();
void A_Comp_Clz();
@@ -122,6 +138,8 @@ public:
void A_Comp_BranchImm();
void A_Comp_BranchXchangeReg();
+ void A_Comp_MRS();
+ void A_Comp_MSR();
void T_Comp_ShiftImm();
void T_Comp_AddSub_();
@@ -168,7 +186,7 @@ public:
void Comp_RegShiftImm(int op, int amount, bool S, Op2& op2, Arm64Gen::ARM64Reg tmp = Arm64Gen::W0);
void Comp_RegShiftReg(int op, bool S, Op2& op2, Arm64Gen::ARM64Reg rs);
- void Comp_MemLoadLiteral(int size, bool signExtend, int rd, u32 addr);
+ bool Comp_MemLoadLiteral(int size, bool signExtend, int rd, u32 addr);
enum
{
memop_Writeback = 1 << 0,
@@ -179,16 +197,33 @@ public:
};
void Comp_MemAccess(int rd, int rn, Op2 offset, int size, int flags);
- void* Gen_MemoryRoutine9(int size, bool store);
-
- void* Gen_MemoryRoutine9Seq(bool store, bool preinc);
- void* Gen_MemoryRoutine7Seq(bool store, bool preinc);
-
// 0 = switch mode, 1 = stay arm, 2 = stay thumb
void* Gen_JumpTo9(int kind);
void* Gen_JumpTo7(int kind);
- void Comp_BranchSpecialBehaviour();
+ void Comp_BranchSpecialBehaviour(bool taken);
+
+ JitBlockEntry AddEntryOffset(u32 offset)
+ {
+ return (JitBlockEntry)(GetRXBase() + offset);
+ }
+
+ u32 SubEntryOffset(JitBlockEntry entry)
+ {
+ return (u8*)entry - GetRXBase();
+ }
+
+ bool IsJITFault(u64 pc);
+ s64 RewriteMemAccess(u64 pc);
+
+ void SwapCodeRegion()
+ {
+ ptrdiff_t offset = GetCodeOffset();
+ SetCodePtrUnsafe(OtherCodeRegion);
+ OtherCodeRegion = offset;
+ }
+
+ ptrdiff_t OtherCodeRegion;
bool Exit;
@@ -202,22 +237,20 @@ public:
BitSet32 SavedRegs;
- u32 JitMemUseableSize;
+ u32 JitMemSecondarySize;
+ u32 JitMemMainSize;
void* ReadBanked, *WriteBanked;
- // [size][store]
- void* MemFunc9[3][2];
- void* MemFunc7[3][2];
-
- // [store][pre increment]
- void* MemFuncsSeq9[2][2];
- // "[code in main ram]
- void* MemFuncsSeq7[2][2];
-
void* JumpToFuncs9[3];
void* JumpToFuncs7[3];
+ std::unordered_map<ptrdiff_t, LoadStorePatch> LoadStorePatches;
+
+ // [Num][Size][Sign Extend][Output register]
+ void* PatchedLoadFuncs[2][3][2][8];
+ void* PatchedStoreFuncs[2][3][8];
+
RegisterCache<Compiler, Arm64Gen::ARM64Reg> RegCache;
bool CPSRDirty = false;
diff --git a/src/ARMJIT_A64/ARMJIT_Linkage.s b/src/ARMJIT_A64/ARMJIT_Linkage.s
new file mode 100644
index 0000000..536a478
--- /dev/null
+++ b/src/ARMJIT_A64/ARMJIT_Linkage.s
@@ -0,0 +1,68 @@
+#include "../ARMJIT_x64/ARMJIT_Offsets.h"
+
+.text
+
+#define RCPSR W27
+#define RCycles W28
+#define RCPU X29
+
+.p2align 4,,15
+
+.global ARM_Dispatch
+ARM_Dispatch:
+ stp x19, x20, [sp, #-96]!
+ stp x21, x22, [sp, #16]
+ stp x23, x24, [sp, #32]
+ stp x25, x26, [sp, #48]
+ stp x27, x28, [sp, #64]
+ stp x29, x30, [sp, #80]
+
+ mov RCPU, x0
+ ldr RCycles, [RCPU, ARM_Cycles_offset]
+ ldr RCPSR, [RCPU, ARM_CPSR_offset]
+
+ br x1
+
+.p2align 4,,15
+
+.global ARM_Ret
+ARM_Ret:
+ str RCycles, [RCPU, ARM_Cycles_offset]
+ str RCPSR, [RCPU, ARM_CPSR_offset]
+
+ ldp x29, x30, [sp, #80]
+ ldp x27, x28, [sp, #64]
+ ldp x25, x26, [sp, #48]
+ ldp x23, x24, [sp, #32]
+ ldp x21, x22, [sp, #16]
+ ldp x19, x20, [sp], #96
+
+ ret
+
+.p2align 4,,15
+
+.global ARM_RestoreContext
+ARM_RestoreContext:
+ mov sp, x0
+
+ ldp x0, x1, [sp]
+ ldp x2, x3, [sp, #16]
+ ldp x4, x5, [sp, #32]
+ ldp x6, x7, [sp, #48]
+ ldp x8, x9, [sp, #64]
+ ldp x10, x11, [sp, #80]
+ ldp x12, x13, [sp, #96]
+ ldp x14, x15, [sp, #112]
+ ldp x16, x17, [sp, #128]
+ ldp x18, x19, [sp, #144]
+ ldp x20, x21, [sp, #160]
+ ldp x22, x23, [sp, #176]
+ ldp x24, x25, [sp, #192]
+ ldp x26, x27, [sp, #208]
+ ldp x28, x29, [sp, #224]
+ ldr x30, [sp, #240]
+
+ ldp x17, x18, [sp, #248]
+ mov sp, x17
+
+ br x18 \ No newline at end of file
diff --git a/src/ARMJIT_A64/ARMJIT_LoadStore.cpp b/src/ARMJIT_A64/ARMJIT_LoadStore.cpp
index 6cf710b..b307d0e 100644
--- a/src/ARMJIT_A64/ARMJIT_LoadStore.cpp
+++ b/src/ARMJIT_A64/ARMJIT_LoadStore.cpp
@@ -2,286 +2,62 @@
#include "../Config.h"
+#include "../ARMJIT_Memory.h"
+
using namespace Arm64Gen;
namespace ARMJIT
{
-// W0 - address
-// (if store) W1 - value to store
-// W2 - code cycles
-void* Compiler::Gen_MemoryRoutine9(int size, bool store)
+bool Compiler::IsJITFault(u64 pc)
{
- AlignCode16();
- void* res = GetRXPtr();
-
- u32 addressMask;
- switch (size)
- {
- case 32: addressMask = ~3; break;
- case 16: addressMask = ~1; break;
- case 8: addressMask = ~0; break;
- }
-
- LDR(INDEX_UNSIGNED, W3, RCPU, offsetof(ARMv5, DTCMBase));
- LDR(INDEX_UNSIGNED, W4, RCPU, offsetof(ARMv5, DTCMSize));
- SUB(W3, W0, W3);
- CMP(W3, W4);
- FixupBranch insideDTCM = B(CC_LO);
-
- UBFX(W4, W0, 24, 8);
- CMP(W4, 0x02);
- FixupBranch outsideMainRAM = B(CC_NEQ);
- ANDI2R(W3, W0, addressMask & (MAIN_RAM_SIZE - 1));
- MOVP2R(X4, NDS::MainRAM);
- if (!store && size == 32)
- {
- LDR(W3, X3, X4);
- ANDI2R(W0, W0, 3);
- LSL(W0, W0, 3);
- RORV(W0, W3, W0);
- }
- else if (store)
- STRGeneric(size, W1, X3, X4);
- else
- LDRGeneric(size, false, W0, X3, X4);
- RET();
-
- SetJumpTarget(outsideMainRAM);
-
- LDR(INDEX_UNSIGNED, W3, RCPU, offsetof(ARMv5, ITCMSize));
- CMP(W0, W3);
- FixupBranch insideITCM = B(CC_LO);
-
- if (store)
- {
- if (size > 8)
- ANDI2R(W0, W0, addressMask);
-
- switch (size)
- {
- case 32: QuickTailCall(X4, NDS::ARM9Write32); break;
- case 16: QuickTailCall(X4, NDS::ARM9Write16); break;
- case 8: QuickTailCall(X4, NDS::ARM9Write8); break;
- }
- }
- else
- {
- if (size == 32)
- ABI_PushRegisters({0, 30});
- if (size > 8)
- ANDI2R(W0, W0, addressMask);
-
- switch (size)
- {
- case 32: QuickCallFunction(X4, NDS::ARM9Read32); break;
- case 16: QuickTailCall (X4, NDS::ARM9Read16); break;
- case 8: QuickTailCall (X4, NDS::ARM9Read8 ); break;
- }
- if (size == 32)
- {
- ABI_PopRegisters({1, 30});
- ANDI2R(W1, W1, 3);
- LSL(W1, W1, 3);
- RORV(W0, W0, W1);
- RET();
- }
- }
-
- SetJumpTarget(insideDTCM);
- ANDI2R(W3, W3, 0x3FFF & addressMask);
- ADDI2R(W3, W3, offsetof(ARMv5, DTCM), W4);
- if (!store && size == 32)
- {
- ANDI2R(W4, W0, 3);
- LDR(W0, RCPU, W3);
- LSL(W4, W4, 3);
- RORV(W0, W0, W4);
- }
- else if (store)
- STRGeneric(size, W1, RCPU, W3);
- else
- LDRGeneric(size, false, W0, RCPU, W3);
-
- RET();
-
- SetJumpTarget(insideITCM);
- ANDI2R(W3, W0, 0x7FFF & addressMask);
- if (store)
- {
- ADDI2R(W0, W3, ExeMemRegionOffsets[exeMem_ITCM], W4);
- LSR(W5, W0, 9);
- MOVP2R(X4, CodeRanges);
- ADD(X4, X4, X5, ArithOption(X5, ST_LSL, 4));
- static_assert(sizeof(AddressRange) == 16);
- LDRH(INDEX_UNSIGNED, W4, X4, offsetof(AddressRange, Blocks.Length));
- FixupBranch null = CBZ(W4);
- ABI_PushRegisters({1, 3, 30});
- QuickCallFunction(X4, InvalidateByAddr);
- ABI_PopRegisters({1, 3, 30});
- SetJumpTarget(null);
- }
- ADDI2R(W3, W3, offsetof(ARMv5, ITCM), W4);
- if (!store && size == 32)
- {
- ANDI2R(W4, W0, 3);
- LDR(W0, RCPU, W3);
- LSL(W4, W4, 3);
- RORV(W0, W0, W4);
- }
- else if (store)
- STRGeneric(size, W1, RCPU, W3);
- else
- LDRGeneric(size, false, W0, RCPU, W3);
- RET();
-
- return res;
+ return pc >= (u64)GetRXBase() && pc - (u64)GetRXBase() < (JitMemMainSize + JitMemSecondarySize);
}
-/*
- W0 - base address
- X1 - stack space
- W2 - values count
-*/
-void* Compiler::Gen_MemoryRoutine9Seq(bool store, bool preinc)
+s64 Compiler::RewriteMemAccess(u64 pc)
{
- AlignCode16();
- void* res = GetRXPtr();
-
- void* loopStart = GetRXPtr();
- SUB(W2, W2, 1);
-
- if (preinc)
- ADD(W0, W0, 4);
+ ptrdiff_t pcOffset = pc - (u64)GetRXBase();
- LDR(INDEX_UNSIGNED, W4, RCPU, offsetof(ARMv5, DTCMBase));
- LDR(INDEX_UNSIGNED, W5, RCPU, offsetof(ARMv5, DTCMSize));
- SUB(W4, W0, W4);
- CMP(W4, W5);
- FixupBranch insideDTCM = B(CC_LO);
+ auto it = LoadStorePatches.find(pcOffset);
- LDR(INDEX_UNSIGNED, W4, RCPU, offsetof(ARMv5, ITCMSize));
- CMP(W0, W4);
- FixupBranch insideITCM = B(CC_LO);
-
- ABI_PushRegisters({0, 1, 2, 30}); // TODO: move SP only once
- if (store)
+ if (it != LoadStorePatches.end())
{
- LDR(X1, X1, ArithOption(X2, true));
- QuickCallFunction(X4, NDS::ARM9Write32);
+ LoadStorePatch patch = it->second;
- ABI_PopRegisters({0, 1, 2, 30});
- }
- else
- {
- QuickCallFunction(X4, NDS::ARM9Read32);
- MOV(W4, W0);
+ ptrdiff_t curCodeOffset = GetCodeOffset();
- ABI_PopRegisters({0, 1, 2, 30});
+ SetCodePtrUnsafe(pcOffset + patch.PatchOffset);
- STR(X4, X1, ArithOption(X2, true));
- }
+ BL(patch.PatchFunc);
- if (!preinc)
- ADD(W0, W0, 4);
- CBNZ(W2, loopStart);
- RET();
+ for (int i = 0; i < patch.PatchSize / 4 - 1; i++)
+ HINT(HINT_NOP);
- SetJumpTarget(insideDTCM);
+ FlushIcacheSection((u8*)pc + patch.PatchOffset, (u8*)GetRXPtr());
- ANDI2R(W4, W4, ~3 & 0x3FFF);
- ADDI2R(X4, X4, offsetof(ARMv5, DTCM));
- if (store)
- {
- LDR(X5, X1, ArithOption(X2, true));
- STR(W5, RCPU, X4);
- }
- else
- {
- LDR(W5, RCPU, X4);
- STR(X5, X1, ArithOption(X2, true));
- }
+ SetCodePtrUnsafe(curCodeOffset);
- if (!preinc)
- ADD(W0, W0, 4);
- CBNZ(W2, loopStart);
- RET();
-
- SetJumpTarget(insideITCM);
-
- ANDI2R(W4, W0, ~3 & 0x7FFF);
-
- ADDI2R(W6, W4, offsetof(ARMv5, ITCM), W5);
- if (store)
- {
- LDR(X5, X1, ArithOption(X2, true));
- STR(W5, RCPU, X6);
- }
- else
- {
- LDR(W5, RCPU, X6);
- STR(X5, X1, ArithOption(X2, true));
- }
+ LoadStorePatches.erase(it);
- if (store)
- {
- ADDI2R(W4, W4, ExeMemRegionOffsets[exeMem_ITCM], W5);
- LSR(W6, W4, 9);
- MOVP2R(X5, CodeRanges);
- ADD(X5, X5, X6, ArithOption(X6, ST_LSL, 4));
- static_assert(sizeof(AddressRange) == 16);
- LDRH(INDEX_UNSIGNED, W5, X5, offsetof(AddressRange, Blocks.Length));
- FixupBranch null = CBZ(W5);
- ABI_PushRegisters({0, 1, 2, 4, 30});
- MOV(W0, W4);
- QuickCallFunction(X5, InvalidateByAddr);
- ABI_PopRegisters({0, 1, 2, 4, 30});
- SetJumpTarget(null);
+ return patch.PatchOffset;
}
-
- if (!preinc)
- ADD(W0, W0, 4);
- CBNZ(W2, loopStart);
- RET();
- return res;
+ printf("this is a JIT bug! %08x\n", __builtin_bswap32(*(u32*)pc));
+ assert(false);
}
-void* Compiler::Gen_MemoryRoutine7Seq(bool store, bool preinc)
+bool Compiler::Comp_MemLoadLiteral(int size, bool signExtend, int rd, u32 addr)
{
- AlignCode16();
- void* res = GetRXPtr();
+ u32 localAddr = LocaliseCodeAddress(Num, addr);
- void* loopStart = GetRXPtr();
- SUB(W2, W2, 1);
-
- if (preinc)
- ADD(W0, W0, 4);
-
- ABI_PushRegisters({0, 1, 2, 30});
- if (store)
+ int invalidLiteralIdx = InvalidLiterals.Find(localAddr);
+ if (invalidLiteralIdx != -1)
{
- LDR(X1, X1, ArithOption(X2, true));
- QuickCallFunction(X4, NDS::ARM7Write32);
- ABI_PopRegisters({0, 1, 2, 30});
+ InvalidLiterals.Remove(invalidLiteralIdx);
+ return false;
}
- else
- {
- QuickCallFunction(X4, NDS::ARM7Read32);
- MOV(W4, W0);
- ABI_PopRegisters({0, 1, 2, 30});
- STR(X4, X1, ArithOption(X2, true));
- }
-
- if (!preinc)
- ADD(W0, W0, 4);
- CBNZ(W2, loopStart);
- RET();
- return res;
-}
+ Comp_AddCycles_CDI();
-void Compiler::Comp_MemLoadLiteral(int size, bool signExtend, int rd, u32 addr)
-{
u32 val;
// make sure arm7 bios is accessible
u32 tmpR15 = CurCPU->R[15];
@@ -309,6 +85,8 @@ void Compiler::Comp_MemLoadLiteral(int size, bool signExtend, int rd, u32 addr)
if (Thumb || CurInstr.Cond() == 0xE)
RegCache.PutLiteral(rd, val);
+
+ return true;
}
void Compiler::Comp_MemAccess(int rd, int rn, Op2 offset, int size, int flags)
@@ -318,163 +96,209 @@ void Compiler::Comp_MemAccess(int rd, int rn, Op2 offset, int size, int flags)
addressMask = ~3;
if (size == 16)
addressMask = ~1;
+
+ if (Config::JIT_LiteralOptimisations && rn == 15 && rd != 15 && offset.IsImm && !(flags & (memop_Post|memop_Store|memop_Writeback)))
+ {
+ u32 addr = R15 + offset.Imm * ((flags & memop_SubtractOffset) ? -1 : 1);
+
+ if (Comp_MemLoadLiteral(size, flags & memop_SignExtend, rd, addr))
+ return;
+ }
if (flags & memop_Store)
Comp_AddCycles_CD();
else
Comp_AddCycles_CDI();
- if (Config::JIT_LiteralOptimisations && rn == 15 && rd != 15 && offset.IsImm && !(flags & (memop_Post|memop_Store|memop_Writeback)))
- {
- u32 addr = R15 + offset.Imm * ((flags & memop_SubtractOffset) ? -1 : 1);
- u32 translatedAddr = Num == 0 ? TranslateAddr<0>(addr) : TranslateAddr<1>(addr);
+ ARM64Reg rdMapped = MapReg(rd);
+ ARM64Reg rnMapped = MapReg(rn);
- if (!(CodeRanges[translatedAddr / 512].InvalidLiterals & (1 << ((translatedAddr & 0x1FF) / 16))))
- {
- Comp_MemLoadLiteral(size, flags & memop_SignExtend, rd, addr);
- return;
- }
+ if (Thumb && rn == 15)
+ {
+ ANDI2R(W3, rnMapped, ~2);
+ rnMapped = W3;
}
+ ARM64Reg finalAddr = W0;
+ if (flags & memop_Post)
{
- ARM64Reg rdMapped = MapReg(rd);
- ARM64Reg rnMapped = MapReg(rn);
-
- bool inlinePreparation = Num == 1;
- u32 constLocalROR32 = 4;
+ finalAddr = rnMapped;
+ MOV(W0, rnMapped);
+ }
- void* memFunc = Num == 0
- ? MemFunc9[size >> 4][!!(flags & memop_Store)]
- : MemFunc7[size >> 4][!!((flags & memop_Store))];
+ bool addrIsStatic = Config::JIT_LiteralOptimisations
+ && RegCache.IsLiteral(rn) && offset.IsImm && !(flags & (memop_Writeback|memop_Post));
+ u32 staticAddress;
+ if (addrIsStatic)
+ staticAddress = RegCache.LiteralValues[rn] + offset.Imm * ((flags & memop_SubtractOffset) ? -1 : 1);
- if (Config::JIT_LiteralOptimisations && (rd != 15 || (flags & memop_Store)) && offset.IsImm && RegCache.IsLiteral(rn))
+ if (!offset.IsImm)
+ Comp_RegShiftImm(offset.Reg.ShiftType, offset.Reg.ShiftAmount, false, offset, W2);
+ // offset might has become an immediate
+ if (offset.IsImm)
+ {
+ if (offset.Imm)
+ {
+ if (flags & memop_SubtractOffset)
+ SUB(finalAddr, rnMapped, offset.Imm);
+ else
+ ADD(finalAddr, rnMapped, offset.Imm);
+ }
+ else if (finalAddr != rnMapped)
+ MOV(finalAddr, rnMapped);
+ }
+ else
+ {
+ if (offset.Reg.ShiftType == ST_ROR)
{
- u32 addr = RegCache.LiteralValues[rn] + offset.Imm * ((flags & memop_SubtractOffset) ? -1 : 1);
+ ROR_(W0, offset.Reg.Rm, offset.Reg.ShiftAmount);
+ offset = Op2(W0);
+ }
- NDS::MemRegion region;
- region.Mem = NULL;
- if (Num == 0)
- {
- ARMv5* cpu5 = (ARMv5*)CurCPU;
+ if (flags & memop_SubtractOffset)
+ SUB(finalAddr, rnMapped, offset.Reg.Rm, offset.ToArithOption());
+ else
+ ADD(finalAddr, rnMapped, offset.Reg.Rm, offset.ToArithOption());
+ }
- // stupid dtcm...
- if (addr >= cpu5->DTCMBase && addr < (cpu5->DTCMBase + cpu5->DTCMSize))
- {
- region.Mem = cpu5->DTCM;
- region.Mask = 0x3FFF;
- }
- else
- {
- NDS::ARM9GetMemRegion(addr, flags & memop_Store, &region);
- }
- }
- else
- NDS::ARM7GetMemRegion(addr, flags & memop_Store, &region);
+ if (!(flags & memop_Post) && (flags & memop_Writeback))
+ MOV(rnMapped, W0);
- if (region.Mem != NULL)
- {
- void* ptr = &region.Mem[addr & addressMask & region.Mask];
+ u32 expectedTarget = Num == 0
+ ? ARMJIT_Memory::ClassifyAddress9(addrIsStatic ? staticAddress : CurInstr.DataRegion)
+ : ARMJIT_Memory::ClassifyAddress7(addrIsStatic ? staticAddress : CurInstr.DataRegion);
- MOVP2R(X0, ptr);
- if (flags & memop_Store)
- STRGeneric(size, INDEX_UNSIGNED, rdMapped, X0, 0);
- else
- {
- LDRGeneric(size, flags & memop_SignExtend, INDEX_UNSIGNED, rdMapped, X0, 0);
- if (size == 32 && addr & ~0x3)
- ROR_(rdMapped, rdMapped, (addr & 0x3) << 3);
- }
- return;
- }
+ if (Config::JIT_FastMemory && ((!Thumb && CurInstr.Cond() != 0xE) || ARMJIT_Memory::IsMappable(expectedTarget)))
+ {
+ ptrdiff_t memopStart = GetCodeOffset();
+ LoadStorePatch patch;
- void* specialFunc = GetFuncForAddr(CurCPU, addr, flags & memop_Store, size);
- if (specialFunc)
- {
- memFunc = specialFunc;
- inlinePreparation = true;
- constLocalROR32 = addr & 0x3;
- }
- }
+ patch.PatchFunc = flags & memop_Store
+ ? PatchedStoreFuncs[Num][__builtin_ctz(size) - 3][rdMapped - W19]
+ : PatchedLoadFuncs[Num][__builtin_ctz(size) - 3][!!(flags & memop_SignExtend)][rdMapped - W19];
+ assert(rdMapped - W19 >= 0 && rdMapped - W19 < 8);
- ARM64Reg finalAddr = W0;
- if (flags & memop_Post)
- {
- finalAddr = rnMapped;
- MOV(W0, rnMapped);
- }
+ MOVP2R(X7, Num == 0 ? ARMJIT_Memory::FastMem9Start : ARMJIT_Memory::FastMem7Start);
+ // take a chance at fastmem
+ if (size > 8)
+ ANDI2R(W1, W0, addressMask);
+
+ ptrdiff_t loadStorePosition = GetCodeOffset();
if (flags & memop_Store)
- MOV(W1, rdMapped);
-
- if (!offset.IsImm)
- Comp_RegShiftImm(offset.Reg.ShiftType, offset.Reg.ShiftAmount, false, offset, W2);
- // offset might become an immediate
- if (offset.IsImm)
{
- if (flags & memop_SubtractOffset)
- SUB(finalAddr, rnMapped, offset.Imm);
- else
- ADD(finalAddr, rnMapped, offset.Imm);
+ STRGeneric(size, rdMapped, size > 8 ? X1 : X0, X7);
}
else
{
- if (offset.Reg.ShiftType == ST_ROR)
+ LDRGeneric(size, flags & memop_SignExtend, rdMapped, size > 8 ? X1 : X0, X7);
+ if (size == 32)
{
- ROR_(W0, offset.Reg.Rm, offset.Reg.ShiftAmount);
- offset = Op2(W0);
+ UBFIZ(W0, W0, 3, 2);
+ RORV(rdMapped, rdMapped, W0);
}
-
- if (flags & memop_SubtractOffset)
- SUB(finalAddr, rnMapped, offset.Reg.Rm, offset.ToArithOption());
- else
- ADD(finalAddr, rnMapped, offset.Reg.Rm, offset.ToArithOption());
}
- if (!(flags & memop_Post) && (flags & memop_Writeback))
- MOV(rnMapped, W0);
+ patch.PatchOffset = memopStart - loadStorePosition;
+ patch.PatchSize = GetCodeOffset() - memopStart;
+ LoadStorePatches[loadStorePosition] = patch;
+ }
+ else
+ {
+ void* func = NULL;
+ if (addrIsStatic)
+ func = ARMJIT_Memory::GetFuncForAddr(CurCPU, staticAddress, flags & memop_Store, size);
- if (inlinePreparation)
+ if (func)
{
- if (size == 32 && !(flags & memop_Store) && constLocalROR32 == 4)
- ANDI2R(rdMapped, W0, 3);
- if (size > 8)
- ANDI2R(W0, W0, addressMask);
+ if (flags & memop_Store)
+ MOV(W1, rdMapped);
+ QuickCallFunction(X2, (void (*)())func);
+
+ if (!(flags & memop_Store))
+ {
+ if (size == 32)
+ {
+ if (staticAddress & 0x3)
+ ROR_(rdMapped, W0, (staticAddress & 0x3) << 3);
+ else
+ MOV(rdMapped, W0);
+ }
+ else
+ {
+ if (flags & memop_SignExtend)
+ SBFX(rdMapped, W0, 0, size);
+ else
+ UBFX(rdMapped, W0, 0, size);
+ }
+ }
}
- QuickCallFunction(X2, memFunc);
- if (!(flags & memop_Store))
+ else
{
- if (inlinePreparation && !(flags & memop_Store) && size == 32)
+ if (Num == 0)
{
- if (constLocalROR32 == 4)
+ MOV(X1, RCPU);
+ if (flags & memop_Store)
{
- LSL(rdMapped, rdMapped, 3);
- RORV(rdMapped, W0, rdMapped);
+ MOV(W2, rdMapped);
+ switch (size)
+ {
+ case 32: QuickCallFunction(X3, SlowWrite9<u32>); break;
+ case 16: QuickCallFunction(X3, SlowWrite9<u16>); break;
+ case 8: QuickCallFunction(X3, SlowWrite9<u8>); break;
+ }
}
- else if (constLocalROR32 > 0)
- ROR_(rdMapped, W0, constLocalROR32 << 3);
else
- MOV(rdMapped, W0);
+ {
+ switch (size)
+ {
+ case 32: QuickCallFunction(X3, SlowRead9<u32>); break;
+ case 16: QuickCallFunction(X3, SlowRead9<u16>); break;
+ case 8: QuickCallFunction(X3, SlowRead9<u8>); break;
+ }
+ }
}
- else if (flags & memop_SignExtend)
+ else
{
- if (size == 16)
- SXTH(rdMapped, W0);
- else if (size == 8)
- SXTB(rdMapped, W0);
+ if (flags & memop_Store)
+ {
+ MOV(W1, rdMapped);
+ switch (size)
+ {
+ case 32: QuickCallFunction(X3, SlowWrite7<u32>); break;
+ case 16: QuickCallFunction(X3, SlowWrite7<u16>); break;
+ case 8: QuickCallFunction(X3, SlowWrite7<u8>); break;
+ }
+ }
else
- assert("What's wrong with you?");
+ {
+ switch (size)
+ {
+ case 32: QuickCallFunction(X3, SlowRead7<u32>); break;
+ case 16: QuickCallFunction(X3, SlowRead7<u16>); break;
+ case 8: QuickCallFunction(X3, SlowRead7<u8>); break;
+ }
+ }
}
- else
- MOV(rdMapped, W0);
-
- if (CurInstr.Info.Branches())
+
+ if (!(flags & memop_Store))
{
- if (size < 32)
- printf("LDR size < 32 branching?\n");
- Comp_JumpTo(rdMapped, Num == 0, false);
+ if (size == 32)
+ MOV(rdMapped, W0);
+ else if (flags & memop_SignExtend)
+ SBFX(rdMapped, W0, 0, size);
+ else
+ UBFX(rdMapped, W0, 0, size);
}
}
}
+
+ if (CurInstr.Info.Branches())
+ {
+ if (size < 32)
+ printf("LDR size < 32 branching?\n");
+ Comp_JumpTo(rdMapped, Num == 0, false);
+ }
}
void Compiler::A_Comp_MemWB()
@@ -589,19 +413,11 @@ void Compiler::T_Comp_MemImmHalf()
void Compiler::T_Comp_LoadPCRel()
{
- u32 addr = (R15 & ~0x2) + ((CurInstr.Instr & 0xFF) << 2);
+ u32 offset = ((CurInstr.Instr & 0xFF) << 2);
+ u32 addr = (R15 & ~0x2) + offset;
- if (Config::JIT_LiteralOptimisations)
- {
- Comp_MemLoadLiteral(32, false, CurInstr.T_Reg(8), addr);
- Comp_AddCycles_CDI();
- }
- else
- {
- bool negative = addr < R15;
- u32 abs = negative ? R15 - addr : addr - R15;
- Comp_MemAccess(CurInstr.T_Reg(8), 15, Op2(abs), 32, negative ? memop_SubtractOffset : 0);
- }
+ if (!Config::JIT_LiteralOptimisations || !Comp_MemLoadLiteral(32, false, CurInstr.T_Reg(8), addr))
+ Comp_MemAccess(CurInstr.T_Reg(8), 15, Op2(offset), 32, 0);
}
void Compiler::T_Comp_MemSPRel()
@@ -621,15 +437,138 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc
if (regsCount == 0)
return 0; // actually not the right behaviour TODO: fix me
- SUB(SP, SP, ((regsCount + 1) & ~1) * 8);
- if (store)
+ if (regsCount == 1 && !usermode && RegCache.LoadedRegs & (1 << *regs.begin()))
{
+ int flags = 0;
+ if (store)
+ flags |= memop_Store;
+ if (decrement)
+ flags |= memop_SubtractOffset;
+ Op2 offset = preinc ? Op2(4) : Op2(0);
+
+ Comp_MemAccess(*regs.begin(), rn, offset, 32, flags);
+
+ return decrement ? -4 : 4;
+ }
+
+ if (store)
Comp_AddCycles_CD();
+ else
+ Comp_AddCycles_CDI();
- if (usermode && (regs & BitSet16(0x7f00)))
- UBFX(W0, RCPSR, 0, 5);
+ int expectedTarget = Num == 0
+ ? ARMJIT_Memory::ClassifyAddress9(CurInstr.DataRegion)
+ : ARMJIT_Memory::ClassifyAddress7(CurInstr.DataRegion);
+
+ bool compileFastPath = Config::JIT_FastMemory
+ && store && !usermode && (CurInstr.Cond() < 0xE || ARMJIT_Memory::IsMappable(expectedTarget));
+
+ if (decrement)
+ {
+ SUB(W0, MapReg(rn), regsCount * 4);
+ ANDI2R(W0, W0, ~3);
+ preinc ^= true;
+ }
+ else
+ {
+ ANDI2R(W0, MapReg(rn), ~3);
+ }
+
+ LoadStorePatch patch;
+ if (compileFastPath)
+ {
+ ptrdiff_t fastPathStart = GetCodeOffset();
+ ptrdiff_t firstLoadStoreOffset;
+
+ bool firstLoadStore = true;
+
+ MOVP2R(X1, Num == 0 ? ARMJIT_Memory::FastMem9Start : ARMJIT_Memory::FastMem7Start);
+ ADD(X1, X1, X0);
+
+ u32 offset = preinc ? 4 : 0;
+ BitSet16::Iterator it = regs.begin();
+
+ if (regsCount & 1)
+ {
+ int reg = *it;
+ it++;
+
+ ARM64Reg first = W3;
+ if (RegCache.LoadedRegs & (1 << reg))
+ first = MapReg(reg);
+ else if (store)
+ LoadReg(reg, first);
+
+ if (firstLoadStore)
+ {
+ firstLoadStoreOffset = GetCodeOffset();
+ firstLoadStore = false;
+ }
+
+ if (store)
+ STR(INDEX_UNSIGNED, first, X1, offset);
+ else
+ LDR(INDEX_UNSIGNED, first, X1, offset);
+
+ if (!(RegCache.LoadedRegs & (1 << reg)) && !store)
+ SaveReg(reg, first);
+
+ offset += 4;
+ }
+
+ while (it != regs.end())
+ {
+ int reg = *it;
+ it++;
+ int nextReg = *it;
+ it++;
- int i = regsCount - 1;
+ ARM64Reg first = W3, second = W4;
+ if (RegCache.LoadedRegs & (1 << reg))
+ first = MapReg(reg);
+ else if (store)
+ LoadReg(reg, first);
+ if (RegCache.LoadedRegs & (1 << nextReg))
+ second = MapReg(nextReg);
+ else if (store)
+ LoadReg(nextReg, second);
+
+ if (firstLoadStore)
+ {
+ firstLoadStoreOffset = GetCodeOffset();
+ firstLoadStore = false;
+ }
+
+ if (store)
+ STP(INDEX_SIGNED, first, second, X1, offset);
+ else
+ LDP(INDEX_SIGNED, first, second, X1, offset);
+
+ if (!(RegCache.LoadedRegs & (1 << reg)) && !store)
+ SaveReg(reg, first);
+ if (!(RegCache.LoadedRegs & (1 << nextReg)) && !store)
+ SaveReg(nextReg, second);
+
+ offset += 8;
+ }
+
+ patch.PatchSize = GetCodeOffset() - fastPathStart;
+ patch.PatchOffset = fastPathStart - firstLoadStoreOffset;
+ SwapCodeRegion();
+ patch.PatchFunc = GetRXPtr();
+
+ LoadStorePatches[firstLoadStoreOffset] = patch;
+
+ ABI_PushRegisters({30});
+ }
+
+ int i = 0;
+
+ SUB(SP, SP, ((regsCount + 1) & ~1) * 8);
+ if (store)
+ {
+ if (usermode && (regs & BitSet16(0x7f00)))
+ UBFX(W5, RCPSR, 0, 5);
BitSet16::Iterator it = regs.begin();
while (it != regs.end())
@@ -641,7 +580,7 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc
if (usermode && reg >= 8 && reg < 15)
{
- if (RegCache.Mapping[reg] != INVALID_REG)
+ if (RegCache.LoadedRegs & (1 << reg))
MOV(W3, MapReg(reg));
else
LoadReg(reg, W3);
@@ -651,55 +590,67 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc
}
else if (!usermode && nextReg != regs.end())
{
- ARM64Reg first = W3;
- ARM64Reg second = W4;
+ ARM64Reg first = W3, second = W4;
- if (RegCache.Mapping[reg] != INVALID_REG)
+ if (RegCache.LoadedRegs & (1 << reg))
first = MapReg(reg);
else
LoadReg(reg, W3);
- if (RegCache.Mapping[*nextReg] != INVALID_REG)
+ if (RegCache.LoadedRegs & (1 << *nextReg))
second = MapReg(*nextReg);
else
LoadReg(*nextReg, W4);
- STP(INDEX_SIGNED, EncodeRegTo64(second), EncodeRegTo64(first), SP, i * 8 - 8);
+ STP(INDEX_SIGNED, EncodeRegTo64(first), EncodeRegTo64(second), SP, i * 8);
- i--;
+ i++;
it++;
}
- else if (RegCache.Mapping[reg] != INVALID_REG)
+ else if (RegCache.LoadedRegs & (1 << reg))
+ {
STR(INDEX_UNSIGNED, MapReg(reg), SP, i * 8);
+ }
else
{
LoadReg(reg, W3);
STR(INDEX_UNSIGNED, W3, SP, i * 8);
}
- i--;
+ i++;
it++;
}
}
- if (decrement)
- {
- SUB(W0, MapReg(rn), regsCount * 4);
- preinc ^= true;
- }
- else
- MOV(W0, MapReg(rn));
+
ADD(X1, SP, 0);
MOVI2R(W2, regsCount);
- BL(Num ? MemFuncsSeq7[store][preinc] : MemFuncsSeq9[store][preinc]);
+ if (Num == 0)
+ {
+ MOV(X3, RCPU);
+ switch (preinc * 2 | store)
+ {
+ case 0: QuickCallFunction(X4, SlowBlockTransfer9<false, false>); break;
+ case 1: QuickCallFunction(X4, SlowBlockTransfer9<false, true>); break;
+ case 2: QuickCallFunction(X4, SlowBlockTransfer9<true, false>); break;
+ case 3: QuickCallFunction(X4, SlowBlockTransfer9<true, true>); break;
+ }
+ }
+ else
+ {
+ switch (preinc * 2 | store)
+ {
+ case 0: QuickCallFunction(X4, SlowBlockTransfer7<false, false>); break;
+ case 1: QuickCallFunction(X4, SlowBlockTransfer7<false, true>); break;
+ case 2: QuickCallFunction(X4, SlowBlockTransfer7<true, false>); break;
+ case 3: QuickCallFunction(X4, SlowBlockTransfer7<true, true>); break;
+ }
+ }
if (!store)
{
- Comp_AddCycles_CDI();
-
if (usermode && !regs[15] && (regs & BitSet16(0x7f00)))
- UBFX(W0, RCPSR, 0, 5);
+ UBFX(W5, RCPSR, 0, 5);
- int i = regsCount - 1;
BitSet16::Iterator it = regs.begin();
while (it != regs.end())
{
@@ -714,11 +665,8 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc
MOVI2R(W1, reg - 8);
BL(WriteBanked);
FixupBranch alreadyWritten = CBNZ(W4);
- if (RegCache.Mapping[reg] != INVALID_REG)
- {
+ if (RegCache.LoadedRegs & (1 << reg))
MOV(MapReg(reg), W3);
- RegCache.DirtyRegs |= 1 << reg;
- }
else
SaveReg(reg, W3);
SetJumpTarget(alreadyWritten);
@@ -727,20 +675,12 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc
{
ARM64Reg first = W3, second = W4;
- if (RegCache.Mapping[reg] != INVALID_REG)
- {
+ if (RegCache.LoadedRegs & (1 << reg))
first = MapReg(reg);
- if (reg != 15)
- RegCache.DirtyRegs |= 1 << reg;
- }
- if (RegCache.Mapping[*nextReg] != INVALID_REG)
- {
+ if (RegCache.LoadedRegs & (1 << *nextReg))
second = MapReg(*nextReg);
- if (*nextReg != 15)
- RegCache.DirtyRegs |= 1 << *nextReg;
- }
- LDP(INDEX_SIGNED, EncodeRegTo64(second), EncodeRegTo64(first), SP, i * 8 - 8);
+ LDP(INDEX_SIGNED, EncodeRegTo64(first), EncodeRegTo64(second), SP, i * 8);
if (first == W3)
SaveReg(reg, W3);
@@ -748,15 +688,12 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc
SaveReg(*nextReg, W4);
it++;
- i--;
+ i++;
}
- else if (RegCache.Mapping[reg] != INVALID_REG)
+ else if (RegCache.LoadedRegs & (1 << reg))
{
ARM64Reg mapped = MapReg(reg);
LDR(INDEX_UNSIGNED, mapped, SP, i * 8);
-
- if (reg != 15)
- RegCache.DirtyRegs |= 1 << reg;
}
else
{
@@ -765,11 +702,20 @@ s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc
}
it++;
- i--;
+ i++;
}
}
ADD(SP, SP, ((regsCount + 1) & ~1) * 8);
+ if (compileFastPath)
+ {
+ ABI_PopRegisters({30});
+ RET();
+
+ FlushIcacheSection((u8*)patch.PatchFunc, (u8*)GetRXPtr());
+ SwapCodeRegion();
+ }
+
if (!store && regs[15])
{
ARM64Reg mapped = MapReg(15);