aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/ARM.h9
-rw-r--r--src/ARMJIT.cpp4
-rw-r--r--src/ARMJIT_A64/ARMJIT_ALU.cpp837
-rw-r--r--src/ARMJIT_A64/ARMJIT_Branch.cpp452
-rw-r--r--src/ARMJIT_A64/ARMJIT_Compiler.cpp707
-rw-r--r--src/ARMJIT_A64/ARMJIT_Compiler.h234
-rw-r--r--src/ARMJIT_A64/ARMJIT_LoadStore.cpp848
-rw-r--r--src/ARM_InstrInfo.cpp7
-rw-r--r--src/CMakeLists.txt27
-rw-r--r--src/dolphin/Align.h24
-rw-r--r--src/dolphin/Arm64Emitter.cpp4466
-rw-r--r--src/dolphin/Arm64Emitter.h1152
-rw-r--r--src/dolphin/ArmCommon.h27
-rw-r--r--src/dolphin/BitUtils.h254
-rw-r--r--src/dolphin/Compat.h12
-rw-r--r--src/dolphin/MathUtil.cpp13
-rw-r--r--src/dolphin/MathUtil.h121
17 files changed, 9187 insertions, 7 deletions
diff --git a/src/ARM.h b/src/ARM.h
index e252d23..8282c01 100644
--- a/src/ARM.h
+++ b/src/ARM.h
@@ -246,10 +246,14 @@ public:
u32 DTCMSetting, ITCMSetting;
- u8 ITCM[0x8000];
+ // for aarch64 JIT they need to go up here
+ // to be addressable by a 12-bit immediate
u32 ITCMSize;
- u8 DTCM[0x4000];
u32 DTCMBase, DTCMSize;
+ s32 RegionCodeCycles;
+
+ u8 ITCM[0x8000];
+ u8 DTCM[0x4000];
u8 ICache[0x2000];
u32 ICacheTags[64*4];
@@ -274,7 +278,6 @@ public:
// code/16N/32N/32S
u8 MemTimings[0x100000][4];
- s32 RegionCodeCycles;
u8* CurICacheLine;
};
diff --git a/src/ARMJIT.cpp b/src/ARMJIT.cpp
index 8fd7708..561fabb 100644
--- a/src/ARMJIT.cpp
+++ b/src/ARMJIT.cpp
@@ -6,7 +6,11 @@
#include "Config.h"
#include "ARMJIT_Internal.h"
+#if defined(__x86_64__)
#include "ARMJIT_x64/ARMJIT_Compiler.h"
+#else
+#include "ARMJIT_A64/ARMJIT_Compiler.h"
+#endif
#include "ARMInterpreter_ALU.h"
#include "ARMInterpreter_LoadStore.h"
diff --git a/src/ARMJIT_A64/ARMJIT_ALU.cpp b/src/ARMJIT_A64/ARMJIT_ALU.cpp
new file mode 100644
index 0000000..0fe6a97
--- /dev/null
+++ b/src/ARMJIT_A64/ARMJIT_ALU.cpp
@@ -0,0 +1,837 @@
+#include "ARMJIT_Compiler.h"
+
+using namespace Arm64Gen;
+
+namespace ARMJIT
+{
+
+void Compiler::Comp_RegShiftReg(int op, bool S, Op2& op2, ARM64Reg rs)
+{
+ if (!(CurInstr.SetFlags & 0x2))
+ S = false;
+
+ CPSRDirty |= S;
+
+ UBFX(W1, rs, 0, 8);
+
+ if (!S)
+ {
+ if (op == 3)
+ RORV(W0, op2.Reg.Rm, W1);
+ else
+ {
+ CMP(W1, 32);
+ if (op == 2)
+ {
+ MOVI2R(W2, 31);
+ CSEL(W1, W2, W1, CC_GE);
+ ASRV(W0, op2.Reg.Rm, W1);
+ }
+ else
+ {
+ if (op == 0)
+ LSLV(W0, op2.Reg.Rm, W1);
+ else if (op == 1)
+ LSRV(W0, op2.Reg.Rm, W1);
+ CSEL(W0, WZR, W0, CC_GE);
+ }
+ }
+ }
+ else
+ {
+ MOV(W0, op2.Reg.Rm);
+ FixupBranch zero = CBZ(W1);
+
+ SUB(W1, W1, 1);
+ if (op == 3)
+ {
+ RORV(W0, op2.Reg.Rm, W1);
+ BFI(RCPSR, W0, 29, 1);
+ }
+ else
+ {
+ CMP(W1, 31);
+ if (op == 2)
+ {
+ MOVI2R(W2, 31);
+ CSEL(W1, W2, W1, CC_GT);
+ ASRV(W0, op2.Reg.Rm, W1);
+ BFI(RCPSR, W0, 29, 1);
+ }
+ else
+ {
+ if (op == 0)
+ {
+ LSLV(W0, op2.Reg.Rm, W1);
+ UBFX(W1, W0, 31, 1);
+ }
+ else if (op == 1)
+ LSRV(W0, op2.Reg.Rm, W1);
+ CSEL(W1, WZR, op ? W0 : W1, CC_GT);
+ BFI(RCPSR, W1, 29, 1);
+ CSEL(W0, WZR, W0, CC_GE);
+ }
+ }
+
+ MOV(W0, W0, ArithOption(W0, (ShiftType)op, 1));
+ SetJumpTarget(zero);
+ }
+ op2 = Op2(W0, ST_LSL, 0);
+}
+
+void Compiler::Comp_RegShiftImm(int op, int amount, bool S, Op2& op2, ARM64Reg tmp)
+{
+ if (!(CurInstr.SetFlags & 0x2))
+ S = false;
+
+ CPSRDirty |= S;
+
+ switch (op)
+ {
+ case 0: // LSL
+ if (S && amount)
+ {
+ UBFX(tmp, op2.Reg.Rm, 32 - amount, 1);
+ BFI(RCPSR, tmp, 29, 1);
+ }
+ op2 = Op2(op2.Reg.Rm, ST_LSL, amount);
+ return;
+ case 1: // LSR
+ if (S)
+ {
+ UBFX(tmp, op2.Reg.Rm, (amount ? amount : 32) - 1, 1);
+ BFI(RCPSR, tmp, 29, 1);
+ }
+ if (amount == 0)
+ {
+ op2 = Op2(0);
+ return;
+ }
+ op2 = Op2(op2.Reg.Rm, ST_LSR, amount);
+ return;
+ case 2: // ASR
+ if (S)
+ {
+ UBFX(tmp, op2.Reg.Rm, (amount ? amount : 32) - 1, 1);
+ BFI(RCPSR, tmp, 29, 1);
+ }
+ op2 = Op2(op2.Reg.Rm, ST_ASR, amount ? amount : 31);
+ return;
+ case 3: // ROR
+ if (amount == 0)
+ {
+ UBFX(tmp, RCPSR, 29, 1);
+ LSL(tmp, tmp, 31);
+ if (S)
+ BFI(RCPSR, op2.Reg.Rm, 29, 1);
+ ORR(tmp, tmp, op2.Reg.Rm, ArithOption(tmp, ST_LSR, 1));
+
+ op2 = Op2(tmp, ST_LSL, 0);
+ }
+ else
+ {
+ if (S)
+ {
+ UBFX(tmp, op2.Reg.Rm, amount - 1, 1);
+ BFI(RCPSR, tmp, 29, 1);
+ }
+ op2 = Op2(op2.Reg.Rm, ST_ROR, amount);
+ }
+ return;
+ }
+}
+
+void Compiler::Comp_RetriveFlags(bool retriveCV)
+{
+ if (CurInstr.SetFlags)
+ CPSRDirty = true;
+
+ if (CurInstr.SetFlags & 0x4)
+ {
+ CSET(W0, CC_EQ);
+ BFI(RCPSR, W0, 30, 1);
+ }
+ if (CurInstr.SetFlags & 0x8)
+ {
+ CSET(W0, CC_MI);
+ BFI(RCPSR, W0, 31, 1);
+ }
+ if (retriveCV)
+ {
+ if (CurInstr.SetFlags & 0x2)
+ {
+ CSET(W0, CC_CS);
+ BFI(RCPSR, W0, 29, 1);
+ }
+ if (CurInstr.SetFlags & 0x1)
+ {
+ CSET(W0, CC_VS);
+ BFI(RCPSR, W0, 28, 1);
+ }
+ }
+}
+
+void Compiler::Comp_Logical(int op, bool S, ARM64Reg rd, ARM64Reg rn, Op2 op2)
+{
+ if (S && !CurInstr.SetFlags)
+ S = false;
+
+ switch (op)
+ {
+ case 0x0: // AND
+ if (S)
+ {
+ if (op2.IsImm)
+ ANDSI2R(rd, rn, op2.Imm, W0);
+ else
+ ANDS(rd, rn, op2.Reg.Rm, op2.ToArithOption());
+ }
+ else
+ {
+ if (op2.IsImm)
+ ANDI2R(rd, rn, op2.Imm, W0);
+ else
+ AND(rd, rn, op2.Reg.Rm, op2.ToArithOption());
+ }
+ break;
+ case 0x1: // EOR
+ if (op2.IsImm)
+ EORI2R(rd, rn, op2.Imm, W0);
+ else
+ EOR(rd, rn, op2.Reg.Rm, op2.ToArithOption());
+ if (S && FlagsNZNeeded())
+ TST(rd, rd);
+ break;
+ case 0xC: // ORR
+ if (op2.IsImm)
+ ORRI2R(rd, rn, op2.Imm, W0);
+ else
+ ORR(rd, rn, op2.Reg.Rm, op2.ToArithOption());
+ if (S && FlagsNZNeeded())
+ TST(rd, rd);
+ break;
+ case 0xE: // BIC
+ if (S)
+ {
+ if (op2.IsImm)
+ ANDSI2R(rd, rn, ~op2.Imm, W0);
+ else
+ BICS(rd, rn, op2.Reg.Rm, op2.ToArithOption());
+ }
+ else
+ {
+ if (op2.IsImm)
+ ANDI2R(rd, rn, ~op2.Imm, W0);
+ else
+ BIC(rd, rn, op2.Reg.Rm, op2.ToArithOption());
+ }
+ break;
+ }
+
+ if (S)
+ Comp_RetriveFlags(false);
+}
+
+void Compiler::Comp_Arithmetic(int op, bool S, ARM64Reg rd, ARM64Reg rn, Op2 op2)
+{
+ if (!op2.IsImm && op2.Reg.ShiftType == ST_ROR)
+ {
+ MOV(W0, op2.Reg.Rm, op2.ToArithOption());
+ op2 = Op2(W0, ST_LSL, 0);
+ }
+
+ if (S && !CurInstr.SetFlags)
+ S = false;
+
+ bool CVInGP = false;
+ switch (op)
+ {
+ case 0x2: // SUB
+ if (S)
+ {
+ if (op2.IsImm)
+ SUBSI2R(rd, rn, op2.Imm, W0);
+ else
+ SUBS(rd, rn, op2.Reg.Rm, op2.ToArithOption());
+ }
+ else
+ {
+ if (op2.IsImm)
+ {
+ MOVI2R(W2, op2.Imm);
+ SUBI2R(rd, rn, op2.Imm, W0);
+ }
+ else
+ SUB(rd, rn, op2.Reg.Rm, op2.ToArithOption());
+ }
+ break;
+ case 0x3: // RSB
+ if (op2.IsZero())
+ {
+ op2 = Op2(WZR);
+ }
+ else if (op2.IsImm)
+ {
+ MOVI2R(W1, op2.Imm);
+ op2 = Op2(W1);
+ }
+ else if (op2.Reg.ShiftAmount != 0)
+ {
+ MOV(W1, op2.Reg.Rm, op2.ToArithOption());
+ op2 = Op2(W1);
+ }
+
+ if (S)
+ SUBS(rd, op2.Reg.Rm, rn);
+ else
+ SUB(rd, op2.Reg.Rm, rn);
+ break;
+ case 0x4: // ADD
+ if (S)
+ {
+ if (op2.IsImm)
+ ADDSI2R(rd, rn, op2.Imm, W0);
+ else
+ ADDS(rd, rn, op2.Reg.Rm, op2.ToArithOption());
+ }
+ else
+ {
+ if (op2.IsImm)
+ ADDI2R(rd, rn, op2.Imm, W0);
+ else
+ ADD(rd, rn, op2.Reg.Rm, op2.ToArithOption());
+ }
+ break;
+ case 0x5: // ADC
+ UBFX(W2, RCPSR, 29, 1);
+ if (S)
+ {
+ CVInGP = true;
+ ADDS(W1, rn, W2);
+ CSET(W2, CC_CS);
+ CSET(W3, CC_VS);
+ if (op2.IsImm)
+ ADDSI2R(rd, W1, op2.Imm, W0);
+ else
+ ADDS(rd, W1, op2.Reg.Rm, op2.ToArithOption());
+ CSINC(W2, W2, WZR, CC_CC);
+ CSINC(W3, W3, WZR, CC_VC);
+ }
+ else
+ {
+ ADD(W1, rn, W2);
+ if (op2.IsImm)
+ ADDI2R(rd, W1, op2.Imm, W0);
+ else
+ ADD(rd, W1, op2.Reg.Rm, op2.ToArithOption());
+ }
+ break;
+ case 0x6: // SBC
+ UBFX(W2, RCPSR, 29, 1);
+ // W1 = -op2 - 1
+ if (op2.IsImm)
+ MOVI2R(W1, ~op2.Imm);
+ else
+ ORN(W1, WZR, op2.Reg.Rm, op2.ToArithOption());
+ if (S)
+ {
+ CVInGP = true;
+ ADDS(W1, W2, W1);
+ CSET(W2, CC_CS);
+ CSET(W3, CC_VS);
+ ADDS(rd, rn, W1);
+ CSINC(W2, W2, WZR, CC_CC);
+ CSINC(W3, W3, WZR, CC_VC);
+ }
+ else
+ {
+ ADD(W1, W2, W1);
+ ADD(rd, rn, W1);
+ }
+ break;
+ case 0x7: // RSC
+ UBFX(W2, RCPSR, 29, 1);
+ // W1 = -rn - 1
+ MVN(W1, rn);
+ if (S)
+ {
+ CVInGP = true;
+ ADDS(W1, W2, W1);
+ CSET(W2, CC_CS);
+ CSET(W3, CC_VS);
+ if (op2.IsImm)
+ ADDSI2R(rd, W1, op2.Imm);
+ else
+ ADDS(rd, W1, op2.Reg.Rm, op2.ToArithOption());
+ CSINC(W2, W2, WZR, CC_CC);
+ CSINC(W3, W3, WZR, CC_VC);
+ }
+ else
+ {
+ ADD(W1, W2, W1);
+ if (op2.IsImm)
+ ADDI2R(rd, W1, op2.Imm);
+ else
+ ADD(rd, W1, op2.Reg.Rm, op2.ToArithOption());
+ }
+ break;
+ }
+
+ if (S)
+ {
+ if (CVInGP)
+ {
+ BFI(RCPSR, W2, 29, 1);
+ BFI(RCPSR, W3, 28, 1);
+ }
+ Comp_RetriveFlags(!CVInGP);
+ }
+}
+
+void Compiler::Comp_Compare(int op, ARM64Reg rn, Op2 op2)
+{
+ if (!op2.IsImm && op2.Reg.ShiftType == ST_ROR)
+ {
+ MOV(W0, op2.Reg.Rm, op2.ToArithOption());
+ op2 = Op2(W0, ST_LSL, 0);
+ }
+
+ switch (op)
+ {
+ case 0x8: // TST
+ if (op2.IsImm)
+ TSTI2R(rn, op2.Imm, W0);
+ else
+ ANDS(WZR, rn, op2.Reg.Rm, op2.ToArithOption());
+ break;
+ case 0x9: // TEQ
+ if (op2.IsImm)
+ EORI2R(W0, rn, op2.Imm, W0);
+ else
+ EOR(W0, rn, op2.Reg.Rm, op2.ToArithOption());
+ TST(W0, W0);
+ break;
+ case 0xA: // CMP
+ if (op2.IsImm)
+ CMPI2R(rn, op2.Imm, W0);
+ else
+ CMP(rn, op2.Reg.Rm, op2.ToArithOption());
+ break;
+ case 0xB: // CMN
+ if (op2.IsImm)
+ ADDSI2R(WZR, rn, op2.Imm, W0);
+ else
+ CMN(rn, op2.Reg.Rm, op2.ToArithOption());
+ break;
+ }
+
+ Comp_RetriveFlags(op >= 0xA);
+}
+
+// also counts cycles!
+void Compiler::A_Comp_GetOp2(bool S, Op2& op2)
+{
+ if (CurInstr.Instr & (1 << 25))
+ {
+ Comp_AddCycles_C();
+ op2 = Op2(ROR(CurInstr.Instr & 0xFF, (CurInstr.Instr >> 7) & 0x1E));
+ }
+ else
+ {
+ int op = (CurInstr.Instr >> 5) & 0x3;
+ op2.Reg.Rm = MapReg(CurInstr.A_Reg(0));
+ if (CurInstr.Instr & (1 << 4))
+ {
+ Comp_AddCycles_CI(1);
+
+ ARM64Reg rs = MapReg(CurInstr.A_Reg(8));
+ if (CurInstr.A_Reg(0) == 15)
+ {
+ ADD(W0, op2.Reg.Rm, 4);
+ op2.Reg.Rm = W0;
+ }
+ Comp_RegShiftReg(op, S, op2, rs);
+ }
+ else
+ {
+ Comp_AddCycles_C();
+
+ int amount = (CurInstr.Instr >> 7) & 0x1F;
+ Comp_RegShiftImm(op, amount, S, op2);
+ }
+ }
+}
+
+void Compiler::A_Comp_ALUCmpOp()
+{
+ u32 op = (CurInstr.Instr >> 21) & 0xF;
+ ARM64Reg rn = MapReg(CurInstr.A_Reg(16));
+ Op2 op2;
+ A_Comp_GetOp2(op <= 0x9, op2);
+
+ Comp_Compare(op, rn, op2);
+}
+
+void Compiler::A_Comp_ALUMovOp()
+{
+ bool S = CurInstr.Instr & (1 << 20);
+ u32 op = (CurInstr.Instr >> 21) & 0xF;
+
+ ARM64Reg rd = MapReg(CurInstr.A_Reg(12));
+ Op2 op2;
+ A_Comp_GetOp2(S, op2);
+
+ if (op == 0xF) // MVN
+ {
+ if (op2.IsImm)
+ {
+ if (CurInstr.Cond() == 0xE)
+ RegCache.PutLiteral(CurInstr.A_Reg(12), ~op2.Imm);
+ MOVI2R(rd, ~op2.Imm);
+ }
+ else
+ ORN(rd, WZR, op2.Reg.Rm, op2.ToArithOption());
+ }
+ else // MOV
+ {
+ if (op2.IsImm)
+ {
+ if (CurInstr.Cond() == 0xE)
+ RegCache.PutLiteral(CurInstr.A_Reg(12), op2.Imm);
+ MOVI2R(rd, op2.Imm);
+ }
+ else
+ MOV(rd, op2.Reg.Rm, op2.ToArithOption());
+ }
+
+ if (S)
+ {
+ if (FlagsNZNeeded())
+ TST(rd, rd);
+ Comp_RetriveFlags(false);
+ }
+
+ if (CurInstr.Info.Branches())
+ Comp_JumpTo(rd, true, S);
+}
+
+void Compiler::A_Comp_ALUTriOp()
+{
+ bool S = CurInstr.Instr & (1 << 20);
+ u32 op = (CurInstr.Instr >> 21) & 0xF;
+ bool logical = (1 << op) & 0xF303;
+
+ ARM64Reg rd = MapReg(CurInstr.A_Reg(12));
+ ARM64Reg rn = MapReg(CurInstr.A_Reg(16));
+ Op2 op2;
+ A_Comp_GetOp2(S && logical, op2);
+
+ if (op2.IsImm && op2.Imm == 0)
+ op2 = Op2(WZR, ST_LSL, 0);
+
+ if (logical)
+ Comp_Logical(op, S, rd, rn, op2);
+ else
+ Comp_Arithmetic(op, S, rd, rn, op2);
+
+ if (CurInstr.Info.Branches())
+ Comp_JumpTo(rd, true, S);
+}
+
+void Compiler::A_Comp_Clz()
+{
+ Comp_AddCycles_C();
+
+ ARM64Reg rd = MapReg(CurInstr.A_Reg(12));
+ ARM64Reg rm = MapReg(CurInstr.A_Reg(0));
+
+ CLZ(rd, rm);
+
+ assert(Num == 0);
+}
+
+void Compiler::Comp_Mul_Mla(bool S, bool mla, ARM64Reg rd, ARM64Reg rm, ARM64Reg rs, ARM64Reg rn)
+{
+ if (Num == 0)
+ {
+ Comp_AddCycles_CI(S ? 3 : 1);
+ }
+ else
+ {
+ CLZ(W0, rs);
+ CLS(W1, rs);
+ CMP(W0, W1);
+ CSEL(W0, W0, W1, CC_GT);
+ Comp_AddCycles_CI(mla ? 1 : 0, W0, ArithOption(W0, ST_LSR, 3));
+ }
+
+ if (mla)
+ MADD(rd, rm, rs, rn);
+ else
+ MUL(rd, rm, rs);
+
+ if (S && FlagsNZNeeded())
+ {
+ TST(rd, rd);
+ Comp_RetriveFlags(false);
+ }
+}
+
+void Compiler::A_Comp_Mul_Long()
+{
+ ARM64Reg rd = MapReg(CurInstr.A_Reg(16));
+ ARM64Reg rm = MapReg(CurInstr.A_Reg(0));
+ ARM64Reg rs = MapReg(CurInstr.A_Reg(8));
+ ARM64Reg rn = MapReg(CurInstr.A_Reg(12));
+
+ bool S = CurInstr.Instr & (1 << 20);
+ bool add = CurInstr.Instr & (1 << 21);
+ bool sign = CurInstr.Instr & (1 << 22);
+
+ if (Num == 0)
+ {
+ Comp_AddCycles_CI(S ? 3 : 1);
+ }
+ else
+ {
+ CLZ(W0, rs);
+ CLS(W1, rs);
+ CMP(W0, W1);
+ CSEL(W0, W0, W1, CC_GT);
+ Comp_AddCycles_CI(0, W0, ArithOption(W0, ST_LSR, 3));
+ }
+
+ if (add)
+ {
+ MOV(W0, rn);
+ BFI(X0, EncodeRegTo64(rd), 32, 32);
+ if (sign)
+ SMADDL(EncodeRegTo64(rn), rm, rs, X0);
+ else
+ UMADDL(EncodeRegTo64(rn), rm, rs, X0);
+ if (S && FlagsNZNeeded())
+ TST(EncodeRegTo64(rn), EncodeRegTo64(rn));
+ UBFX(EncodeRegTo64(rd), EncodeRegTo64(rn), 32, 32);
+ }
+ else
+ {
+ if (sign)
+ SMULL(EncodeRegTo64(rn), rm, rs);
+ else
+ UMULL(EncodeRegTo64(rn), rm, rs);
+ if (S && FlagsNZNeeded())
+ TST(EncodeRegTo64(rn), EncodeRegTo64(rn));
+ UBFX(EncodeRegTo64(rd), EncodeRegTo64(rn), 32, 32);
+ }
+
+ if (S)
+ Comp_RetriveFlags(false);
+}
+
+void Compiler::A_Comp_Mul()
+{
+ ARM64Reg rd = MapReg(CurInstr.A_Reg(16));
+ ARM64Reg rm = MapReg(CurInstr.A_Reg(0));
+ ARM64Reg rs = MapReg(CurInstr.A_Reg(8));
+
+ bool S = CurInstr.Instr & (1 << 20);
+ bool mla = CurInstr.Instr & (1 << 21);
+ ARM64Reg rn = INVALID_REG;
+ if (mla)
+ rn = MapReg(CurInstr.A_Reg(12));
+
+ Comp_Mul_Mla(S, mla, rd, rm, rs, rn);
+}
+
+void Compiler::T_Comp_ShiftImm()
+{
+ Comp_AddCycles_C();
+
+ u32 op = (CurInstr.Instr >> 11) & 0x3;
+ int amount = (CurInstr.Instr >> 6) & 0x1F;
+
+ ARM64Reg rd = MapReg(CurInstr.T_Reg(0));
+ Op2 op2;
+ op2.Reg.Rm = MapReg(CurInstr.T_Reg(3));
+ Comp_RegShiftImm(op, amount, true, op2);
+ if (op2.IsImm)
+ MOVI2R(rd, op2.Imm);
+ else
+ MOV(rd, op2.Reg.Rm, op2.ToArithOption());
+ if (FlagsNZNeeded())
+ TST(rd, rd);
+
+ Comp_RetriveFlags(false);
+}
+
+void Compiler::T_Comp_AddSub_()
+{
+ Comp_AddCycles_C();
+
+ Op2 op2;
+ if (CurInstr.Instr & (1 << 10))
+ op2 = Op2((CurInstr.Instr >> 6) & 0x7);
+ else
+ op2 = Op2(MapReg(CurInstr.T_Reg(6)));
+
+ Comp_Arithmetic(
+ CurInstr.Instr & (1 << 9) ? 0x2 : 0x4,
+ true,
+ MapReg(CurInstr.T_Reg(0)),
+ MapReg(CurInstr.T_Reg(3)),
+ op2);
+}
+
+void Compiler::T_Comp_ALUImm8()
+{
+ Comp_AddCycles_C();
+
+ u32 imm = CurInstr.Instr & 0xFF;
+ int op = (CurInstr.Instr >> 11) & 0x3;
+
+ ARM64Reg rd = MapReg(CurInstr.T_Reg(8));
+
+ switch (op)
+ {
+ case 0:
+ MOVI2R(rd, imm);
+ if (FlagsNZNeeded())
+ TST(rd, rd);
+ Comp_RetriveFlags(false);
+ break;
+ case 1:
+ Comp_Compare(0xA, rd, Op2(imm));
+ break;
+ case 2:
+ case 3:
+ Comp_Arithmetic(op == 2 ? 0x4 : 0x2, true, rd, rd, Op2(imm));
+ break;
+ }
+}
+
+void Compiler::T_Comp_ALU()
+{
+ int op = (CurInstr.Instr >> 6) & 0xF;
+ ARM64Reg rd = MapReg(CurInstr.T_Reg(0));
+ ARM64Reg rs = MapReg(CurInstr.T_Reg(3));
+
+ if ((op >= 0x2 && op <= 0x4) || op == 0x7)
+ Comp_AddCycles_CI(1);
+ else
+ Comp_AddCycles_C();
+
+ switch (op)
+ {
+ case 0x0:
+ Comp_Logical(0x0, true, rd, rd, Op2(rs));
+ break;
+ case 0x1:
+ Comp_Logical(0x1, true, rd, rd, Op2(rs));
+ break;
+ case 0x2:
+ case 0x3:
+ case 0x4:
+ case 0x7:
+ {
+ Op2 op2;
+ op2.Reg.Rm = rd;
+ Comp_RegShiftReg(op == 0x7 ? 3 : (op - 0x2), true, op2, rs);
+ MOV(rd, op2.Reg.Rm, op2.ToArithOption());
+ if (FlagsNZNeeded())
+ TST(rd, rd);
+ Comp_RetriveFlags(false);
+ }
+ break;
+ case 0x5:
+ Comp_Arithmetic(0x5, true, rd, rd, Op2(rs));
+ break;
+ case 0x6:
+ Comp_Arithmetic(0x6, true, rd, rd, Op2(rs));
+ break;
+ case 0x8:
+ Comp_Compare(0x8, rd, Op2(rs));
+ break;
+ case 0x9:
+ Comp_Arithmetic(0x3, true, rd, rs, Op2(0));
+ break;
+ case 0xA:
+ Comp_Compare(0xA, rd, Op2(rs));
+ break;
+ case 0xB:
+ Comp_Compare(0xB, rd, Op2(rs));
+ break;
+ case 0xC:
+ Comp_Logical(0xC, true, rd, rd, Op2(rs));
+ break;
+ case 0xD:
+ Comp_Mul_Mla(true, false, rd, rd, rs, INVALID_REG);
+ break;
+ case 0xE:
+ Comp_Logical(0xE, true, rd, rd, Op2(rs));
+ break;
+ case 0xF:
+ MVN(rd, rs);
+ if (FlagsNZNeeded())
+ TST(rd, rd);
+ Comp_RetriveFlags(false);
+ break;
+ }
+}
+
+void Compiler::T_Comp_ALU_HiReg()
+{
+ u32 rd = ((CurInstr.Instr & 0x7) | ((CurInstr.Instr >> 4) & 0x8));
+ ARM64Reg rdMapped = MapReg(rd);
+ ARM64Reg rs = MapReg((CurInstr.Instr >> 3) & 0xF);
+
+ u32 op = (CurInstr.Instr >> 8) & 0x3;
+
+ Comp_AddCycles_C();
+
+ switch (op)
+ {
+ case 0:
+ Comp_Arithmetic(0x4, false, rdMapped, rdMapped, Op2(rs));
+ break;
+ case 1:
+ Comp_Compare(0xA, rdMapped, rs);
+ return;
+ case 2:
+ MOV(rdMapped, rs);
+ break;
+ }
+
+ if (rd == 15)
+ {
+ Comp_JumpTo(rdMapped, false, false);
+ }
+}
+
+void Compiler::T_Comp_AddSP()
+{
+ Comp_AddCycles_C();
+
+ ARM64Reg sp = MapReg(13);
+ u32 offset = (CurInstr.Instr & 0x7F) << 2;
+ if (CurInstr.Instr & (1 << 7))
+ SUB(sp, sp, offset);
+ else
+ ADD(sp, sp, offset);
+}
+
+void Compiler::T_Comp_RelAddr()
+{
+ Comp_AddCycles_C();
+
+ ARM64Reg rd = MapReg(CurInstr.T_Reg(8));
+ u32 offset = (CurInstr.Instr & 0xFF) << 2;
+ if (CurInstr.Instr & (1 << 11))
+ {
+ ARM64Reg sp = MapReg(13);
+ ADD(rd, sp, offset);
+ }
+ else
+ MOVI2R(rd, (R15 & ~2) + offset);
+}
+
+} \ No newline at end of file
diff --git a/src/ARMJIT_A64/ARMJIT_Branch.cpp b/src/ARMJIT_A64/ARMJIT_Branch.cpp
new file mode 100644
index 0000000..542f0b7
--- /dev/null
+++ b/src/ARMJIT_A64/ARMJIT_Branch.cpp
@@ -0,0 +1,452 @@
+#include "ARMJIT_Compiler.h"
+
+using namespace Arm64Gen;
+
+// hack
+const int kCodeCacheTiming = 3;
+
+namespace ARMJIT
+{
+
+template <typename T>
+void jumpToTrampoline(T* cpu, u32 addr, bool changeCPSR)
+{
+ cpu->JumpTo(addr, changeCPSR);
+}
+
+void Compiler::Comp_JumpTo(u32 addr, bool forceNonConstantCycles)
+{
+ // we can simplify constant branches by a lot
+ // it's not completely safe to assume stuff like, which instructions to preload
+ // we'll see how it works out
+
+ IrregularCycles = true;
+
+ u32 newPC;
+ u32 cycles = 0;
+ bool setupRegion = false;
+
+ if (addr & 0x1 && !Thumb)
+ {
+ CPSRDirty = true;
+ ORRI2R(RCPSR, RCPSR, 0x20);
+ }
+ else if (!(addr & 0x1) && Thumb)
+ {
+ CPSRDirty = true;
+ ANDI2R(RCPSR, RCPSR, ~0x20);
+ }
+
+ if (Num == 0)
+ {
+ ARMv5* cpu9 = (ARMv5*)CurCPU;
+
+ u32 oldregion = R15 >> 24;
+ u32 newregion = addr >> 24;
+
+ u32 regionCodeCycles = cpu9->MemTimings[addr >> 12][0];
+ u32 compileTimeCodeCycles = cpu9->RegionCodeCycles;
+ cpu9->RegionCodeCycles = regionCodeCycles;
+
+ MOVI2R(W0, regionCodeCycles);
+ STR(INDEX_UNSIGNED, W0, RCPU, offsetof(ARMv5, RegionCodeCycles));
+
+ setupRegion = newregion != oldregion;
+ if (setupRegion)
+ cpu9->SetupCodeMem(addr);
+
+ if (addr & 0x1)
+ {
+ addr &= ~0x1;
+ newPC = addr+2;
+
+ // two-opcodes-at-once fetch
+ // doesn't matter if we put garbage in the MSbs there
+ if (addr & 0x2)
+ {
+ cpu9->CodeRead32(addr-2, true) >> 16;
+ cycles += cpu9->CodeCycles;
+ cpu9->CodeRead32(addr+2, false);
+ cycles += CurCPU->CodeCycles;
+ }
+ else
+ {
+ cpu9->CodeRead32(addr, true);
+ cycles += cpu9->CodeCycles;
+ }
+ }
+ else
+ {
+ addr &= ~0x3;
+ newPC = addr+4;
+
+ cpu9->CodeRead32(addr, true);
+ cycles += cpu9->CodeCycles;
+ cpu9->CodeRead32(addr+4, false);
+ cycles += cpu9->CodeCycles;
+ }
+
+ cpu9->RegionCodeCycles = compileTimeCodeCycles;
+ if (setupRegion)
+ cpu9->SetupCodeMem(R15);
+ }
+ else
+ {
+ ARMv4* cpu7 = (ARMv4*)CurCPU;
+
+ u32 codeRegion = addr >> 24;
+ u32 codeCycles = addr >> 15; // cheato
+
+ cpu7->CodeRegion = codeRegion;
+ cpu7->CodeCycles = codeCycles;
+
+ MOVI2R(W0, codeRegion);
+ STR(INDEX_UNSIGNED, W0, RCPU, offsetof(ARM, CodeRegion));
+ MOVI2R(W0, codeCycles);
+ STR(INDEX_UNSIGNED, W0, RCPU, offsetof(ARM, CodeCycles));
+
+ if (addr & 0x1)
+ {
+ addr &= ~0x1;
+ newPC = addr+2;
+
+ // this is necessary because ARM7 bios protection
+ u32 compileTimePC = CurCPU->R[15];
+ CurCPU->R[15] = newPC;
+
+ cycles += NDS::ARM7MemTimings[codeCycles][0] + NDS::ARM7MemTimings[codeCycles][1];
+
+ CurCPU->R[15] = compileTimePC;
+ }
+ else
+ {
+ addr &= ~0x3;
+ newPC = addr+4;
+
+ u32 compileTimePC = CurCPU->R[15];
+ CurCPU->R[15] = newPC;
+
+ cycles += NDS::ARM7MemTimings[codeCycles][2] + NDS::ARM7MemTimings[codeCycles][3];
+
+ CurCPU->R[15] = compileTimePC;
+ }
+
+ cpu7->CodeRegion = R15 >> 24;
+ cpu7->CodeCycles = addr >> 15;
+ }
+
+ if (Exit)
+ {
+ MOVI2R(W0, newPC);
+ STR(INDEX_UNSIGNED, W0, RCPU, offsetof(ARM, R[15]));
+ }
+ if ((Thumb || CurInstr.Cond() >= 0xE) && !forceNonConstantCycles)
+ ConstantCycles += cycles;
+ else
+ ADD(RCycles, RCycles, cycles);
+}
+
+
+void* Compiler::Gen_JumpTo9(int kind)
+{
+ AlignCode16();
+ void* res = GetRXPtr();
+
+ MOVI2R(W2, kCodeCacheTiming);
+ // W1 - code cycles non branch
+ // W2 - branch code cycles
+ LSR(W1, W0, 12);
+ LSL(W1, W1, 2);
+ ADDI2R(W1, W1, offsetof(ARMv5, MemTimings), W2);
+ LDRB(W1, RCPU, W1);
+
+ LDR(INDEX_UNSIGNED, W3, RCPU, offsetof(ARMv5, ITCMSize));
+
+ STR(INDEX_UNSIGNED, W1, RCPU, offsetof(ARMv5, RegionCodeCycles));
+
+ CMP(W0, W3);
+ FixupBranch outsideITCM = B(CC_LO);
+ MOVI2R(W1, 1);
+ MOVI2R(W2, 1);
+ SetJumpTarget(outsideITCM);
+
+ FixupBranch switchToThumb;
+ if (kind == 0)
+ switchToThumb = TBNZ(W0, 0);
+
+ if (kind == 0 || kind == 1)
+ {
+ ANDI2R(W0, W0, ~3);
+
+ if (kind == 0)
+ ANDI2R(RCPSR, RCPSR, ~0x20);
+
+ ADD(W3, W0, 4);
+ STR(INDEX_UNSIGNED, W3, RCPU, offsetof(ARM, R[15]));
+
+ ADD(W1, W1, W2);
+ ADD(RCycles, RCycles, W1);
+
+ RET();
+ }
+ if (kind == 0 || kind == 2)
+ {
+ if (kind == 0)
+ {
+ SetJumpTarget(switchToThumb);
+
+ ORRI2R(RCPSR, RCPSR, 0x20);
+ }
+
+ ANDI2R(W0, W0, ~1);
+
+ ADD(W3, W0, 2);
+ STR(INDEX_UNSIGNED, W3, RCPU, offsetof(ARM, R[15]));
+
+ FixupBranch halfwordLoc = TBZ(W0, 1);
+ ADD(W1, W1, W2);
+ ADD(RCycles, RCycles, W1);
+ RET();
+
+ SetJumpTarget(halfwordLoc);
+ ADD(RCycles, RCycles, W2);
+ RET();
+ }
+
+ return res;
+}
+
+void* Compiler::Gen_JumpTo7(int kind)
+{
+ void* res = GetRXPtr();
+
+ LSR(W1, W0, 24);
+ STR(INDEX_UNSIGNED, W1, RCPU, offsetof(ARM, CodeRegion));
+ LSR(W1, W0, 15);
+ STR(INDEX_UNSIGNED, W1, RCPU, offsetof(ARM, CodeCycles));
+
+ MOVP2R(X2, NDS::ARM7MemTimings);
+ LDR(W3, X2, ArithOption(W1, true));
+
+ FixupBranch switchToThumb;
+ if (kind == 0)
+ switchToThumb = TBNZ(W0, 0);
+
+ if (kind == 0 || kind == 1)
+ {
+ UBFX(W2, W3, 0, 8);
+ UBFX(W3, W3, 8, 8);
+ ADD(W2, W3, W2);
+ ADD(RCycles, RCycles, W2);
+
+ ANDI2R(W0, W0, ~3);
+
+ if (kind == 0)
+ ANDI2R(RCPSR, RCPSR, ~0x20);
+
+ ADD(W3, W0, 4);
+ STR(INDEX_UNSIGNED, W3, RCPU, offsetof(ARM, R[15]));
+
+ RET();
+ }
+ if (kind == 0 || kind == 2)
+ {
+ if (kind == 0)
+ {
+ SetJumpTarget(switchToThumb);
+
+ ORRI2R(RCPSR, RCPSR, 0x20);
+ }
+
+ UBFX(W2, W3, 16, 8);
+ UBFX(W3, W3, 24, 8);
+ ADD(W2, W3, W2);
+ ADD(RCycles, RCycles, W2);
+
+ ANDI2R(W0, W0, ~1);
+
+ ADD(W3, W0, 2);
+ STR(INDEX_UNSIGNED, W3, RCPU, offsetof(ARM, R[15]));
+
+ RET();
+ }
+
+ return res;
+}
+
+void Compiler::Comp_JumpTo(Arm64Gen::ARM64Reg addr, bool switchThumb, bool restoreCPSR)
+{
+ IrregularCycles = true;
+
+ if (!restoreCPSR)
+ {
+ if (switchThumb)
+ CPSRDirty = true;
+ MOV(W0, addr);
+ BL((Num ? JumpToFuncs7 : JumpToFuncs9)[switchThumb ? 0 : (Thumb + 1)]);
+ }
+ else
+ {
+ BitSet16 hiRegsLoaded(RegCache.DirtyRegs & 0xFF00);
+ bool previouslyDirty = CPSRDirty;
+ SaveCPSR();
+
+ if (restoreCPSR)
+ {
+ if (Thumb || CurInstr.Cond() >= 0xE)
+ RegCache.Flush();
+ else
+ {
+ // the ugly way...
+ // we only save them, to load and save them again
+ for (int reg : hiRegsLoaded)
+ SaveReg(reg, RegCache.Mapping[reg]);
+ }
+ }
+
+ if (switchThumb)
+ MOV(W1, addr);
+ else
+ {
+ if (Thumb)
+ ORRI2R(W1, addr, 1);
+ else
+ ANDI2R(W1, addr, ~1);
+ }
+ MOV(X0, RCPU);
+ MOVI2R(W2, restoreCPSR);
+ if (Num == 0)
+ QuickCallFunction(X3, jumpToTrampoline<ARMv5>);
+ else
+ QuickCallFunction(X3, jumpToTrampoline<ARMv4>);
+
+ if (!Thumb && restoreCPSR && CurInstr.Cond() < 0xE)
+ {
+ for (int reg : hiRegsLoaded)
+ LoadReg(reg, RegCache.Mapping[reg]);
+ }
+
+ if (previouslyDirty)
+ LoadCPSR();
+ CPSRDirty = previouslyDirty;
+ }
+}
+
+void Compiler::A_Comp_BranchImm()
+{
+ int op = (CurInstr.Instr >> 24) & 1;
+ s32 offset = (s32)(CurInstr.Instr << 8) >> 6;
+ u32 target = R15 + offset;
+ bool link = op;
+
+ if (CurInstr.Cond() == 0xF) // BLX_imm
+ {
+ target += (op << 1) + 1;
+ link = true;
+ }
+
+ if (link)
+ MOVI2R(MapReg(14), R15 - 4);
+
+ Comp_JumpTo(target);
+}
+
+void Compiler::A_Comp_BranchXchangeReg()
+{
+ ARM64Reg rn = MapReg(CurInstr.A_Reg(0));
+ MOV(W0, rn);
+ if ((CurInstr.Instr & 0xF0) == 0x30) // BLX_reg
+ MOVI2R(MapReg(14), R15 - 4);
+ Comp_JumpTo(W0, true);
+}
+
+void Compiler::T_Comp_BCOND()
+{
+ u32 cond = (CurInstr.Instr >> 8) & 0xF;
+ FixupBranch skipExecute = CheckCondition(cond);
+
+ s32 offset = (s32)(CurInstr.Instr << 24) >> 23;
+ Comp_JumpTo(R15 + offset + 1, true);
+
+ Comp_BranchSpecialBehaviour();
+
+ FixupBranch skipFailed = B();
+ SetJumpTarget(skipExecute);
+ Comp_AddCycles_C(true);
+
+ if (CurInstr.BranchFlags & branch_FollowCondTaken)
+ {
+ SaveCPSR(false);
+ RegCache.PrepareExit();
+
+ ADD(W0, RCycles, ConstantCycles);
+ ABI_PopRegisters(SavedRegs);
+ RET();
+ }
+
+ SetJumpTarget(skipFailed);
+}
+
+void Compiler::T_Comp_B()
+{
+ s32 offset = (s32)((CurInstr.Instr & 0x7FF) << 21) >> 20;
+ Comp_JumpTo(R15 + offset + 1);
+}
+
+void Compiler::T_Comp_BranchXchangeReg()
+{
+ bool link = CurInstr.Instr & (1 << 7);
+
+ if (link)
+ {
+ if (Num == 1)
+ {
+ printf("BLX unsupported on ARM7!!!\n");
+ return;
+ }
+ MOV(W0, MapReg(CurInstr.A_Reg(3)));
+ MOVI2R(MapReg(14), R15 - 1);
+ Comp_JumpTo(W0, true);
+ }
+ else
+ {
+ ARM64Reg rn = MapReg(CurInstr.A_Reg(3));
+ Comp_JumpTo(rn, true);
+ }
+}
+
+void Compiler::T_Comp_BL_LONG_1()
+{
+ s32 offset = (s32)((CurInstr.Instr & 0x7FF) << 21) >> 9;
+ MOVI2R(MapReg(14), R15 + offset);
+ Comp_AddCycles_C();
+}
+
+void Compiler::T_Comp_BL_LONG_2()
+{
+ ARM64Reg lr = MapReg(14);
+ s32 offset = (CurInstr.Instr & 0x7FF) << 1;
+ ADD(W0, lr, offset);
+ MOVI2R(lr, (R15 - 2) | 1);
+ Comp_JumpTo(W0, Num == 0 && !(CurInstr.Instr & (1 << 12)));
+}
+
+void Compiler::T_Comp_BL_Merged()
+{
+ Comp_AddCycles_C();
+
+ R15 += 2;
+
+ u32 upperPart = CurInstr.Instr >> 16;
+ u32 target = (R15 - 2) + ((s32)((CurInstr.Instr & 0x7FF) << 21) >> 9);
+ target += (upperPart & 0x7FF) << 1;
+
+ if (Num == 1 || upperPart & (1 << 12))
+ target |= 1;
+
+ MOVI2R(MapReg(14), (R15 - 2) | 1);
+
+ Comp_JumpTo(target);
+}
+
+} \ No newline at end of file
diff --git a/src/ARMJIT_A64/ARMJIT_Compiler.cpp b/src/ARMJIT_A64/ARMJIT_Compiler.cpp
new file mode 100644
index 0000000..89d0029
--- /dev/null
+++ b/src/ARMJIT_A64/ARMJIT_Compiler.cpp
@@ -0,0 +1,707 @@
+#include "ARMJIT_Compiler.h"
+
+#include "../ARMInterpreter.h"
+
+#include "../ARMJIT_Internal.h"
+
+#ifdef __SWITCH__
+#include "../switch/compat_switch.h"
+
+extern char __start__;
+#endif
+
+#include <malloc.h>
+
+using namespace Arm64Gen;
+
+
+namespace ARMJIT
+{
+
+/*
+
+ Recompiling classic ARM to ARMv8 code is at the same time
+ easier and trickier than compiling to a less related architecture
+ like x64. At one hand you can translate a lot of instructions directly.
+ But at the same time, there are a ton of exceptions, like for
+ example ADD and SUB can't have a RORed second operand on ARMv8.
+ */
+
+template <>
+const ARM64Reg RegisterCache<Compiler, ARM64Reg>::NativeRegAllocOrder[] =
+ {W19, W20, W21, W22, W23, W24, W25, W26};
+template <>
+const int RegisterCache<Compiler, ARM64Reg>::NativeRegsAvailable = 8;
+
+const int JitMemSize = 16 * 1024 * 1024;
+
+void Compiler::MovePC()
+{
+ ADD(MapReg(15), MapReg(15), Thumb ? 2 : 4);
+}
+
+Compiler::Compiler()
+{
+#ifdef __SWITCH__
+ JitRWBase = memalign(0x1000, JitMemSize);
+
+ JitRXStart = (u8*)&__start__ - JitMemSize - 0x1000;
+ JitRWStart = virtmemReserve(JitMemSize);
+ MemoryInfo info = {0};
+ u32 pageInfo = {0};
+ int i = 0;
+ while (JitRXStart != NULL)
+ {
+ svcQueryMemory(&info, &pageInfo, (u64)JitRXStart);
+ if (info.type != MemType_Unmapped)
+ JitRXStart = (void*)((u8*)info.addr - JitMemSize - 0x1000);
+ else
+ break;
+ if (i++ > 8)
+ {
+ printf("couldn't find unmapped place for jit memory\n");
+ JitRXStart = NULL;
+ }
+ }
+
+ assert(JitRXStart != NULL);
+
+ bool succeded = R_SUCCEEDED(svcMapProcessCodeMemory(envGetOwnProcessHandle(), (u64)JitRXStart, (u64)JitRWBase, JitMemSize));
+ assert(succeded);
+ succeded = R_SUCCEEDED(svcSetProcessMemoryPermission(envGetOwnProcessHandle(), (u64)JitRXStart, JitMemSize, Perm_Rx));
+ assert(succeded);
+ succeded = R_SUCCEEDED(svcMapProcessMemory(JitRWStart, envGetOwnProcessHandle(), (u64)JitRXStart, JitMemSize));
+ assert(succeded);
+
+ SetCodeBase((u8*)JitRWStart, (u8*)JitRXStart);
+ JitMemUseableSize = JitMemSize;
+ Reset();
+#endif
+
+ for (int i = 0; i < 3; i++)
+ {
+ for (int j = 0; j < 2; j++)
+ {
+ MemFunc9[i][j] = Gen_MemoryRoutine9(8 << i, j);
+ }
+ }
+ MemFunc7[0][0] = (void*)NDS::ARM7Read8;
+ MemFunc7[1][0] = (void*)NDS::ARM7Read16;
+ MemFunc7[2][0] = (void*)NDS::ARM7Read32;
+ MemFunc7[0][1] = (void*)NDS::ARM7Write8;
+ MemFunc7[1][1] = (void*)NDS::ARM7Write16;
+ MemFunc7[2][1] = (void*)NDS::ARM7Write32;
+
+ for (int i = 0; i < 2; i++)
+ {
+ for (int j = 0; j < 2; j++)
+ {
+ MemFuncsSeq9[i][j] = Gen_MemoryRoutine9Seq(i, j);
+ MemFuncsSeq7[i][j] = Gen_MemoryRoutine7Seq(i, j);
+ }
+ }
+
+ for (int i = 0; i < 3; i++)
+ {
+ JumpToFuncs9[i] = Gen_JumpTo9(i);
+ JumpToFuncs7[i] = Gen_JumpTo7(i);
+ }
+
+ /*
+ W0 - mode
+ W1 - reg num
+ W3 - in/out value of reg
+ */
+ {
+ ReadBanked = GetRXPtr();
+
+ ADD(X2, RCPU, X1, ArithOption(X1, ST_LSL, 2));
+ CMP(W0, 0x11);
+ FixupBranch fiq = B(CC_EQ);
+ SUBS(W1, W1, 13 - 8);
+ ADD(X2, RCPU, X1, ArithOption(X1, ST_LSL, 2));
+ FixupBranch notEverything = B(CC_LT);
+ CMP(W0, 0x12);
+ FixupBranch irq = B(CC_EQ);
+ CMP(W0, 0x13);
+ FixupBranch svc = B(CC_EQ);
+ CMP(W0, 0x17);
+ FixupBranch abt = B(CC_EQ);
+ CMP(W0, 0x1B);
+ FixupBranch und = B(CC_EQ);
+ SetJumpTarget(notEverything);
+ RET();
+
+ SetJumpTarget(fiq);
+ LDR(INDEX_UNSIGNED, W3, X2, offsetof(ARM, R_FIQ));
+ RET();
+ SetJumpTarget(irq);
+ LDR(INDEX_UNSIGNED, W3, X2, offsetof(ARM, R_IRQ));
+ RET();
+ SetJumpTarget(svc);
+ LDR(INDEX_UNSIGNED, W3, X2, offsetof(ARM, R_SVC));
+ RET();
+ SetJumpTarget(abt);
+ LDR(INDEX_UNSIGNED, W3, X2, offsetof(ARM, R_ABT));
+ RET();
+ SetJumpTarget(und);
+ LDR(INDEX_UNSIGNED, W3, X2, offsetof(ARM, R_UND));
+ RET();
+ }
+ {
+ WriteBanked = GetRXPtr();
+
+ ADD(X2, RCPU, X1, ArithOption(X1, ST_LSL, 2));
+ CMP(W0, 0x11);
+ FixupBranch fiq = B(CC_EQ);
+ SUBS(W1, W1, 13 - 8);
+ ADD(X2, RCPU, X1, ArithOption(X1, ST_LSL, 2));
+ FixupBranch notEverything = B(CC_LT);
+ CMP(W0, 0x12);
+ FixupBranch irq = B(CC_EQ);
+ CMP(W0, 0x13);
+ FixupBranch svc = B(CC_EQ);
+ CMP(W0, 0x17);
+ FixupBranch abt = B(CC_EQ);
+ CMP(W0, 0x1B);
+ FixupBranch und = B(CC_EQ);
+ SetJumpTarget(notEverything);
+ MOVI2R(W4, 0);
+ RET();
+
+ SetJumpTarget(fiq);
+ STR(INDEX_UNSIGNED, W3, X2, offsetof(ARM, R_FIQ));
+ MOVI2R(W4, 1);
+ RET();
+ SetJumpTarget(irq);
+ STR(INDEX_UNSIGNED, W3, X2, offsetof(ARM, R_IRQ));
+ MOVI2R(W4, 1);
+ RET();
+ SetJumpTarget(svc);
+ STR(INDEX_UNSIGNED, W3, X2, offsetof(ARM, R_SVC));
+ MOVI2R(W4, 1);
+ RET();
+ SetJumpTarget(abt);
+ STR(INDEX_UNSIGNED, W3, X2, offsetof(ARM, R_ABT));
+ MOVI2R(W4, 1);
+ RET();
+ SetJumpTarget(und);
+ STR(INDEX_UNSIGNED, W3, X2, offsetof(ARM, R_UND));
+ MOVI2R(W4, 1);
+ RET();
+ }
+
+ //FlushIcache();
+
+ JitMemUseableSize -= GetCodeOffset();
+ SetCodeBase((u8*)GetRWPtr(), (u8*)GetRXPtr());
+}
+
+Compiler::~Compiler()
+{
+#ifdef __SWITCH__
+ if (JitRWStart != NULL)
+ {
+ bool succeded = R_SUCCEEDED(svcUnmapProcessMemory(JitRWStart, envGetOwnProcessHandle(), (u64)JitRXStart, JitMemSize));
+ assert(succeded);
+ virtmemFree(JitRWStart, JitMemSize);
+ succeded = R_SUCCEEDED(svcUnmapProcessCodeMemory(envGetOwnProcessHandle(), (u64)JitRXStart, (u64)JitRWBase, JitMemSize));
+ assert(succeded);
+ free(JitRWBase);
+ }
+#endif
+}
+
+void Compiler::LoadReg(int reg, ARM64Reg nativeReg)
+{
+ if (reg == 15)
+ MOVI2R(nativeReg, R15);
+ else
+ LDR(INDEX_UNSIGNED, nativeReg, RCPU, offsetof(ARM, R[reg]));
+}
+
+void Compiler::SaveReg(int reg, ARM64Reg nativeReg)
+{
+ STR(INDEX_UNSIGNED, nativeReg, RCPU, offsetof(ARM, R[reg]));
+}
+
+void Compiler::LoadCPSR()
+{
+ assert(!CPSRDirty);
+ LDR(INDEX_UNSIGNED, RCPSR, RCPU, offsetof(ARM, CPSR));
+}
+
+void Compiler::SaveCPSR(bool markClean)
+{
+ if (CPSRDirty)
+ {
+ STR(INDEX_UNSIGNED, RCPSR, RCPU, offsetof(ARM, CPSR));
+ CPSRDirty = CPSRDirty && !markClean;
+ }
+}
+
+FixupBranch Compiler::CheckCondition(u32 cond)
+{
+ if (cond >= 0x8)
+ {
+ LSR(W1, RCPSR, 28);
+ MOVI2R(W2, 1);
+ LSLV(W2, W2, W1);
+ ANDI2R(W2, W2, ARM::ConditionTable[cond], W3);
+
+ return CBZ(W2);
+ }
+ else
+ {
+ u8 bit = (28 + ((~(cond >> 1) & 1) << 1 | (cond >> 2 & 1) ^ (cond >> 1 & 1)));
+
+ if (cond & 1)
+ return TBNZ(RCPSR, bit);
+ else
+ return TBZ(RCPSR, bit);
+ }
+}
+
+#define F(x) &Compiler::A_Comp_##x
+const Compiler::CompileFunc A_Comp[ARMInstrInfo::ak_Count] =
+{
+ // AND
+ F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp),
+ F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp),
+ // EOR
+ F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp),
+ F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp),
+ // SUB
+ F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp),
+ F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp),
+ // RSB
+ F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp),
+ F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp),
+ // ADD
+ F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp),
+ F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp),
+ // ADC
+ F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp),
+ F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp),
+ // SBC
+ F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp),
+ F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp),
+ // RSC
+ F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp),
+ F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp),
+ // ORR
+ F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp),
+ F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp),
+ // MOV
+ F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), F(ALUMovOp),
+ F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), F(ALUMovOp),
+ // BIC
+ F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp),
+ F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp), F(ALUTriOp),
+ // MVN
+ F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), F(ALUMovOp),
+ F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), F(ALUMovOp), F(ALUMovOp),
+ // TST
+ F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp),
+ // TEQ
+ F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp),
+ // CMP
+ F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp),
+ // CMN
+ F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp), F(ALUCmpOp),
+ // Mul
+ F(Mul), F(Mul), F(Mul_Long), F(Mul_Long), F(Mul_Long), F(Mul_Long), NULL, NULL, NULL, NULL, NULL,
+ // ARMv5 exclusives
+ F(Clz), NULL, NULL, NULL, NULL,
+
+ // STR
+ F(MemWB), F(MemWB), F(MemWB), F(MemWB), F(MemWB), F(MemWB), F(MemWB), F(MemWB), F(MemWB), F(MemWB),
+ // STRB
+ F(MemWB), F(MemWB), F(MemWB), F(MemWB), F(MemWB), F(MemWB), F(MemWB), F(MemWB), F(MemWB), F(MemWB),
+ // LDR
+ F(MemWB), F(MemWB), F(MemWB), F(MemWB), F(MemWB), F(MemWB), F(MemWB), F(MemWB), F(MemWB), F(MemWB),
+ // LDRB
+ F(MemWB), F(MemWB), F(MemWB), F(MemWB), F(MemWB), F(MemWB), F(MemWB), F(MemWB), F(MemWB), F(MemWB),
+ // STRH
+ F(MemHD), F(MemHD), F(MemHD), F(MemHD),
+ // LDRD
+ NULL, NULL, NULL, NULL,
+ // STRD
+ NULL, NULL, NULL, NULL,
+ // LDRH
+ F(MemHD), F(MemHD), F(MemHD), F(MemHD),
+ // LDRSB
+ F(MemHD), F(MemHD), F(MemHD), F(MemHD),
+ // LDRSH
+ F(MemHD), F(MemHD), F(MemHD), F(MemHD),
+ // Swap
+ NULL, NULL,
+ // LDM, STM
+ F(LDM_STM), F(LDM_STM),
+ // Branch
+ F(BranchImm), F(BranchImm), F(BranchImm), F(BranchXchangeReg), F(BranchXchangeReg),
+ // Special
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL
+};
+#undef F
+#define F(x) &Compiler::T_Comp_##x
+const Compiler::CompileFunc T_Comp[ARMInstrInfo::tk_Count] =
+{
+ // Shift imm
+ F(ShiftImm), F(ShiftImm), F(ShiftImm),
+ // Add/sub tri operand
+ F(AddSub_), F(AddSub_), F(AddSub_), F(AddSub_),
+ // 8 bit imm
+ F(ALUImm8), F(ALUImm8), F(ALUImm8), F(ALUImm8),
+ // ALU
+ F(ALU), F(ALU), F(ALU), F(ALU), F(ALU), F(ALU), F(ALU), F(ALU),
+ F(ALU), F(ALU), F(ALU), F(ALU), F(ALU), F(ALU), F(ALU), F(ALU),
+ // ALU hi reg
+ F(ALU_HiReg), F(ALU_HiReg), F(ALU_HiReg),
+ // PC/SP relative ops
+ F(RelAddr), F(RelAddr), F(AddSP),
+ // LDR PC rel
+ F(LoadPCRel),
+ // LDR/STR reg offset
+ F(MemReg), F(MemReg), F(MemReg), F(MemReg),
+ // LDR/STR sign extended, half
+ F(MemRegHalf), F(MemRegHalf), F(MemRegHalf), F(MemRegHalf),
+ // LDR/STR imm offset
+ F(MemImm), F(MemImm), F(MemImm), F(MemImm),
+ // LDR/STR half imm offset
+ F(MemImmHalf), F(MemImmHalf),
+ // LDR/STR sp rel
+ F(MemSPRel), F(MemSPRel),
+ // PUSH/POP
+ F(PUSH_POP), F(PUSH_POP),
+ // LDMIA, STMIA
+ F(LDMIA_STMIA), F(LDMIA_STMIA),
+ // Branch
+ F(BCOND), F(BranchXchangeReg), F(BranchXchangeReg), F(B), F(BL_LONG_1), F(BL_LONG_2),
+ // Unk, SVC
+ NULL, NULL,
+ F(BL_Merged)
+};
+
+bool Compiler::CanCompile(bool thumb, u16 kind)
+{
+ return (thumb ? T_Comp[kind] : A_Comp[kind]) != NULL;
+}
+
+void Compiler::Comp_BranchSpecialBehaviour()
+{
+ if (CurInstr.BranchFlags & branch_IdleBranch)
+ {
+ MOVI2R(W0, 1);
+ STRB(INDEX_UNSIGNED, W0, RCPU, offsetof(ARM, IdleLoop));
+ }
+
+ if (CurInstr.BranchFlags & branch_FollowCondNotTaken)
+ {
+ SaveCPSR(false);
+ RegCache.PrepareExit();
+ ADD(W0, RCycles, ConstantCycles);
+ ABI_PopRegisters(SavedRegs);
+ RET();
+ }
+}
+
+JitBlockEntry Compiler::CompileBlock(ARM* cpu, bool thumb, FetchedInstr instrs[], int instrsCount)
+{
+ if (JitMemUseableSize - GetCodeOffset() < 1024 * 16)
+ {
+ printf("JIT memory full, resetting...\n");
+ ResetBlockCache();
+ }
+
+ JitBlockEntry res = (JitBlockEntry)GetRXPtr();
+
+ Thumb = thumb;
+ Num = cpu->Num;
+ CurCPU = cpu;
+ ConstantCycles = 0;
+ RegCache = RegisterCache<Compiler, ARM64Reg>(this, instrs, instrsCount, true);
+
+ //printf("compiling block at %x\n", R15 - (Thumb ? 2 : 4));
+ const u32 ALL_CALLEE_SAVED = 0x7FF80000;
+
+ SavedRegs = BitSet32((RegCache.GetPushRegs() | BitSet32(0x78000000)) & BitSet32(ALL_CALLEE_SAVED));
+
+ //if (Num == 1)
+ {
+ ABI_PushRegisters(SavedRegs);
+
+ MOVP2R(RCPU, CurCPU);
+ MOVI2R(RCycles, 0);
+
+ LoadCPSR();
+ }
+
+ for (int i = 0; i < instrsCount; i++)
+ {
+ CurInstr = instrs[i];
+ R15 = CurInstr.Addr + (Thumb ? 4 : 8);
+ CodeRegion = R15 >> 24;
+
+ CompileFunc comp = Thumb
+ ? T_Comp[CurInstr.Info.Kind]
+ : A_Comp[CurInstr.Info.Kind];
+
+ Exit = i == (instrsCount - 1) || (CurInstr.BranchFlags & branch_FollowCondNotTaken);
+
+ //printf("%x instr %x regs: r%x w%x n%x flags: %x %x %x\n", R15, CurInstr.Instr, CurInstr.Info.SrcRegs, CurInstr.Info.DstRegs, CurInstr.Info.ReadFlags, CurInstr.Info.NotStrictlyNeeded, CurInstr.Info.WriteFlags, CurInstr.SetFlags);
+
+ bool isConditional = Thumb ? CurInstr.Info.Kind == ARMInstrInfo::tk_BCOND : CurInstr.Cond() < 0xE;
+ if (comp == NULL || (CurInstr.BranchFlags & branch_FollowCondTaken) || (i == instrsCount - 1 && (!CurInstr.Info.Branches() || isConditional)))
+ {
+ MOVI2R(W0, R15);
+ STR(INDEX_UNSIGNED, W0, RCPU, offsetof(ARM, R[15]));
+ if (comp == NULL)
+ {
+ MOVI2R(W0, CurInstr.Instr);
+ STR(INDEX_UNSIGNED, W0, RCPU, offsetof(ARM, CurInstr));
+ }
+ if (Num == 0)
+ {
+ MOVI2R(W0, (s32)CurInstr.CodeCycles);
+ STR(INDEX_UNSIGNED, W0, RCPU, offsetof(ARM, CodeCycles));
+ }
+ }
+
+ if (comp == NULL)
+ {
+ SaveCPSR();
+ RegCache.Flush();
+ }
+ else
+ RegCache.Prepare(Thumb, i);
+
+ if (Thumb)
+ {
+ if (comp == NULL)
+ {
+ MOV(X0, RCPU);
+ QuickCallFunction(X1, InterpretTHUMB[CurInstr.Info.Kind]);
+ }
+ else
+ (this->*comp)();
+ }
+ else
+ {
+ u32 cond = CurInstr.Cond();
+ if (CurInstr.Info.Kind == ARMInstrInfo::ak_BLX_IMM)
+ {
+ if (comp)
+ (this->*comp)();
+ else
+ {
+ MOV(X0, RCPU);
+ QuickCallFunction(X1, ARMInterpreter::A_BLX_IMM);
+ }
+ }
+ else if (cond == 0xF)
+ Comp_AddCycles_C();
+ else
+ {
+ IrregularCycles = false;
+
+ FixupBranch skipExecute;
+ if (cond < 0xE)
+ skipExecute = CheckCondition(cond);
+
+ if (comp == NULL)
+ {
+ MOV(X0, RCPU);
+ QuickCallFunction(X1, InterpretARM[CurInstr.Info.Kind]);
+ }
+ else
+ {
+ (this->*comp)();
+ }
+
+ Comp_BranchSpecialBehaviour();
+
+ if (cond < 0xE)
+ {
+ if (IrregularCycles)
+ {
+ FixupBranch skipNop = B();
+ SetJumpTarget(skipExecute);
+
+ Comp_AddCycles_C();
+
+ if (CurInstr.BranchFlags & branch_FollowCondTaken)
+ {
+ SaveCPSR(false);
+ RegCache.PrepareExit();
+ ADD(W0, RCycles, ConstantCycles);
+ ABI_PopRegisters(SavedRegs);
+ RET();
+ }
+
+ SetJumpTarget(skipNop);
+ }
+ else
+ SetJumpTarget(skipExecute);
+ }
+
+ }
+ }
+
+ if (comp == NULL)
+ LoadCPSR();
+ }
+
+ RegCache.Flush();
+
+ //if (Num == 1)
+ {
+ SaveCPSR();
+
+ ADD(W0, RCycles, ConstantCycles);
+
+ ABI_PopRegisters(SavedRegs);
+ }
+ //else
+ // ADD(RCycles, RCycles, ConstantCycles);
+
+ RET();
+
+ FlushIcache();
+
+ //printf("finished\n");
+
+ return res;
+}
+
+void Compiler::Reset()
+{
+ SetCodePtr(0);
+
+ const u32 brk_0 = 0xD4200000;
+
+ for (int i = 0; i < JitMemUseableSize / 4; i++)
+ *(((u32*)GetRWPtr()) + i) = brk_0;
+}
+
+void Compiler::Comp_AddCycles_C(bool nonConst)
+{
+ s32 cycles = Num ?
+ NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 1 : 3]
+ : ((R15 & 0x2) ? 0 : CurInstr.CodeCycles);
+
+ if (!nonConst && !CurInstr.Info.Branches())
+ ConstantCycles += cycles;
+ else
+ ADD(RCycles, RCycles, cycles);
+}
+
+void Compiler::Comp_AddCycles_CI(u32 numI)
+{
+ s32 cycles = (Num ?
+ NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2]
+ : ((R15 & 0x2) ? 0 : CurInstr.CodeCycles)) + numI;
+
+ if (Thumb || CurInstr.Cond() >= 0xE)
+ ConstantCycles += cycles;
+ else
+ ADD(RCycles, RCycles, cycles);
+}
+
+void Compiler::Comp_AddCycles_CI(u32 c, ARM64Reg numI, ArithOption shift)
+{
+ s32 cycles = (Num ?
+ NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2]
+ : ((R15 & 0x2) ? 0 : CurInstr.CodeCycles)) + c;
+
+ ADD(RCycles, RCycles, numI, shift);
+ if (Thumb || CurInstr.Cond() >= 0xE)
+ ConstantCycles += c;
+ else
+ ADD(RCycles, RCycles, cycles);
+}
+
+void Compiler::Comp_AddCycles_CDI()
+{
+ if (Num == 0)
+ Comp_AddCycles_CD();
+ else
+ {
+ IrregularCycles = true;
+
+ s32 cycles;
+
+ s32 numC = NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2];
+ s32 numD = CurInstr.DataCycles;
+
+ if (CurInstr.DataRegion == 0x02) // mainRAM
+ {
+ if (CodeRegion == 0x02)
+ cycles = numC + numD;
+ else
+ {
+ numC++;
+ cycles = std::max(numC + numD - 3, std::max(numC, numD));
+ }
+ }
+ else if (CodeRegion == 0x02)
+ {
+ numD++;
+ cycles = std::max(numC + numD - 3, std::max(numC, numD));
+ }
+ else
+ {
+ cycles = numC + numD + 1;
+ }
+
+ if (!Thumb && CurInstr.Cond() < 0xE)
+ ADD(RCycles, RCycles, cycles);
+ else
+ ConstantCycles += cycles;
+ }
+}
+
+void Compiler::Comp_AddCycles_CD()
+{
+ u32 cycles = 0;
+ if (Num == 0)
+ {
+ s32 numC = (R15 & 0x2) ? 0 : CurInstr.CodeCycles;
+ s32 numD = CurInstr.DataCycles;
+
+ //if (DataRegion != CodeRegion)
+ cycles = std::max(numC + numD - 6, std::max(numC, numD));
+
+ IrregularCycles = cycles != numC;
+ }
+ else
+ {
+ s32 numC = NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2];
+ s32 numD = CurInstr.DataCycles;
+
+ if (CurInstr.DataRegion == 0x02)
+ {
+ if (CodeRegion == 0x02)
+ cycles += numC + numD;
+ else
+ cycles += std::max(numC + numD - 3, std::max(numC, numD));
+ }
+ else if (CodeRegion == 0x02)
+ {
+ cycles += std::max(numC + numD - 3, std::max(numC, numD));
+ }
+ else
+ {
+ cycles += numC + numD;
+ }
+
+ IrregularCycles = true;
+ }
+
+ if ((!Thumb && CurInstr.Cond() < 0xE) && IrregularCycles)
+ ADD(RCycles, RCycles, cycles);
+ else
+ ConstantCycles += cycles;
+}
+
+} \ No newline at end of file
diff --git a/src/ARMJIT_A64/ARMJIT_Compiler.h b/src/ARMJIT_A64/ARMJIT_Compiler.h
new file mode 100644
index 0000000..7e13507
--- /dev/null
+++ b/src/ARMJIT_A64/ARMJIT_Compiler.h
@@ -0,0 +1,234 @@
+#ifndef ARMJIT_COMPILER_H
+#define ARMJIT_COMPILER_H
+
+#include "../ARM.h"
+#include "../ARMJIT.h"
+
+#include "../dolphin/Arm64Emitter.h"
+
+#include "../ARMJIT_Internal.h"
+#include "../ARMJIT_RegisterCache.h"
+
+namespace ARMJIT
+{
+
+const Arm64Gen::ARM64Reg RCPSR = Arm64Gen::W27;
+const Arm64Gen::ARM64Reg RCycles = Arm64Gen::W28;
+const Arm64Gen::ARM64Reg RCPU = Arm64Gen::X29;
+
+struct Op2
+{
+ Op2()
+ {}
+
+ Op2(Arm64Gen::ARM64Reg rm) : IsImm(false)
+ {
+ Reg.Rm = rm;
+ Reg.ShiftType = Arm64Gen::ST_LSL;
+ Reg.ShiftAmount = 0;
+ }
+
+ Op2(u32 imm) : IsImm(true), Imm(imm)
+ {}
+
+ Op2(Arm64Gen::ARM64Reg rm, Arm64Gen::ShiftType st, int amount) : IsImm(false)
+ {
+ Reg.Rm = rm;
+ Reg.ShiftType = st;
+ Reg.ShiftAmount = amount;
+ }
+
+ Arm64Gen::ArithOption ToArithOption()
+ {
+ assert(!IsImm);
+ return Arm64Gen::ArithOption(Reg.Rm, Reg.ShiftType, Reg.ShiftAmount);
+ }
+
+ bool IsSimpleReg()
+ { return !IsImm && !Reg.ShiftAmount && Reg.ShiftType == Arm64Gen::ST_LSL; }
+ bool ImmFits12Bit()
+ { return IsImm && (Imm & 0xFFF == Imm); }
+ bool IsZero()
+ { return IsImm && !Imm; }
+
+ bool IsImm;
+ union
+ {
+ struct
+ {
+ Arm64Gen::ARM64Reg Rm;
+ Arm64Gen::ShiftType ShiftType;
+ int ShiftAmount;
+ } Reg;
+ u32 Imm;
+ };
+};
+
+class Compiler : Arm64Gen::ARM64XEmitter
+{
+public:
+ typedef void (Compiler::*CompileFunc)();
+
+ Compiler();
+ ~Compiler();
+
+ Arm64Gen::ARM64Reg MapReg(int reg)
+ {
+ assert(RegCache.Mapping[reg] != Arm64Gen::INVALID_REG);
+ return RegCache.Mapping[reg];
+ }
+
+ JitBlockEntry CompileBlock(ARM* cpu, bool thumb, FetchedInstr instrs[], int instrsCount);
+
+ bool CanCompile(bool thumb, u16 kind);
+
+ bool FlagsNZNeeded()
+ {
+ return CurInstr.SetFlags & 0xC;
+ }
+
+ void Reset();
+
+ void Comp_AddCycles_C(bool forceNonConst = false);
+ void Comp_AddCycles_CI(u32 numI);
+ void Comp_AddCycles_CI(u32 c, Arm64Gen::ARM64Reg numI, Arm64Gen::ArithOption shift);
+ void Comp_AddCycles_CD();
+ void Comp_AddCycles_CDI();
+
+ void MovePC();
+
+ void LoadReg(int reg, Arm64Gen::ARM64Reg nativeReg);
+ void SaveReg(int reg, Arm64Gen::ARM64Reg nativeReg);
+
+ void LoadCPSR();
+ void SaveCPSR(bool markClean = true);
+
+ void A_Comp_ALUTriOp();
+ void A_Comp_ALUMovOp();
+ void A_Comp_ALUCmpOp();
+
+ void A_Comp_Mul();
+ void A_Comp_Mul_Long();
+
+ void A_Comp_Clz();
+
+ void A_Comp_MemWB();
+ void A_Comp_MemHD();
+
+ void A_Comp_LDM_STM();
+
+ void A_Comp_BranchImm();
+ void A_Comp_BranchXchangeReg();
+
+
+ void T_Comp_ShiftImm();
+ void T_Comp_AddSub_();
+ void T_Comp_ALUImm8();
+ void T_Comp_ALU();
+ void T_Comp_ALU_HiReg();
+ void T_Comp_AddSP();
+ void T_Comp_RelAddr();
+
+ void T_Comp_MemReg();
+ void T_Comp_MemImm();
+ void T_Comp_MemRegHalf();
+ void T_Comp_MemImmHalf();
+ void T_Comp_LoadPCRel();
+ void T_Comp_MemSPRel();
+
+ void T_Comp_LDMIA_STMIA();
+ void T_Comp_PUSH_POP();
+
+ void T_Comp_BCOND();
+ void T_Comp_B();
+ void T_Comp_BranchXchangeReg();
+ void T_Comp_BL_LONG_1();
+ void T_Comp_BL_LONG_2();
+ void T_Comp_BL_Merged();
+
+ s32 Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc, bool decrement, bool usermode);
+
+ void Comp_Mul_Mla(bool S, bool mla, Arm64Gen::ARM64Reg rd, Arm64Gen::ARM64Reg rm, Arm64Gen::ARM64Reg rs, Arm64Gen::ARM64Reg rn);
+
+ void Comp_Compare(int op, Arm64Gen::ARM64Reg rn, Op2 op2);
+ void Comp_Logical(int op, bool S, Arm64Gen::ARM64Reg rd, Arm64Gen::ARM64Reg rn, Op2 op2);
+ void Comp_Arithmetic(int op, bool S, Arm64Gen::ARM64Reg rd, Arm64Gen::ARM64Reg rn, Op2 op2);
+
+ void Comp_RetriveFlags(bool retriveCV);
+
+ Arm64Gen::FixupBranch CheckCondition(u32 cond);
+
+ void Comp_JumpTo(Arm64Gen::ARM64Reg addr, bool switchThumb, bool restoreCPSR = false);
+ void Comp_JumpTo(u32 addr, bool forceNonConstantCycles = false);
+
+ void A_Comp_GetOp2(bool S, Op2& op2);
+
+ void Comp_RegShiftImm(int op, int amount, bool S, Op2& op2, Arm64Gen::ARM64Reg tmp = Arm64Gen::W0);
+ void Comp_RegShiftReg(int op, bool S, Op2& op2, Arm64Gen::ARM64Reg rs);
+
+ void Comp_MemLoadLiteral(int size, bool signExtend, int rd, u32 addr);
+ enum
+ {
+ memop_Writeback = 1 << 0,
+ memop_Post = 1 << 1,
+ memop_SignExtend = 1 << 2,
+ memop_Store = 1 << 3,
+ memop_SubtractOffset = 1 << 4
+ };
+ void Comp_MemAccess(int rd, int rn, Op2 offset, int size, int flags);
+
+ void* Gen_MemoryRoutine9(int size, bool store);
+
+ void* Gen_MemoryRoutine9Seq(bool store, bool preinc);
+ void* Gen_MemoryRoutine7Seq(bool store, bool preinc);
+
+ // 0 = switch mode, 1 = stay arm, 2 = stay thumb
+ void* Gen_JumpTo9(int kind);
+ void* Gen_JumpTo7(int kind);
+
+ void Comp_BranchSpecialBehaviour();
+
+ bool Exit;
+
+ FetchedInstr CurInstr;
+ bool Thumb;
+ u32 R15;
+ u32 Num;
+ ARM* CurCPU;
+ u32 ConstantCycles;
+ u32 CodeRegion;
+
+ BitSet32 SavedRegs;
+
+ u32 JitMemUseableSize;
+
+ void* ReadBanked, *WriteBanked;
+
+ // [size][store]
+ void* MemFunc9[3][2];
+ void* MemFunc7[3][2];
+
+ // [store][pre increment]
+ void* MemFuncsSeq9[2][2];
+ // "[code in main ram]
+ void* MemFuncsSeq7[2][2];
+
+ void* JumpToFuncs9[3];
+ void* JumpToFuncs7[3];
+
+ RegisterCache<Compiler, Arm64Gen::ARM64Reg> RegCache;
+
+ bool CPSRDirty = false;
+
+ bool IrregularCycles = false;
+
+#ifdef __SWITCH__
+ void* JitRWBase;
+ void* JitRWStart;
+ void* JitRXStart;
+#endif
+};
+
+}
+
+#endif \ No newline at end of file
diff --git a/src/ARMJIT_A64/ARMJIT_LoadStore.cpp b/src/ARMJIT_A64/ARMJIT_LoadStore.cpp
new file mode 100644
index 0000000..a5d0e3f
--- /dev/null
+++ b/src/ARMJIT_A64/ARMJIT_LoadStore.cpp
@@ -0,0 +1,848 @@
+#include "ARMJIT_Compiler.h"
+
+#include "../Config.h"
+
+using namespace Arm64Gen;
+
+namespace ARMJIT
+{
+
+// W0 - address
+// (if store) W1 - value to store
+// W2 - code cycles
+void* Compiler::Gen_MemoryRoutine9(int size, bool store)
+{
+ AlignCode16();
+ void* res = GetRXPtr();
+
+ u32 addressMask;
+ switch (size)
+ {
+ case 32: addressMask = ~3; break;
+ case 16: addressMask = ~1; break;
+ case 8: addressMask = ~0; break;
+ }
+
+ LDR(INDEX_UNSIGNED, W3, RCPU, offsetof(ARMv5, DTCMBase));
+ LDR(INDEX_UNSIGNED, W4, RCPU, offsetof(ARMv5, DTCMSize));
+ SUB(W3, W0, W3);
+ CMP(W3, W4);
+ FixupBranch insideDTCM = B(CC_LO);
+
+ UBFX(W4, W0, 24, 8);
+ CMP(W4, 0x02);
+ FixupBranch outsideMainRAM = B(CC_NEQ);
+ ANDI2R(W3, W0, addressMask & (MAIN_RAM_SIZE - 1));
+ MOVP2R(X4, NDS::MainRAM);
+ if (!store && size == 32)
+ {
+ LDR(W3, X3, X4);
+ ANDI2R(W0, W0, 3);
+ LSL(W0, W0, 3);
+ RORV(W0, W3, W0);
+ }
+ else if (store)
+ STRGeneric(size, W1, X3, X4);
+ else
+ LDRGeneric(size, false, W0, X3, X4);
+ RET();
+
+ SetJumpTarget(outsideMainRAM);
+
+ LDR(INDEX_UNSIGNED, W3, RCPU, offsetof(ARMv5, ITCMSize));
+ CMP(W0, W3);
+ FixupBranch insideITCM = B(CC_LO);
+
+ if (store)
+ {
+ if (size > 8)
+ ANDI2R(W0, W0, addressMask);
+
+ switch (size)
+ {
+ case 32: QuickTailCall(X4, NDS::ARM9Write32); break;
+ case 16: QuickTailCall(X4, NDS::ARM9Write16); break;
+ case 8: QuickTailCall(X4, NDS::ARM9Write8); break;
+ }
+ }
+ else
+ {
+ if (size == 32)
+ ABI_PushRegisters({0, 30});
+ if (size > 8)
+ ANDI2R(W0, W0, addressMask);
+
+ switch (size)
+ {
+ case 32: QuickCallFunction(X4, NDS::ARM9Read32); break;
+ case 16: QuickTailCall (X4, NDS::ARM9Read16); break;
+ case 8: QuickTailCall (X4, NDS::ARM9Read8 ); break;
+ }
+ if (size == 32)
+ {
+ ABI_PopRegisters({1, 30});
+ ANDI2R(W1, W1, 3);
+ LSL(W1, W1, 3);
+ RORV(W0, W0, W1);
+ RET();
+ }
+ }
+
+ SetJumpTarget(insideDTCM);
+ ANDI2R(W3, W3, 0x3FFF & addressMask);
+ ADDI2R(W3, W3, offsetof(ARMv5, DTCM), W4);
+ if (!store && size == 32)
+ {
+ ANDI2R(W4, W0, 3);
+ LDR(W0, RCPU, W3);
+ LSL(W4, W4, 3);
+ RORV(W0, W0, W4);
+ }
+ else if (store)
+ STRGeneric(size, W1, RCPU, W3);
+ else
+ LDRGeneric(size, false, W0, RCPU, W3);
+
+ RET();
+
+ SetJumpTarget(insideITCM);
+ ANDI2R(W3, W0, 0x7FFF & addressMask);
+ if (store)
+ {
+ LSR(W0, W3, 8);
+ ADDI2R(W0, W0, ExeMemRegionOffsets[exeMem_ITCM], W4);
+ MOVP2R(X4, CodeRanges);
+ ADD(X4, X4, X0, ArithOption(X0, ST_LSL, 4));
+ static_assert(sizeof(AddressRange) == 16);
+ LDR(INDEX_UNSIGNED, W4, X4, offsetof(AddressRange, Blocks.Length));
+ FixupBranch null = CBZ(W4);
+ ABI_PushRegisters({1, 3, 30});
+ QuickCallFunction(X4, InvalidateByAddr);
+ ABI_PopRegisters({1, 3, 30});
+ SetJumpTarget(null);
+ }
+ ADDI2R(W3, W3, offsetof(ARMv5, ITCM), W4);
+ if (!store && size == 32)
+ {
+ ANDI2R(W4, W0, 3);
+ LDR(W0, RCPU, W3);
+ LSL(W4, W4, 3);
+ RORV(W0, W0, W4);
+ }
+ else if (store)
+ STRGeneric(size, W1, RCPU, W3);
+ else
+ LDRGeneric(size, false, W0, RCPU, W3);
+ RET();
+
+ return res;
+}
+
+/*
+ W0 - base address
+ X1 - stack space
+ W2 - values count
+*/
+void* Compiler::Gen_MemoryRoutine9Seq(bool store, bool preinc)
+{
+ AlignCode16();
+ void* res = GetRXPtr();
+
+ void* loopStart = GetRXPtr();
+ SUB(W2, W2, 1);
+
+ if (preinc)
+ ADD(W0, W0, 4);
+
+ LDR(INDEX_UNSIGNED, W4, RCPU, offsetof(ARMv5, DTCMBase));
+ LDR(INDEX_UNSIGNED, W5, RCPU, offsetof(ARMv5, DTCMSize));
+ SUB(W4, W0, W4);
+ CMP(W4, W5);
+ FixupBranch insideDTCM = B(CC_LO);
+
+ LDR(INDEX_UNSIGNED, W4, RCPU, offsetof(ARMv5, ITCMSize));
+ CMP(W0, W4);
+ FixupBranch insideITCM = B(CC_LO);
+
+ ABI_PushRegisters({0, 1, 2, 30}); // TODO: move SP only once
+ if (store)
+ {
+ LDR(X1, X1, ArithOption(X2, true));
+ QuickCallFunction(X4, NDS::ARM9Write32);
+
+ ABI_PopRegisters({0, 1, 2, 30});
+ }
+ else
+ {
+ QuickCallFunction(X4, NDS::ARM9Read32);
+ MOV(W4, W0);
+
+ ABI_PopRegisters({0, 1, 2, 30});
+
+ STR(X4, X1, ArithOption(X2, true));
+ }
+
+ if (!preinc)
+ ADD(W0, W0, 4);
+ CBNZ(W2, loopStart);
+ RET();
+
+ SetJumpTarget(insideDTCM);
+
+ ANDI2R(W4, W4, ~3 & 0x3FFF);
+ ADDI2R(X4, X4, offsetof(ARMv5, DTCM));
+ if (store)
+ {
+ LDR(X5, X1, ArithOption(X2, true));
+ STR(W5, RCPU, X4);
+ }
+ else
+ {
+ LDR(W5, RCPU, X4);
+ STR(X5, X1, ArithOption(X2, true));
+ }
+
+ if (!preinc)
+ ADD(W0, W0, 4);
+ CBNZ(W2, loopStart);
+ RET();
+
+ SetJumpTarget(insideITCM);
+
+ ANDI2R(W4, W0, ~3 & 0x7FFF);
+
+ if (store)
+ {
+ LSR(W6, W4, 8);
+ ADDI2R(W6, W6, ExeMemRegionOffsets[exeMem_ITCM], W5);
+ MOVP2R(X5, CodeRanges);
+ ADD(X5, X5, X6, ArithOption(X6, ST_LSL, 4));
+ static_assert(sizeof(AddressRange) == 16);
+ LDR(INDEX_UNSIGNED, W5, X5, offsetof(AddressRange, Blocks.Length));
+ FixupBranch null = CBZ(W5);
+ ABI_PushRegisters({0, 1, 2, 4, 30});
+ MOV(W0, W6);
+ QuickCallFunction(X5, InvalidateByAddr);
+ ABI_PopRegisters({0, 1, 2, 4, 30});
+ SetJumpTarget(null);
+ }
+
+ ADDI2R(W4, W4, offsetof(ARMv5, ITCM), W5);
+ if (store)
+ {
+ LDR(X5, X1, ArithOption(X2, true));
+ STR(W5, RCPU, X4);
+ }
+ else
+ {
+ LDR(W5, RCPU, X4);
+ STR(X5, X1, ArithOption(X2, true));
+ }
+
+ if (!preinc)
+ ADD(W0, W0, 4);
+ CBNZ(W2, loopStart);
+ RET();
+ return res;
+}
+
+void* Compiler::Gen_MemoryRoutine7Seq(bool store, bool preinc)
+{
+ AlignCode16();
+ void* res = GetRXPtr();
+
+ void* loopStart = GetRXPtr();
+ SUB(W2, W2, 1);
+
+ if (preinc)
+ ADD(W0, W0, 4);
+
+ ABI_PushRegisters({0, 1, 2, 30});
+ if (store)
+ {
+ LDR(X1, X1, ArithOption(X2, true));
+ QuickCallFunction(X4, NDS::ARM7Write32);
+ ABI_PopRegisters({0, 1, 2, 30});
+ }
+ else
+ {
+ QuickCallFunction(X4, NDS::ARM7Read32);
+ MOV(W4, W0);
+ ABI_PopRegisters({0, 1, 2, 30});
+ STR(X4, X1, ArithOption(X2, true));
+ }
+
+ if (!preinc)
+ ADD(W0, W0, 4);
+ CBNZ(W2, loopStart);
+ RET();
+
+ return res;
+}
+
+void Compiler::Comp_MemLoadLiteral(int size, bool signExtend, int rd, u32 addr)
+{
+ u32 val;
+ // make sure arm7 bios is accessible
+ u32 tmpR15 = CurCPU->R[15];
+ CurCPU->R[15] = R15;
+ if (size == 32)
+ {
+ CurCPU->DataRead32(addr & ~0x3, &val);
+ val = ROR(val, (addr & 0x3) << 3);
+ }
+ else if (size == 16)
+ {
+ CurCPU->DataRead16(addr & ~0x1, &val);
+ if (signExtend)
+ val = ((s32)val << 16) >> 16;
+ }
+ else
+ {
+ CurCPU->DataRead8(addr, &val);
+ if (signExtend)
+ val = ((s32)val << 24) >> 24;
+ }
+ CurCPU->R[15] = tmpR15;
+
+ MOVI2R(MapReg(rd), val);
+
+ if (Thumb || CurInstr.Cond() == 0xE)
+ RegCache.PutLiteral(rd, val);
+}
+
+void Compiler::Comp_MemAccess(int rd, int rn, Op2 offset, int size, int flags)
+{
+ u32 addressMask = ~0;
+ if (size == 32)
+ addressMask = ~3;
+ if (size == 16)
+ addressMask = ~1;
+
+ if (flags & memop_Store)
+ Comp_AddCycles_CD();
+ else
+ Comp_AddCycles_CDI();
+
+ if (Config::JIT_LiteralOptimisations && rn == 15 && rd != 15 && offset.IsImm && !(flags & (memop_Post|memop_Store|memop_Writeback)))
+ {
+ u32 addr = R15 + offset.Imm * ((flags & memop_SubtractOffset) ? -1 : 1);
+ u32 translatedAddr = Num == 0 ? TranslateAddr<0>(addr) : TranslateAddr<1>(addr);
+
+ if (!(CodeRanges[translatedAddr / 512].InvalidLiterals & (1 << ((translatedAddr & 0x1FF) / 16))))
+ {
+ Comp_MemLoadLiteral(size, flags & memop_SignExtend, rd, addr);
+ return;
+ }
+ }
+
+ {
+ ARM64Reg rdMapped = MapReg(rd);
+ ARM64Reg rnMapped = MapReg(rn);
+
+ bool inlinePreparation = Num == 1;
+ u32 constLocalROR32 = 4;
+
+ void* memFunc = Num == 0
+ ? MemFunc9[size >> 4][!!(flags & memop_Store)]
+ : MemFunc7[size >> 4][!!((flags & memop_Store))];
+
+ if (Config::JIT_LiteralOptimisations && (rd != 15 || (flags & memop_Store)) && offset.IsImm && RegCache.IsLiteral(rn))
+ {
+ u32 addr = RegCache.LiteralValues[rn] + offset.Imm * ((flags & memop_SubtractOffset) ? -1 : 1);
+
+ NDS::MemRegion region;
+ region.Mem = NULL;
+ if (Num == 0)
+ {
+ ARMv5* cpu5 = (ARMv5*)CurCPU;
+
+ // stupid dtcm...
+ if (addr >= cpu5->DTCMBase && addr < (cpu5->DTCMBase + cpu5->DTCMSize))
+ {
+ region.Mem = cpu5->DTCM;
+ region.Mask = 0x3FFF;
+ }
+ else
+ {
+ NDS::ARM9GetMemRegion(addr, flags & memop_Store, &region);
+ }
+ }
+ else
+ NDS::ARM7GetMemRegion(addr, flags & memop_Store, &region);
+
+ if (region.Mem != NULL)
+ {
+ void* ptr = &region.Mem[addr & addressMask & region.Mask];
+
+ MOVP2R(X0, ptr);
+ if (flags & memop_Store)
+ STRGeneric(size, INDEX_UNSIGNED, rdMapped, X0, 0);
+ else
+ {
+ LDRGeneric(size, flags & memop_SignExtend, INDEX_UNSIGNED, rdMapped, X0, 0);
+ if (size == 32 && addr & ~0x3)
+ ROR_(rdMapped, rdMapped, (addr & 0x3) << 3);
+ }
+ return;
+ }
+
+ void* specialFunc = GetFuncForAddr(CurCPU, addr, flags & memop_Store, size);
+ if (specialFunc)
+ {
+ memFunc = specialFunc;
+ inlinePreparation = true;
+ constLocalROR32 = addr & 0x3;
+ }
+ }
+
+ ARM64Reg finalAddr = W0;
+ if (flags & memop_Post)
+ {
+ finalAddr = rnMapped;
+ MOV(W0, rnMapped);
+ }
+
+ if (flags & memop_Store)
+ MOV(W1, rdMapped);
+
+ if (!offset.IsImm)
+ Comp_RegShiftImm(offset.Reg.ShiftType, offset.Reg.ShiftAmount, false, offset, W2);
+ // offset might become an immediate
+ if (offset.IsImm)
+ {
+ if (flags & memop_SubtractOffset)
+ SUB(finalAddr, rnMapped, offset.Imm);
+ else
+ ADD(finalAddr, rnMapped, offset.Imm);
+ }
+ else
+ {
+ if (offset.Reg.ShiftType == ST_ROR)
+ {
+ ROR_(W0, offset.Reg.Rm, offset.Reg.ShiftAmount);
+ offset = Op2(W0);
+ }
+
+ if (flags & memop_SubtractOffset)
+ SUB(finalAddr, rnMapped, offset.Reg.Rm, offset.ToArithOption());
+ else
+ ADD(finalAddr, rnMapped, offset.Reg.Rm, offset.ToArithOption());
+ }
+
+ if (!(flags & memop_Post) && (flags & memop_Writeback))
+ MOV(rnMapped, W0);
+
+ if (inlinePreparation)
+ {
+ if (size == 32 && !(flags & memop_Store) && constLocalROR32 == 4)
+ ANDI2R(rdMapped, W0, 3);
+ if (size > 8)
+ ANDI2R(W0, W0, addressMask);
+ }
+ QuickCallFunction(X2, memFunc);
+ if (!(flags & memop_Store))
+ {
+ if (inlinePreparation && !(flags & memop_Store) && size == 32)
+ {
+ if (constLocalROR32 == 4)
+ {
+ LSL(rdMapped, rdMapped, 3);
+ RORV(rdMapped, W0, rdMapped);
+ }
+ else if (constLocalROR32 > 0)
+ ROR_(rdMapped, W0, constLocalROR32 << 3);
+ else
+ MOV(rdMapped, W0);
+ }
+ else if (flags & memop_SignExtend)
+ {
+ if (size == 16)
+ SXTH(rdMapped, W0);
+ else if (size == 8)
+ SXTB(rdMapped, W0);
+ else
+ assert("What's wrong with you?");
+ }
+ else
+ MOV(rdMapped, W0);
+
+ if (CurInstr.Info.Branches())
+ {
+ if (size < 32)
+ printf("LDR size < 32 branching?\n");
+ Comp_JumpTo(rdMapped, Num == 0, false);
+ }
+ }
+ }
+}
+
+void Compiler::A_Comp_MemWB()
+{
+ Op2 offset;
+ if (CurInstr.Instr & (1 << 25))
+ offset = Op2(MapReg(CurInstr.A_Reg(0)), (ShiftType)((CurInstr.Instr >> 5) & 0x3), (CurInstr.Instr >> 7) & 0x1F);
+ else
+ offset = Op2(CurInstr.Instr & 0xFFF);
+
+ bool load = CurInstr.Instr & (1 << 20);
+ bool byte = CurInstr.Instr & (1 << 22);
+
+ int flags = 0;
+ if (!load)
+ flags |= memop_Store;
+ if (!(CurInstr.Instr & (1 << 24)))
+ flags |= memop_Post;
+ if (CurInstr.Instr & (1 << 21))
+ flags |= memop_Writeback;
+ if (!(CurInstr.Instr & (1 << 23)))
+ flags |= memop_SubtractOffset;
+
+ Comp_MemAccess(CurInstr.A_Reg(12), CurInstr.A_Reg(16), offset, byte ? 8 : 32, flags);
+}
+
+void Compiler::A_Comp_MemHD()
+{
+ bool load = CurInstr.Instr & (1 << 20);
+ bool signExtend;
+ int op = (CurInstr.Instr >> 5) & 0x3;
+ int size;
+
+ if (load)
+ {
+ signExtend = op >= 2;
+ size = op == 2 ? 8 : 16;
+ }
+ else
+ {
+ size = 16;
+ signExtend = false;
+ }
+
+ Op2 offset;
+ if (CurInstr.Instr & (1 << 22))
+ offset = Op2((CurInstr.Instr & 0xF) | ((CurInstr.Instr >> 4) & 0xF0));
+ else
+ offset = Op2(MapReg(CurInstr.A_Reg(0)));
+
+ int flags = 0;
+ if (signExtend)
+ flags |= memop_SignExtend;
+ if (!load)
+ flags |= memop_Store;
+ if (!(CurInstr.Instr & (1 << 24)))
+ flags |= memop_Post;
+ if (!(CurInstr.Instr & (1 << 23)))
+ flags |= memop_SubtractOffset;
+ if (CurInstr.Instr & (1 << 21))
+ flags |= memop_Writeback;
+
+ Comp_MemAccess(CurInstr.A_Reg(12), CurInstr.A_Reg(16), offset, size, flags);
+}
+
+void Compiler::T_Comp_MemReg()
+{
+ int op = (CurInstr.Instr >> 10) & 0x3;
+ bool load = op & 0x2;
+ bool byte = op & 0x1;
+
+ Comp_MemAccess(CurInstr.T_Reg(0), CurInstr.T_Reg(3),
+ Op2(MapReg(CurInstr.T_Reg(6))), byte ? 8 : 32, load ? 0 : memop_Store);
+}
+
+void Compiler::T_Comp_MemImm()
+{
+ int op = (CurInstr.Instr >> 11) & 0x3;
+ bool load = op & 0x1;
+ bool byte = op & 0x2;
+ u32 offset = ((CurInstr.Instr >> 6) & 0x1F) * (byte ? 1 : 4);
+
+ Comp_MemAccess(CurInstr.T_Reg(0), CurInstr.T_Reg(3), Op2(offset),
+ byte ? 8 : 32, load ? 0 : memop_Store);
+}
+
+void Compiler::T_Comp_MemRegHalf()
+{
+ int op = (CurInstr.Instr >> 10) & 0x3;
+ bool load = op != 0;
+ int size = op != 1 ? 16 : 8;
+ bool signExtend = op & 1;
+
+ int flags = 0;
+ if (signExtend)
+ flags |= memop_SignExtend;
+ if (!load)
+ flags |= memop_Store;
+
+ Comp_MemAccess(CurInstr.T_Reg(0), CurInstr.T_Reg(3), Op2(MapReg(CurInstr.T_Reg(6))),
+ size, flags);
+}
+
+void Compiler::T_Comp_MemImmHalf()
+{
+ u32 offset = (CurInstr.Instr >> 5) & 0x3E;
+ bool load = CurInstr.Instr & (1 << 11);
+
+ Comp_MemAccess(CurInstr.T_Reg(0), CurInstr.T_Reg(3), Op2(offset), 16,
+ load ? 0 : memop_Store);
+}
+
+void Compiler::T_Comp_LoadPCRel()
+{
+ u32 addr = (R15 & ~0x2) + ((CurInstr.Instr & 0xFF) << 2);
+
+ if (Config::JIT_LiteralOptimisations)
+ {
+ Comp_MemLoadLiteral(32, false, CurInstr.T_Reg(8), addr);
+ Comp_AddCycles_CDI();
+ }
+ else
+ {
+ bool negative = addr < R15;
+ u32 abs = negative ? R15 - addr : addr - R15;
+ Comp_MemAccess(CurInstr.T_Reg(8), 15, Op2(abs), 32, negative ? memop_SubtractOffset : 0);
+ }
+}
+
+void Compiler::T_Comp_MemSPRel()
+{
+ u32 offset = (CurInstr.Instr & 0xFF) * 4;
+ bool load = CurInstr.Instr & (1 << 11);
+
+ Comp_MemAccess(CurInstr.T_Reg(8), 13, Op2(offset), 32, load ? 0 : memop_Store);
+}
+
+s32 Compiler::Comp_MemAccessBlock(int rn, BitSet16 regs, bool store, bool preinc, bool decrement, bool usermode)
+{
+ IrregularCycles = true;
+
+ int regsCount = regs.Count();
+
+ if (regsCount == 0)
+ return 0; // actually not the right behaviour TODO: fix me
+
+ SUB(SP, SP, ((regsCount + 1) & ~1) * 8);
+ if (store)
+ {
+ Comp_AddCycles_CD();
+
+ if (usermode && (regs & BitSet16(0x7f00)))
+ UBFX(W0, RCPSR, 0, 5);
+
+ int i = regsCount - 1;
+
+ BitSet16::Iterator it = regs.begin();
+ while (it != regs.end())
+ {
+ BitSet16::Iterator nextReg = it;
+ nextReg++;
+
+ int reg = *it;
+
+ if (usermode && reg >= 8 && reg < 15)
+ {
+ if (RegCache.Mapping[reg] != INVALID_REG)
+ MOV(W3, MapReg(reg));
+ else
+ LoadReg(reg, W3);
+ MOVI2R(W1, reg - 8);
+ BL(ReadBanked);
+ STR(INDEX_UNSIGNED, W3, SP, i * 8);
+ }
+ else if (!usermode && nextReg != regs.end())
+ {
+ ARM64Reg first = W3;
+ ARM64Reg second = W4;
+
+ if (RegCache.Mapping[reg] != INVALID_REG)
+ first = MapReg(reg);
+ else
+ LoadReg(reg, W3);
+
+ if (RegCache.Mapping[*nextReg] != INVALID_REG)
+ second = MapReg(*nextReg);
+ else
+ LoadReg(*nextReg, W4);
+
+ STP(INDEX_SIGNED, EncodeRegTo64(second), EncodeRegTo64(first), SP, i * 8 - 8);
+
+ i--;
+ it++;
+ }
+ else if (RegCache.Mapping[reg] != INVALID_REG)
+ STR(INDEX_UNSIGNED, MapReg(reg), SP, i * 8);
+ else
+ {
+ LoadReg(reg, W3);
+ STR(INDEX_UNSIGNED, W3, SP, i * 8);
+ }
+ i--;
+ it++;
+ }
+ }
+ if (decrement)
+ {
+ SUB(W0, MapReg(rn), regsCount * 4);
+ preinc ^= true;
+ }
+ else
+ MOV(W0, MapReg(rn));
+ ADD(X1, SP, 0);
+ MOVI2R(W2, regsCount);
+
+ BL(Num ? MemFuncsSeq7[store][preinc] : MemFuncsSeq9[store][preinc]);
+
+ if (!store)
+ {
+ Comp_AddCycles_CDI();
+
+ if (usermode && (regs & BitSet16(0x7f00)))
+ UBFX(W0, RCPSR, 0, 5);
+
+ int i = regsCount - 1;
+ BitSet16::Iterator it = regs.begin();
+ while (it != regs.end())
+ {
+ BitSet16::Iterator nextReg = it;
+ nextReg++;
+
+ int reg = *it;
+
+ if (usermode && reg >= 8 && reg < 15)
+ {
+ LDR(INDEX_UNSIGNED, W3, SP, i * 8);
+ MOVI2R(W1, reg - 8);
+ BL(WriteBanked);
+ FixupBranch alreadyWritten = CBNZ(W4);
+ if (RegCache.Mapping[reg] != INVALID_REG)
+ {
+ MOV(MapReg(reg), W3);
+ RegCache.DirtyRegs |= 1 << reg;
+ }
+ else
+ SaveReg(reg, W3);
+ SetJumpTarget(alreadyWritten);
+ }
+ else if (!usermode && nextReg != regs.end())
+ {
+ ARM64Reg first = W3, second = W4;
+
+ if (RegCache.Mapping[reg] != INVALID_REG)
+ {
+ first = MapReg(reg);
+ if (reg != 15)
+ RegCache.DirtyRegs |= 1 << reg;
+ }
+ if (RegCache.Mapping[*nextReg] != INVALID_REG)
+ {
+ second = MapReg(*nextReg);
+ if (*nextReg != 15)
+ RegCache.DirtyRegs |= 1 << *nextReg;
+ }
+
+ LDP(INDEX_SIGNED, EncodeRegTo64(second), EncodeRegTo64(first), SP, i * 8 - 8);
+
+ if (first == W3)
+ SaveReg(reg, W3);
+ if (second == W4)
+ SaveReg(*nextReg, W4);
+
+ it++;
+ i--;
+ }
+ else if (RegCache.Mapping[reg] != INVALID_REG)
+ {
+ ARM64Reg mapped = MapReg(reg);
+ LDR(INDEX_UNSIGNED, mapped, SP, i * 8);
+
+ if (reg != 15)
+ RegCache.DirtyRegs |= 1 << reg;
+ }
+ else
+ {
+ LDR(INDEX_UNSIGNED, W3, SP, i * 8);
+ SaveReg(reg, W3);
+ }
+
+ it++;
+ i--;
+ }
+ }
+ ADD(SP, SP, ((regsCount + 1) & ~1) * 8);
+
+ if (!store && regs[15])
+ {
+ ARM64Reg mapped = MapReg(15);
+ Comp_JumpTo(mapped, Num == 0, usermode);
+ }
+
+ return regsCount * 4 * (decrement ? -1 : 1);
+}
+
+void Compiler::A_Comp_LDM_STM()
+{
+ BitSet16 regs(CurInstr.Instr & 0xFFFF);
+
+ bool load = CurInstr.Instr & (1 << 20);
+ bool pre = CurInstr.Instr & (1 << 24);
+ bool add = CurInstr.Instr & (1 << 23);
+ bool writeback = CurInstr.Instr & (1 << 21);
+ bool usermode = CurInstr.Instr & (1 << 22);
+
+ ARM64Reg rn = MapReg(CurInstr.A_Reg(16));
+
+ s32 offset = Comp_MemAccessBlock(CurInstr.A_Reg(16), regs, !load, pre, !add, usermode);
+
+ if (load && writeback && regs[CurInstr.A_Reg(16)])
+ writeback = Num == 0
+ ? (!(regs & ~BitSet16(1 << CurInstr.A_Reg(16)))) || (regs & ~BitSet16((2 << CurInstr.A_Reg(16)) - 1))
+ : false;
+ if (writeback)
+ {
+ if (offset > 0)
+ ADD(rn, rn, offset);
+ else
+ SUB(rn, rn, -offset);
+ }
+}
+
+void Compiler::T_Comp_PUSH_POP()
+{
+ bool load = CurInstr.Instr & (1 << 11);
+ BitSet16 regs(CurInstr.Instr & 0xFF);
+ if (CurInstr.Instr & (1 << 8))
+ {
+ if (load)
+ regs[15] = true;
+ else
+ regs[14] = true;
+ }
+
+ ARM64Reg sp = MapReg(13);
+ s32 offset = Comp_MemAccessBlock(13, regs, !load, !load, !load, false);
+
+ if (offset > 0)
+ ADD(sp, sp, offset);
+ else
+ SUB(sp, sp, -offset);
+}
+
+void Compiler::T_Comp_LDMIA_STMIA()
+{
+ BitSet16 regs(CurInstr.Instr & 0xFF);
+ ARM64Reg rb = MapReg(CurInstr.T_Reg(8));
+ bool load = CurInstr.Instr & (1 << 11);
+ u32 regsCount = regs.Count();
+
+ s32 offset = Comp_MemAccessBlock(CurInstr.T_Reg(8), regs, !load, false, false, false);
+
+ if (!load || !regs[CurInstr.T_Reg(8)])
+ {
+ if (offset > 0)
+ ADD(rb, rb, offset);
+ else
+ SUB(rb, rb, -offset);
+ }
+}
+
+} \ No newline at end of file
diff --git a/src/ARM_InstrInfo.cpp b/src/ARM_InstrInfo.cpp
index 08e2f0a..b884773 100644
--- a/src/ARM_InstrInfo.cpp
+++ b/src/ARM_InstrInfo.cpp
@@ -2,6 +2,8 @@
#include <stdio.h>
+#include "Config.h"
+
namespace ARMInstrInfo
{
@@ -363,7 +365,11 @@ Info Decode(bool thumb, u32 num, u32 instr)
res.SpecialKind = special_WriteMem;
if (res.Kind == ARMInstrInfo::tk_LDR_PCREL)
+ {
+ if (!Config::JIT_LiteralOptimisations)
+ res.SrcRegs |= 1 << 15;
res.SpecialKind = special_LoadLiteral;
+ }
if (res.Kind == tk_LDMIA || res.Kind == tk_POP)
{
@@ -417,7 +423,6 @@ Info Decode(bool thumb, u32 num, u32 instr)
u32 cp = ((instr >> 8) & 0xF);
if ((num == 0 && cp != 15) || (num == 1 && cp != 14))
{
- printf("happens\n");
data = A_UNK;
res.Kind = ak_UNK;
}
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 10428aa..8b81ce3 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -41,10 +41,31 @@ if (ENABLE_JIT)
ARMJIT_x64/ARMJIT_Branch.cpp
dolphin/CommonFuncs.cpp
- dolphin/x64ABI.cpp
- dolphin/x64CPUDetect.cpp
- dolphin/x64Emitter.cpp
)
+
+ if (ARCHITECTURE STREQUAL x86_64)
+ target_sources(core PRIVATE
+ dolphin/x64ABI.cpp
+ dolphin/x64CPUDetect.cpp
+ dolphin/x64Emitter.cpp
+
+ ARMJIT_x64/ARMJIT_Compiler.cpp
+ ARMJIT_x64/ARMJIT_ALU.cpp
+ ARMJIT_x64/ARMJIT_LoadStore.cpp
+ ARMJIT_x64/ARMJIT_Branch.cpp
+ )
+ endif()
+ if (ARCHITECTURE STREQUAL ARM64)
+ target_sources(core PRIVATE
+ dolphin/Arm64Emitter.cpp
+ dolphin/MathUtil.cpp
+
+ ARMJIT_A64/ARMJIT_Compiler.cpp
+ ARMJIT_A64/ARMJIT_ALU.cpp
+ ARMJIT_A64/ARMJIT_LoadStore.cpp
+ ARMJIT_A64/ARMJIT_Branch.cpp
+ )
+ endif()
endif()
if (WIN32)
diff --git a/src/dolphin/Align.h b/src/dolphin/Align.h
new file mode 100644
index 0000000..40c4576
--- /dev/null
+++ b/src/dolphin/Align.h
@@ -0,0 +1,24 @@
+// This file is under the public domain.
+
+#pragma once
+
+#include <cstddef>
+#include <type_traits>
+
+namespace Common
+{
+template <typename T>
+constexpr T AlignUp(T value, size_t size)
+{
+ static_assert(std::is_unsigned<T>(), "T must be an unsigned value.");
+ return static_cast<T>(value + (size - value % size) % size);
+}
+
+template <typename T>
+constexpr T AlignDown(T value, size_t size)
+{
+ static_assert(std::is_unsigned<T>(), "T must be an unsigned value.");
+ return static_cast<T>(value - value % size);
+}
+
+} // namespace Common
diff --git a/src/dolphin/Arm64Emitter.cpp b/src/dolphin/Arm64Emitter.cpp
new file mode 100644
index 0000000..dbcf425
--- /dev/null
+++ b/src/dolphin/Arm64Emitter.cpp
@@ -0,0 +1,4466 @@
+// Copyright 2015 Dolphin Emulator Project
+// Licensed under GPLv2+
+// Refer to the license.txt file included.
+
+#include <algorithm>
+#include <array>
+#include <cinttypes>
+#include <cstring>
+#include <vector>
+
+#include "Align.h"
+#include "Arm64Emitter.h"
+#include "Assert.h"
+#include "BitUtils.h"
+#include "../types.h"
+#include "MathUtil.h"
+
+namespace Arm64Gen
+{
+namespace
+{
+const int kWRegSizeInBits = 32;
+const int kXRegSizeInBits = 64;
+
+// The below few functions are taken from V8.
+int CountLeadingZeros(uint64_t value, int width)
+{
+ // TODO(jbramley): Optimize this for ARM64 hosts.
+ int count = 0;
+ uint64_t bit_test = 1ULL << (width - 1);
+ while ((count < width) && ((bit_test & value) == 0))
+ {
+ count++;
+ bit_test >>= 1;
+ }
+ return count;
+}
+
+uint64_t LargestPowerOf2Divisor(uint64_t value)
+{
+ return value & -(int64_t)value;
+}
+
+// For ADD/SUB
+bool IsImmArithmetic(uint64_t input, u32* val, bool* shift)
+{
+ if (input < 4096)
+ {
+ *val = input;
+ *shift = false;
+ return true;
+ }
+ else if ((input & 0xFFF000) == input)
+ {
+ *val = input >> 12;
+ *shift = true;
+ return true;
+ }
+ return false;
+}
+
+// For AND/TST/ORR/EOR etc
+bool IsImmLogical(uint64_t value, unsigned int width, unsigned int* n, unsigned int* imm_s,
+ unsigned int* imm_r)
+{
+ // DCHECK((n != NULL) && (imm_s != NULL) && (imm_r != NULL));
+ // DCHECK((width == kWRegSizeInBits) || (width == kXRegSizeInBits));
+
+ bool negate = false;
+
+ // Logical immediates are encoded using parameters n, imm_s and imm_r using
+ // the following table:
+ //
+ // N imms immr size S R
+ // 1 ssssss rrrrrr 64 UInt(ssssss) UInt(rrrrrr)
+ // 0 0sssss xrrrrr 32 UInt(sssss) UInt(rrrrr)
+ // 0 10ssss xxrrrr 16 UInt(ssss) UInt(rrrr)
+ // 0 110sss xxxrrr 8 UInt(sss) UInt(rrr)
+ // 0 1110ss xxxxrr 4 UInt(ss) UInt(rr)
+ // 0 11110s xxxxxr 2 UInt(s) UInt(r)
+ // (s bits must not be all set)
+ //
+ // A pattern is constructed of size bits, where the least significant S+1 bits
+ // are set. The pattern is rotated right by R, and repeated across a 32 or
+ // 64-bit value, depending on destination register width.
+ //
+ // Put another way: the basic format of a logical immediate is a single
+ // contiguous stretch of 1 bits, repeated across the whole word at intervals
+ // given by a power of 2. To identify them quickly, we first locate the
+ // lowest stretch of 1 bits, then the next 1 bit above that; that combination
+ // is different for every logical immediate, so it gives us all the
+ // information we need to identify the only logical immediate that our input
+ // could be, and then we simply check if that's the value we actually have.
+ //
+ // (The rotation parameter does give the possibility of the stretch of 1 bits
+ // going 'round the end' of the word. To deal with that, we observe that in
+ // any situation where that happens the bitwise NOT of the value is also a
+ // valid logical immediate. So we simply invert the input whenever its low bit
+ // is set, and then we know that the rotated case can't arise.)
+
+ if (value & 1)
+ {
+ // If the low bit is 1, negate the value, and set a flag to remember that we
+ // did (so that we can adjust the return values appropriately).
+ negate = true;
+ value = ~value;
+ }
+
+ if (width == kWRegSizeInBits)
+ {
+ // To handle 32-bit logical immediates, the very easiest thing is to repeat
+ // the input value twice to make a 64-bit word. The correct encoding of that
+ // as a logical immediate will also be the correct encoding of the 32-bit
+ // value.
+
+ // The most-significant 32 bits may not be zero (ie. negate is true) so
+ // shift the value left before duplicating it.
+ value <<= kWRegSizeInBits;
+ value |= value >> kWRegSizeInBits;
+ }
+
+ // The basic analysis idea: imagine our input word looks like this.
+ //
+ // 0011111000111110001111100011111000111110001111100011111000111110
+ // c b a
+ // |<--d-->|
+ //
+ // We find the lowest set bit (as an actual power-of-2 value, not its index)
+ // and call it a. Then we add a to our original number, which wipes out the
+ // bottommost stretch of set bits and replaces it with a 1 carried into the
+ // next zero bit. Then we look for the new lowest set bit, which is in
+ // position b, and subtract it, so now our number is just like the original
+ // but with the lowest stretch of set bits completely gone. Now we find the
+ // lowest set bit again, which is position c in the diagram above. Then we'll
+ // measure the distance d between bit positions a and c (using CLZ), and that
+ // tells us that the only valid logical immediate that could possibly be equal
+ // to this number is the one in which a stretch of bits running from a to just
+ // below b is replicated every d bits.
+ uint64_t a = LargestPowerOf2Divisor(value);
+ uint64_t value_plus_a = value + a;
+ uint64_t b = LargestPowerOf2Divisor(value_plus_a);
+ uint64_t value_plus_a_minus_b = value_plus_a - b;
+ uint64_t c = LargestPowerOf2Divisor(value_plus_a_minus_b);
+
+ int d, clz_a, out_n;
+ uint64_t mask;
+
+ if (c != 0)
+ {
+ // The general case, in which there is more than one stretch of set bits.
+ // Compute the repeat distance d, and set up a bitmask covering the basic
+ // unit of repetition (i.e. a word with the bottom d bits set). Also, in all
+ // of these cases the N bit of the output will be zero.
+ clz_a = CountLeadingZeros(a, kXRegSizeInBits);
+ int clz_c = CountLeadingZeros(c, kXRegSizeInBits);
+ d = clz_a - clz_c;
+ mask = ((UINT64_C(1) << d) - 1);
+ out_n = 0;
+ }
+ else
+ {
+ // Handle degenerate cases.
+ //
+ // If any of those 'find lowest set bit' operations didn't find a set bit at
+ // all, then the word will have been zero thereafter, so in particular the
+ // last lowest_set_bit operation will have returned zero. So we can test for
+ // all the special case conditions in one go by seeing if c is zero.
+ if (a == 0)
+ {
+ // The input was zero (or all 1 bits, which will come to here too after we
+ // inverted it at the start of the function), for which we just return
+ // false.
+ return false;
+ }
+ else
+ {
+ // Otherwise, if c was zero but a was not, then there's just one stretch
+ // of set bits in our word, meaning that we have the trivial case of
+ // d == 64 and only one 'repetition'. Set up all the same variables as in
+ // the general case above, and set the N bit in the output.
+ clz_a = CountLeadingZeros(a, kXRegSizeInBits);
+ d = 64;
+ mask = ~UINT64_C(0);
+ out_n = 1;
+ }
+ }
+
+ // If the repeat period d is not a power of two, it can't be encoded.
+ if (!MathUtil::IsPow2<u64>(d))
+ return false;
+
+ // If the bit stretch (b - a) does not fit within the mask derived from the
+ // repeat period, then fail.
+ if (((b - a) & ~mask) != 0)
+ return false;
+
+ // The only possible option is b - a repeated every d bits. Now we're going to
+ // actually construct the valid logical immediate derived from that
+ // specification, and see if it equals our original input.
+ //
+ // To repeat a value every d bits, we multiply it by a number of the form
+ // (1 + 2^d + 2^(2d) + ...), i.e. 0x0001000100010001 or similar. These can
+ // be derived using a table lookup on CLZ(d).
+ static const std::array<uint64_t, 6> multipliers = {{
+ 0x0000000000000001UL,
+ 0x0000000100000001UL,
+ 0x0001000100010001UL,
+ 0x0101010101010101UL,
+ 0x1111111111111111UL,
+ 0x5555555555555555UL,
+ }};
+
+ int multiplier_idx = CountLeadingZeros(d, kXRegSizeInBits) - 57;
+
+ // Ensure that the index to the multipliers array is within bounds.
+ DEBUG_ASSERT((multiplier_idx >= 0) && (static_cast<size_t>(multiplier_idx) < multipliers.size()));
+
+ uint64_t multiplier = multipliers[multiplier_idx];
+ uint64_t candidate = (b - a) * multiplier;
+
+ // The candidate pattern doesn't match our input value, so fail.
+ if (value != candidate)
+ return false;
+
+ // We have a match! This is a valid logical immediate, so now we have to
+ // construct the bits and pieces of the instruction encoding that generates
+ // it.
+
+ // Count the set bits in our basic stretch. The special case of clz(0) == -1
+ // makes the answer come out right for stretches that reach the very top of
+ // the word (e.g. numbers like 0xffffc00000000000).
+ int clz_b = (b == 0) ? -1 : CountLeadingZeros(b, kXRegSizeInBits);
+ int s = clz_a - clz_b;
+
+ // Decide how many bits to rotate right by, to put the low bit of that basic
+ // stretch in position a.
+ int r;
+ if (negate)
+ {
+ // If we inverted the input right at the start of this function, here's
+ // where we compensate: the number of set bits becomes the number of clear
+ // bits, and the rotation count is based on position b rather than position
+ // a (since b is the location of the 'lowest' 1 bit after inversion).
+ s = d - s;
+ r = (clz_b + 1) & (d - 1);
+ }
+ else
+ {
+ r = (clz_a + 1) & (d - 1);
+ }
+
+ // Now we're done, except for having to encode the S output in such a way that
+ // it gives both the number of set bits and the length of the repeated
+ // segment. The s field is encoded like this:
+ //
+ // imms size S
+ // ssssss 64 UInt(ssssss)
+ // 0sssss 32 UInt(sssss)
+ // 10ssss 16 UInt(ssss)
+ // 110sss 8 UInt(sss)
+ // 1110ss 4 UInt(ss)
+ // 11110s 2 UInt(s)
+ //
+ // So we 'or' (-d << 1) with our computed s to form imms.
+ *n = out_n;
+ *imm_s = ((-d << 1) | (s - 1)) & 0x3f;
+ *imm_r = r;
+
+ return true;
+}
+
+float FPImm8ToFloat(u8 bits)
+{
+ const u32 sign = bits >> 7;
+ const u32 bit6 = (bits >> 6) & 1;
+ const u32 exp = ((!bit6) << 7) | (0x7C * bit6) | ((bits >> 4) & 3);
+ const u32 mantissa = (bits & 0xF) << 19;
+ const u32 f = (sign << 31) | (exp << 23) | mantissa;
+
+ return Common::BitCast<float>(f);
+}
+
+bool FPImm8FromFloat(float value, u8* imm_out)
+{
+ const u32 f = Common::BitCast<u32>(value);
+ const u32 mantissa4 = (f & 0x7FFFFF) >> 19;
+ const u32 exponent = (f >> 23) & 0xFF;
+ const u32 sign = f >> 31;
+
+ if ((exponent >> 7) == ((exponent >> 6) & 1))
+ return false;
+
+ const u8 imm8 = (sign << 7) | ((!(exponent >> 7)) << 6) | ((exponent & 3) << 4) | mantissa4;
+ const float new_float = FPImm8ToFloat(imm8);
+ if (new_float == value)
+ *imm_out = imm8;
+ else
+ return false;
+
+ return true;
+}
+} // Anonymous namespace
+
+void ARM64XEmitter::SetCodePtrUnsafe(ptrdiff_t ptr)
+{
+ m_code = ptr;
+}
+
+void ARM64XEmitter::SetCodePtr(ptrdiff_t ptr)
+{
+ SetCodePtrUnsafe(ptr);
+ m_lastCacheFlushEnd = ptr;
+}
+
+void ARM64XEmitter::SetCodeBase(u8* rwbase, u8* rxbase)
+{
+ m_code = 0;
+ m_lastCacheFlushEnd = 0;
+ m_rwbase = rwbase;
+ m_rxbase = rxbase;
+}
+
+ptrdiff_t ARM64XEmitter::GetCodeOffset()
+{
+ return m_code;
+}
+
+const u8* ARM64XEmitter::GetRWPtr()
+{
+ return m_rwbase + m_code;
+}
+
+u8* ARM64XEmitter::GetWriteableRWPtr()
+{
+ return m_rwbase + m_code;
+}
+
+void* ARM64XEmitter::GetRXPtr()
+{
+ return m_rxbase + m_code;
+}
+
+void ARM64XEmitter::ReserveCodeSpace(u32 bytes)
+{
+ for (u32 i = 0; i < bytes / 4; i++)
+ BRK(0);
+}
+
+ptrdiff_t ARM64XEmitter::AlignCode16()
+{
+ int c = int((u64)m_code & 15);
+ if (c)
+ ReserveCodeSpace(16 - c);
+ return m_code;
+}
+
+ptrdiff_t ARM64XEmitter::AlignCodePage()
+{
+ int c = int((u64)m_code & 4095);
+ if (c)
+ ReserveCodeSpace(4096 - c);
+ return m_code;
+}
+
+void ARM64XEmitter::Write32(u32 value)
+{
+ std::memcpy(m_rwbase + m_code, &value, sizeof(u32));
+ m_code += sizeof(u32);
+}
+
+void ARM64XEmitter::FlushIcache()
+{
+ FlushIcacheSection(m_rxbase + m_lastCacheFlushEnd, m_rxbase + m_code);
+ m_lastCacheFlushEnd = m_code;
+}
+
+void ARM64XEmitter::FlushIcacheSection(u8* start, u8* end)
+{
+ if (start == end)
+ return;
+
+#if defined(IOS)
+ // Header file says this is equivalent to: sys_icache_invalidate(start, end - start);
+ sys_cache_control(kCacheFunctionPrepareForExecution, start, end - start);
+#else
+ // Don't rely on GCC's __clear_cache implementation, as it caches
+ // icache/dcache cache line sizes, that can vary between cores on
+ // big.LITTLE architectures.
+ u64 addr, ctr_el0;
+ static size_t icache_line_size = 0xffff, dcache_line_size = 0xffff;
+ size_t isize, dsize;
+
+ __asm__ volatile("mrs %0, ctr_el0" : "=r"(ctr_el0));
+ isize = 4 << ((ctr_el0 >> 0) & 0xf);
+ dsize = 4 << ((ctr_el0 >> 16) & 0xf);
+
+ // use the global minimum cache line size
+ icache_line_size = isize = icache_line_size < isize ? icache_line_size : isize;
+ dcache_line_size = dsize = dcache_line_size < dsize ? dcache_line_size : dsize;
+
+ addr = (u64)start & ~(u64)(dsize - 1);
+ for (; addr < (u64)end; addr += dsize)
+ // use "civac" instead of "cvau", as this is the suggested workaround for
+ // Cortex-A53 errata 819472, 826319, 827319 and 824069.
+ __asm__ volatile("dc civac, %0" : : "r"(addr) : "memory");
+ __asm__ volatile("dsb ish" : : : "memory");
+
+ addr = (u64)start & ~(u64)(isize - 1);
+ for (; addr < (u64)end; addr += isize)
+ __asm__ volatile("ic ivau, %0" : : "r"(addr) : "memory");
+
+ __asm__ volatile("dsb ish" : : : "memory");
+ __asm__ volatile("isb" : : : "memory");
+#endif
+}
+
+// Exception generation
+static const u32 ExcEnc[][3] = {
+ {0, 0, 1}, // SVC
+ {0, 0, 2}, // HVC
+ {0, 0, 3}, // SMC
+ {1, 0, 0}, // BRK
+ {2, 0, 0}, // HLT
+ {5, 0, 1}, // DCPS1
+ {5, 0, 2}, // DCPS2
+ {5, 0, 3}, // DCPS3
+};
+
+// Arithmetic generation
+static const u32 ArithEnc[] = {
+ 0x058, // ADD
+ 0x258, // SUB
+};
+
+// Conditional Select
+static const u32 CondSelectEnc[][2] = {
+ {0, 0}, // CSEL
+ {0, 1}, // CSINC
+ {1, 0}, // CSINV
+ {1, 1}, // CSNEG
+};
+
+// Data-Processing (1 source)
+static const u32 Data1SrcEnc[][2] = {
+ {0, 0}, // RBIT
+ {0, 1}, // REV16
+ {0, 2}, // REV32
+ {0, 3}, // REV64
+ {0, 4}, // CLZ
+ {0, 5}, // CLS
+};
+
+// Data-Processing (2 source)
+static const u32 Data2SrcEnc[] = {
+ 0x02, // UDIV
+ 0x03, // SDIV
+ 0x08, // LSLV
+ 0x09, // LSRV
+ 0x0A, // ASRV
+ 0x0B, // RORV
+ 0x10, // CRC32B
+ 0x11, // CRC32H
+ 0x12, // CRC32W
+ 0x14, // CRC32CB
+ 0x15, // CRC32CH
+ 0x16, // CRC32CW
+ 0x13, // CRC32X (64bit Only)
+ 0x17, // XRC32CX (64bit Only)
+};
+
+// Data-Processing (3 source)
+static const u32 Data3SrcEnc[][2] = {
+ {0, 0}, // MADD
+ {0, 1}, // MSUB
+ {1, 0}, // SMADDL (64Bit Only)
+ {1, 1}, // SMSUBL (64Bit Only)
+ {2, 0}, // SMULH (64Bit Only)
+ {5, 0}, // UMADDL (64Bit Only)
+ {5, 1}, // UMSUBL (64Bit Only)
+ {6, 0}, // UMULH (64Bit Only)
+};
+
+// Logical (shifted register)
+static const u32 LogicalEnc[][2] = {
+ {0, 0}, // AND
+ {0, 1}, // BIC
+ {1, 0}, // OOR
+ {1, 1}, // ORN
+ {2, 0}, // EOR
+ {2, 1}, // EON
+ {3, 0}, // ANDS
+ {3, 1}, // BICS
+};
+
+// Load/Store Exclusive
+static const u32 LoadStoreExcEnc[][5] = {
+ {0, 0, 0, 0, 0}, // STXRB
+ {0, 0, 0, 0, 1}, // STLXRB
+ {0, 0, 1, 0, 0}, // LDXRB
+ {0, 0, 1, 0, 1}, // LDAXRB
+ {0, 1, 0, 0, 1}, // STLRB
+ {0, 1, 1, 0, 1}, // LDARB
+ {1, 0, 0, 0, 0}, // STXRH
+ {1, 0, 0, 0, 1}, // STLXRH
+ {1, 0, 1, 0, 0}, // LDXRH
+ {1, 0, 1, 0, 1}, // LDAXRH
+ {1, 1, 0, 0, 1}, // STLRH
+ {1, 1, 1, 0, 1}, // LDARH
+ {2, 0, 0, 0, 0}, // STXR
+ {3, 0, 0, 0, 0}, // (64bit) STXR
+ {2, 0, 0, 0, 1}, // STLXR
+ {3, 0, 0, 0, 1}, // (64bit) STLXR
+ {2, 0, 0, 1, 0}, // STXP
+ {3, 0, 0, 1, 0}, // (64bit) STXP
+ {2, 0, 0, 1, 1}, // STLXP
+ {3, 0, 0, 1, 1}, // (64bit) STLXP
+ {2, 0, 1, 0, 0}, // LDXR
+ {3, 0, 1, 0, 0}, // (64bit) LDXR
+ {2, 0, 1, 0, 1}, // LDAXR
+ {3, 0, 1, 0, 1}, // (64bit) LDAXR
+ {2, 0, 1, 1, 0}, // LDXP
+ {3, 0, 1, 1, 0}, // (64bit) LDXP
+ {2, 0, 1, 1, 1}, // LDAXP
+ {3, 0, 1, 1, 1}, // (64bit) LDAXP
+ {2, 1, 0, 0, 1}, // STLR
+ {3, 1, 0, 0, 1}, // (64bit) STLR
+ {2, 1, 1, 0, 1}, // LDAR
+ {3, 1, 1, 0, 1}, // (64bit) LDAR
+};
+
+void ARM64XEmitter::EncodeCompareBranchInst(u32 op, ARM64Reg Rt, const void* ptr)
+{
+ bool b64Bit = Is64Bit(Rt);
+ s64 distance = (s64)ptr - (s64)(m_rxbase + m_code);
+
+ ASSERT_MSG(DYNA_REC, !(distance & 0x3), "%s: distance must be a multiple of 4: %" PRIx64,
+ __func__, distance);
+
+ distance >>= 2;
+
+ ASSERT_MSG(DYNA_REC, distance >= -0x40000 && distance <= 0x3FFFF,
+ "%s: Received too large distance: %" PRIx64, __func__, distance);
+
+ Rt = DecodeReg(Rt);
+ Write32((b64Bit << 31) | (0x34 << 24) | (op << 24) | (((u32)distance << 5) & 0xFFFFE0) | Rt);
+}
+
+void ARM64XEmitter::EncodeTestBranchInst(u32 op, ARM64Reg Rt, u8 bits, const void* ptr)
+{
+ bool b64Bit = Is64Bit(Rt);
+ s64 distance = (s64)ptr - (s64)(m_rxbase + m_code);
+
+ ASSERT_MSG(DYNA_REC, !(distance & 0x3), "%s: distance must be a multiple of 4: %" PRIx64,
+ __func__, distance);
+
+ distance >>= 2;
+
+ ASSERT_MSG(DYNA_REC, distance >= -0x3FFF && distance < 0x3FFF,
+ "%s: Received too large distance: %" PRIx64, __func__, distance);
+
+ Rt = DecodeReg(Rt);
+ Write32((b64Bit << 31) | (0x36 << 24) | (op << 24) | (bits << 19) |
+ (((u32)distance << 5) & 0x7FFE0) | Rt);
+}
+
+void ARM64XEmitter::EncodeUnconditionalBranchInst(u32 op, const void* ptr)
+{
+ s64 distance = (s64)ptr - s64(m_rxbase + m_code);
+
+ ASSERT_MSG(DYNA_REC, !(distance & 0x3), "%s: distance must be a multiple of 4: %" PRIx64,
+ __func__, distance);
+
+ distance >>= 2;
+
+ ASSERT_MSG(DYNA_REC, distance >= -0x2000000LL && distance <= 0x1FFFFFFLL,
+ "%s: Received too large distance: %" PRIx64, __func__, distance);
+
+ Write32((op << 31) | (0x5 << 26) | (distance & 0x3FFFFFF));
+}
+
+void ARM64XEmitter::EncodeUnconditionalBranchInst(u32 opc, u32 op2, u32 op3, u32 op4, ARM64Reg Rn)
+{
+ Rn = DecodeReg(Rn);
+ Write32((0x6B << 25) | (opc << 21) | (op2 << 16) | (op3 << 10) | (Rn << 5) | op4);
+}
+
+void ARM64XEmitter::EncodeExceptionInst(u32 instenc, u32 imm)
+{
+ ASSERT_MSG(DYNA_REC, !(imm & ~0xFFFF), "%s: Exception instruction too large immediate: %d",
+ __func__, imm);
+
+ Write32((0xD4 << 24) | (ExcEnc[instenc][0] << 21) | (imm << 5) | (ExcEnc[instenc][1] << 2) |
+ ExcEnc[instenc][2]);
+}
+
+void ARM64XEmitter::EncodeSystemInst(u32 op0, u32 op1, u32 CRn, u32 CRm, u32 op2, ARM64Reg Rt)
+{
+ Write32((0x354 << 22) | (op0 << 19) | (op1 << 16) | (CRn << 12) | (CRm << 8) | (op2 << 5) | Rt);
+}
+
+void ARM64XEmitter::EncodeArithmeticInst(u32 instenc, bool flags, ARM64Reg Rd, ARM64Reg Rn,
+ ARM64Reg Rm, ArithOption Option)
+{
+ bool b64Bit = Is64Bit(Rd);
+
+ Rd = DecodeReg(Rd);
+ Rn = DecodeReg(Rn);
+ Rm = DecodeReg(Rm);
+ Write32((b64Bit << 31) | (flags << 29) | (ArithEnc[instenc] << 21) |
+ (Option.GetType() == ArithOption::TYPE_EXTENDEDREG ? (1 << 21) : 0) | (Rm << 16) |
+ Option.GetData() | (Rn << 5) | Rd);
+}
+
+void ARM64XEmitter::EncodeArithmeticCarryInst(u32 op, bool flags, ARM64Reg Rd, ARM64Reg Rn,
+ ARM64Reg Rm)
+{
+ bool b64Bit = Is64Bit(Rd);
+
+ Rd = DecodeReg(Rd);
+ Rm = DecodeReg(Rm);
+ Rn = DecodeReg(Rn);
+ Write32((b64Bit << 31) | (op << 30) | (flags << 29) | (0xD0 << 21) | (Rm << 16) | (Rn << 5) | Rd);
+}
+
+void ARM64XEmitter::EncodeCondCompareImmInst(u32 op, ARM64Reg Rn, u32 imm, u32 nzcv, CCFlags cond)
+{
+ bool b64Bit = Is64Bit(Rn);
+
+ ASSERT_MSG(DYNA_REC, !(imm & ~0x1F), "%s: too large immediate: %d", __func__, imm);
+ ASSERT_MSG(DYNA_REC, !(nzcv & ~0xF), "%s: Flags out of range: %d", __func__, nzcv);
+
+ Rn = DecodeReg(Rn);
+ Write32((b64Bit << 31) | (op << 30) | (1 << 29) | (0xD2 << 21) | (imm << 16) | (cond << 12) |
+ (1 << 11) | (Rn << 5) | nzcv);
+}
+
+void ARM64XEmitter::EncodeCondCompareRegInst(u32 op, ARM64Reg Rn, ARM64Reg Rm, u32 nzcv,
+ CCFlags cond)
+{
+ bool b64Bit = Is64Bit(Rm);
+
+ ASSERT_MSG(DYNA_REC, !(nzcv & ~0xF), "%s: Flags out of range: %d", __func__, nzcv);
+
+ Rm = DecodeReg(Rm);
+ Rn = DecodeReg(Rn);
+ Write32((b64Bit << 31) | (op << 30) | (1 << 29) | (0xD2 << 21) | (Rm << 16) | (cond << 12) |
+ (Rn << 5) | nzcv);
+}
+
+void ARM64XEmitter::EncodeCondSelectInst(u32 instenc, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm,
+ CCFlags cond)
+{
+ bool b64Bit = Is64Bit(Rd);
+
+ Rd = DecodeReg(Rd);
+ Rm = DecodeReg(Rm);
+ Rn = DecodeReg(Rn);
+ Write32((b64Bit << 31) | (CondSelectEnc[instenc][0] << 30) | (0xD4 << 21) | (Rm << 16) |
+ (cond << 12) | (CondSelectEnc[instenc][1] << 10) | (Rn << 5) | Rd);
+}
+
+void ARM64XEmitter::EncodeData1SrcInst(u32 instenc, ARM64Reg Rd, ARM64Reg Rn)
+{
+ bool b64Bit = Is64Bit(Rd);
+
+ Rd = DecodeReg(Rd);
+ Rn = DecodeReg(Rn);
+ Write32((b64Bit << 31) | (0x2D6 << 21) | (Data1SrcEnc[instenc][0] << 16) |
+ (Data1SrcEnc[instenc][1] << 10) | (Rn << 5) | Rd);
+}
+
+void ARM64XEmitter::EncodeData2SrcInst(u32 instenc, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+ bool b64Bit = Is64Bit(Rd);
+
+ Rd = DecodeReg(Rd);
+ Rm = DecodeReg(Rm);
+ Rn = DecodeReg(Rn);
+ Write32((b64Bit << 31) | (0x0D6 << 21) | (Rm << 16) | (Data2SrcEnc[instenc] << 10) | (Rn << 5) |
+ Rd);
+}
+
+void ARM64XEmitter::EncodeData3SrcInst(u32 instenc, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm,
+ ARM64Reg Ra)
+{
+ bool b64Bit = Is64Bit(Rd);
+
+ Rd = DecodeReg(Rd);
+ Rm = DecodeReg(Rm);
+ Rn = DecodeReg(Rn);
+ Ra = DecodeReg(Ra);
+ Write32((b64Bit << 31) | (0xD8 << 21) | (Data3SrcEnc[instenc][0] << 21) | (Rm << 16) |
+ (Data3SrcEnc[instenc][1] << 15) | (Ra << 10) | (Rn << 5) | Rd);
+}
+
+void ARM64XEmitter::EncodeLogicalInst(u32 instenc, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm,
+ ArithOption Shift)
+{
+ bool b64Bit = Is64Bit(Rd);
+
+ Rd = DecodeReg(Rd);
+ Rm = DecodeReg(Rm);
+ Rn = DecodeReg(Rn);
+ Write32((b64Bit << 31) | (LogicalEnc[instenc][0] << 29) | (0x5 << 25) |
+ (LogicalEnc[instenc][1] << 21) | Shift.GetData() | (Rm << 16) | (Rn << 5) | Rd);
+}
+
+void ARM64XEmitter::EncodeLoadRegisterInst(u32 bitop, ARM64Reg Rt, u32 imm)
+{
+ bool b64Bit = Is64Bit(Rt);
+ bool bVec = IsVector(Rt);
+
+ ASSERT_MSG(DYNA_REC, !(imm & 0xFFFFF), "%s: offset too large %d", __func__, imm);
+
+ Rt = DecodeReg(Rt);
+ if (b64Bit && bitop != 0x2) // LDRSW(0x2) uses 64bit reg, doesn't have 64bit bit set
+ bitop |= 0x1;
+ Write32((bitop << 30) | (bVec << 26) | (0x18 << 24) | (imm << 5) | Rt);
+}
+
+void ARM64XEmitter::EncodeLoadStoreExcInst(u32 instenc, ARM64Reg Rs, ARM64Reg Rt2, ARM64Reg Rn,
+ ARM64Reg Rt)
+{
+ Rs = DecodeReg(Rs);
+ Rt2 = DecodeReg(Rt2);
+ Rn = DecodeReg(Rn);
+ Rt = DecodeReg(Rt);
+ Write32((LoadStoreExcEnc[instenc][0] << 30) | (0x8 << 24) | (LoadStoreExcEnc[instenc][1] << 23) |
+ (LoadStoreExcEnc[instenc][2] << 22) | (LoadStoreExcEnc[instenc][3] << 21) | (Rs << 16) |
+ (LoadStoreExcEnc[instenc][4] << 15) | (Rt2 << 10) | (Rn << 5) | Rt);
+}
+
+void ARM64XEmitter::EncodeLoadStorePairedInst(u32 op, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn,
+ u32 imm)
+{
+ bool b64Bit = Is64Bit(Rt);
+ bool b128Bit = IsQuad(Rt);
+ bool bVec = IsVector(Rt);
+
+ if (b128Bit)
+ imm >>= 4;
+ else if (b64Bit)
+ imm >>= 3;
+ else
+ imm >>= 2;
+
+ ASSERT_MSG(DYNA_REC, !(imm & ~0xF), "%s: offset too large %d", __func__, imm);
+
+ u32 opc = 0;
+ if (b128Bit)
+ opc = 2;
+ else if (b64Bit && bVec)
+ opc = 1;
+ else if (b64Bit && !bVec)
+ opc = 2;
+
+ Rt = DecodeReg(Rt);
+ Rt2 = DecodeReg(Rt2);
+ Rn = DecodeReg(Rn);
+ Write32((opc << 30) | (bVec << 26) | (op << 22) | (imm << 15) | (Rt2 << 10) | (Rn << 5) | Rt);
+}
+
+void ARM64XEmitter::EncodeLoadStoreIndexedInst(u32 op, u32 op2, ARM64Reg Rt, ARM64Reg Rn, s32 imm)
+{
+ bool b64Bit = Is64Bit(Rt);
+ bool bVec = IsVector(Rt);
+
+ u32 offset = imm & 0x1FF;
+
+ ASSERT_MSG(DYNA_REC, !(imm < -256 || imm > 255), "%s: offset too large %d", __func__, imm);
+
+ Rt = DecodeReg(Rt);
+ Rn = DecodeReg(Rn);
+ Write32((b64Bit << 30) | (op << 22) | (bVec << 26) | (offset << 12) | (op2 << 10) | (Rn << 5) |
+ Rt);
+}
+
+void ARM64XEmitter::EncodeLoadStoreIndexedInst(u32 op, ARM64Reg Rt, ARM64Reg Rn, s32 imm, u8 size)
+{
+ bool b64Bit = Is64Bit(Rt);
+ bool bVec = IsVector(Rt);
+
+ if (size == 64)
+ imm >>= 3;
+ else if (size == 32)
+ imm >>= 2;
+ else if (size == 16)
+ imm >>= 1;
+
+ ASSERT_MSG(DYNA_REC, imm >= 0, "%s(INDEX_UNSIGNED): offset must be positive %d", __func__, imm);
+ ASSERT_MSG(DYNA_REC, !(imm & ~0xFFF), "%s(INDEX_UNSIGNED): offset too large %d", __func__, imm);
+
+ Rt = DecodeReg(Rt);
+ Rn = DecodeReg(Rn);
+ Write32((b64Bit << 30) | (op << 22) | (bVec << 26) | (imm << 10) | (Rn << 5) | Rt);
+}
+
+void ARM64XEmitter::EncodeMOVWideInst(u32 op, ARM64Reg Rd, u32 imm, ShiftAmount pos)
+{
+ bool b64Bit = Is64Bit(Rd);
+
+ ASSERT_MSG(DYNA_REC, !(imm & ~0xFFFF), "%s: immediate out of range: %d", __func__, imm);
+
+ Rd = DecodeReg(Rd);
+ Write32((b64Bit << 31) | (op << 29) | (0x25 << 23) | (pos << 21) | (imm << 5) | Rd);
+}
+
+void ARM64XEmitter::EncodeBitfieldMOVInst(u32 op, ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms)
+{
+ bool b64Bit = Is64Bit(Rd);
+
+ Rd = DecodeReg(Rd);
+ Rn = DecodeReg(Rn);
+ Write32((b64Bit << 31) | (op << 29) | (0x26 << 23) | (b64Bit << 22) | (immr << 16) |
+ (imms << 10) | (Rn << 5) | Rd);
+}
+
+void ARM64XEmitter::EncodeLoadStoreRegisterOffset(u32 size, u32 opc, ARM64Reg Rt, ARM64Reg Rn,
+ ArithOption Rm)
+{
+ Rt = DecodeReg(Rt);
+ Rn = DecodeReg(Rn);
+ ARM64Reg decoded_Rm = DecodeReg(Rm.GetReg());
+
+ Write32((size << 30) | (opc << 22) | (0x1C1 << 21) | (decoded_Rm << 16) | Rm.GetData() |
+ (1 << 11) | (Rn << 5) | Rt);
+}
+
+void ARM64XEmitter::EncodeAddSubImmInst(u32 op, bool flags, u32 shift, u32 imm, ARM64Reg Rn,
+ ARM64Reg Rd)
+{
+ bool b64Bit = Is64Bit(Rd);
+
+ ASSERT_MSG(DYNA_REC, !(imm & ~0xFFF), "%s: immediate too large: %x", __func__, imm);
+
+ Rd = DecodeReg(Rd);
+ Rn = DecodeReg(Rn);
+ Write32((b64Bit << 31) | (op << 30) | (flags << 29) | (0x11 << 24) | (shift << 22) | (imm << 10) |
+ (Rn << 5) | Rd);
+}
+
+void ARM64XEmitter::EncodeLogicalImmInst(u32 op, ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms,
+ int n)
+{
+ // Sometimes Rd is fixed to SP, but can still be 32bit or 64bit.
+ // Use Rn to determine bitness here.
+ bool b64Bit = Is64Bit(Rn);
+
+ Rd = DecodeReg(Rd);
+ Rn = DecodeReg(Rn);
+
+ Write32((b64Bit << 31) | (op << 29) | (0x24 << 23) | (n << 22) | (immr << 16) | (imms << 10) |
+ (Rn << 5) | Rd);
+}
+
+void ARM64XEmitter::EncodeLoadStorePair(u32 op, u32 load, IndexType type, ARM64Reg Rt, ARM64Reg Rt2,
+ ARM64Reg Rn, s32 imm)
+{
+ bool b64Bit = Is64Bit(Rt);
+ u32 type_encode = 0;
+
+ switch (type)
+ {
+ case INDEX_SIGNED:
+ type_encode = 0b010;
+ break;
+ case INDEX_POST:
+ type_encode = 0b001;
+ break;
+ case INDEX_PRE:
+ type_encode = 0b011;
+ break;
+ case INDEX_UNSIGNED:
+ ASSERT_MSG(DYNA_REC, false, "%s doesn't support INDEX_UNSIGNED!", __func__);
+ break;
+ }
+
+ if (b64Bit)
+ {
+ op |= 0b10;
+ imm >>= 3;
+ }
+ else
+ {
+ imm >>= 2;
+ }
+
+ Rt = DecodeReg(Rt);
+ Rt2 = DecodeReg(Rt2);
+ Rn = DecodeReg(Rn);
+
+ Write32((op << 30) | (0b101 << 27) | (type_encode << 23) | (load << 22) | ((imm & 0x7F) << 15) |
+ (Rt2 << 10) | (Rn << 5) | Rt);
+}
+void ARM64XEmitter::EncodeAddressInst(u32 op, ARM64Reg Rd, s32 imm)
+{
+ Rd = DecodeReg(Rd);
+
+ Write32((op << 31) | ((imm & 0x3) << 29) | (0x10 << 24) | ((imm & 0x1FFFFC) << 3) | Rd);
+}
+
+void ARM64XEmitter::EncodeLoadStoreUnscaled(u32 size, u32 op, ARM64Reg Rt, ARM64Reg Rn, s32 imm)
+{
+ ASSERT_MSG(DYNA_REC, !(imm < -256 || imm > 255), "%s received too large offset: %d", __func__,
+ imm);
+ Rt = DecodeReg(Rt);
+ Rn = DecodeReg(Rn);
+
+ Write32((size << 30) | (0b111 << 27) | (op << 22) | ((imm & 0x1FF) << 12) | (Rn << 5) | Rt);
+}
+
+static constexpr bool IsInRangeImm19(s64 distance)
+{
+ return (distance >= -0x40000 && distance <= 0x3FFFF);
+}
+
+static constexpr bool IsInRangeImm14(s64 distance)
+{
+ return (distance >= -0x2000 && distance <= 0x1FFF);
+}
+
+static constexpr bool IsInRangeImm26(s64 distance)
+{
+ return (distance >= -0x2000000 && distance <= 0x1FFFFFF);
+}
+
+static constexpr u32 MaskImm19(s64 distance)
+{
+ return distance & 0x7FFFF;
+}
+
+static constexpr u32 MaskImm14(s64 distance)
+{
+ return distance & 0x3FFF;
+}
+
+static constexpr u32 MaskImm26(s64 distance)
+{
+ return distance & 0x3FFFFFF;
+}
+
+// FixupBranch branching
+void ARM64XEmitter::SetJumpTarget(FixupBranch const& branch)
+{
+ bool Not = false;
+ u32 inst = 0;
+ s64 distance = (s64)(m_code - branch.ptr);
+ distance >>= 2;
+
+ switch (branch.type)
+ {
+ case 1: // CBNZ
+ Not = true;
+ case 0: // CBZ
+ {
+ ASSERT_MSG(DYNA_REC, IsInRangeImm19(distance), "%s(%d): Received too large distance: %" PRIx64,
+ __func__, branch.type, distance);
+ bool b64Bit = Is64Bit(branch.reg);
+ ARM64Reg reg = DecodeReg(branch.reg);
+ inst = (b64Bit << 31) | (0x1A << 25) | (Not << 24) | (MaskImm19(distance) << 5) | reg;
+ }
+ break;
+ case 2: // B (conditional)
+ ASSERT_MSG(DYNA_REC, IsInRangeImm19(distance), "%s(%d): Received too large distance: %" PRIx64,
+ __func__, branch.type, distance);
+ inst = (0x2A << 25) | (MaskImm19(distance) << 5) | branch.cond;
+ break;
+ case 4: // TBNZ
+ Not = true;
+ case 3: // TBZ
+ {
+ ASSERT_MSG(DYNA_REC, IsInRangeImm14(distance), "%s(%d): Received too large distance: %" PRIx64,
+ __func__, branch.type, distance);
+ ARM64Reg reg = DecodeReg(branch.reg);
+ inst = ((branch.bit & 0x20) << 26) | (0x1B << 25) | (Not << 24) | ((branch.bit & 0x1F) << 19) |
+ (MaskImm14(distance) << 5) | reg;
+ }
+ break;
+ case 5: // B (uncoditional)
+ ASSERT_MSG(DYNA_REC, IsInRangeImm26(distance), "%s(%d): Received too large distance: %" PRIx64,
+ __func__, branch.type, distance);
+ inst = (0x5 << 26) | MaskImm26(distance);
+ break;
+ case 6: // BL (unconditional)
+ ASSERT_MSG(DYNA_REC, IsInRangeImm26(distance), "%s(%d): Received too large distance: %" PRIx64,
+ __func__, branch.type, distance);
+ inst = (0x25 << 26) | MaskImm26(distance);
+ break;
+ }
+
+ std::memcpy(m_rwbase + branch.ptr, &inst, sizeof(inst));
+}
+
+FixupBranch ARM64XEmitter::CBZ(ARM64Reg Rt)
+{
+ FixupBranch branch;
+ branch.ptr = m_code;
+ branch.type = 0;
+ branch.reg = Rt;
+ HINT(HINT_NOP);
+ return branch;
+}
+FixupBranch ARM64XEmitter::CBNZ(ARM64Reg Rt)
+{
+ FixupBranch branch;
+ branch.ptr = m_code;
+ branch.type = 1;
+ branch.reg = Rt;
+ HINT(HINT_NOP);
+ return branch;
+}
+FixupBranch ARM64XEmitter::B(CCFlags cond)
+{
+ FixupBranch branch;
+ branch.ptr = m_code;
+ branch.type = 2;
+ branch.cond = cond;
+ HINT(HINT_NOP);
+ return branch;
+}
+FixupBranch ARM64XEmitter::TBZ(ARM64Reg Rt, u8 bit)
+{
+ FixupBranch branch;
+ branch.ptr = m_code;
+ branch.type = 3;
+ branch.reg = Rt;
+ branch.bit = bit;
+ HINT(HINT_NOP);
+ return branch;
+}
+FixupBranch ARM64XEmitter::TBNZ(ARM64Reg Rt, u8 bit)
+{
+ FixupBranch branch;
+ branch.ptr = m_code;
+ branch.type = 4;
+ branch.reg = Rt;
+ branch.bit = bit;
+ HINT(HINT_NOP);
+ return branch;
+}
+FixupBranch ARM64XEmitter::B()
+{
+ FixupBranch branch;
+ branch.ptr = m_code;
+ branch.type = 5;
+ HINT(HINT_NOP);
+ return branch;
+}
+FixupBranch ARM64XEmitter::BL()
+{
+ FixupBranch branch;
+ branch.ptr = m_code;
+ branch.type = 6;
+ HINT(HINT_NOP);
+ return branch;
+}
+
+// Compare and Branch
+void ARM64XEmitter::CBZ(ARM64Reg Rt, const void* ptr)
+{
+ EncodeCompareBranchInst(0, Rt, ptr);
+}
+void ARM64XEmitter::CBNZ(ARM64Reg Rt, const void* ptr)
+{
+ EncodeCompareBranchInst(1, Rt, ptr);
+}
+
+// Conditional Branch
+void ARM64XEmitter::B(CCFlags cond, const void* ptr)
+{
+ s64 distance = (s64)ptr - (s64)(m_rxbase + m_code);
+
+ distance >>= 2;
+
+ ASSERT_MSG(DYNA_REC, IsInRangeImm19(distance),
+ "%s: Received too large distance: %p->%p %" PRIi64 " %" PRIx64, __func__, m_execcode, ptr,
+ distance, distance);
+ Write32((0x54 << 24) | (MaskImm19(distance) << 5) | cond);
+}
+
+// Test and Branch
+void ARM64XEmitter::TBZ(ARM64Reg Rt, u8 bits, const void* ptr)
+{
+ EncodeTestBranchInst(0, Rt, bits, ptr);
+}
+void ARM64XEmitter::TBNZ(ARM64Reg Rt, u8 bits, const void* ptr)
+{
+ EncodeTestBranchInst(1, Rt, bits, ptr);
+}
+
+// Unconditional Branch
+void ARM64XEmitter::B(const void* ptr)
+{
+ EncodeUnconditionalBranchInst(0, ptr);
+}
+void ARM64XEmitter::BL(const void* ptr)
+{
+ EncodeUnconditionalBranchInst(1, ptr);
+}
+
+void ARM64XEmitter::QuickCallFunction(ARM64Reg scratchreg, const void* func)
+{
+ s64 distance = (s64)func - (s64)(m_rxbase + m_code);
+ distance >>= 2; // Can only branch to opcode-aligned (4) addresses
+ if (!IsInRangeImm26(distance))
+ {
+ // WARN_LOG(DYNA_REC, "Distance too far in function call (%p to %p)! Using scratch.", m_code,
+ // func);
+ MOVI2R(scratchreg, (uintptr_t)func);
+ BLR(scratchreg);
+ }
+ else
+ {
+ BL(func);
+ }
+}
+
+void ARM64XEmitter::QuickTailCall(ARM64Reg scratchreg, const void* func)
+{
+ s64 distance = (s64)func - (s64)(m_rxbase + m_code);
+ distance >>= 2; // Can only branch to opcode-aligned (4) addresses
+ if (!IsInRangeImm26(distance))
+ {
+ // WARN_LOG(DYNA_REC, "Distance too far in function call (%p to %p)! Using scratch.", m_code,
+ // func);
+ MOVI2R(scratchreg, (uintptr_t)func);
+ BR(scratchreg);
+ }
+ else
+ {
+ B(func);
+ }
+}
+
+// Unconditional Branch (register)
+void ARM64XEmitter::BR(ARM64Reg Rn)
+{
+ EncodeUnconditionalBranchInst(0, 0x1F, 0, 0, Rn);
+}
+void ARM64XEmitter::BLR(ARM64Reg Rn)
+{
+ EncodeUnconditionalBranchInst(1, 0x1F, 0, 0, Rn);
+}
+void ARM64XEmitter::RET(ARM64Reg Rn)
+{
+ EncodeUnconditionalBranchInst(2, 0x1F, 0, 0, Rn);
+}
+void ARM64XEmitter::ERET()
+{
+ EncodeUnconditionalBranchInst(4, 0x1F, 0, 0, SP);
+}
+void ARM64XEmitter::DRPS()
+{
+ EncodeUnconditionalBranchInst(5, 0x1F, 0, 0, SP);
+}
+
+// Exception generation
+void ARM64XEmitter::SVC(u32 imm)
+{
+ EncodeExceptionInst(0, imm);
+}
+
+void ARM64XEmitter::HVC(u32 imm)
+{
+ EncodeExceptionInst(1, imm);
+}
+
+void ARM64XEmitter::SMC(u32 imm)
+{
+ EncodeExceptionInst(2, imm);
+}
+
+void ARM64XEmitter::BRK(u32 imm)
+{
+ EncodeExceptionInst(3, imm);
+}
+
+void ARM64XEmitter::HLT(u32 imm)
+{
+ EncodeExceptionInst(4, imm);
+}
+
+void ARM64XEmitter::DCPS1(u32 imm)
+{
+ EncodeExceptionInst(5, imm);
+}
+
+void ARM64XEmitter::DCPS2(u32 imm)
+{
+ EncodeExceptionInst(6, imm);
+}
+
+void ARM64XEmitter::DCPS3(u32 imm)
+{
+ EncodeExceptionInst(7, imm);
+}
+
+// System
+void ARM64XEmitter::_MSR(PStateField field, u8 imm)
+{
+ u32 op1 = 0, op2 = 0;
+ switch (field)
+ {
+ case FIELD_SPSel:
+ op1 = 0;
+ op2 = 5;
+ break;
+ case FIELD_DAIFSet:
+ op1 = 3;
+ op2 = 6;
+ break;
+ case FIELD_DAIFClr:
+ op1 = 3;
+ op2 = 7;
+ break;
+ default:
+ ASSERT_MSG(DYNA_REC, false, "Invalid PStateField to do a imm move to");
+ break;
+ }
+ EncodeSystemInst(0, op1, 4, imm, op2, WSP);
+}
+
+static void GetSystemReg(PStateField field, int& o0, int& op1, int& CRn, int& CRm, int& op2)
+{
+ switch (field)
+ {
+ case FIELD_NZCV:
+ o0 = 3;
+ op1 = 3;
+ CRn = 4;
+ CRm = 2;
+ op2 = 0;
+ break;
+ case FIELD_FPCR:
+ o0 = 3;
+ op1 = 3;
+ CRn = 4;
+ CRm = 4;
+ op2 = 0;
+ break;
+ case FIELD_FPSR:
+ o0 = 3;
+ op1 = 3;
+ CRn = 4;
+ CRm = 4;
+ op2 = 1;
+ break;
+ case FIELD_PMCR_EL0:
+ o0 = 3;
+ op1 = 3;
+ CRn = 9;
+ CRm = 6;
+ op2 = 0;
+ break;
+ case FIELD_PMCCNTR_EL0:
+ o0 = 3;
+ op1 = 3;
+ CRn = 9;
+ CRm = 7;
+ op2 = 0;
+ break;
+ default:
+ ASSERT_MSG(DYNA_REC, false, "Invalid PStateField to do a register move from/to");
+ break;
+ }
+}
+
+void ARM64XEmitter::_MSR(PStateField field, ARM64Reg Rt)
+{
+ int o0 = 0, op1 = 0, CRn = 0, CRm = 0, op2 = 0;
+ ASSERT_MSG(DYNA_REC, Is64Bit(Rt), "MSR: Rt must be 64-bit");
+ GetSystemReg(field, o0, op1, CRn, CRm, op2);
+ EncodeSystemInst(o0, op1, CRn, CRm, op2, DecodeReg(Rt));
+}
+
+void ARM64XEmitter::MRS(ARM64Reg Rt, PStateField field)
+{
+ int o0 = 0, op1 = 0, CRn = 0, CRm = 0, op2 = 0;
+ ASSERT_MSG(DYNA_REC, Is64Bit(Rt), "MRS: Rt must be 64-bit");
+ GetSystemReg(field, o0, op1, CRn, CRm, op2);
+ EncodeSystemInst(o0 | 4, op1, CRn, CRm, op2, DecodeReg(Rt));
+}
+
+void ARM64XEmitter::CNTVCT(Arm64Gen::ARM64Reg Rt)
+{
+ ASSERT_MSG(DYNA_REC, Is64Bit(Rt), "CNTVCT: Rt must be 64-bit");
+
+ // MRS <Xt>, CNTVCT_EL0 ; Read CNTVCT_EL0 into Xt
+ EncodeSystemInst(3 | 4, 3, 0xe, 0, 2, DecodeReg(Rt));
+}
+
+void ARM64XEmitter::HINT(SystemHint op)
+{
+ EncodeSystemInst(0, 3, 2, 0, op, WSP);
+}
+void ARM64XEmitter::CLREX()
+{
+ EncodeSystemInst(0, 3, 3, 0, 2, WSP);
+}
+void ARM64XEmitter::DSB(BarrierType type)
+{
+ EncodeSystemInst(0, 3, 3, type, 4, WSP);
+}
+void ARM64XEmitter::DMB(BarrierType type)
+{
+ EncodeSystemInst(0, 3, 3, type, 5, WSP);
+}
+void ARM64XEmitter::ISB(BarrierType type)
+{
+ EncodeSystemInst(0, 3, 3, type, 6, WSP);
+}
+
+// Add/Subtract (extended register)
+void ARM64XEmitter::ADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+ ADD(Rd, Rn, Rm, ArithOption(Rd, ST_LSL, 0));
+}
+
+void ARM64XEmitter::ADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Option)
+{
+ EncodeArithmeticInst(0, false, Rd, Rn, Rm, Option);
+}
+
+void ARM64XEmitter::ADDS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+ EncodeArithmeticInst(0, true, Rd, Rn, Rm, ArithOption(Rd, ST_LSL, 0));
+}
+
+void ARM64XEmitter::ADDS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Option)
+{
+ EncodeArithmeticInst(0, true, Rd, Rn, Rm, Option);
+}
+
+void ARM64XEmitter::SUB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+ SUB(Rd, Rn, Rm, ArithOption(Rd, ST_LSL, 0));
+}
+
+void ARM64XEmitter::SUB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Option)
+{
+ EncodeArithmeticInst(1, false, Rd, Rn, Rm, Option);
+}
+
+void ARM64XEmitter::SUBS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+ EncodeArithmeticInst(1, true, Rd, Rn, Rm, ArithOption(Rd, ST_LSL, 0));
+}
+
+void ARM64XEmitter::SUBS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Option)
+{
+ EncodeArithmeticInst(1, true, Rd, Rn, Rm, Option);
+}
+
+void ARM64XEmitter::CMN(ARM64Reg Rn, ARM64Reg Rm)
+{
+ CMN(Rn, Rm, ArithOption(Rn, ST_LSL, 0));
+}
+
+void ARM64XEmitter::CMN(ARM64Reg Rn, ARM64Reg Rm, ArithOption Option)
+{
+ EncodeArithmeticInst(0, true, Is64Bit(Rn) ? ZR : WZR, Rn, Rm, Option);
+}
+
+void ARM64XEmitter::CMP(ARM64Reg Rn, ARM64Reg Rm)
+{
+ CMP(Rn, Rm, ArithOption(Rn, ST_LSL, 0));
+}
+
+void ARM64XEmitter::CMP(ARM64Reg Rn, ARM64Reg Rm, ArithOption Option)
+{
+ EncodeArithmeticInst(1, true, Is64Bit(Rn) ? ZR : WZR, Rn, Rm, Option);
+}
+
+// Add/Subtract (with carry)
+void ARM64XEmitter::ADC(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+ EncodeArithmeticCarryInst(0, false, Rd, Rn, Rm);
+}
+void ARM64XEmitter::ADCS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+ EncodeArithmeticCarryInst(0, true, Rd, Rn, Rm);
+}
+void ARM64XEmitter::SBC(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+ EncodeArithmeticCarryInst(1, false, Rd, Rn, Rm);
+}
+void ARM64XEmitter::SBCS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+ EncodeArithmeticCarryInst(1, true, Rd, Rn, Rm);
+}
+
+// Conditional Compare (immediate)
+void ARM64XEmitter::CCMN(ARM64Reg Rn, u32 imm, u32 nzcv, CCFlags cond)
+{
+ EncodeCondCompareImmInst(0, Rn, imm, nzcv, cond);
+}
+void ARM64XEmitter::CCMP(ARM64Reg Rn, u32 imm, u32 nzcv, CCFlags cond)
+{
+ EncodeCondCompareImmInst(1, Rn, imm, nzcv, cond);
+}
+
+// Conditiona Compare (register)
+void ARM64XEmitter::CCMN(ARM64Reg Rn, ARM64Reg Rm, u32 nzcv, CCFlags cond)
+{
+ EncodeCondCompareRegInst(0, Rn, Rm, nzcv, cond);
+}
+void ARM64XEmitter::CCMP(ARM64Reg Rn, ARM64Reg Rm, u32 nzcv, CCFlags cond)
+{
+ EncodeCondCompareRegInst(1, Rn, Rm, nzcv, cond);
+}
+
+// Conditional Select
+void ARM64XEmitter::CSEL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, CCFlags cond)
+{
+ EncodeCondSelectInst(0, Rd, Rn, Rm, cond);
+}
+void ARM64XEmitter::CSINC(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, CCFlags cond)
+{
+ EncodeCondSelectInst(1, Rd, Rn, Rm, cond);
+}
+void ARM64XEmitter::CSINV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, CCFlags cond)
+{
+ EncodeCondSelectInst(2, Rd, Rn, Rm, cond);
+}
+void ARM64XEmitter::CSNEG(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, CCFlags cond)
+{
+ EncodeCondSelectInst(3, Rd, Rn, Rm, cond);
+}
+
+// Data-Processing 1 source
+void ARM64XEmitter::RBIT(ARM64Reg Rd, ARM64Reg Rn)
+{
+ EncodeData1SrcInst(0, Rd, Rn);
+}
+void ARM64XEmitter::REV16(ARM64Reg Rd, ARM64Reg Rn)
+{
+ EncodeData1SrcInst(1, Rd, Rn);
+}
+void ARM64XEmitter::REV32(ARM64Reg Rd, ARM64Reg Rn)
+{
+ EncodeData1SrcInst(2, Rd, Rn);
+}
+void ARM64XEmitter::REV64(ARM64Reg Rd, ARM64Reg Rn)
+{
+ EncodeData1SrcInst(3, Rd, Rn);
+}
+void ARM64XEmitter::CLZ(ARM64Reg Rd, ARM64Reg Rn)
+{
+ EncodeData1SrcInst(4, Rd, Rn);
+}
+void ARM64XEmitter::CLS(ARM64Reg Rd, ARM64Reg Rn)
+{
+ EncodeData1SrcInst(5, Rd, Rn);
+}
+
+// Data-Processing 2 source
+void ARM64XEmitter::UDIV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+ EncodeData2SrcInst(0, Rd, Rn, Rm);
+}
+void ARM64XEmitter::SDIV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+ EncodeData2SrcInst(1, Rd, Rn, Rm);
+}
+void ARM64XEmitter::LSLV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+ EncodeData2SrcInst(2, Rd, Rn, Rm);
+}
+void ARM64XEmitter::LSRV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+ EncodeData2SrcInst(3, Rd, Rn, Rm);
+}
+void ARM64XEmitter::ASRV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+ EncodeData2SrcInst(4, Rd, Rn, Rm);
+}
+void ARM64XEmitter::RORV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+ EncodeData2SrcInst(5, Rd, Rn, Rm);
+}
+void ARM64XEmitter::CRC32B(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+ EncodeData2SrcInst(6, Rd, Rn, Rm);
+}
+void ARM64XEmitter::CRC32H(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+ EncodeData2SrcInst(7, Rd, Rn, Rm);
+}
+void ARM64XEmitter::CRC32W(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+ EncodeData2SrcInst(8, Rd, Rn, Rm);
+}
+void ARM64XEmitter::CRC32CB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+ EncodeData2SrcInst(9, Rd, Rn, Rm);
+}
+void ARM64XEmitter::CRC32CH(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+ EncodeData2SrcInst(10, Rd, Rn, Rm);
+}
+void ARM64XEmitter::CRC32CW(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+ EncodeData2SrcInst(11, Rd, Rn, Rm);
+}
+void ARM64XEmitter::CRC32X(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+ EncodeData2SrcInst(12, Rd, Rn, Rm);
+}
+void ARM64XEmitter::CRC32CX(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+ EncodeData2SrcInst(13, Rd, Rn, Rm);
+}
+
+// Data-Processing 3 source
+void ARM64XEmitter::MADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra)
+{
+ EncodeData3SrcInst(0, Rd, Rn, Rm, Ra);
+}
+void ARM64XEmitter::MSUB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra)
+{
+ EncodeData3SrcInst(1, Rd, Rn, Rm, Ra);
+}
+void ARM64XEmitter::SMADDL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra)
+{
+ EncodeData3SrcInst(2, Rd, Rn, Rm, Ra);
+}
+void ARM64XEmitter::SMULL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+ SMADDL(Rd, Rn, Rm, SP);
+}
+void ARM64XEmitter::SMSUBL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra)
+{
+ EncodeData3SrcInst(3, Rd, Rn, Rm, Ra);
+}
+void ARM64XEmitter::SMULH(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+ EncodeData3SrcInst(4, Rd, Rn, Rm, SP);
+}
+void ARM64XEmitter::UMADDL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra)
+{
+ EncodeData3SrcInst(5, Rd, Rn, Rm, Ra);
+}
+void ARM64XEmitter::UMULL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+ UMADDL(Rd, Rn, Rm, SP);
+}
+void ARM64XEmitter::UMSUBL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra)
+{
+ EncodeData3SrcInst(6, Rd, Rn, Rm, Ra);
+}
+void ARM64XEmitter::UMULH(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+ EncodeData3SrcInst(7, Rd, Rn, Rm, SP);
+}
+void ARM64XEmitter::MUL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+ EncodeData3SrcInst(0, Rd, Rn, Rm, SP);
+}
+void ARM64XEmitter::MNEG(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+ EncodeData3SrcInst(1, Rd, Rn, Rm, SP);
+}
+
+// Logical (shifted register)
+void ARM64XEmitter::AND(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift)
+{
+ EncodeLogicalInst(0, Rd, Rn, Rm, Shift);
+}
+void ARM64XEmitter::BIC(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift)
+{
+ EncodeLogicalInst(1, Rd, Rn, Rm, Shift);
+}
+void ARM64XEmitter::ORR(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift)
+{
+ EncodeLogicalInst(2, Rd, Rn, Rm, Shift);
+}
+void ARM64XEmitter::ORN(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift)
+{
+ EncodeLogicalInst(3, Rd, Rn, Rm, Shift);
+}
+void ARM64XEmitter::EOR(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift)
+{
+ EncodeLogicalInst(4, Rd, Rn, Rm, Shift);
+}
+void ARM64XEmitter::EON(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift)
+{
+ EncodeLogicalInst(5, Rd, Rn, Rm, Shift);
+}
+void ARM64XEmitter::ANDS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift)
+{
+ EncodeLogicalInst(6, Rd, Rn, Rm, Shift);
+}
+void ARM64XEmitter::BICS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift)
+{
+ EncodeLogicalInst(7, Rd, Rn, Rm, Shift);
+}
+
+void ARM64XEmitter::MOV(ARM64Reg Rd, ARM64Reg Rm, ArithOption Shift)
+{
+ ORR(Rd, Is64Bit(Rd) ? ZR : WZR, Rm, Shift);
+}
+
+void ARM64XEmitter::MOV(ARM64Reg Rd, ARM64Reg Rm)
+{
+ if (IsGPR(Rd) && IsGPR(Rm))
+ ORR(Rd, Is64Bit(Rd) ? ZR : WZR, Rm, ArithOption(Rm, ST_LSL, 0));
+ else
+ ASSERT_MSG(DYNA_REC, false, "Non-GPRs not supported in MOV");
+}
+void ARM64XEmitter::MVN(ARM64Reg Rd, ARM64Reg Rm)
+{
+ ORN(Rd, Is64Bit(Rd) ? ZR : WZR, Rm, ArithOption(Rm, ST_LSL, 0));
+}
+void ARM64XEmitter::LSL(ARM64Reg Rd, ARM64Reg Rm, int shift)
+{
+ int bits = Is64Bit(Rd) ? 64 : 32;
+ UBFM(Rd, Rm, (bits - shift) & (bits - 1), bits - shift - 1);
+}
+void ARM64XEmitter::LSR(ARM64Reg Rd, ARM64Reg Rm, int shift)
+{
+ int bits = Is64Bit(Rd) ? 64 : 32;
+ UBFM(Rd, Rm, shift, bits - 1);
+}
+void ARM64XEmitter::ASR(ARM64Reg Rd, ARM64Reg Rm, int shift)
+{
+ int bits = Is64Bit(Rd) ? 64 : 32;
+ SBFM(Rd, Rm, shift, bits - 1);
+}
+void ARM64XEmitter::ROR_(ARM64Reg Rd, ARM64Reg Rm, int shift)
+{
+ EXTR(Rd, Rm, Rm, shift);
+}
+
+// Logical (immediate)
+void ARM64XEmitter::AND(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms, bool invert)
+{
+ EncodeLogicalImmInst(0, Rd, Rn, immr, imms, invert);
+}
+void ARM64XEmitter::ANDS(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms, bool invert)
+{
+ EncodeLogicalImmInst(3, Rd, Rn, immr, imms, invert);
+}
+void ARM64XEmitter::EOR(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms, bool invert)
+{
+ EncodeLogicalImmInst(2, Rd, Rn, immr, imms, invert);
+}
+void ARM64XEmitter::ORR(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms, bool invert)
+{
+ EncodeLogicalImmInst(1, Rd, Rn, immr, imms, invert);
+}
+void ARM64XEmitter::TST(ARM64Reg Rn, u32 immr, u32 imms, bool invert)
+{
+ EncodeLogicalImmInst(3, Is64Bit(Rn) ? ZR : WZR, Rn, immr, imms, invert);
+}
+
+// Add/subtract (immediate)
+void ARM64XEmitter::ADD(ARM64Reg Rd, ARM64Reg Rn, u32 imm, bool shift)
+{
+ EncodeAddSubImmInst(0, false, shift, imm, Rn, Rd);
+}
+void ARM64XEmitter::ADDS(ARM64Reg Rd, ARM64Reg Rn, u32 imm, bool shift)
+{
+ EncodeAddSubImmInst(0, true, shift, imm, Rn, Rd);
+}
+void ARM64XEmitter::SUB(ARM64Reg Rd, ARM64Reg Rn, u32 imm, bool shift)
+{
+ EncodeAddSubImmInst(1, false, shift, imm, Rn, Rd);
+}
+void ARM64XEmitter::SUBS(ARM64Reg Rd, ARM64Reg Rn, u32 imm, bool shift)
+{
+ EncodeAddSubImmInst(1, true, shift, imm, Rn, Rd);
+}
+void ARM64XEmitter::CMP(ARM64Reg Rn, u32 imm, bool shift)
+{
+ EncodeAddSubImmInst(1, true, shift, imm, Rn, Is64Bit(Rn) ? SP : WSP);
+}
+
+// Data Processing (Immediate)
+void ARM64XEmitter::MOVZ(ARM64Reg Rd, u32 imm, ShiftAmount pos)
+{
+ EncodeMOVWideInst(2, Rd, imm, pos);
+}
+void ARM64XEmitter::MOVN(ARM64Reg Rd, u32 imm, ShiftAmount pos)
+{
+ EncodeMOVWideInst(0, Rd, imm, pos);
+}
+void ARM64XEmitter::MOVK(ARM64Reg Rd, u32 imm, ShiftAmount pos)
+{
+ EncodeMOVWideInst(3, Rd, imm, pos);
+}
+
+// Bitfield move
+void ARM64XEmitter::BFM(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms)
+{
+ EncodeBitfieldMOVInst(1, Rd, Rn, immr, imms);
+}
+void ARM64XEmitter::SBFM(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms)
+{
+ EncodeBitfieldMOVInst(0, Rd, Rn, immr, imms);
+}
+void ARM64XEmitter::UBFM(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms)
+{
+ EncodeBitfieldMOVInst(2, Rd, Rn, immr, imms);
+}
+
+void ARM64XEmitter::BFI(ARM64Reg Rd, ARM64Reg Rn, u32 lsb, u32 width)
+{
+ u32 size = Is64Bit(Rn) ? 64 : 32;
+ ASSERT_MSG(DYNA_REC, (lsb + width) <= size,
+ "%s passed lsb %d and width %d which is greater than the register size!", __func__,
+ lsb, width);
+ EncodeBitfieldMOVInst(1, Rd, Rn, (size - lsb) % size, width - 1);
+}
+void ARM64XEmitter::UBFIZ(ARM64Reg Rd, ARM64Reg Rn, u32 lsb, u32 width)
+{
+ u32 size = Is64Bit(Rn) ? 64 : 32;
+ ASSERT_MSG(DYNA_REC, (lsb + width) <= size,
+ "%s passed lsb %d and width %d which is greater than the register size!", __func__,
+ lsb, width);
+ EncodeBitfieldMOVInst(2, Rd, Rn, (size - lsb) % size, width - 1);
+}
+void ARM64XEmitter::EXTR(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, u32 shift)
+{
+ bool sf = Is64Bit(Rd);
+ bool N = sf;
+ Rd = DecodeReg(Rd);
+ Rn = DecodeReg(Rn);
+ Rm = DecodeReg(Rm);
+ Write32((sf << 31) | (0x27 << 23) | (N << 22) | (Rm << 16) | (shift << 10) | (Rm << 5) | Rd);
+}
+void ARM64XEmitter::SXTB(ARM64Reg Rd, ARM64Reg Rn)
+{
+ SBFM(Rd, Rn, 0, 7);
+}
+void ARM64XEmitter::SXTH(ARM64Reg Rd, ARM64Reg Rn)
+{
+ SBFM(Rd, Rn, 0, 15);
+}
+void ARM64XEmitter::SXTW(ARM64Reg Rd, ARM64Reg Rn)
+{
+ ASSERT_MSG(DYNA_REC, Is64Bit(Rd), "%s requires 64bit register as destination", __func__);
+ SBFM(Rd, Rn, 0, 31);
+}
+void ARM64XEmitter::UXTB(ARM64Reg Rd, ARM64Reg Rn)
+{
+ UBFM(Rd, Rn, 0, 7);
+}
+void ARM64XEmitter::UXTH(ARM64Reg Rd, ARM64Reg Rn)
+{
+ UBFM(Rd, Rn, 0, 15);
+}
+
+// Load Register (Literal)
+void ARM64XEmitter::LDR(ARM64Reg Rt, u32 imm)
+{
+ EncodeLoadRegisterInst(0, Rt, imm);
+}
+void ARM64XEmitter::LDRSW(ARM64Reg Rt, u32 imm)
+{
+ EncodeLoadRegisterInst(2, Rt, imm);
+}
+void ARM64XEmitter::PRFM(ARM64Reg Rt, u32 imm)
+{
+ EncodeLoadRegisterInst(3, Rt, imm);
+}
+
+// Load/Store pair
+void ARM64XEmitter::LDP(IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, s32 imm)
+{
+ EncodeLoadStorePair(0, 1, type, Rt, Rt2, Rn, imm);
+}
+void ARM64XEmitter::LDPSW(IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, s32 imm)
+{
+ EncodeLoadStorePair(1, 1, type, Rt, Rt2, Rn, imm);
+}
+void ARM64XEmitter::STP(IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, s32 imm)
+{
+ EncodeLoadStorePair(0, 0, type, Rt, Rt2, Rn, imm);
+}
+
+// Load/Store Exclusive
+void ARM64XEmitter::STXRB(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rn)
+{
+ EncodeLoadStoreExcInst(0, Rs, SP, Rt, Rn);
+}
+void ARM64XEmitter::STLXRB(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rn)
+{
+ EncodeLoadStoreExcInst(1, Rs, SP, Rt, Rn);
+}
+void ARM64XEmitter::LDXRB(ARM64Reg Rt, ARM64Reg Rn)
+{
+ EncodeLoadStoreExcInst(2, SP, SP, Rt, Rn);
+}
+void ARM64XEmitter::LDAXRB(ARM64Reg Rt, ARM64Reg Rn)
+{
+ EncodeLoadStoreExcInst(3, SP, SP, Rt, Rn);
+}
+void ARM64XEmitter::STLRB(ARM64Reg Rt, ARM64Reg Rn)
+{
+ EncodeLoadStoreExcInst(4, SP, SP, Rt, Rn);
+}
+void ARM64XEmitter::LDARB(ARM64Reg Rt, ARM64Reg Rn)
+{
+ EncodeLoadStoreExcInst(5, SP, SP, Rt, Rn);
+}
+void ARM64XEmitter::STXRH(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rn)
+{
+ EncodeLoadStoreExcInst(6, Rs, SP, Rt, Rn);
+}
+void ARM64XEmitter::STLXRH(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rn)
+{
+ EncodeLoadStoreExcInst(7, Rs, SP, Rt, Rn);
+}
+void ARM64XEmitter::LDXRH(ARM64Reg Rt, ARM64Reg Rn)
+{
+ EncodeLoadStoreExcInst(8, SP, SP, Rt, Rn);
+}
+void ARM64XEmitter::LDAXRH(ARM64Reg Rt, ARM64Reg Rn)
+{
+ EncodeLoadStoreExcInst(9, SP, SP, Rt, Rn);
+}
+void ARM64XEmitter::STLRH(ARM64Reg Rt, ARM64Reg Rn)
+{
+ EncodeLoadStoreExcInst(10, SP, SP, Rt, Rn);
+}
+void ARM64XEmitter::LDARH(ARM64Reg Rt, ARM64Reg Rn)
+{
+ EncodeLoadStoreExcInst(11, SP, SP, Rt, Rn);
+}
+void ARM64XEmitter::STXR(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rn)
+{
+ EncodeLoadStoreExcInst(12 + Is64Bit(Rt), Rs, SP, Rt, Rn);
+}
+void ARM64XEmitter::STLXR(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rn)
+{
+ EncodeLoadStoreExcInst(14 + Is64Bit(Rt), Rs, SP, Rt, Rn);
+}
+void ARM64XEmitter::STXP(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn)
+{
+ EncodeLoadStoreExcInst(16 + Is64Bit(Rt), Rs, Rt2, Rt, Rn);
+}
+void ARM64XEmitter::STLXP(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn)
+{
+ EncodeLoadStoreExcInst(18 + Is64Bit(Rt), Rs, Rt2, Rt, Rn);
+}
+void ARM64XEmitter::LDXR(ARM64Reg Rt, ARM64Reg Rn)
+{
+ EncodeLoadStoreExcInst(20 + Is64Bit(Rt), SP, SP, Rt, Rn);
+}
+void ARM64XEmitter::LDAXR(ARM64Reg Rt, ARM64Reg Rn)
+{
+ EncodeLoadStoreExcInst(22 + Is64Bit(Rt), SP, SP, Rt, Rn);
+}
+void ARM64XEmitter::LDXP(ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn)
+{
+ EncodeLoadStoreExcInst(24 + Is64Bit(Rt), SP, Rt2, Rt, Rn);
+}
+void ARM64XEmitter::LDAXP(ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn)
+{
+ EncodeLoadStoreExcInst(26 + Is64Bit(Rt), SP, Rt2, Rt, Rn);
+}
+void ARM64XEmitter::STLR(ARM64Reg Rt, ARM64Reg Rn)
+{
+ EncodeLoadStoreExcInst(28 + Is64Bit(Rt), SP, SP, Rt, Rn);
+}
+void ARM64XEmitter::LDAR(ARM64Reg Rt, ARM64Reg Rn)
+{
+ EncodeLoadStoreExcInst(30 + Is64Bit(Rt), SP, SP, Rt, Rn);
+}
+
+// Load/Store no-allocate pair (offset)
+void ARM64XEmitter::STNP(ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, u32 imm)
+{
+ EncodeLoadStorePairedInst(0xA0, Rt, Rt2, Rn, imm);
+}
+void ARM64XEmitter::LDNP(ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, u32 imm)
+{
+ EncodeLoadStorePairedInst(0xA1, Rt, Rt2, Rn, imm);
+}
+
+// Load/Store register (immediate post-indexed)
+// XXX: Most of these support vectors
+void ARM64XEmitter::STRB(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm)
+{
+ if (type == INDEX_UNSIGNED)
+ EncodeLoadStoreIndexedInst(0x0E4, Rt, Rn, imm, 8);
+ else
+ EncodeLoadStoreIndexedInst(0x0E0, type == INDEX_POST ? 1 : 3, Rt, Rn, imm);
+}
+void ARM64XEmitter::LDRB(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm)
+{
+ if (type == INDEX_UNSIGNED)
+ EncodeLoadStoreIndexedInst(0x0E5, Rt, Rn, imm, 8);
+ else
+ EncodeLoadStoreIndexedInst(0x0E1, type == INDEX_POST ? 1 : 3, Rt, Rn, imm);
+}
+void ARM64XEmitter::LDRSB(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm)
+{
+ if (type == INDEX_UNSIGNED)
+ EncodeLoadStoreIndexedInst(Is64Bit(Rt) ? 0x0E6 : 0x0E7, Rt, Rn, imm, 8);
+ else
+ EncodeLoadStoreIndexedInst(Is64Bit(Rt) ? 0x0E2 : 0x0E3, type == INDEX_POST ? 1 : 3, Rt, Rn,
+ imm);
+}
+void ARM64XEmitter::STRH(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm)
+{
+ if (type == INDEX_UNSIGNED)
+ EncodeLoadStoreIndexedInst(0x1E4, Rt, Rn, imm, 16);
+ else
+ EncodeLoadStoreIndexedInst(0x1E0, type == INDEX_POST ? 1 : 3, Rt, Rn, imm);
+}
+void ARM64XEmitter::LDRH(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm)
+{
+ if (type == INDEX_UNSIGNED)
+ EncodeLoadStoreIndexedInst(0x1E5, Rt, Rn, imm, 16);
+ else
+ EncodeLoadStoreIndexedInst(0x1E1, type == INDEX_POST ? 1 : 3, Rt, Rn, imm);
+}
+void ARM64XEmitter::LDRSH(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm)
+{
+ if (type == INDEX_UNSIGNED)
+ EncodeLoadStoreIndexedInst(Is64Bit(Rt) ? 0x1E6 : 0x1E7, Rt, Rn, imm, 16);
+ else
+ EncodeLoadStoreIndexedInst(Is64Bit(Rt) ? 0x1E2 : 0x1E3, type == INDEX_POST ? 1 : 3, Rt, Rn,
+ imm);
+}
+void ARM64XEmitter::STR(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm)
+{
+ if (type == INDEX_UNSIGNED)
+ EncodeLoadStoreIndexedInst(Is64Bit(Rt) ? 0x3E4 : 0x2E4, Rt, Rn, imm, Is64Bit(Rt) ? 64 : 32);
+ else
+ EncodeLoadStoreIndexedInst(Is64Bit(Rt) ? 0x3E0 : 0x2E0, type == INDEX_POST ? 1 : 3, Rt, Rn,
+ imm);
+}
+void ARM64XEmitter::LDR(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm)
+{
+ if (type == INDEX_UNSIGNED)
+ EncodeLoadStoreIndexedInst(Is64Bit(Rt) ? 0x3E5 : 0x2E5, Rt, Rn, imm, Is64Bit(Rt) ? 64 : 32);
+ else
+ EncodeLoadStoreIndexedInst(Is64Bit(Rt) ? 0x3E1 : 0x2E1, type == INDEX_POST ? 1 : 3, Rt, Rn,
+ imm);
+}
+void ARM64XEmitter::LDRSW(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm)
+{
+ if (type == INDEX_UNSIGNED)
+ EncodeLoadStoreIndexedInst(0x2E6, Rt, Rn, imm, 32);
+ else
+ EncodeLoadStoreIndexedInst(0x2E2, type == INDEX_POST ? 1 : 3, Rt, Rn, imm);
+}
+
+// Load/Store register (register offset)
+void ARM64XEmitter::STRB(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm)
+{
+ EncodeLoadStoreRegisterOffset(0, 0, Rt, Rn, Rm);
+}
+void ARM64XEmitter::LDRB(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm)
+{
+ EncodeLoadStoreRegisterOffset(0, 1, Rt, Rn, Rm);
+}
+void ARM64XEmitter::LDRSB(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm)
+{
+ bool b64Bit = Is64Bit(Rt);
+ EncodeLoadStoreRegisterOffset(0, 3 - b64Bit, Rt, Rn, Rm);
+}
+void ARM64XEmitter::STRH(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm)
+{
+ EncodeLoadStoreRegisterOffset(1, 0, Rt, Rn, Rm);
+}
+void ARM64XEmitter::LDRH(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm)
+{
+ EncodeLoadStoreRegisterOffset(1, 1, Rt, Rn, Rm);
+}
+void ARM64XEmitter::LDRSH(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm)
+{
+ bool b64Bit = Is64Bit(Rt);
+ EncodeLoadStoreRegisterOffset(1, 3 - b64Bit, Rt, Rn, Rm);
+}
+void ARM64XEmitter::STR(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm)
+{
+ bool b64Bit = Is64Bit(Rt);
+ EncodeLoadStoreRegisterOffset(2 + b64Bit, 0, Rt, Rn, Rm);
+}
+void ARM64XEmitter::LDR(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm)
+{
+ bool b64Bit = Is64Bit(Rt);
+ EncodeLoadStoreRegisterOffset(2 + b64Bit, 1, Rt, Rn, Rm);
+}
+void ARM64XEmitter::LDRSW(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm)
+{
+ EncodeLoadStoreRegisterOffset(2, 2, Rt, Rn, Rm);
+}
+void ARM64XEmitter::PRFM(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm)
+{
+ EncodeLoadStoreRegisterOffset(3, 2, Rt, Rn, Rm);
+}
+
+// Load/Store register (unscaled offset)
+void ARM64XEmitter::STURB(ARM64Reg Rt, ARM64Reg Rn, s32 imm)
+{
+ EncodeLoadStoreUnscaled(0, 0, Rt, Rn, imm);
+}
+void ARM64XEmitter::LDURB(ARM64Reg Rt, ARM64Reg Rn, s32 imm)
+{
+ EncodeLoadStoreUnscaled(0, 1, Rt, Rn, imm);
+}
+void ARM64XEmitter::LDURSB(ARM64Reg Rt, ARM64Reg Rn, s32 imm)
+{
+ EncodeLoadStoreUnscaled(0, Is64Bit(Rt) ? 2 : 3, Rt, Rn, imm);
+}
+void ARM64XEmitter::STURH(ARM64Reg Rt, ARM64Reg Rn, s32 imm)
+{
+ EncodeLoadStoreUnscaled(1, 0, Rt, Rn, imm);
+}
+void ARM64XEmitter::LDURH(ARM64Reg Rt, ARM64Reg Rn, s32 imm)
+{
+ EncodeLoadStoreUnscaled(1, 1, Rt, Rn, imm);
+}
+void ARM64XEmitter::LDURSH(ARM64Reg Rt, ARM64Reg Rn, s32 imm)
+{
+ EncodeLoadStoreUnscaled(1, Is64Bit(Rt) ? 2 : 3, Rt, Rn, imm);
+}
+void ARM64XEmitter::STUR(ARM64Reg Rt, ARM64Reg Rn, s32 imm)
+{
+ EncodeLoadStoreUnscaled(Is64Bit(Rt) ? 3 : 2, 0, Rt, Rn, imm);
+}
+void ARM64XEmitter::LDUR(ARM64Reg Rt, ARM64Reg Rn, s32 imm)
+{
+ EncodeLoadStoreUnscaled(Is64Bit(Rt) ? 3 : 2, 1, Rt, Rn, imm);
+}
+void ARM64XEmitter::LDURSW(ARM64Reg Rt, ARM64Reg Rn, s32 imm)
+{
+ ASSERT_MSG(DYNA_REC, !Is64Bit(Rt), "%s must have a 64bit destination register!", __func__);
+ EncodeLoadStoreUnscaled(2, 2, Rt, Rn, imm);
+}
+
+void ARM64XEmitter::LDRGeneric(int size, bool signExtend, ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm)
+{
+ switch (size | signExtend)
+ {
+ case 32: LDR (Rt, Rn, Rm); break;
+ case 33: LDRSW(Rt, Rn, Rm); break;
+ case 16: LDRH (Rt, Rn, Rm); break;
+ case 17: LDRSH(Rt, Rn, Rm); break;
+ case 8: LDRB (Rt, Rn, Rm); break;
+ case 9: LDRSB(Rt, Rn, Rm); break;
+ default: PanicAlert("LDRGeneric(reg): invalid size %d", size); break;
+ }
+}
+void ARM64XEmitter::STRGeneric(int size, ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm)
+{
+ switch (size)
+ {
+ case 32: STR (Rt, Rn, Rm); break;
+ case 16: STRH (Rt, Rn, Rm); break;
+ case 8: STRB (Rt, Rn, Rm); break;
+ default: PanicAlert("STRGeneric(reg): invalid size %d", size); break;
+ }
+}
+
+void ARM64XEmitter::LDRGeneric(int size, bool signExtend, IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm)
+{
+ switch (size | signExtend)
+ {
+ case 32: LDR (type, Rt, Rn, imm); break;
+ case 33: LDRSW(type, Rt, Rn, imm); break;
+ case 16: LDRH (type, Rt, Rn, imm); break;
+ case 17: LDRSH(type, Rt, Rn, imm); break;
+ case 8: LDRB (type, Rt, Rn, imm); break;
+ case 9: LDRSB(type, Rt, Rn, imm); break;
+ default: PanicAlert("LDRGeneric(imm): invalid size %d", size); break;
+ }
+}
+void ARM64XEmitter::STRGeneric(int size, IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm)
+{
+ switch (size)
+ {
+ case 32: STR (type, Rt, Rn, imm); break;
+ case 16: STRH (type, Rt, Rn, imm); break;
+ case 8: STRB (type, Rt, Rn, imm); break;
+ default: PanicAlert("STRGeneric(imm): invalid size %d", size); break;
+ }
+}
+
+// Address of label/page PC-relative
+void ARM64XEmitter::ADR(ARM64Reg Rd, s32 imm)
+{
+ EncodeAddressInst(0, Rd, imm);
+}
+void ARM64XEmitter::ADRP(ARM64Reg Rd, s32 imm)
+{
+ EncodeAddressInst(1, Rd, imm >> 12);
+}
+
+// Wrapper around MOVZ+MOVK (and later MOVN)
+void ARM64XEmitter::MOVI2R(ARM64Reg Rd, u64 imm, bool optimize)
+{
+ unsigned int parts = Is64Bit(Rd) ? 4 : 2;
+ BitSet32 upload_part(0);
+
+ // Always start with a movz! Kills the dependency on the register.
+ bool use_movz = true;
+
+ if (!imm)
+ {
+ // Zero immediate, just clear the register. EOR is pointless when we have MOVZ, which looks
+ // clearer in disasm too.
+ MOVZ(Rd, 0, SHIFT_0);
+ return;
+ }
+
+ if ((Is64Bit(Rd) && imm == std::numeric_limits<u64>::max()) ||
+ (!Is64Bit(Rd) && imm == std::numeric_limits<u32>::max()))
+ {
+ // Max unsigned value (or if signed, -1)
+ // Set to ~ZR
+ ARM64Reg ZR = Is64Bit(Rd) ? SP : WSP;
+ ORN(Rd, ZR, ZR, ArithOption(ZR, ST_LSL, 0));
+ return;
+ }
+
+ // TODO: Make some more systemic use of MOVN, but this will take care of most cases.
+ // Small negative integer. Use MOVN
+ if (!Is64Bit(Rd) && (imm | 0xFFFF0000) == imm)
+ {
+ MOVN(Rd, ~imm, SHIFT_0);
+ return;
+ }
+
+ // XXX: Use MOVN when possible.
+ // XXX: Optimize more
+ // XXX: Support rotating immediates to save instructions
+ if (optimize)
+ {
+ for (unsigned int i = 0; i < parts; ++i)
+ {
+ if ((imm >> (i * 16)) & 0xFFFF)
+ upload_part[i] = 1;
+ }
+ }
+
+ u64 aligned_pc = (u64)(m_rxbase + m_code) & ~0xFFF;
+s64 aligned_offset = (s64)imm - (s64)aligned_pc;
+ // The offset for ADR/ADRP is an s32, so make sure it can be represented in that
+ if (upload_part.Count() > 1 && std::abs(aligned_offset) < 0x7FFFFFFFLL)
+ {
+ // Immediate we are loading is within 4GB of our aligned range
+ // Most likely a address that we can load in one or two instructions
+ if (!(std::abs(aligned_offset) & 0xFFF))
+ {
+ // Aligned ADR
+ ADRP(Rd, (s32)aligned_offset);
+ return;
+ }
+ else
+ {
+ // If the address is within 1MB of PC we can load it in a single instruction still
+ s64 offset = (s64)imm - (s64)(m_rxbase + m_code);
+ if (offset >= -0xFFFFF && offset <= 0xFFFFF)
+ {
+ ADR(Rd, (s32)offset);
+ return;
+ }
+ else
+ {
+ ADRP(Rd, (s32)(aligned_offset & ~0xFFF));
+ ADD(Rd, Rd, imm & 0xFFF);
+ return;
+ }
+ }
+ }
+
+ for (unsigned i = 0; i < parts; ++i)
+ {
+ if (use_movz && upload_part[i])
+ {
+ MOVZ(Rd, (imm >> (i * 16)) & 0xFFFF, (ShiftAmount)i);
+ use_movz = false;
+ }
+ else
+ {
+ if (upload_part[i] || !optimize)
+ MOVK(Rd, (imm >> (i * 16)) & 0xFFFF, (ShiftAmount)i);
+ }
+ }
+}
+
+bool ARM64XEmitter::MOVI2R2(ARM64Reg Rd, u64 imm1, u64 imm2)
+{
+ // TODO: Also optimize for performance, not just for code size.
+ ptrdiff_t start_offset = GetCodeOffset();
+
+ MOVI2R(Rd, imm1);
+ int size1 = GetCodeOffset() - start_offset;
+
+ SetCodePtrUnsafe(start_offset);
+
+ MOVI2R(Rd, imm2);
+ int size2 = GetCodeOffset() - start_offset;
+
+ SetCodePtrUnsafe(start_offset);
+
+ bool element = size1 > size2;
+
+ MOVI2R(Rd, element ? imm2 : imm1);
+
+ return element;
+}
+
+void ARM64XEmitter::ABI_PushRegisters(BitSet32 registers)
+{
+ int num_regs = registers.Count();
+ int stack_size = (num_regs + (num_regs & 1)) * 8;
+ auto it = registers.begin();
+
+ if (!num_regs)
+ return;
+
+ // 8 byte per register, but 16 byte alignment, so we may have to padd one register.
+ // Only update the SP on the last write to avoid the dependency between those stores.
+
+ // The first push must adjust the SP, else a context switch may invalidate everything below SP.
+ if (num_regs & 1)
+ {
+ STR(INDEX_PRE, (ARM64Reg)(X0 + *it++), SP, -stack_size);
+ }
+ else
+ {
+ ARM64Reg first_reg = (ARM64Reg)(X0 + *it++);
+ ARM64Reg second_reg = (ARM64Reg)(X0 + *it++);
+ STP(INDEX_PRE, first_reg, second_reg, SP, -stack_size);
+ }
+
+ // Fast store for all other registers, this is always an even number.
+ for (int i = 0; i < (num_regs - 1) / 2; i++)
+ {
+ ARM64Reg odd_reg = (ARM64Reg)(X0 + *it++);
+ ARM64Reg even_reg = (ARM64Reg)(X0 + *it++);
+ STP(INDEX_SIGNED, odd_reg, even_reg, SP, 16 * (i + 1));
+ }
+
+ ASSERT_MSG(DYNA_REC, it == registers.end(), "%s registers don't match.", __func__);
+}
+
+void ARM64XEmitter::ABI_PopRegisters(BitSet32 registers, BitSet32 ignore_mask)
+{
+ int num_regs = registers.Count();
+ int stack_size = (num_regs + (num_regs & 1)) * 8;
+ auto it = registers.begin();
+
+ if (!num_regs)
+ return;
+
+ // We must adjust the SP in the end, so load the first (two) registers at least.
+ ARM64Reg first = (ARM64Reg)(X0 + *it++);
+ ARM64Reg second;
+ if (!(num_regs & 1))
+ second = (ARM64Reg)(X0 + *it++);
+
+ // 8 byte per register, but 16 byte alignment, so we may have to padd one register.
+ // Only update the SP on the last load to avoid the dependency between those loads.
+
+ // Fast load for all but the first (two) registers, this is always an even number.
+ for (int i = 0; i < (num_regs - 1) / 2; i++)
+ {
+ ARM64Reg odd_reg = (ARM64Reg)(X0 + *it++);
+ ARM64Reg even_reg = (ARM64Reg)(X0 + *it++);
+ LDP(INDEX_SIGNED, odd_reg, even_reg, SP, 16 * (i + 1));
+ }
+
+ // Post loading the first (two) registers.
+ if (num_regs & 1)
+ LDR(INDEX_POST, first, SP, stack_size);
+ else
+ LDP(INDEX_POST, first, second, SP, stack_size);
+
+ ASSERT_MSG(DYNA_REC, it == registers.end(), "%s registers don't match.", __func__);
+}
+
+// Float Emitter
+void ARM64FloatEmitter::EmitLoadStoreImmediate(u8 size, u32 opc, IndexType type, ARM64Reg Rt,
+ ARM64Reg Rn, s32 imm)
+{
+ Rt = DecodeReg(Rt);
+ Rn = DecodeReg(Rn);
+ u32 encoded_size = 0;
+ u32 encoded_imm = 0;
+
+ if (size == 8)
+ encoded_size = 0;
+ else if (size == 16)
+ encoded_size = 1;
+ else if (size == 32)
+ encoded_size = 2;
+ else if (size == 64)
+ encoded_size = 3;
+ else if (size == 128)
+ encoded_size = 0;
+
+ if (type == INDEX_UNSIGNED)
+ {
+ ASSERT_MSG(DYNA_REC, !(imm & ((size - 1) >> 3)),
+ "%s(INDEX_UNSIGNED) immediate offset must be aligned to size! (%d) (%p)", __func__,
+ imm, m_emit->GetCodePtr());
+ ASSERT_MSG(DYNA_REC, imm >= 0, "%s(INDEX_UNSIGNED) immediate offset must be positive!",
+ __func__);
+ if (size == 16)
+ imm >>= 1;
+ else if (size == 32)
+ imm >>= 2;
+ else if (size == 64)
+ imm >>= 3;
+ else if (size == 128)
+ imm >>= 4;
+ encoded_imm = (imm & 0xFFF);
+ }
+ else
+ {
+ ASSERT_MSG(DYNA_REC, !(imm < -256 || imm > 255),
+ "%s immediate offset must be within range of -256 to 256!", __func__);
+ encoded_imm = (imm & 0x1FF) << 2;
+ if (type == INDEX_POST)
+ encoded_imm |= 1;
+ else
+ encoded_imm |= 3;
+ }
+
+ Write32((encoded_size << 30) | (0xF << 26) | (type == INDEX_UNSIGNED ? (1 << 24) : 0) |
+ (size == 128 ? (1 << 23) : 0) | (opc << 22) | (encoded_imm << 10) | (Rn << 5) | Rt);
+}
+
+void ARM64FloatEmitter::EmitScalar2Source(bool M, bool S, u32 type, u32 opcode, ARM64Reg Rd,
+ ARM64Reg Rn, ARM64Reg Rm)
+{
+ ASSERT_MSG(DYNA_REC, !IsQuad(Rd), "%s only supports double and single registers!", __func__);
+ Rd = DecodeReg(Rd);
+ Rn = DecodeReg(Rn);
+ Rm = DecodeReg(Rm);
+
+ Write32((M << 31) | (S << 29) | (0b11110001 << 21) | (type << 22) | (Rm << 16) | (opcode << 12) |
+ (1 << 11) | (Rn << 5) | Rd);
+}
+
+void ARM64FloatEmitter::EmitThreeSame(bool U, u32 size, u32 opcode, ARM64Reg Rd, ARM64Reg Rn,
+ ARM64Reg Rm)
+{
+ ASSERT_MSG(DYNA_REC, !IsSingle(Rd), "%s doesn't support singles!", __func__);
+ bool quad = IsQuad(Rd);
+ Rd = DecodeReg(Rd);
+ Rn = DecodeReg(Rn);
+ Rm = DecodeReg(Rm);
+
+ Write32((quad << 30) | (U << 29) | (0b1110001 << 21) | (size << 22) | (Rm << 16) |
+ (opcode << 11) | (1 << 10) | (Rn << 5) | Rd);
+}
+
+void ARM64FloatEmitter::EmitCopy(bool Q, u32 op, u32 imm5, u32 imm4, ARM64Reg Rd, ARM64Reg Rn)
+{
+ Rd = DecodeReg(Rd);
+ Rn = DecodeReg(Rn);
+
+ Write32((Q << 30) | (op << 29) | (0b111 << 25) | (imm5 << 16) | (imm4 << 11) | (1 << 10) |
+ (Rn << 5) | Rd);
+}
+
+void ARM64FloatEmitter::Emit2RegMisc(bool Q, bool U, u32 size, u32 opcode, ARM64Reg Rd, ARM64Reg Rn)
+{
+ ASSERT_MSG(DYNA_REC, !IsSingle(Rd), "%s doesn't support singles!", __func__);
+ Rd = DecodeReg(Rd);
+ Rn = DecodeReg(Rn);
+
+ Write32((Q << 30) | (U << 29) | (0b1110001 << 21) | (size << 22) | (opcode << 12) | (1 << 11) |
+ (Rn << 5) | Rd);
+}
+
+void ARM64FloatEmitter::EmitLoadStoreSingleStructure(bool L, bool R, u32 opcode, bool S, u32 size,
+ ARM64Reg Rt, ARM64Reg Rn)
+{
+ ASSERT_MSG(DYNA_REC, !IsSingle(Rt), "%s doesn't support singles!", __func__);
+ bool quad = IsQuad(Rt);
+ Rt = DecodeReg(Rt);
+ Rn = DecodeReg(Rn);
+
+ Write32((quad << 30) | (0b1101 << 24) | (L << 22) | (R << 21) | (opcode << 13) | (S << 12) |
+ (size << 10) | (Rn << 5) | Rt);
+}
+
+void ARM64FloatEmitter::EmitLoadStoreSingleStructure(bool L, bool R, u32 opcode, bool S, u32 size,
+ ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm)
+{
+ ASSERT_MSG(DYNA_REC, !IsSingle(Rt), "%s doesn't support singles!", __func__);
+ bool quad = IsQuad(Rt);
+ Rt = DecodeReg(Rt);
+ Rn = DecodeReg(Rn);
+ Rm = DecodeReg(Rm);
+
+ Write32((quad << 30) | (0x1B << 23) | (L << 22) | (R << 21) | (Rm << 16) | (opcode << 13) |
+ (S << 12) | (size << 10) | (Rn << 5) | Rt);
+}
+
+void ARM64FloatEmitter::Emit1Source(bool M, bool S, u32 type, u32 opcode, ARM64Reg Rd, ARM64Reg Rn)
+{
+ ASSERT_MSG(DYNA_REC, !IsQuad(Rd), "%s doesn't support vector!", __func__);
+ Rd = DecodeReg(Rd);
+ Rn = DecodeReg(Rn);
+
+ Write32((M << 31) | (S << 29) | (0xF1 << 21) | (type << 22) | (opcode << 15) | (1 << 14) |
+ (Rn << 5) | Rd);
+}
+
+void ARM64FloatEmitter::EmitConversion(bool sf, bool S, u32 type, u32 rmode, u32 opcode,
+ ARM64Reg Rd, ARM64Reg Rn)
+{
+ ASSERT_MSG(DYNA_REC, Rn <= SP, "%s only supports GPR as source!", __func__);
+ Rd = DecodeReg(Rd);
+ Rn = DecodeReg(Rn);
+
+ Write32((sf << 31) | (S << 29) | (0xF1 << 21) | (type << 22) | (rmode << 19) | (opcode << 16) |
+ (Rn << 5) | Rd);
+}
+
+void ARM64FloatEmitter::EmitConvertScalarToInt(ARM64Reg Rd, ARM64Reg Rn, RoundingMode round,
+ bool sign)
+{
+ DEBUG_ASSERT_MSG(DYNA_REC, IsScalar(Rn), "fcvts: Rn must be floating point");
+ if (IsGPR(Rd))
+ {
+ // Use the encoding that transfers the result to a GPR.
+ bool sf = Is64Bit(Rd);
+ int type = IsDouble(Rn) ? 1 : 0;
+ Rd = DecodeReg(Rd);
+ Rn = DecodeReg(Rn);
+ int opcode = (sign ? 1 : 0);
+ int rmode = 0;
+ switch (round)
+ {
+ case ROUND_A:
+ rmode = 0;
+ opcode |= 4;
+ break;
+ case ROUND_P:
+ rmode = 1;
+ break;
+ case ROUND_M:
+ rmode = 2;
+ break;
+ case ROUND_Z:
+ rmode = 3;
+ break;
+ case ROUND_N:
+ rmode = 0;
+ break;
+ }
+ EmitConversion2(sf, 0, true, type, rmode, opcode, 0, Rd, Rn);
+ }
+ else
+ {
+ // Use the encoding (vector, single) that keeps the result in the fp register.
+ int sz = IsDouble(Rn);
+ Rd = DecodeReg(Rd);
+ Rn = DecodeReg(Rn);
+ int opcode = 0;
+ switch (round)
+ {
+ case ROUND_A:
+ opcode = 0x1C;
+ break;
+ case ROUND_N:
+ opcode = 0x1A;
+ break;
+ case ROUND_M:
+ opcode = 0x1B;
+ break;
+ case ROUND_P:
+ opcode = 0x1A;
+ sz |= 2;
+ break;
+ case ROUND_Z:
+ opcode = 0x1B;
+ sz |= 2;
+ break;
+ }
+ Write32((0x5E << 24) | (sign << 29) | (sz << 22) | (1 << 21) | (opcode << 12) | (2 << 10) |
+ (Rn << 5) | Rd);
+ }
+}
+
+void ARM64FloatEmitter::FCVTS(ARM64Reg Rd, ARM64Reg Rn, RoundingMode round)
+{
+ EmitConvertScalarToInt(Rd, Rn, round, false);
+}
+
+void ARM64FloatEmitter::FCVTU(ARM64Reg Rd, ARM64Reg Rn, RoundingMode round)
+{
+ EmitConvertScalarToInt(Rd, Rn, round, true);
+}
+
+void ARM64FloatEmitter::EmitConversion2(bool sf, bool S, bool direction, u32 type, u32 rmode,
+ u32 opcode, int scale, ARM64Reg Rd, ARM64Reg Rn)
+{
+ Rd = DecodeReg(Rd);
+ Rn = DecodeReg(Rn);
+
+ Write32((sf << 31) | (S << 29) | (0xF0 << 21) | (direction << 21) | (type << 22) | (rmode << 19) |
+ (opcode << 16) | (scale << 10) | (Rn << 5) | Rd);
+}
+
+void ARM64FloatEmitter::EmitCompare(bool M, bool S, u32 op, u32 opcode2, ARM64Reg Rn, ARM64Reg Rm)
+{
+ ASSERT_MSG(DYNA_REC, !IsQuad(Rn), "%s doesn't support vector!", __func__);
+ bool is_double = IsDouble(Rn);
+
+ Rn = DecodeReg(Rn);
+ Rm = DecodeReg(Rm);
+
+ Write32((M << 31) | (S << 29) | (0xF1 << 21) | (is_double << 22) | (Rm << 16) | (op << 14) |
+ (1 << 13) | (Rn << 5) | opcode2);
+}
+
+void ARM64FloatEmitter::EmitCondSelect(bool M, bool S, CCFlags cond, ARM64Reg Rd, ARM64Reg Rn,
+ ARM64Reg Rm)
+{
+ ASSERT_MSG(DYNA_REC, !IsQuad(Rd), "%s doesn't support vector!", __func__);
+ bool is_double = IsDouble(Rd);
+
+ Rd = DecodeReg(Rd);
+ Rn = DecodeReg(Rn);
+ Rm = DecodeReg(Rm);
+
+ Write32((M << 31) | (S << 29) | (0xF1 << 21) | (is_double << 22) | (Rm << 16) | (cond << 12) |
+ (3 << 10) | (Rn << 5) | Rd);
+}
+
+void ARM64FloatEmitter::EmitPermute(u32 size, u32 op, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+ ASSERT_MSG(DYNA_REC, !IsSingle(Rd), "%s doesn't support singles!", __func__);
+
+ bool quad = IsQuad(Rd);
+
+ u32 encoded_size = 0;
+ if (size == 16)
+ encoded_size = 1;
+ else if (size == 32)
+ encoded_size = 2;
+ else if (size == 64)
+ encoded_size = 3;
+
+ Rd = DecodeReg(Rd);
+ Rn = DecodeReg(Rn);
+ Rm = DecodeReg(Rm);
+
+ Write32((quad << 30) | (7 << 25) | (encoded_size << 22) | (Rm << 16) | (op << 12) | (1 << 11) |
+ (Rn << 5) | Rd);
+}
+
+void ARM64FloatEmitter::EmitScalarImm(bool M, bool S, u32 type, u32 imm5, ARM64Reg Rd, u32 imm8)
+{
+ ASSERT_MSG(DYNA_REC, !IsQuad(Rd), "%s doesn't support vector!", __func__);
+
+ bool is_double = !IsSingle(Rd);
+
+ Rd = DecodeReg(Rd);
+
+ Write32((M << 31) | (S << 29) | (0xF1 << 21) | (is_double << 22) | (type << 22) | (imm8 << 13) |
+ (1 << 12) | (imm5 << 5) | Rd);
+}
+
+void ARM64FloatEmitter::EmitShiftImm(bool Q, bool U, u32 immh, u32 immb, u32 opcode, ARM64Reg Rd,
+ ARM64Reg Rn)
+{
+ ASSERT_MSG(DYNA_REC, immh, "%s bad encoding! Can't have zero immh", __func__);
+
+ Rd = DecodeReg(Rd);
+ Rn = DecodeReg(Rn);
+
+ Write32((Q << 30) | (U << 29) | (0xF << 24) | (immh << 19) | (immb << 16) | (opcode << 11) |
+ (1 << 10) | (Rn << 5) | Rd);
+}
+
+void ARM64FloatEmitter::EmitScalarShiftImm(bool U, u32 immh, u32 immb, u32 opcode, ARM64Reg Rd,
+ ARM64Reg Rn)
+{
+ Rd = DecodeReg(Rd);
+ Rn = DecodeReg(Rn);
+
+ Write32((2 << 30) | (U << 29) | (0x3E << 23) | (immh << 19) | (immb << 16) | (opcode << 11) |
+ (1 << 10) | (Rn << 5) | Rd);
+}
+
+void ARM64FloatEmitter::EmitLoadStoreMultipleStructure(u32 size, bool L, u32 opcode, ARM64Reg Rt,
+ ARM64Reg Rn)
+{
+ bool quad = IsQuad(Rt);
+ u32 encoded_size = 0;
+
+ if (size == 16)
+ encoded_size = 1;
+ else if (size == 32)
+ encoded_size = 2;
+ else if (size == 64)
+ encoded_size = 3;
+
+ Rt = DecodeReg(Rt);
+ Rn = DecodeReg(Rn);
+
+ Write32((quad << 30) | (3 << 26) | (L << 22) | (opcode << 12) | (encoded_size << 10) | (Rn << 5) |
+ Rt);
+}
+
+void ARM64FloatEmitter::EmitLoadStoreMultipleStructurePost(u32 size, bool L, u32 opcode,
+ ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm)
+{
+ bool quad = IsQuad(Rt);
+ u32 encoded_size = 0;
+
+ if (size == 16)
+ encoded_size = 1;
+ else if (size == 32)
+ encoded_size = 2;
+ else if (size == 64)
+ encoded_size = 3;
+
+ Rt = DecodeReg(Rt);
+ Rn = DecodeReg(Rn);
+ Rm = DecodeReg(Rm);
+
+ Write32((quad << 30) | (0b11001 << 23) | (L << 22) | (Rm << 16) | (opcode << 12) |
+ (encoded_size << 10) | (Rn << 5) | Rt);
+}
+
+void ARM64FloatEmitter::EmitScalar1Source(bool M, bool S, u32 type, u32 opcode, ARM64Reg Rd,
+ ARM64Reg Rn)
+{
+ ASSERT_MSG(DYNA_REC, !IsQuad(Rd), "%s doesn't support vector!", __func__);
+
+ Rd = DecodeReg(Rd);
+ Rn = DecodeReg(Rn);
+
+ Write32((M << 31) | (S << 29) | (0xF1 << 21) | (type << 22) | (opcode << 15) | (1 << 14) |
+ (Rn << 5) | Rd);
+}
+
+void ARM64FloatEmitter::EmitVectorxElement(bool U, u32 size, bool L, u32 opcode, bool H,
+ ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+ bool quad = IsQuad(Rd);
+
+ Rd = DecodeReg(Rd);
+ Rn = DecodeReg(Rn);
+ Rm = DecodeReg(Rm);
+
+ Write32((quad << 30) | (U << 29) | (0xF << 24) | (size << 22) | (L << 21) | (Rm << 16) |
+ (opcode << 12) | (H << 11) | (Rn << 5) | Rd);
+}
+
+void ARM64FloatEmitter::EmitLoadStoreUnscaled(u32 size, u32 op, ARM64Reg Rt, ARM64Reg Rn, s32 imm)
+{
+ ASSERT_MSG(DYNA_REC, !(imm < -256 || imm > 255), "%s received too large offset: %d", __func__,
+ imm);
+ Rt = DecodeReg(Rt);
+ Rn = DecodeReg(Rn);
+
+ Write32((size << 30) | (0xF << 26) | (op << 22) | ((imm & 0x1FF) << 12) | (Rn << 5) | Rt);
+}
+
+void ARM64FloatEmitter::EncodeLoadStorePair(u32 size, bool load, IndexType type, ARM64Reg Rt,
+ ARM64Reg Rt2, ARM64Reg Rn, s32 imm)
+{
+ u32 type_encode = 0;
+ u32 opc = 0;
+
+ switch (type)
+ {
+ case INDEX_SIGNED:
+ type_encode = 0b010;
+ break;
+ case INDEX_POST:
+ type_encode = 0b001;
+ break;
+ case INDEX_PRE:
+ type_encode = 0b011;
+ break;
+ case INDEX_UNSIGNED:
+ ASSERT_MSG(DYNA_REC, false, "%s doesn't support INDEX_UNSIGNED!", __func__);
+ break;
+ }
+
+ if (size == 128)
+ {
+ ASSERT_MSG(DYNA_REC, !(imm & 0xF), "%s received invalid offset 0x%x!", __func__, imm);
+ opc = 2;
+ imm >>= 4;
+ }
+ else if (size == 64)
+ {
+ ASSERT_MSG(DYNA_REC, !(imm & 0x7), "%s received invalid offset 0x%x!", __func__, imm);
+ opc = 1;
+ imm >>= 3;
+ }
+ else if (size == 32)
+ {
+ ASSERT_MSG(DYNA_REC, !(imm & 0x3), "%s received invalid offset 0x%x!", __func__, imm);
+ opc = 0;
+ imm >>= 2;
+ }
+
+ Rt = DecodeReg(Rt);
+ Rt2 = DecodeReg(Rt2);
+ Rn = DecodeReg(Rn);
+
+ Write32((opc << 30) | (0b1011 << 26) | (type_encode << 23) | (load << 22) | ((imm & 0x7F) << 15) |
+ (Rt2 << 10) | (Rn << 5) | Rt);
+}
+
+void ARM64FloatEmitter::EncodeLoadStoreRegisterOffset(u32 size, bool load, ARM64Reg Rt, ARM64Reg Rn,
+ ArithOption Rm)
+{
+ ASSERT_MSG(DYNA_REC, Rm.GetType() == ArithOption::TYPE_EXTENDEDREG,
+ "%s must contain an extended reg as Rm!", __func__);
+
+ u32 encoded_size = 0;
+ u32 encoded_op = 0;
+
+ if (size == 8)
+ {
+ encoded_size = 0;
+ encoded_op = 0;
+ }
+ else if (size == 16)
+ {
+ encoded_size = 1;
+ encoded_op = 0;
+ }
+ else if (size == 32)
+ {
+ encoded_size = 2;
+ encoded_op = 0;
+ }
+ else if (size == 64)
+ {
+ encoded_size = 3;
+ encoded_op = 0;
+ }
+ else if (size == 128)
+ {
+ encoded_size = 0;
+ encoded_op = 2;
+ }
+
+ if (load)
+ encoded_op |= 1;
+
+ Rt = DecodeReg(Rt);
+ Rn = DecodeReg(Rn);
+ ARM64Reg decoded_Rm = DecodeReg(Rm.GetReg());
+
+ Write32((encoded_size << 30) | (encoded_op << 22) | (0b111100001 << 21) | (decoded_Rm << 16) |
+ Rm.GetData() | (1 << 11) | (Rn << 5) | Rt);
+}
+
+void ARM64FloatEmitter::EncodeModImm(bool Q, u8 op, u8 cmode, u8 o2, ARM64Reg Rd, u8 abcdefgh)
+{
+ union
+ {
+ u8 hex;
+ struct
+ {
+ unsigned defgh : 5;
+ unsigned abc : 3;
+ };
+ } v;
+ v.hex = abcdefgh;
+ Rd = DecodeReg(Rd);
+ Write32((Q << 30) | (op << 29) | (0xF << 24) | (v.abc << 16) | (cmode << 12) | (o2 << 11) |
+ (1 << 10) | (v.defgh << 5) | Rd);
+}
+
+void ARM64FloatEmitter::LDR(u8 size, IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm)
+{
+ EmitLoadStoreImmediate(size, 1, type, Rt, Rn, imm);
+}
+void ARM64FloatEmitter::STR(u8 size, IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm)
+{
+ EmitLoadStoreImmediate(size, 0, type, Rt, Rn, imm);
+}
+
+// Loadstore unscaled
+void ARM64FloatEmitter::LDUR(u8 size, ARM64Reg Rt, ARM64Reg Rn, s32 imm)
+{
+ u32 encoded_size = 0;
+ u32 encoded_op = 0;
+
+ if (size == 8)
+ {
+ encoded_size = 0;
+ encoded_op = 1;
+ }
+ else if (size == 16)
+ {
+ encoded_size = 1;
+ encoded_op = 1;
+ }
+ else if (size == 32)
+ {
+ encoded_size = 2;
+ encoded_op = 1;
+ }
+ else if (size == 64)
+ {
+ encoded_size = 3;
+ encoded_op = 1;
+ }
+ else if (size == 128)
+ {
+ encoded_size = 0;
+ encoded_op = 3;
+ }
+
+ EmitLoadStoreUnscaled(encoded_size, encoded_op, Rt, Rn, imm);
+}
+void ARM64FloatEmitter::STUR(u8 size, ARM64Reg Rt, ARM64Reg Rn, s32 imm)
+{
+ u32 encoded_size = 0;
+ u32 encoded_op = 0;
+
+ if (size == 8)
+ {
+ encoded_size = 0;
+ encoded_op = 0;
+ }
+ else if (size == 16)
+ {
+ encoded_size = 1;
+ encoded_op = 0;
+ }
+ else if (size == 32)
+ {
+ encoded_size = 2;
+ encoded_op = 0;
+ }
+ else if (size == 64)
+ {
+ encoded_size = 3;
+ encoded_op = 0;
+ }
+ else if (size == 128)
+ {
+ encoded_size = 0;
+ encoded_op = 2;
+ }
+
+ EmitLoadStoreUnscaled(encoded_size, encoded_op, Rt, Rn, imm);
+}
+
+// Loadstore single structure
+void ARM64FloatEmitter::LD1(u8 size, ARM64Reg Rt, u8 index, ARM64Reg Rn)
+{
+ bool S = 0;
+ u32 opcode = 0;
+ u32 encoded_size = 0;
+ ARM64Reg encoded_reg = INVALID_REG;
+
+ if (size == 8)
+ {
+ S = (index & 4) != 0;
+ opcode = 0;
+ encoded_size = index & 3;
+ if (index & 8)
+ encoded_reg = EncodeRegToQuad(Rt);
+ else
+ encoded_reg = EncodeRegToDouble(Rt);
+ }
+ else if (size == 16)
+ {
+ S = (index & 2) != 0;
+ opcode = 2;
+ encoded_size = (index & 1) << 1;
+ if (index & 4)
+ encoded_reg = EncodeRegToQuad(Rt);
+ else
+ encoded_reg = EncodeRegToDouble(Rt);
+ }
+ else if (size == 32)
+ {
+ S = (index & 1) != 0;
+ opcode = 4;
+ encoded_size = 0;
+ if (index & 2)
+ encoded_reg = EncodeRegToQuad(Rt);
+ else
+ encoded_reg = EncodeRegToDouble(Rt);
+ }
+ else if (size == 64)
+ {
+ S = 0;
+ opcode = 4;
+ encoded_size = 1;
+ if (index == 1)
+ encoded_reg = EncodeRegToQuad(Rt);
+ else
+ encoded_reg = EncodeRegToDouble(Rt);
+ }
+
+ EmitLoadStoreSingleStructure(1, 0, opcode, S, encoded_size, encoded_reg, Rn);
+}
+
+void ARM64FloatEmitter::LD1(u8 size, ARM64Reg Rt, u8 index, ARM64Reg Rn, ARM64Reg Rm)
+{
+ bool S = 0;
+ u32 opcode = 0;
+ u32 encoded_size = 0;
+ ARM64Reg encoded_reg = INVALID_REG;
+
+ if (size == 8)
+ {
+ S = (index & 4) != 0;
+ opcode = 0;
+ encoded_size = index & 3;
+ if (index & 8)
+ encoded_reg = EncodeRegToQuad(Rt);
+ else
+ encoded_reg = EncodeRegToDouble(Rt);
+ }
+ else if (size == 16)
+ {
+ S = (index & 2) != 0;
+ opcode = 2;
+ encoded_size = (index & 1) << 1;
+ if (index & 4)
+ encoded_reg = EncodeRegToQuad(Rt);
+ else
+ encoded_reg = EncodeRegToDouble(Rt);
+ }
+ else if (size == 32)
+ {
+ S = (index & 1) != 0;
+ opcode = 4;
+ encoded_size = 0;
+ if (index & 2)
+ encoded_reg = EncodeRegToQuad(Rt);
+ else
+ encoded_reg = EncodeRegToDouble(Rt);
+ }
+ else if (size == 64)
+ {
+ S = 0;
+ opcode = 4;
+ encoded_size = 1;
+ if (index == 1)
+ encoded_reg = EncodeRegToQuad(Rt);
+ else
+ encoded_reg = EncodeRegToDouble(Rt);
+ }
+
+ EmitLoadStoreSingleStructure(1, 0, opcode, S, encoded_size, encoded_reg, Rn, Rm);
+}
+
+void ARM64FloatEmitter::LD1R(u8 size, ARM64Reg Rt, ARM64Reg Rn)
+{
+ EmitLoadStoreSingleStructure(1, 0, 6, 0, size >> 4, Rt, Rn);
+}
+void ARM64FloatEmitter::LD2R(u8 size, ARM64Reg Rt, ARM64Reg Rn)
+{
+ EmitLoadStoreSingleStructure(1, 1, 6, 0, size >> 4, Rt, Rn);
+}
+void ARM64FloatEmitter::LD1R(u8 size, ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm)
+{
+ EmitLoadStoreSingleStructure(1, 0, 6, 0, size >> 4, Rt, Rn, Rm);
+}
+void ARM64FloatEmitter::LD2R(u8 size, ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm)
+{
+ EmitLoadStoreSingleStructure(1, 1, 6, 0, size >> 4, Rt, Rn, Rm);
+}
+
+void ARM64FloatEmitter::ST1(u8 size, ARM64Reg Rt, u8 index, ARM64Reg Rn)
+{
+ bool S = 0;
+ u32 opcode = 0;
+ u32 encoded_size = 0;
+ ARM64Reg encoded_reg = INVALID_REG;
+
+ if (size == 8)
+ {
+ S = (index & 4) != 0;
+ opcode = 0;
+ encoded_size = index & 3;
+ if (index & 8)
+ encoded_reg = EncodeRegToQuad(Rt);
+ else
+ encoded_reg = EncodeRegToDouble(Rt);
+ }
+ else if (size == 16)
+ {
+ S = (index & 2) != 0;
+ opcode = 2;
+ encoded_size = (index & 1) << 1;
+ if (index & 4)
+ encoded_reg = EncodeRegToQuad(Rt);
+ else
+ encoded_reg = EncodeRegToDouble(Rt);
+ }
+ else if (size == 32)
+ {
+ S = (index & 1) != 0;
+ opcode = 4;
+ encoded_size = 0;
+ if (index & 2)
+ encoded_reg = EncodeRegToQuad(Rt);
+ else
+ encoded_reg = EncodeRegToDouble(Rt);
+ }
+ else if (size == 64)
+ {
+ S = 0;
+ opcode = 4;
+ encoded_size = 1;
+ if (index == 1)
+ encoded_reg = EncodeRegToQuad(Rt);
+ else
+ encoded_reg = EncodeRegToDouble(Rt);
+ }
+
+ EmitLoadStoreSingleStructure(0, 0, opcode, S, encoded_size, encoded_reg, Rn);
+}
+
+void ARM64FloatEmitter::ST1(u8 size, ARM64Reg Rt, u8 index, ARM64Reg Rn, ARM64Reg Rm)
+{
+ bool S = 0;
+ u32 opcode = 0;
+ u32 encoded_size = 0;
+ ARM64Reg encoded_reg = INVALID_REG;
+
+ if (size == 8)
+ {
+ S = (index & 4) != 0;
+ opcode = 0;
+ encoded_size = index & 3;
+ if (index & 8)
+ encoded_reg = EncodeRegToQuad(Rt);
+ else
+ encoded_reg = EncodeRegToDouble(Rt);
+ }
+ else if (size == 16)
+ {
+ S = (index & 2) != 0;
+ opcode = 2;
+ encoded_size = (index & 1) << 1;
+ if (index & 4)
+ encoded_reg = EncodeRegToQuad(Rt);
+ else
+ encoded_reg = EncodeRegToDouble(Rt);
+ }
+ else if (size == 32)
+ {
+ S = (index & 1) != 0;
+ opcode = 4;
+ encoded_size = 0;
+ if (index & 2)
+ encoded_reg = EncodeRegToQuad(Rt);
+ else
+ encoded_reg = EncodeRegToDouble(Rt);
+ }
+ else if (size == 64)
+ {
+ S = 0;
+ opcode = 4;
+ encoded_size = 1;
+ if (index == 1)
+ encoded_reg = EncodeRegToQuad(Rt);
+ else
+ encoded_reg = EncodeRegToDouble(Rt);
+ }
+
+ EmitLoadStoreSingleStructure(0, 0, opcode, S, encoded_size, encoded_reg, Rn, Rm);
+}
+
+// Loadstore multiple structure
+void ARM64FloatEmitter::LD1(u8 size, u8 count, ARM64Reg Rt, ARM64Reg Rn)
+{
+ ASSERT_MSG(DYNA_REC, !(count == 0 || count > 4), "%s must have a count of 1 to 4 registers!",
+ __func__);
+ u32 opcode = 0;
+ if (count == 1)
+ opcode = 0b111;
+ else if (count == 2)
+ opcode = 0b1010;
+ else if (count == 3)
+ opcode = 0b0110;
+ else if (count == 4)
+ opcode = 0b0010;
+ EmitLoadStoreMultipleStructure(size, 1, opcode, Rt, Rn);
+}
+void ARM64FloatEmitter::LD1(u8 size, u8 count, IndexType type, ARM64Reg Rt, ARM64Reg Rn,
+ ARM64Reg Rm)
+{
+ ASSERT_MSG(DYNA_REC, !(count == 0 || count > 4), "%s must have a count of 1 to 4 registers!",
+ __func__);
+ ASSERT_MSG(DYNA_REC, type == INDEX_POST, "%s only supports post indexing!", __func__);
+
+ u32 opcode = 0;
+ if (count == 1)
+ opcode = 0b111;
+ else if (count == 2)
+ opcode = 0b1010;
+ else if (count == 3)
+ opcode = 0b0110;
+ else if (count == 4)
+ opcode = 0b0010;
+ EmitLoadStoreMultipleStructurePost(size, 1, opcode, Rt, Rn, Rm);
+}
+void ARM64FloatEmitter::ST1(u8 size, u8 count, ARM64Reg Rt, ARM64Reg Rn)
+{
+ ASSERT_MSG(DYNA_REC, !(count == 0 || count > 4), "%s must have a count of 1 to 4 registers!",
+ __func__);
+ u32 opcode = 0;
+ if (count == 1)
+ opcode = 0b111;
+ else if (count == 2)
+ opcode = 0b1010;
+ else if (count == 3)
+ opcode = 0b0110;
+ else if (count == 4)
+ opcode = 0b0010;
+ EmitLoadStoreMultipleStructure(size, 0, opcode, Rt, Rn);
+}
+void ARM64FloatEmitter::ST1(u8 size, u8 count, IndexType type, ARM64Reg Rt, ARM64Reg Rn,
+ ARM64Reg Rm)
+{
+ ASSERT_MSG(DYNA_REC, !(count == 0 || count > 4), "%s must have a count of 1 to 4 registers!",
+ __func__);
+ ASSERT_MSG(DYNA_REC, type == INDEX_POST, "%s only supports post indexing!", __func__);
+
+ u32 opcode = 0;
+ if (count == 1)
+ opcode = 0b111;
+ else if (count == 2)
+ opcode = 0b1010;
+ else if (count == 3)
+ opcode = 0b0110;
+ else if (count == 4)
+ opcode = 0b0010;
+ EmitLoadStoreMultipleStructurePost(size, 0, opcode, Rt, Rn, Rm);
+}
+
+// Scalar - 1 Source
+void ARM64FloatEmitter::FMOV(ARM64Reg Rd, ARM64Reg Rn, bool top)
+{
+ if (IsScalar(Rd) && IsScalar(Rn))
+ {
+ EmitScalar1Source(0, 0, IsDouble(Rd), 0, Rd, Rn);
+ }
+ else
+ {
+ ASSERT_MSG(DYNA_REC, !IsQuad(Rd) && !IsQuad(Rn), "FMOV can't move to/from quads");
+ int rmode = 0;
+ int opcode = 6;
+ int sf = 0;
+ if (IsSingle(Rd) && !Is64Bit(Rn) && !top)
+ {
+ // GPR to scalar single
+ opcode |= 1;
+ }
+ else if (!Is64Bit(Rd) && IsSingle(Rn) && !top)
+ {
+ // Scalar single to GPR - defaults are correct
+ }
+ else
+ {
+ // TODO
+ ASSERT_MSG(DYNA_REC, 0, "FMOV: Unhandled case");
+ }
+ Rd = DecodeReg(Rd);
+ Rn = DecodeReg(Rn);
+ Write32((sf << 31) | (0x1e2 << 20) | (rmode << 19) | (opcode << 16) | (Rn << 5) | Rd);
+ }
+}
+
+// Loadstore paired
+void ARM64FloatEmitter::LDP(u8 size, IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn,
+ s32 imm)
+{
+ EncodeLoadStorePair(size, true, type, Rt, Rt2, Rn, imm);
+}
+void ARM64FloatEmitter::STP(u8 size, IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn,
+ s32 imm)
+{
+ EncodeLoadStorePair(size, false, type, Rt, Rt2, Rn, imm);
+}
+
+// Loadstore register offset
+void ARM64FloatEmitter::STR(u8 size, ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm)
+{
+ EncodeLoadStoreRegisterOffset(size, false, Rt, Rn, Rm);
+}
+void ARM64FloatEmitter::LDR(u8 size, ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm)
+{
+ EncodeLoadStoreRegisterOffset(size, true, Rt, Rn, Rm);
+}
+
+void ARM64FloatEmitter::FABS(ARM64Reg Rd, ARM64Reg Rn)
+{
+ EmitScalar1Source(0, 0, IsDouble(Rd), 1, Rd, Rn);
+}
+void ARM64FloatEmitter::FNEG(ARM64Reg Rd, ARM64Reg Rn)
+{
+ EmitScalar1Source(0, 0, IsDouble(Rd), 2, Rd, Rn);
+}
+void ARM64FloatEmitter::FSQRT(ARM64Reg Rd, ARM64Reg Rn)
+{
+ EmitScalar1Source(0, 0, IsDouble(Rd), 3, Rd, Rn);
+}
+
+// Scalar - 2 Source
+void ARM64FloatEmitter::FADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+ EmitScalar2Source(0, 0, IsDouble(Rd), 2, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::FMUL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+ EmitScalar2Source(0, 0, IsDouble(Rd), 0, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::FSUB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+ EmitScalar2Source(0, 0, IsDouble(Rd), 3, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::FDIV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+ EmitScalar2Source(0, 0, IsDouble(Rd), 1, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::FMAX(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+ EmitScalar2Source(0, 0, IsDouble(Rd), 4, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::FMIN(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+ EmitScalar2Source(0, 0, IsDouble(Rd), 5, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::FMAXNM(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+ EmitScalar2Source(0, 0, IsDouble(Rd), 6, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::FMINNM(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+ EmitScalar2Source(0, 0, IsDouble(Rd), 7, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::FNMUL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+ EmitScalar2Source(0, 0, IsDouble(Rd), 8, Rd, Rn, Rm);
+}
+
+void ARM64FloatEmitter::FMADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra)
+{
+ EmitScalar3Source(IsDouble(Rd), Rd, Rn, Rm, Ra, 0);
+}
+void ARM64FloatEmitter::FMSUB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra)
+{
+ EmitScalar3Source(IsDouble(Rd), Rd, Rn, Rm, Ra, 1);
+}
+void ARM64FloatEmitter::FNMADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra)
+{
+ EmitScalar3Source(IsDouble(Rd), Rd, Rn, Rm, Ra, 2);
+}
+void ARM64FloatEmitter::FNMSUB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra)
+{
+ EmitScalar3Source(IsDouble(Rd), Rd, Rn, Rm, Ra, 3);
+}
+
+void ARM64FloatEmitter::EmitScalar3Source(bool isDouble, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm,
+ ARM64Reg Ra, int opcode)
+{
+ int type = isDouble ? 1 : 0;
+ Rd = DecodeReg(Rd);
+ Rn = DecodeReg(Rn);
+ Rm = DecodeReg(Rm);
+ Ra = DecodeReg(Ra);
+ int o1 = opcode >> 1;
+ int o0 = opcode & 1;
+ m_emit->Write32((0x1F << 24) | (type << 22) | (o1 << 21) | (Rm << 16) | (o0 << 15) | (Ra << 10) |
+ (Rn << 5) | Rd);
+}
+
+// Scalar floating point immediate
+void ARM64FloatEmitter::FMOV(ARM64Reg Rd, uint8_t imm8)
+{
+ EmitScalarImm(0, 0, 0, 0, Rd, imm8);
+}
+
+// Vector
+void ARM64FloatEmitter::AND(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+ EmitThreeSame(0, 0, 3, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::BSL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+ EmitThreeSame(1, 1, 3, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::DUP(u8 size, ARM64Reg Rd, ARM64Reg Rn, u8 index)
+{
+ u32 imm5 = 0;
+
+ if (size == 8)
+ {
+ imm5 = 1;
+ imm5 |= index << 1;
+ }
+ else if (size == 16)
+ {
+ imm5 = 2;
+ imm5 |= index << 2;
+ }
+ else if (size == 32)
+ {
+ imm5 = 4;
+ imm5 |= index << 3;
+ }
+ else if (size == 64)
+ {
+ imm5 = 8;
+ imm5 |= index << 4;
+ }
+
+ EmitCopy(IsQuad(Rd), 0, imm5, 0, Rd, Rn);
+}
+void ARM64FloatEmitter::FABS(u8 size, ARM64Reg Rd, ARM64Reg Rn)
+{
+ Emit2RegMisc(IsQuad(Rd), 0, 2 | (size >> 6), 0xF, Rd, Rn);
+}
+void ARM64FloatEmitter::FADD(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+ EmitThreeSame(0, size >> 6, 0x1A, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::FMAX(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+ EmitThreeSame(0, size >> 6, 0b11110, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::FMLA(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+ EmitThreeSame(0, size >> 6, 0x19, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::FMIN(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+ EmitThreeSame(0, 2 | size >> 6, 0b11110, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::FCVTL(u8 size, ARM64Reg Rd, ARM64Reg Rn)
+{
+ Emit2RegMisc(false, 0, size >> 6, 0x17, Rd, Rn);
+}
+void ARM64FloatEmitter::FCVTL2(u8 size, ARM64Reg Rd, ARM64Reg Rn)
+{
+ Emit2RegMisc(true, 0, size >> 6, 0x17, Rd, Rn);
+}
+void ARM64FloatEmitter::FCVTN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn)
+{
+ Emit2RegMisc(IsQuad(Rd), 0, dest_size >> 5, 0x16, Rd, Rn);
+}
+void ARM64FloatEmitter::FCVTZS(u8 size, ARM64Reg Rd, ARM64Reg Rn)
+{
+ Emit2RegMisc(IsQuad(Rd), 0, 2 | (size >> 6), 0x1B, Rd, Rn);
+}
+void ARM64FloatEmitter::FCVTZU(u8 size, ARM64Reg Rd, ARM64Reg Rn)
+{
+ Emit2RegMisc(IsQuad(Rd), 1, 2 | (size >> 6), 0x1B, Rd, Rn);
+}
+void ARM64FloatEmitter::FDIV(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+ EmitThreeSame(1, size >> 6, 0x1F, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::FMUL(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+ EmitThreeSame(1, size >> 6, 0x1B, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::FNEG(u8 size, ARM64Reg Rd, ARM64Reg Rn)
+{
+ Emit2RegMisc(IsQuad(Rd), 1, 2 | (size >> 6), 0xF, Rd, Rn);
+}
+void ARM64FloatEmitter::FRECPE(u8 size, ARM64Reg Rd, ARM64Reg Rn)
+{
+ Emit2RegMisc(IsQuad(Rd), 0, 2 | (size >> 6), 0x1D, Rd, Rn);
+}
+void ARM64FloatEmitter::FRSQRTE(u8 size, ARM64Reg Rd, ARM64Reg Rn)
+{
+ Emit2RegMisc(IsQuad(Rd), 1, 2 | (size >> 6), 0x1D, Rd, Rn);
+}
+void ARM64FloatEmitter::FSUB(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+ EmitThreeSame(0, 2 | (size >> 6), 0x1A, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::FMLS(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+ EmitThreeSame(0, 2 | (size >> 6), 0x19, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::NOT(ARM64Reg Rd, ARM64Reg Rn)
+{
+ Emit2RegMisc(IsQuad(Rd), 1, 0, 5, Rd, Rn);
+}
+void ARM64FloatEmitter::ORR(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+ EmitThreeSame(0, 2, 3, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::REV16(u8 size, ARM64Reg Rd, ARM64Reg Rn)
+{
+ Emit2RegMisc(IsQuad(Rd), 0, size >> 4, 1, Rd, Rn);
+}
+void ARM64FloatEmitter::REV32(u8 size, ARM64Reg Rd, ARM64Reg Rn)
+{
+ Emit2RegMisc(IsQuad(Rd), 1, size >> 4, 0, Rd, Rn);
+}
+void ARM64FloatEmitter::REV64(u8 size, ARM64Reg Rd, ARM64Reg Rn)
+{
+ Emit2RegMisc(IsQuad(Rd), 0, size >> 4, 0, Rd, Rn);
+}
+void ARM64FloatEmitter::SCVTF(u8 size, ARM64Reg Rd, ARM64Reg Rn)
+{
+ Emit2RegMisc(IsQuad(Rd), 0, size >> 6, 0x1D, Rd, Rn);
+}
+void ARM64FloatEmitter::UCVTF(u8 size, ARM64Reg Rd, ARM64Reg Rn)
+{
+ Emit2RegMisc(IsQuad(Rd), 1, size >> 6, 0x1D, Rd, Rn);
+}
+void ARM64FloatEmitter::SCVTF(u8 size, ARM64Reg Rd, ARM64Reg Rn, int scale)
+{
+ int imm = size * 2 - scale;
+ EmitShiftImm(IsQuad(Rd), 0, imm >> 3, imm & 7, 0x1C, Rd, Rn);
+}
+void ARM64FloatEmitter::UCVTF(u8 size, ARM64Reg Rd, ARM64Reg Rn, int scale)
+{
+ int imm = size * 2 - scale;
+ EmitShiftImm(IsQuad(Rd), 1, imm >> 3, imm & 7, 0x1C, Rd, Rn);
+}
+void ARM64FloatEmitter::SQXTN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn)
+{
+ Emit2RegMisc(false, 0, dest_size >> 4, 0b10100, Rd, Rn);
+}
+void ARM64FloatEmitter::SQXTN2(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn)
+{
+ Emit2RegMisc(true, 0, dest_size >> 4, 0b10100, Rd, Rn);
+}
+void ARM64FloatEmitter::UQXTN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn)
+{
+ Emit2RegMisc(false, 1, dest_size >> 4, 0b10100, Rd, Rn);
+}
+void ARM64FloatEmitter::UQXTN2(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn)
+{
+ Emit2RegMisc(true, 1, dest_size >> 4, 0b10100, Rd, Rn);
+}
+void ARM64FloatEmitter::XTN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn)
+{
+ Emit2RegMisc(false, 0, dest_size >> 4, 0b10010, Rd, Rn);
+}
+void ARM64FloatEmitter::XTN2(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn)
+{
+ Emit2RegMisc(true, 0, dest_size >> 4, 0b10010, Rd, Rn);
+}
+
+// Move
+void ARM64FloatEmitter::DUP(u8 size, ARM64Reg Rd, ARM64Reg Rn)
+{
+ u32 imm5 = 0;
+
+ if (size == 8)
+ imm5 = 1;
+ else if (size == 16)
+ imm5 = 2;
+ else if (size == 32)
+ imm5 = 4;
+ else if (size == 64)
+ imm5 = 8;
+
+ EmitCopy(IsQuad(Rd), 0, imm5, 1, Rd, Rn);
+}
+void ARM64FloatEmitter::INS(u8 size, ARM64Reg Rd, u8 index, ARM64Reg Rn)
+{
+ u32 imm5 = 0;
+
+ if (size == 8)
+ {
+ imm5 = 1;
+ imm5 |= index << 1;
+ }
+ else if (size == 16)
+ {
+ imm5 = 2;
+ imm5 |= index << 2;
+ }
+ else if (size == 32)
+ {
+ imm5 = 4;
+ imm5 |= index << 3;
+ }
+ else if (size == 64)
+ {
+ imm5 = 8;
+ imm5 |= index << 4;
+ }
+
+ EmitCopy(1, 0, imm5, 3, Rd, Rn);
+}
+void ARM64FloatEmitter::INS(u8 size, ARM64Reg Rd, u8 index1, ARM64Reg Rn, u8 index2)
+{
+ u32 imm5 = 0, imm4 = 0;
+
+ if (size == 8)
+ {
+ imm5 = 1;
+ imm5 |= index1 << 1;
+ imm4 = index2;
+ }
+ else if (size == 16)
+ {
+ imm5 = 2;
+ imm5 |= index1 << 2;
+ imm4 = index2 << 1;
+ }
+ else if (size == 32)
+ {
+ imm5 = 4;
+ imm5 |= index1 << 3;
+ imm4 = index2 << 2;
+ }
+ else if (size == 64)
+ {
+ imm5 = 8;
+ imm5 |= index1 << 4;
+ imm4 = index2 << 3;
+ }
+
+ EmitCopy(1, 1, imm5, imm4, Rd, Rn);
+}
+
+void ARM64FloatEmitter::UMOV(u8 size, ARM64Reg Rd, ARM64Reg Rn, u8 index)
+{
+ bool b64Bit = Is64Bit(Rd);
+ ASSERT_MSG(DYNA_REC, Rd < SP, "%s destination must be a GPR!", __func__);
+ ASSERT_MSG(DYNA_REC, !(b64Bit && size != 64),
+ "%s must have a size of 64 when destination is 64bit!", __func__);
+ u32 imm5 = 0;
+
+ if (size == 8)
+ {
+ imm5 = 1;
+ imm5 |= index << 1;
+ }
+ else if (size == 16)
+ {
+ imm5 = 2;
+ imm5 |= index << 2;
+ }
+ else if (size == 32)
+ {
+ imm5 = 4;
+ imm5 |= index << 3;
+ }
+ else if (size == 64)
+ {
+ imm5 = 8;
+ imm5 |= index << 4;
+ }
+
+ EmitCopy(b64Bit, 0, imm5, 7, Rd, Rn);
+}
+void ARM64FloatEmitter::SMOV(u8 size, ARM64Reg Rd, ARM64Reg Rn, u8 index)
+{
+ bool b64Bit = Is64Bit(Rd);
+ ASSERT_MSG(DYNA_REC, Rd < SP, "%s destination must be a GPR!", __func__);
+ ASSERT_MSG(DYNA_REC, size != 64, "%s doesn't support 64bit destination. Use UMOV!", __func__);
+ u32 imm5 = 0;
+
+ if (size == 8)
+ {
+ imm5 = 1;
+ imm5 |= index << 1;
+ }
+ else if (size == 16)
+ {
+ imm5 = 2;
+ imm5 |= index << 2;
+ }
+ else if (size == 32)
+ {
+ imm5 = 4;
+ imm5 |= index << 3;
+ }
+
+ EmitCopy(b64Bit, 0, imm5, 5, Rd, Rn);
+}
+
+// One source
+void ARM64FloatEmitter::FCVT(u8 size_to, u8 size_from, ARM64Reg Rd, ARM64Reg Rn)
+{
+ u32 dst_encoding = 0;
+ u32 src_encoding = 0;
+
+ if (size_to == 16)
+ dst_encoding = 3;
+ else if (size_to == 32)
+ dst_encoding = 0;
+ else if (size_to == 64)
+ dst_encoding = 1;
+
+ if (size_from == 16)
+ src_encoding = 3;
+ else if (size_from == 32)
+ src_encoding = 0;
+ else if (size_from == 64)
+ src_encoding = 1;
+
+ Emit1Source(0, 0, src_encoding, 4 | dst_encoding, Rd, Rn);
+}
+
+void ARM64FloatEmitter::SCVTF(ARM64Reg Rd, ARM64Reg Rn)
+{
+ if (IsScalar(Rn))
+ {
+ // Source is in FP register (like destination!). We must use a vector encoding.
+ bool sign = false;
+ Rd = DecodeReg(Rd);
+ Rn = DecodeReg(Rn);
+ int sz = IsDouble(Rn);
+ Write32((0x5e << 24) | (sign << 29) | (sz << 22) | (0x876 << 10) | (Rn << 5) | Rd);
+ }
+ else
+ {
+ bool sf = Is64Bit(Rn);
+ u32 type = 0;
+ if (IsDouble(Rd))
+ type = 1;
+ EmitConversion(sf, 0, type, 0, 2, Rd, Rn);
+ }
+}
+
+void ARM64FloatEmitter::UCVTF(ARM64Reg Rd, ARM64Reg Rn)
+{
+ if (IsScalar(Rn))
+ {
+ // Source is in FP register (like destination!). We must use a vector encoding.
+ bool sign = true;
+ Rd = DecodeReg(Rd);
+ Rn = DecodeReg(Rn);
+ int sz = IsDouble(Rn);
+ Write32((0x5e << 24) | (sign << 29) | (sz << 22) | (0x876 << 10) | (Rn << 5) | Rd);
+ }
+ else
+ {
+ bool sf = Is64Bit(Rn);
+ u32 type = 0;
+ if (IsDouble(Rd))
+ type = 1;
+
+ EmitConversion(sf, 0, type, 0, 3, Rd, Rn);
+ }
+}
+
+void ARM64FloatEmitter::SCVTF(ARM64Reg Rd, ARM64Reg Rn, int scale)
+{
+ bool sf = Is64Bit(Rn);
+ u32 type = 0;
+ if (IsDouble(Rd))
+ type = 1;
+
+ EmitConversion2(sf, 0, false, type, 0, 2, 64 - scale, Rd, Rn);
+}
+
+void ARM64FloatEmitter::UCVTF(ARM64Reg Rd, ARM64Reg Rn, int scale)
+{
+ bool sf = Is64Bit(Rn);
+ u32 type = 0;
+ if (IsDouble(Rd))
+ type = 1;
+
+ EmitConversion2(sf, 0, false, type, 0, 3, 64 - scale, Rd, Rn);
+}
+
+void ARM64FloatEmitter::FCMP(ARM64Reg Rn, ARM64Reg Rm)
+{
+ EmitCompare(0, 0, 0, 0, Rn, Rm);
+}
+void ARM64FloatEmitter::FCMP(ARM64Reg Rn)
+{
+ EmitCompare(0, 0, 0, 8, Rn, (ARM64Reg)0);
+}
+void ARM64FloatEmitter::FCMPE(ARM64Reg Rn, ARM64Reg Rm)
+{
+ EmitCompare(0, 0, 0, 0x10, Rn, Rm);
+}
+void ARM64FloatEmitter::FCMPE(ARM64Reg Rn)
+{
+ EmitCompare(0, 0, 0, 0x18, Rn, (ARM64Reg)0);
+}
+void ARM64FloatEmitter::FCMEQ(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+ EmitThreeSame(0, size >> 6, 0x1C, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::FCMEQ(u8 size, ARM64Reg Rd, ARM64Reg Rn)
+{
+ Emit2RegMisc(IsQuad(Rd), 0, 2 | (size >> 6), 0xD, Rd, Rn);
+}
+void ARM64FloatEmitter::FCMGE(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+ EmitThreeSame(1, size >> 6, 0x1C, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::FCMGE(u8 size, ARM64Reg Rd, ARM64Reg Rn)
+{
+ Emit2RegMisc(IsQuad(Rd), 1, 2 | (size >> 6), 0x0C, Rd, Rn);
+}
+void ARM64FloatEmitter::FCMGT(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+ EmitThreeSame(1, 2 | (size >> 6), 0x1C, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::FCMGT(u8 size, ARM64Reg Rd, ARM64Reg Rn)
+{
+ Emit2RegMisc(IsQuad(Rd), 0, 2 | (size >> 6), 0x0C, Rd, Rn);
+}
+void ARM64FloatEmitter::FCMLE(u8 size, ARM64Reg Rd, ARM64Reg Rn)
+{
+ Emit2RegMisc(IsQuad(Rd), 1, 2 | (size >> 6), 0xD, Rd, Rn);
+}
+void ARM64FloatEmitter::FCMLT(u8 size, ARM64Reg Rd, ARM64Reg Rn)
+{
+ Emit2RegMisc(IsQuad(Rd), 0, 2 | (size >> 6), 0xE, Rd, Rn);
+}
+
+void ARM64FloatEmitter::FCSEL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, CCFlags cond)
+{
+ EmitCondSelect(0, 0, cond, Rd, Rn, Rm);
+}
+
+// Permute
+void ARM64FloatEmitter::UZP1(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+ EmitPermute(size, 0b001, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::TRN1(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+ EmitPermute(size, 0b010, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::ZIP1(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+ EmitPermute(size, 0b011, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::UZP2(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+ EmitPermute(size, 0b101, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::TRN2(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+ EmitPermute(size, 0b110, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::ZIP2(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+ EmitPermute(size, 0b111, Rd, Rn, Rm);
+}
+
+// Shift by immediate
+void ARM64FloatEmitter::SSHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift)
+{
+ SSHLL(src_size, Rd, Rn, shift, false);
+}
+void ARM64FloatEmitter::SSHLL2(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift)
+{
+ SSHLL(src_size, Rd, Rn, shift, true);
+}
+void ARM64FloatEmitter::SHRN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift)
+{
+ SHRN(dest_size, Rd, Rn, shift, false);
+}
+void ARM64FloatEmitter::SHRN2(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift)
+{
+ SHRN(dest_size, Rd, Rn, shift, true);
+}
+void ARM64FloatEmitter::USHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift)
+{
+ USHLL(src_size, Rd, Rn, shift, false);
+}
+void ARM64FloatEmitter::USHLL2(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift)
+{
+ USHLL(src_size, Rd, Rn, shift, true);
+}
+void ARM64FloatEmitter::SXTL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn)
+{
+ SXTL(src_size, Rd, Rn, false);
+}
+void ARM64FloatEmitter::SXTL2(u8 src_size, ARM64Reg Rd, ARM64Reg Rn)
+{
+ SXTL(src_size, Rd, Rn, true);
+}
+void ARM64FloatEmitter::UXTL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn)
+{
+ UXTL(src_size, Rd, Rn, false);
+}
+void ARM64FloatEmitter::UXTL2(u8 src_size, ARM64Reg Rd, ARM64Reg Rn)
+{
+ UXTL(src_size, Rd, Rn, true);
+}
+
+void ARM64FloatEmitter::SSHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift, bool upper)
+{
+ ASSERT_MSG(DYNA_REC, shift < src_size, "%s shift amount must less than the element size!",
+ __func__);
+ u32 immh = 0;
+ u32 immb = shift & 0xFFF;
+
+ if (src_size == 8)
+ {
+ immh = 1;
+ }
+ else if (src_size == 16)
+ {
+ immh = 2 | ((shift >> 3) & 1);
+ }
+ else if (src_size == 32)
+ {
+ immh = 4 | ((shift >> 3) & 3);
+ ;
+ }
+ EmitShiftImm(upper, 0, immh, immb, 0b10100, Rd, Rn);
+}
+
+void ARM64FloatEmitter::USHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift, bool upper)
+{
+ ASSERT_MSG(DYNA_REC, shift < src_size, "%s shift amount must less than the element size!",
+ __func__);
+ u32 immh = 0;
+ u32 immb = shift & 0xFFF;
+
+ if (src_size == 8)
+ {
+ immh = 1;
+ }
+ else if (src_size == 16)
+ {
+ immh = 2 | ((shift >> 3) & 1);
+ }
+ else if (src_size == 32)
+ {
+ immh = 4 | ((shift >> 3) & 3);
+ ;
+ }
+ EmitShiftImm(upper, 1, immh, immb, 0b10100, Rd, Rn);
+}
+
+void ARM64FloatEmitter::SHRN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift, bool upper)
+{
+ ASSERT_MSG(DYNA_REC, shift < dest_size, "%s shift amount must less than the element size!",
+ __func__);
+ u32 immh = 0;
+ u32 immb = shift & 0xFFF;
+
+ if (dest_size == 8)
+ {
+ immh = 1;
+ }
+ else if (dest_size == 16)
+ {
+ immh = 2 | ((shift >> 3) & 1);
+ }
+ else if (dest_size == 32)
+ {
+ immh = 4 | ((shift >> 3) & 3);
+ ;
+ }
+ EmitShiftImm(upper, 1, immh, immb, 0b10000, Rd, Rn);
+}
+
+void ARM64FloatEmitter::SXTL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, bool upper)
+{
+ SSHLL(src_size, Rd, Rn, 0, upper);
+}
+
+void ARM64FloatEmitter::UXTL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, bool upper)
+{
+ USHLL(src_size, Rd, Rn, 0, upper);
+}
+
+// vector x indexed element
+void ARM64FloatEmitter::FMUL(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, u8 index)
+{
+ ASSERT_MSG(DYNA_REC, size == 32 || size == 64, "%s only supports 32bit or 64bit size!", __func__);
+
+ bool L = false;
+ bool H = false;
+ if (size == 32)
+ {
+ L = index & 1;
+ H = (index >> 1) & 1;
+ }
+ else if (size == 64)
+ {
+ H = index == 1;
+ }
+
+ EmitVectorxElement(0, 2 | (size >> 6), L, 0x9, H, Rd, Rn, Rm);
+}
+
+void ARM64FloatEmitter::FMLA(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, u8 index)
+{
+ ASSERT_MSG(DYNA_REC, size == 32 || size == 64, "%s only supports 32bit or 64bit size!", __func__);
+
+ bool L = false;
+ bool H = false;
+ if (size == 32)
+ {
+ L = index & 1;
+ H = (index >> 1) & 1;
+ }
+ else if (size == 64)
+ {
+ H = index == 1;
+ }
+
+ EmitVectorxElement(0, 2 | (size >> 6), L, 1, H, Rd, Rn, Rm);
+}
+
+// Modified Immediate
+void ARM64FloatEmitter::MOVI(u8 size, ARM64Reg Rd, u64 imm, u8 shift)
+{
+ bool Q = IsQuad(Rd);
+ u8 cmode = 0;
+ u8 op = 0;
+ u8 abcdefgh = imm & 0xFF;
+ if (size == 8)
+ {
+ ASSERT_MSG(DYNA_REC, shift == 0, "%s(size8) doesn't support shift!", __func__);
+ ASSERT_MSG(DYNA_REC, !(imm & ~0xFFULL), "%s(size8) only supports 8bit values!", __func__);
+ }
+ else if (size == 16)
+ {
+ ASSERT_MSG(DYNA_REC, shift == 0 || shift == 8, "%s(size16) only supports shift of {0, 8}!",
+ __func__);
+ ASSERT_MSG(DYNA_REC, !(imm & ~0xFFULL), "%s(size16) only supports 8bit values!", __func__);
+
+ if (shift == 8)
+ cmode |= 2;
+ }
+ else if (size == 32)
+ {
+ ASSERT_MSG(DYNA_REC, shift == 0 || shift == 8 || shift == 16 || shift == 24,
+ "%s(size32) only supports shift of {0, 8, 16, 24}!", __func__);
+ // XXX: Implement support for MOVI - shifting ones variant
+ ASSERT_MSG(DYNA_REC, !(imm & ~0xFFULL), "%s(size32) only supports 8bit values!", __func__);
+ switch (shift)
+ {
+ case 8:
+ cmode |= 2;
+ break;
+ case 16:
+ cmode |= 4;
+ break;
+ case 24:
+ cmode |= 6;
+ break;
+ default:
+ break;
+ }
+ }
+ else // 64
+ {
+ ASSERT_MSG(DYNA_REC, shift == 0, "%s(size64) doesn't support shift!", __func__);
+
+ op = 1;
+ cmode = 0xE;
+ abcdefgh = 0;
+ for (int i = 0; i < 8; ++i)
+ {
+ u8 tmp = (imm >> (i << 3)) & 0xFF;
+ ASSERT_MSG(DYNA_REC, tmp == 0xFF || tmp == 0, "%s(size64) Invalid immediate!", __func__);
+ if (tmp == 0xFF)
+ abcdefgh |= (1 << i);
+ }
+ }
+ EncodeModImm(Q, op, cmode, 0, Rd, abcdefgh);
+}
+
+void ARM64FloatEmitter::BIC(u8 size, ARM64Reg Rd, u8 imm, u8 shift)
+{
+ bool Q = IsQuad(Rd);
+ u8 cmode = 1;
+ u8 op = 1;
+ if (size == 16)
+ {
+ ASSERT_MSG(DYNA_REC, shift == 0 || shift == 8, "%s(size16) only supports shift of {0, 8}!",
+ __func__);
+
+ if (shift == 8)
+ cmode |= 2;
+ }
+ else if (size == 32)
+ {
+ ASSERT_MSG(DYNA_REC, shift == 0 || shift == 8 || shift == 16 || shift == 24,
+ "%s(size32) only supports shift of {0, 8, 16, 24}!", __func__);
+ // XXX: Implement support for MOVI - shifting ones variant
+ switch (shift)
+ {
+ case 8:
+ cmode |= 2;
+ break;
+ case 16:
+ cmode |= 4;
+ break;
+ case 24:
+ cmode |= 6;
+ break;
+ default:
+ break;
+ }
+ }
+ else
+ {
+ ASSERT_MSG(DYNA_REC, false, "%s only supports size of {16, 32}!", __func__);
+ }
+ EncodeModImm(Q, op, cmode, 0, Rd, imm);
+}
+
+void ARM64FloatEmitter::ABI_PushRegisters(BitSet32 registers, ARM64Reg tmp)
+{
+ bool bundled_loadstore = false;
+
+ for (int i = 0; i < 32; ++i)
+ {
+ if (!registers[i])
+ continue;
+
+ int count = 0;
+ while (++count < 4 && (i + count) < 32 && registers[i + count])
+ {
+ }
+ if (count > 1)
+ {
+ bundled_loadstore = true;
+ break;
+ }
+ }
+
+ if (bundled_loadstore && tmp != INVALID_REG)
+ {
+ int num_regs = registers.Count();
+ m_emit->SUB(SP, SP, num_regs * 16);
+ m_emit->ADD(tmp, SP, 0);
+ std::vector<ARM64Reg> island_regs;
+ for (int i = 0; i < 32; ++i)
+ {
+ if (!registers[i])
+ continue;
+
+ int count = 0;
+
+ // 0 = true
+ // 1 < 4 && registers[i + 1] true!
+ // 2 < 4 && registers[i + 2] true!
+ // 3 < 4 && registers[i + 3] true!
+ // 4 < 4 && registers[i + 4] false!
+ while (++count < 4 && (i + count) < 32 && registers[i + count])
+ {
+ }
+
+ if (count == 1)
+ island_regs.push_back((ARM64Reg)(Q0 + i));
+ else
+ ST1(64, count, INDEX_POST, (ARM64Reg)(Q0 + i), tmp);
+
+ i += count - 1;
+ }
+
+ // Handle island registers
+ std::vector<ARM64Reg> pair_regs;
+ for (auto& it : island_regs)
+ {
+ pair_regs.push_back(it);
+ if (pair_regs.size() == 2)
+ {
+ STP(128, INDEX_POST, pair_regs[0], pair_regs[1], tmp, 32);
+ pair_regs.clear();
+ }
+ }
+ if (pair_regs.size())
+ STR(128, INDEX_POST, pair_regs[0], tmp, 16);
+ }
+ else
+ {
+ std::vector<ARM64Reg> pair_regs;
+ for (auto it : registers)
+ {
+ pair_regs.push_back((ARM64Reg)(Q0 + it));
+ if (pair_regs.size() == 2)
+ {
+ STP(128, INDEX_PRE, pair_regs[0], pair_regs[1], SP, -32);
+ pair_regs.clear();
+ }
+ }
+ if (pair_regs.size())
+ STR(128, INDEX_PRE, pair_regs[0], SP, -16);
+ }
+}
+void ARM64FloatEmitter::ABI_PopRegisters(BitSet32 registers, ARM64Reg tmp)
+{
+ bool bundled_loadstore = false;
+ int num_regs = registers.Count();
+
+ for (int i = 0; i < 32; ++i)
+ {
+ if (!registers[i])
+ continue;
+
+ int count = 0;
+ while (++count < 4 && (i + count) < 32 && registers[i + count])
+ {
+ }
+ if (count > 1)
+ {
+ bundled_loadstore = true;
+ break;
+ }
+ }
+
+ if (bundled_loadstore && tmp != INVALID_REG)
+ {
+ // The temporary register is only used to indicate that we can use this code path
+ std::vector<ARM64Reg> island_regs;
+ for (int i = 0; i < 32; ++i)
+ {
+ if (!registers[i])
+ continue;
+
+ int count = 0;
+ while (++count < 4 && (i + count) < 32 && registers[i + count])
+ {
+ }
+
+ if (count == 1)
+ island_regs.push_back((ARM64Reg)(Q0 + i));
+ else
+ LD1(64, count, INDEX_POST, (ARM64Reg)(Q0 + i), SP);
+
+ i += count - 1;
+ }
+
+ // Handle island registers
+ std::vector<ARM64Reg> pair_regs;
+ for (auto& it : island_regs)
+ {
+ pair_regs.push_back(it);
+ if (pair_regs.size() == 2)
+ {
+ LDP(128, INDEX_POST, pair_regs[0], pair_regs[1], SP, 32);
+ pair_regs.clear();
+ }
+ }
+ if (pair_regs.size())
+ LDR(128, INDEX_POST, pair_regs[0], SP, 16);
+ }
+ else
+ {
+ bool odd = num_regs % 2;
+ std::vector<ARM64Reg> pair_regs;
+ for (int i = 31; i >= 0; --i)
+ {
+ if (!registers[i])
+ continue;
+
+ if (odd)
+ {
+ // First load must be a regular LDR if odd
+ odd = false;
+ LDR(128, INDEX_POST, (ARM64Reg)(Q0 + i), SP, 16);
+ }
+ else
+ {
+ pair_regs.push_back((ARM64Reg)(Q0 + i));
+ if (pair_regs.size() == 2)
+ {
+ LDP(128, INDEX_POST, pair_regs[1], pair_regs[0], SP, 32);
+ pair_regs.clear();
+ }
+ }
+ }
+ }
+}
+
+void ARM64XEmitter::ANDI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch)
+{
+ unsigned int n, imm_s, imm_r;
+ if (!Is64Bit(Rn))
+ imm &= 0xFFFFFFFF;
+ if (IsImmLogical(imm, Is64Bit(Rn) ? 64 : 32, &n, &imm_s, &imm_r))
+ {
+ AND(Rd, Rn, imm_r, imm_s, n != 0);
+ }
+ else
+ {
+ ASSERT_MSG(DYNA_REC, scratch != INVALID_REG,
+ "ANDI2R - failed to construct logical immediate value from %08x, need scratch",
+ (u32)imm);
+ MOVI2R(scratch, imm);
+ AND(Rd, Rn, scratch);
+ }
+}
+
+void ARM64XEmitter::ORRI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch)
+{
+ unsigned int n, imm_s, imm_r;
+ if (IsImmLogical(imm, Is64Bit(Rn) ? 64 : 32, &n, &imm_s, &imm_r))
+ {
+ ORR(Rd, Rn, imm_r, imm_s, n != 0);
+ }
+ else
+ {
+ ASSERT_MSG(DYNA_REC, scratch != INVALID_REG,
+ "ORRI2R - failed to construct logical immediate value from %08x, need scratch",
+ (u32)imm);
+ MOVI2R(scratch, imm);
+ ORR(Rd, Rn, scratch);
+ }
+}
+
+void ARM64XEmitter::EORI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch)
+{
+ unsigned int n, imm_s, imm_r;
+ if (IsImmLogical(imm, Is64Bit(Rn) ? 64 : 32, &n, &imm_s, &imm_r))
+ {
+ EOR(Rd, Rn, imm_r, imm_s, n != 0);
+ }
+ else
+ {
+ ASSERT_MSG(DYNA_REC, scratch != INVALID_REG,
+ "EORI2R - failed to construct logical immediate value from %08x, need scratch",
+ (u32)imm);
+ MOVI2R(scratch, imm);
+ EOR(Rd, Rn, scratch);
+ }
+}
+
+void ARM64XEmitter::ANDSI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch)
+{
+ unsigned int n, imm_s, imm_r;
+ if (IsImmLogical(imm, Is64Bit(Rn) ? 64 : 32, &n, &imm_s, &imm_r))
+ {
+ ANDS(Rd, Rn, imm_r, imm_s, n != 0);
+ }
+ else
+ {
+ ASSERT_MSG(DYNA_REC, scratch != INVALID_REG,
+ "ANDSI2R - failed to construct logical immediate value from %08x, need scratch",
+ (u32)imm);
+ MOVI2R(scratch, imm);
+ ANDS(Rd, Rn, scratch);
+ }
+}
+
+void ARM64XEmitter::AddImmediate(ARM64Reg Rd, ARM64Reg Rn, u64 imm, bool shift, bool negative,
+ bool flags)
+{
+ switch ((negative << 1) | flags)
+ {
+ case 0:
+ ADD(Rd, Rn, imm, shift);
+ break;
+ case 1:
+ ADDS(Rd, Rn, imm, shift);
+ break;
+ case 2:
+ SUB(Rd, Rn, imm, shift);
+ break;
+ case 3:
+ SUBS(Rd, Rn, imm, shift);
+ break;
+ }
+}
+
+void ARM64XEmitter::ADDI2R_internal(ARM64Reg Rd, ARM64Reg Rn, u64 imm, bool negative, bool flags,
+ ARM64Reg scratch)
+{
+ bool has_scratch = scratch != INVALID_REG;
+ u64 imm_neg = Is64Bit(Rd) ? -imm : -imm & 0xFFFFFFFFuLL;
+ bool neg_neg = negative ? false : true;
+
+ // Fast paths, aarch64 immediate instructions
+ // Try them all first
+ if (imm <= 0xFFF)
+ {
+ AddImmediate(Rd, Rn, imm, false, negative, flags);
+ return;
+ }
+ if (imm <= 0xFFFFFF && (imm & 0xFFF) == 0)
+ {
+ AddImmediate(Rd, Rn, imm >> 12, true, negative, flags);
+ return;
+ }
+ if (imm_neg <= 0xFFF)
+ {
+ AddImmediate(Rd, Rn, imm_neg, false, neg_neg, flags);
+ return;
+ }
+ if (imm_neg <= 0xFFFFFF && (imm_neg & 0xFFF) == 0)
+ {
+ AddImmediate(Rd, Rn, imm_neg >> 12, true, neg_neg, flags);
+ return;
+ }
+
+ // ADD+ADD is slower than MOVK+ADD, but inplace.
+ // But it supports a few more bits, so use it to avoid MOVK+MOVK+ADD.
+ // As this splits the addition in two parts, this must not be done on setting flags.
+ if (!flags && (imm >= 0x10000u || !has_scratch) && imm < 0x1000000u)
+ {
+ AddImmediate(Rd, Rn, imm & 0xFFF, false, negative, false);
+ AddImmediate(Rd, Rd, imm >> 12, true, negative, false);
+ return;
+ }
+ if (!flags && (imm_neg >= 0x10000u || !has_scratch) && imm_neg < 0x1000000u)
+ {
+ AddImmediate(Rd, Rn, imm_neg & 0xFFF, false, neg_neg, false);
+ AddImmediate(Rd, Rd, imm_neg >> 12, true, neg_neg, false);
+ return;
+ }
+
+ ASSERT_MSG(DYNA_REC, has_scratch,
+ "ADDI2R - failed to construct arithmetic immediate value from %08x, need scratch",
+ (u32)imm);
+
+ negative ^= MOVI2R2(scratch, imm, imm_neg);
+ switch ((negative << 1) | flags)
+ {
+ case 0:
+ ADD(Rd, Rn, scratch);
+ break;
+ case 1:
+ ADDS(Rd, Rn, scratch);
+ break;
+ case 2:
+ SUB(Rd, Rn, scratch);
+ break;
+ case 3:
+ SUBS(Rd, Rn, scratch);
+ break;
+ }
+}
+
+void ARM64XEmitter::ADDI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch)
+{
+ ADDI2R_internal(Rd, Rn, imm, false, false, scratch);
+}
+
+void ARM64XEmitter::ADDSI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch)
+{
+ ADDI2R_internal(Rd, Rn, imm, false, true, scratch);
+}
+
+void ARM64XEmitter::SUBI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch)
+{
+ ADDI2R_internal(Rd, Rn, imm, true, false, scratch);
+}
+
+void ARM64XEmitter::SUBSI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch)
+{
+ ADDI2R_internal(Rd, Rn, imm, true, true, scratch);
+}
+
+void ARM64XEmitter::CMPI2R(ARM64Reg Rn, u64 imm, ARM64Reg scratch)
+{
+ ADDI2R_internal(Is64Bit(Rn) ? ZR : WZR, Rn, imm, true, true, scratch);
+}
+
+bool ARM64XEmitter::TryADDI2R(ARM64Reg Rd, ARM64Reg Rn, u32 imm)
+{
+ u32 val;
+ bool shift;
+ if (IsImmArithmetic(imm, &val, &shift))
+ ADD(Rd, Rn, val, shift);
+ else
+ return false;
+
+ return true;
+}
+
+bool ARM64XEmitter::TrySUBI2R(ARM64Reg Rd, ARM64Reg Rn, u32 imm)
+{
+ u32 val;
+ bool shift;
+ if (IsImmArithmetic(imm, &val, &shift))
+ SUB(Rd, Rn, val, shift);
+ else
+ return false;
+
+ return true;
+}
+
+bool ARM64XEmitter::TryCMPI2R(ARM64Reg Rn, u32 imm)
+{
+ u32 val;
+ bool shift;
+ if (IsImmArithmetic(imm, &val, &shift))
+ CMP(Rn, val, shift);
+ else
+ return false;
+
+ return true;
+}
+
+bool ARM64XEmitter::TryANDI2R(ARM64Reg Rd, ARM64Reg Rn, u32 imm)
+{
+ u32 n, imm_r, imm_s;
+ if (IsImmLogical(imm, 32, &n, &imm_s, &imm_r))
+ AND(Rd, Rn, imm_r, imm_s, n != 0);
+ else
+ return false;
+
+ return true;
+}
+bool ARM64XEmitter::TryORRI2R(ARM64Reg Rd, ARM64Reg Rn, u32 imm)
+{
+ u32 n, imm_r, imm_s;
+ if (IsImmLogical(imm, 32, &n, &imm_s, &imm_r))
+ ORR(Rd, Rn, imm_r, imm_s, n != 0);
+ else
+ return false;
+
+ return true;
+}
+bool ARM64XEmitter::TryEORI2R(ARM64Reg Rd, ARM64Reg Rn, u32 imm)
+{
+ u32 n, imm_r, imm_s;
+ if (IsImmLogical(imm, 32, &n, &imm_s, &imm_r))
+ EOR(Rd, Rn, imm_r, imm_s, n != 0);
+ else
+ return false;
+
+ return true;
+}
+
+void ARM64FloatEmitter::MOVI2F(ARM64Reg Rd, float value, ARM64Reg scratch, bool negate)
+{
+ ASSERT_MSG(DYNA_REC, !IsDouble(Rd), "MOVI2F does not yet support double precision");
+ uint8_t imm8;
+ if (value == 0.0)
+ {
+ FMOV(Rd, IsDouble(Rd) ? ZR : WZR);
+ if (negate)
+ FNEG(Rd, Rd);
+ // TODO: There are some other values we could generate with the float-imm instruction, like
+ // 1.0...
+ }
+ else if (FPImm8FromFloat(value, &imm8))
+ {
+ FMOV(Rd, imm8);
+ }
+ else
+ {
+ ASSERT_MSG(DYNA_REC, scratch != INVALID_REG,
+ "Failed to find a way to generate FP immediate %f without scratch", value);
+ if (negate)
+ value = -value;
+
+ const u32 ival = Common::BitCast<u32>(value);
+ m_emit->MOVI2R(scratch, ival);
+ FMOV(Rd, scratch);
+ }
+}
+
+// TODO: Quite a few values could be generated easily using the MOVI instruction and friends.
+void ARM64FloatEmitter::MOVI2FDUP(ARM64Reg Rd, float value, ARM64Reg scratch)
+{
+ // TODO: Make it work with more element sizes
+ // TODO: Optimize - there are shorter solution for many values
+ ARM64Reg s = (ARM64Reg)(S0 + DecodeReg(Rd));
+ MOVI2F(s, value, scratch);
+ DUP(32, Rd, Rd, 0);
+}
+
+} // namespace Arm64Gen
diff --git a/src/dolphin/Arm64Emitter.h b/src/dolphin/Arm64Emitter.h
new file mode 100644
index 0000000..4cb9ff7
--- /dev/null
+++ b/src/dolphin/Arm64Emitter.h
@@ -0,0 +1,1152 @@
+// Copyright 2015 Dolphin Emulator Project
+// Licensed under GPLv2+
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <cstring>
+#include <functional>
+
+#include "ArmCommon.h"
+#include "Assert.h"
+#include "BitSet.h"
+#include "Compat.h"
+
+namespace Arm64Gen
+{
+// X30 serves a dual purpose as a link register
+// Encoded as <u3:type><u5:reg>
+// Types:
+// 000 - 32bit GPR
+// 001 - 64bit GPR
+// 010 - VFP single precision
+// 100 - VFP double precision
+// 110 - VFP quad precision
+enum ARM64Reg
+{
+ // 32bit registers
+ W0 = 0,
+ W1,
+ W2,
+ W3,
+ W4,
+ W5,
+ W6,
+ W7,
+ W8,
+ W9,
+ W10,
+ W11,
+ W12,
+ W13,
+ W14,
+ W15,
+ W16,
+ W17,
+ W18,
+ W19,
+ W20,
+ W21,
+ W22,
+ W23,
+ W24,
+ W25,
+ W26,
+ W27,
+ W28,
+ W29,
+ W30,
+
+ WSP, // 32bit stack pointer
+
+ // 64bit registers
+ X0 = 0x20,
+ X1,
+ X2,
+ X3,
+ X4,
+ X5,
+ X6,
+ X7,
+ X8,
+ X9,
+ X10,
+ X11,
+ X12,
+ X13,
+ X14,
+ X15,
+ X16,
+ X17,
+ X18,
+ X19,
+ X20,
+ X21,
+ X22,
+ X23,
+ X24,
+ X25,
+ X26,
+ X27,
+ X28,
+ X29,
+ X30,
+
+ SP, // 64bit stack pointer
+
+ // VFP single precision registers
+ S0 = 0x40,
+ S1,
+ S2,
+ S3,
+ S4,
+ S5,
+ S6,
+ S7,
+ S8,
+ S9,
+ S10,
+ S11,
+ S12,
+ S13,
+ S14,
+ S15,
+ S16,
+ S17,
+ S18,
+ S19,
+ S20,
+ S21,
+ S22,
+ S23,
+ S24,
+ S25,
+ S26,
+ S27,
+ S28,
+ S29,
+ S30,
+ S31,
+
+ // VFP Double Precision registers
+ D0 = 0x80,
+ D1,
+ D2,
+ D3,
+ D4,
+ D5,
+ D6,
+ D7,
+ D8,
+ D9,
+ D10,
+ D11,
+ D12,
+ D13,
+ D14,
+ D15,
+ D16,
+ D17,
+ D18,
+ D19,
+ D20,
+ D21,
+ D22,
+ D23,
+ D24,
+ D25,
+ D26,
+ D27,
+ D28,
+ D29,
+ D30,
+ D31,
+
+ // ASIMD Quad-Word registers
+ Q0 = 0xC0,
+ Q1,
+ Q2,
+ Q3,
+ Q4,
+ Q5,
+ Q6,
+ Q7,
+ Q8,
+ Q9,
+ Q10,
+ Q11,
+ Q12,
+ Q13,
+ Q14,
+ Q15,
+ Q16,
+ Q17,
+ Q18,
+ Q19,
+ Q20,
+ Q21,
+ Q22,
+ Q23,
+ Q24,
+ Q25,
+ Q26,
+ Q27,
+ Q28,
+ Q29,
+ Q30,
+ Q31,
+
+ // For PRFM(prefetch memory) encoding
+ // This is encoded in the Rt register
+ // Data preload
+ PLDL1KEEP = 0,
+ PLDL1STRM,
+ PLDL2KEEP,
+ PLDL2STRM,
+ PLDL3KEEP,
+ PLDL3STRM,
+ // Instruction preload
+ PLIL1KEEP = 8,
+ PLIL1STRM,
+ PLIL2KEEP,
+ PLIL2STRM,
+ PLIL3KEEP,
+ PLIL3STRM,
+ // Prepare for store
+ PLTL1KEEP = 16,
+ PLTL1STRM,
+ PLTL2KEEP,
+ PLTL2STRM,
+ PLTL3KEEP,
+ PLTL3STRM,
+
+ WZR = WSP,
+ ZR = SP,
+
+ INVALID_REG = 0xFFFFFFFF
+};
+
+constexpr bool Is64Bit(ARM64Reg reg)
+{
+ return (reg & 0x20) != 0;
+}
+constexpr bool IsSingle(ARM64Reg reg)
+{
+ return (reg & 0xC0) == 0x40;
+}
+constexpr bool IsDouble(ARM64Reg reg)
+{
+ return (reg & 0xC0) == 0x80;
+}
+constexpr bool IsScalar(ARM64Reg reg)
+{
+ return IsSingle(reg) || IsDouble(reg);
+}
+constexpr bool IsQuad(ARM64Reg reg)
+{
+ return (reg & 0xC0) == 0xC0;
+}
+constexpr bool IsVector(ARM64Reg reg)
+{
+ return (reg & 0xC0) != 0;
+}
+constexpr bool IsGPR(ARM64Reg reg)
+{
+ return static_cast<int>(reg) < 0x40;
+}
+
+constexpr ARM64Reg DecodeReg(ARM64Reg reg)
+{
+ return static_cast<ARM64Reg>(reg & 0x1F);
+}
+constexpr ARM64Reg EncodeRegTo64(ARM64Reg reg)
+{
+ return static_cast<ARM64Reg>(reg | 0x20);
+}
+constexpr ARM64Reg EncodeRegToSingle(ARM64Reg reg)
+{
+ return static_cast<ARM64Reg>(DecodeReg(reg) + S0);
+}
+constexpr ARM64Reg EncodeRegToDouble(ARM64Reg reg)
+{
+ return static_cast<ARM64Reg>((reg & ~0xC0) | 0x80);
+}
+constexpr ARM64Reg EncodeRegToQuad(ARM64Reg reg)
+{
+ return static_cast<ARM64Reg>(reg | 0xC0);
+}
+
+enum OpType
+{
+ TYPE_IMM = 0,
+ TYPE_REG,
+ TYPE_IMMSREG,
+ TYPE_RSR,
+ TYPE_MEM
+};
+
+enum ShiftType
+{
+ ST_LSL = 0,
+ ST_LSR = 1,
+ ST_ASR = 2,
+ ST_ROR = 3,
+};
+
+enum IndexType
+{
+ INDEX_UNSIGNED,
+ INDEX_POST,
+ INDEX_PRE,
+ INDEX_SIGNED, // used in LDP/STP
+};
+
+enum ShiftAmount
+{
+ SHIFT_0 = 0,
+ SHIFT_16 = 1,
+ SHIFT_32 = 2,
+ SHIFT_48 = 3,
+};
+
+enum RoundingMode
+{
+ ROUND_A, // round to nearest, ties to away
+ ROUND_M, // round towards -inf
+ ROUND_N, // round to nearest, ties to even
+ ROUND_P, // round towards +inf
+ ROUND_Z, // round towards zero
+};
+
+struct FixupBranch
+{
+ ptrdiff_t ptr;
+ // Type defines
+ // 0 = CBZ (32bit)
+ // 1 = CBNZ (32bit)
+ // 2 = B (conditional)
+ // 3 = TBZ
+ // 4 = TBNZ
+ // 5 = B (unconditional)
+ // 6 = BL (unconditional)
+ u32 type;
+
+ // Used with B.cond
+ CCFlags cond;
+
+ // Used with TBZ/TBNZ
+ u8 bit;
+
+ // Used with Test/Compare and Branch
+ ARM64Reg reg;
+};
+
+enum PStateField
+{
+ FIELD_SPSel = 0,
+ FIELD_DAIFSet,
+ FIELD_DAIFClr,
+ FIELD_NZCV, // The only system registers accessible from EL0 (user space)
+ FIELD_PMCR_EL0,
+ FIELD_PMCCNTR_EL0,
+ FIELD_FPCR = 0x340,
+ FIELD_FPSR = 0x341,
+};
+
+enum SystemHint
+{
+ HINT_NOP = 0,
+ HINT_YIELD,
+ HINT_WFE,
+ HINT_WFI,
+ HINT_SEV,
+ HINT_SEVL,
+};
+
+enum BarrierType
+{
+ OSHLD = 1,
+ OSHST = 2,
+ OSH = 3,
+ NSHLD = 5,
+ NSHST = 6,
+ NSH = 7,
+ ISHLD = 9,
+ ISHST = 10,
+ ISH = 11,
+ LD = 13,
+ ST = 14,
+ SY = 15,
+};
+
+class ArithOption
+{
+public:
+ enum WidthSpecifier
+ {
+ WIDTH_DEFAULT,
+ WIDTH_32BIT,
+ WIDTH_64BIT,
+ };
+
+ enum ExtendSpecifier
+ {
+ EXTEND_UXTB = 0x0,
+ EXTEND_UXTH = 0x1,
+ EXTEND_UXTW = 0x2, /* Also LSL on 32bit width */
+ EXTEND_UXTX = 0x3, /* Also LSL on 64bit width */
+ EXTEND_SXTB = 0x4,
+ EXTEND_SXTH = 0x5,
+ EXTEND_SXTW = 0x6,
+ EXTEND_SXTX = 0x7,
+ };
+
+ enum TypeSpecifier
+ {
+ TYPE_EXTENDEDREG,
+ TYPE_IMM,
+ TYPE_SHIFTEDREG,
+ };
+
+private:
+ ARM64Reg m_destReg;
+ WidthSpecifier m_width;
+ ExtendSpecifier m_extend;
+ TypeSpecifier m_type;
+ ShiftType m_shifttype;
+ u32 m_shift;
+
+public:
+ ArithOption(ARM64Reg Rd, bool index = false)
+ {
+ // Indexed registers are a certain feature of AARch64
+ // On Loadstore instructions that use a register offset
+ // We can have the register as an index
+ // If we are indexing then the offset register will
+ // be shifted to the left so we are indexing at intervals
+ // of the size of what we are loading
+ // 8-bit: Index does nothing
+ // 16-bit: Index LSL 1
+ // 32-bit: Index LSL 2
+ // 64-bit: Index LSL 3
+ if (index)
+ m_shift = 4;
+ else
+ m_shift = 0;
+
+ m_destReg = Rd;
+ m_type = TYPE_EXTENDEDREG;
+ if (Is64Bit(Rd))
+ {
+ m_width = WIDTH_64BIT;
+ m_extend = EXTEND_UXTX;
+ }
+ else
+ {
+ m_width = WIDTH_32BIT;
+ m_extend = EXTEND_UXTW;
+ }
+ m_shifttype = ST_LSL;
+ }
+ ArithOption(ARM64Reg Rd, ShiftType shift_type, u32 shift)
+ {
+ m_destReg = Rd;
+ m_shift = shift;
+ m_shifttype = shift_type;
+ m_type = TYPE_SHIFTEDREG;
+ if (Is64Bit(Rd))
+ {
+ m_width = WIDTH_64BIT;
+ if (shift == 64)
+ m_shift = 0;
+ }
+ else
+ {
+ m_width = WIDTH_32BIT;
+ if (shift == 32)
+ m_shift = 0;
+ }
+ }
+ TypeSpecifier GetType() const { return m_type; }
+ ARM64Reg GetReg() const { return m_destReg; }
+ u32 GetData() const
+ {
+ switch (m_type)
+ {
+ case TYPE_EXTENDEDREG:
+ return (m_extend << 13) | (m_shift << 10);
+ break;
+ case TYPE_SHIFTEDREG:
+ return (m_shifttype << 22) | (m_shift << 10);
+ break;
+ default:
+ DEBUG_ASSERT_MSG(DYNA_REC, false, "Invalid type in GetData");
+ break;
+ }
+ return 0;
+ }
+};
+
+class ARM64XEmitter
+{
+ friend class ARM64FloatEmitter;
+
+private:
+ ptrdiff_t m_code;
+ ptrdiff_t m_lastCacheFlushEnd;
+ u8* m_rwbase;
+ u8* m_rxbase;
+
+ void AddImmediate(ARM64Reg Rd, ARM64Reg Rn, u64 imm, bool shift, bool negative, bool flags);
+ void EncodeCompareBranchInst(u32 op, ARM64Reg Rt, const void* ptr);
+ void EncodeTestBranchInst(u32 op, ARM64Reg Rt, u8 bits, const void* ptr);
+ void EncodeUnconditionalBranchInst(u32 op, const void* ptr);
+ void EncodeUnconditionalBranchInst(u32 opc, u32 op2, u32 op3, u32 op4, ARM64Reg Rn);
+ void EncodeExceptionInst(u32 instenc, u32 imm);
+ void EncodeSystemInst(u32 op0, u32 op1, u32 CRn, u32 CRm, u32 op2, ARM64Reg Rt);
+ void EncodeArithmeticInst(u32 instenc, bool flags, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm,
+ ArithOption Option);
+ void EncodeArithmeticCarryInst(u32 op, bool flags, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+ void EncodeCondCompareImmInst(u32 op, ARM64Reg Rn, u32 imm, u32 nzcv, CCFlags cond);
+ void EncodeCondCompareRegInst(u32 op, ARM64Reg Rn, ARM64Reg Rm, u32 nzcv, CCFlags cond);
+ void EncodeCondSelectInst(u32 instenc, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, CCFlags cond);
+ void EncodeData1SrcInst(u32 instenc, ARM64Reg Rd, ARM64Reg Rn);
+ void EncodeData2SrcInst(u32 instenc, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+ void EncodeData3SrcInst(u32 instenc, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra);
+ void EncodeLogicalInst(u32 instenc, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift);
+ void EncodeLoadRegisterInst(u32 bitop, ARM64Reg Rt, u32 imm);
+ void EncodeLoadStoreExcInst(u32 instenc, ARM64Reg Rs, ARM64Reg Rt2, ARM64Reg Rn, ARM64Reg Rt);
+ void EncodeLoadStorePairedInst(u32 op, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, u32 imm);
+ void EncodeLoadStoreIndexedInst(u32 op, u32 op2, ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+ void EncodeLoadStoreIndexedInst(u32 op, ARM64Reg Rt, ARM64Reg Rn, s32 imm, u8 size);
+ void EncodeMOVWideInst(u32 op, ARM64Reg Rd, u32 imm, ShiftAmount pos);
+ void EncodeBitfieldMOVInst(u32 op, ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms);
+ void EncodeLoadStoreRegisterOffset(u32 size, u32 opc, ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm);
+ void EncodeAddSubImmInst(u32 op, bool flags, u32 shift, u32 imm, ARM64Reg Rn, ARM64Reg Rd);
+ void EncodeLogicalImmInst(u32 op, ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms, int n);
+ void EncodeLoadStorePair(u32 op, u32 load, IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn,
+ s32 imm);
+ void EncodeAddressInst(u32 op, ARM64Reg Rd, s32 imm);
+ void EncodeLoadStoreUnscaled(u32 size, u32 op, ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+
+protected:
+ // TODO: make this less ugly
+ // used for Switch where memory is executable and writeable and different addresses
+ // we need to take this for relative addressing in account
+
+ void Write32(u32 value);
+
+public:
+ ARM64XEmitter() : m_code(0), m_lastCacheFlushEnd(0), m_rwbase(nullptr), m_rxbase(nullptr) {}
+ ARM64XEmitter(u8* rwbase, u8* rxbase, ptrdiff_t offset)
+ {
+ m_rwbase = rwbase;
+ m_rxbase = rxbase;
+ m_code = offset;
+ m_lastCacheFlushEnd = offset;
+ }
+
+ virtual ~ARM64XEmitter() {}
+ void SetCodePtr(ptrdiff_t ptr);
+ void SetCodePtrUnsafe(ptrdiff_t ptr);
+ void SetCodeBase(u8* rwbase, u8* rxbase);
+ void ReserveCodeSpace(u32 bytes);
+ ptrdiff_t AlignCode16();
+ ptrdiff_t AlignCodePage();
+ ptrdiff_t GetCodeOffset();
+ const u8* GetRWPtr();
+ u8* GetWriteableRWPtr();
+ void* GetRXPtr();
+ void FlushIcache();
+ void FlushIcacheSection(u8* start, u8* end);
+
+ // FixupBranch branching
+ void SetJumpTarget(FixupBranch const& branch);
+ FixupBranch CBZ(ARM64Reg Rt);
+ FixupBranch CBNZ(ARM64Reg Rt);
+ FixupBranch B(CCFlags cond);
+ FixupBranch TBZ(ARM64Reg Rt, u8 bit);
+ FixupBranch TBNZ(ARM64Reg Rt, u8 bit);
+ FixupBranch B();
+ FixupBranch BL();
+
+ // Compare and Branch
+ void CBZ(ARM64Reg Rt, const void* ptr);
+ void CBNZ(ARM64Reg Rt, const void* ptr);
+
+ // Conditional Branch
+ void B(CCFlags cond, const void* ptr);
+
+ // Test and Branch
+ void TBZ(ARM64Reg Rt, u8 bits, const void* ptr);
+ void TBNZ(ARM64Reg Rt, u8 bits, const void* ptr);
+
+ // Unconditional Branch
+ void B(const void* ptr);
+ void BL(const void* ptr);
+
+ // Unconditional Branch (register)
+ void BR(ARM64Reg Rn);
+ void BLR(ARM64Reg Rn);
+ void RET(ARM64Reg Rn = X30);
+ void ERET();
+ void DRPS();
+
+ // Exception generation
+ void SVC(u32 imm);
+ void HVC(u32 imm);
+ void SMC(u32 imm);
+ void BRK(u32 imm);
+ void HLT(u32 imm);
+ void DCPS1(u32 imm);
+ void DCPS2(u32 imm);
+ void DCPS3(u32 imm);
+
+ // System
+ void _MSR(PStateField field, u8 imm);
+ void _MSR(PStateField field, ARM64Reg Rt);
+ void MRS(ARM64Reg Rt, PStateField field);
+ void CNTVCT(ARM64Reg Rt);
+
+ void HINT(SystemHint op);
+ void CLREX();
+ void DSB(BarrierType type);
+ void DMB(BarrierType type);
+ void ISB(BarrierType type);
+
+ // Add/Subtract (Extended/Shifted register)
+ void ADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+ void ADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Option);
+ void ADDS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+ void ADDS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Option);
+ void SUB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+ void SUB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Option);
+ void SUBS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+ void SUBS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Option);
+ void CMN(ARM64Reg Rn, ARM64Reg Rm);
+ void CMN(ARM64Reg Rn, ARM64Reg Rm, ArithOption Option);
+ void CMP(ARM64Reg Rn, ARM64Reg Rm);
+ void CMP(ARM64Reg Rn, ARM64Reg Rm, ArithOption Option);
+
+ // Add/Subtract (with carry)
+ void ADC(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+ void ADCS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+ void SBC(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+ void SBCS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+
+ // Conditional Compare (immediate)
+ void CCMN(ARM64Reg Rn, u32 imm, u32 nzcv, CCFlags cond);
+ void CCMP(ARM64Reg Rn, u32 imm, u32 nzcv, CCFlags cond);
+
+ // Conditional Compare (register)
+ void CCMN(ARM64Reg Rn, ARM64Reg Rm, u32 nzcv, CCFlags cond);
+ void CCMP(ARM64Reg Rn, ARM64Reg Rm, u32 nzcv, CCFlags cond);
+
+ // Conditional Select
+ void CSEL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, CCFlags cond);
+ void CSINC(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, CCFlags cond);
+ void CSINV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, CCFlags cond);
+ void CSNEG(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, CCFlags cond);
+
+ // Aliases
+ void CSET(ARM64Reg Rd, CCFlags cond)
+ {
+ ARM64Reg zr = Is64Bit(Rd) ? ZR : WZR;
+ CSINC(Rd, zr, zr, (CCFlags)((u32)cond ^ 1));
+ }
+ void CSETM(ARM64Reg Rd, CCFlags cond)
+ {
+ ARM64Reg zr = Is64Bit(Rd) ? ZR : WZR;
+ CSINV(Rd, zr, zr, (CCFlags)((u32)cond ^ 1));
+ }
+ void NEG(ARM64Reg Rd, ARM64Reg Rs) { SUB(Rd, Is64Bit(Rd) ? ZR : WZR, Rs); }
+ // Data-Processing 1 source
+ void RBIT(ARM64Reg Rd, ARM64Reg Rn);
+ void REV16(ARM64Reg Rd, ARM64Reg Rn);
+ void REV32(ARM64Reg Rd, ARM64Reg Rn);
+ void REV64(ARM64Reg Rd, ARM64Reg Rn);
+ void CLZ(ARM64Reg Rd, ARM64Reg Rn);
+ void CLS(ARM64Reg Rd, ARM64Reg Rn);
+
+ // Data-Processing 2 source
+ void UDIV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+ void SDIV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+ void LSLV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+ void LSRV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+ void ASRV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+ void RORV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+ void CRC32B(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+ void CRC32H(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+ void CRC32W(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+ void CRC32CB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+ void CRC32CH(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+ void CRC32CW(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+ void CRC32X(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+ void CRC32CX(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+
+ // Data-Processing 3 source
+ void MADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra);
+ void MSUB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra);
+ void SMADDL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra);
+ void SMULL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+ void SMSUBL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra);
+ void SMULH(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+ void UMADDL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra);
+ void UMULL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+ void UMSUBL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra);
+ void UMULH(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+ void MUL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+ void MNEG(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+
+ // Logical (shifted register)
+ void AND(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift);
+ void BIC(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift);
+ void ORR(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift);
+ void ORN(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift);
+ void EOR(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift);
+ void EON(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift);
+ void ANDS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift);
+ void BICS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift);
+
+ // Wrap the above for saner syntax
+ void AND(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { AND(Rd, Rn, Rm, ArithOption(Rd, ST_LSL, 0)); }
+ void BIC(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { BIC(Rd, Rn, Rm, ArithOption(Rd, ST_LSL, 0)); }
+ void ORR(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { ORR(Rd, Rn, Rm, ArithOption(Rd, ST_LSL, 0)); }
+ void ORN(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { ORN(Rd, Rn, Rm, ArithOption(Rd, ST_LSL, 0)); }
+ void EOR(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { EOR(Rd, Rn, Rm, ArithOption(Rd, ST_LSL, 0)); }
+ void EON(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { EON(Rd, Rn, Rm, ArithOption(Rd, ST_LSL, 0)); }
+ void ANDS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { ANDS(Rd, Rn, Rm, ArithOption(Rd, ST_LSL, 0)); }
+ void BICS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { BICS(Rd, Rn, Rm, ArithOption(Rd, ST_LSL, 0)); }
+ // Convenience wrappers around ORR. These match the official convenience syntax.
+ void MOV(ARM64Reg Rd, ARM64Reg Rm, ArithOption Shift);
+ void MOV(ARM64Reg Rd, ARM64Reg Rm);
+ void MVN(ARM64Reg Rd, ARM64Reg Rm);
+
+ // Convenience wrappers around UBFM/EXTR.
+ void LSR(ARM64Reg Rd, ARM64Reg Rm, int shift);
+ void LSL(ARM64Reg Rd, ARM64Reg Rm, int shift);
+ void ASR(ARM64Reg Rd, ARM64Reg Rm, int shift);
+ void ROR_(ARM64Reg Rd, ARM64Reg Rm, int shift);
+
+ // Logical (immediate)
+ void AND(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms, bool invert = false);
+ void ANDS(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms, bool invert = false);
+ void EOR(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms, bool invert = false);
+ void ORR(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms, bool invert = false);
+ void TST(ARM64Reg Rn, u32 immr, u32 imms, bool invert = false);
+ void TST(ARM64Reg Rn, ARM64Reg Rm) { ANDS(Is64Bit(Rn) ? ZR : WZR, Rn, Rm); }
+ // Add/subtract (immediate)
+ void ADD(ARM64Reg Rd, ARM64Reg Rn, u32 imm, bool shift = false);
+ void ADDS(ARM64Reg Rd, ARM64Reg Rn, u32 imm, bool shift = false);
+ void SUB(ARM64Reg Rd, ARM64Reg Rn, u32 imm, bool shift = false);
+ void SUBS(ARM64Reg Rd, ARM64Reg Rn, u32 imm, bool shift = false);
+ void CMP(ARM64Reg Rn, u32 imm, bool shift = false);
+
+ // Data Processing (Immediate)
+ void MOVZ(ARM64Reg Rd, u32 imm, ShiftAmount pos = SHIFT_0);
+ void MOVN(ARM64Reg Rd, u32 imm, ShiftAmount pos = SHIFT_0);
+ void MOVK(ARM64Reg Rd, u32 imm, ShiftAmount pos = SHIFT_0);
+
+ // Bitfield move
+ void BFM(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms);
+ void SBFM(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms);
+ void UBFM(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms);
+ void BFI(ARM64Reg Rd, ARM64Reg Rn, u32 lsb, u32 width);
+ void UBFIZ(ARM64Reg Rd, ARM64Reg Rn, u32 lsb, u32 width);
+
+ // Extract register (ROR with two inputs, if same then faster on A67)
+ void EXTR(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, u32 shift);
+
+ // Aliases
+ void SXTB(ARM64Reg Rd, ARM64Reg Rn);
+ void SXTH(ARM64Reg Rd, ARM64Reg Rn);
+ void SXTW(ARM64Reg Rd, ARM64Reg Rn);
+ void UXTB(ARM64Reg Rd, ARM64Reg Rn);
+ void UXTH(ARM64Reg Rd, ARM64Reg Rn);
+
+ void UBFX(ARM64Reg Rd, ARM64Reg Rn, int lsb, int width) { UBFM(Rd, Rn, lsb, lsb + width - 1); }
+ // Load Register (Literal)
+ void LDR(ARM64Reg Rt, u32 imm);
+ void LDRSW(ARM64Reg Rt, u32 imm);
+ void PRFM(ARM64Reg Rt, u32 imm);
+
+ // Load/Store Exclusive
+ void STXRB(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rn);
+ void STLXRB(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rn);
+ void LDXRB(ARM64Reg Rt, ARM64Reg Rn);
+ void LDAXRB(ARM64Reg Rt, ARM64Reg Rn);
+ void STLRB(ARM64Reg Rt, ARM64Reg Rn);
+ void LDARB(ARM64Reg Rt, ARM64Reg Rn);
+ void STXRH(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rn);
+ void STLXRH(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rn);
+ void LDXRH(ARM64Reg Rt, ARM64Reg Rn);
+ void LDAXRH(ARM64Reg Rt, ARM64Reg Rn);
+ void STLRH(ARM64Reg Rt, ARM64Reg Rn);
+ void LDARH(ARM64Reg Rt, ARM64Reg Rn);
+ void STXR(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rn);
+ void STLXR(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rn);
+ void STXP(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn);
+ void STLXP(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn);
+ void LDXR(ARM64Reg Rt, ARM64Reg Rn);
+ void LDAXR(ARM64Reg Rt, ARM64Reg Rn);
+ void LDXP(ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn);
+ void LDAXP(ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn);
+ void STLR(ARM64Reg Rt, ARM64Reg Rn);
+ void LDAR(ARM64Reg Rt, ARM64Reg Rn);
+
+ // Load/Store no-allocate pair (offset)
+ void STNP(ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, u32 imm);
+ void LDNP(ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, u32 imm);
+
+ // Load/Store register (immediate indexed)
+ void STRB(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+ void LDRB(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+ void LDRSB(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+ void STRH(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+ void LDRH(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+ void LDRSH(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+ void STR(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+ void LDR(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+ void LDRSW(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+
+ // Load/Store register (register offset)
+ void STRB(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm);
+ void LDRB(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm);
+ void LDRSB(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm);
+ void STRH(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm);
+ void LDRH(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm);
+ void LDRSH(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm);
+ void STR(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm);
+ void LDR(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm);
+ void LDRSW(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm);
+ void PRFM(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm);
+
+ // Load/Store register (unscaled offset)
+ void STURB(ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+ void LDURB(ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+ void LDURSB(ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+ void STURH(ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+ void LDURH(ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+ void LDURSH(ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+ void STUR(ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+ void LDUR(ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+ void LDURSW(ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+
+ // Load/Store pair
+ void LDP(IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, s32 imm);
+ void LDPSW(IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, s32 imm);
+ void STP(IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, s32 imm);
+
+ void LDRGeneric(int size, bool signExtend, ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm);
+ void STRGeneric(int size, ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm);
+
+ void LDRGeneric(int size, bool signExtend, IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+ void STRGeneric(int size, IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+
+ // Address of label/page PC-relative
+ void ADR(ARM64Reg Rd, s32 imm);
+ void ADRP(ARM64Reg Rd, s32 imm);
+
+ // Wrapper around MOVZ+MOVK
+ void MOVI2R(ARM64Reg Rd, u64 imm, bool optimize = true);
+ bool MOVI2R2(ARM64Reg Rd, u64 imm1, u64 imm2);
+ template <class P>
+ void MOVP2R(ARM64Reg Rd, P* ptr)
+ {
+ ASSERT_MSG(DYNA_REC, Is64Bit(Rd), "Can't store pointers in 32-bit registers");
+ MOVI2R(Rd, (uintptr_t)ptr);
+ }
+
+ // Wrapper around AND x, y, imm etc. If you are sure the imm will work, no need to pass a scratch
+ // register.
+ void ANDI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch = INVALID_REG);
+ void ANDSI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch = INVALID_REG);
+ void TSTI2R(ARM64Reg Rn, u64 imm, ARM64Reg scratch = INVALID_REG)
+ {
+ ANDSI2R(Is64Bit(Rn) ? ZR : WZR, Rn, imm, scratch);
+ }
+ void ORRI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch = INVALID_REG);
+ void EORI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch = INVALID_REG);
+ void CMPI2R(ARM64Reg Rn, u64 imm, ARM64Reg scratch = INVALID_REG);
+
+ void ADDI2R_internal(ARM64Reg Rd, ARM64Reg Rn, u64 imm, bool negative, bool flags,
+ ARM64Reg scratch);
+ void ADDI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch = INVALID_REG);
+ void ADDSI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch = INVALID_REG);
+ void SUBI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch = INVALID_REG);
+ void SUBSI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch = INVALID_REG);
+
+ bool TryADDI2R(ARM64Reg Rd, ARM64Reg Rn, u32 imm);
+ bool TrySUBI2R(ARM64Reg Rd, ARM64Reg Rn, u32 imm);
+ bool TryCMPI2R(ARM64Reg Rn, u32 imm);
+
+ bool TryANDI2R(ARM64Reg Rd, ARM64Reg Rn, u32 imm);
+ bool TryORRI2R(ARM64Reg Rd, ARM64Reg Rn, u32 imm);
+ bool TryEORI2R(ARM64Reg Rd, ARM64Reg Rn, u32 imm);
+
+ // ABI related
+ void ABI_PushRegisters(BitSet32 registers);
+ void ABI_PopRegisters(BitSet32 registers, BitSet32 ignore_mask = BitSet32(0));
+
+ // Utility to generate a call to a std::function object.
+ //
+ // Unfortunately, calling operator() directly is undefined behavior in C++
+ // (this method might be a thunk in the case of multi-inheritance) so we
+ // have to go through a trampoline function.
+ template <typename T, typename... Args>
+ static T CallLambdaTrampoline(const std::function<T(Args...)>* f, Args... args)
+ {
+ return (*f)(args...);
+ }
+
+ // This function expects you to have set up the state.
+ // Overwrites X0 and X30
+ template <typename T, typename... Args>
+ ARM64Reg ABI_SetupLambda(const std::function<T(Args...)>* f)
+ {
+ auto trampoline = &ARM64XEmitter::CallLambdaTrampoline<T, Args...>;
+ MOVI2R(X30, (uintptr_t)trampoline);
+ MOVI2R(X0, (uintptr_t) const_cast<void*>((const void*)f));
+ return X30;
+ }
+
+ void QuickTailCall(ARM64Reg scratchreg, const void* func);
+ template <typename T>
+ void QuickTailCall(ARM64Reg scratchreg, T func)
+ {
+ QuickTailCall(scratchreg, (const void*)func);
+ }
+
+ // Plain function call
+ void QuickCallFunction(ARM64Reg scratchreg, const void* func);
+ template <typename T>
+ void QuickCallFunction(ARM64Reg scratchreg, T func)
+ {
+ QuickCallFunction(scratchreg, (const void*)func);
+ }
+};
+
+class ARM64FloatEmitter
+{
+public:
+ ARM64FloatEmitter(ARM64XEmitter* emit) : m_emit(emit) {}
+ void LDR(u8 size, IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+ void STR(u8 size, IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+
+ // Loadstore unscaled
+ void LDUR(u8 size, ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+ void STUR(u8 size, ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+
+ // Loadstore single structure
+ void LD1(u8 size, ARM64Reg Rt, u8 index, ARM64Reg Rn);
+ void LD1(u8 size, ARM64Reg Rt, u8 index, ARM64Reg Rn, ARM64Reg Rm);
+ void LD1R(u8 size, ARM64Reg Rt, ARM64Reg Rn);
+ void LD2R(u8 size, ARM64Reg Rt, ARM64Reg Rn);
+ void LD1R(u8 size, ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm);
+ void LD2R(u8 size, ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm);
+ void ST1(u8 size, ARM64Reg Rt, u8 index, ARM64Reg Rn);
+ void ST1(u8 size, ARM64Reg Rt, u8 index, ARM64Reg Rn, ARM64Reg Rm);
+
+ // Loadstore multiple structure
+ void LD1(u8 size, u8 count, ARM64Reg Rt, ARM64Reg Rn);
+ void LD1(u8 size, u8 count, IndexType type, ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm = SP);
+ void ST1(u8 size, u8 count, ARM64Reg Rt, ARM64Reg Rn);
+ void ST1(u8 size, u8 count, IndexType type, ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm = SP);
+
+ // Loadstore paired
+ void LDP(u8 size, IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, s32 imm);
+ void STP(u8 size, IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, s32 imm);
+
+ // Loadstore register offset
+ void STR(u8 size, ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm);
+ void LDR(u8 size, ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm);
+
+ // Scalar - 1 Source
+ void FABS(ARM64Reg Rd, ARM64Reg Rn);
+ void FNEG(ARM64Reg Rd, ARM64Reg Rn);
+ void FSQRT(ARM64Reg Rd, ARM64Reg Rn);
+ void FMOV(ARM64Reg Rd, ARM64Reg Rn, bool top = false); // Also generalized move between GPR/FP
+
+ // Scalar - 2 Source
+ void FADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+ void FMUL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+ void FSUB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+ void FDIV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+ void FMAX(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+ void FMIN(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+ void FMAXNM(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+ void FMINNM(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+ void FNMUL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+
+ // Scalar - 3 Source. Note - the accumulator is last on ARM!
+ void FMADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra);
+ void FMSUB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra);
+ void FNMADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra);
+ void FNMSUB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra);
+
+ // Scalar floating point immediate
+ void FMOV(ARM64Reg Rd, uint8_t imm8);
+
+ // Vector
+ void AND(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+ void BSL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+ void DUP(u8 size, ARM64Reg Rd, ARM64Reg Rn, u8 index);
+ void FABS(u8 size, ARM64Reg Rd, ARM64Reg Rn);
+ void FADD(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+ void FMAX(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+ void FMLA(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+ void FMLS(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+ void FMIN(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+ void FCVTL(u8 size, ARM64Reg Rd, ARM64Reg Rn);
+ void FCVTL2(u8 size, ARM64Reg Rd, ARM64Reg Rn);
+ void FCVTN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn);
+ void FCVTN2(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn);
+ void FCVTZS(u8 size, ARM64Reg Rd, ARM64Reg Rn);
+ void FCVTZU(u8 size, ARM64Reg Rd, ARM64Reg Rn);
+ void FDIV(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+ void FMUL(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+ void FNEG(u8 size, ARM64Reg Rd, ARM64Reg Rn);
+ void FRECPE(u8 size, ARM64Reg Rd, ARM64Reg Rn);
+ void FRSQRTE(u8 size, ARM64Reg Rd, ARM64Reg Rn);
+ void FSUB(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+ void NOT(ARM64Reg Rd, ARM64Reg Rn);
+ void ORR(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+ void MOV(ARM64Reg Rd, ARM64Reg Rn) { ORR(Rd, Rn, Rn); }
+ void REV16(u8 size, ARM64Reg Rd, ARM64Reg Rn);
+ void REV32(u8 size, ARM64Reg Rd, ARM64Reg Rn);
+ void REV64(u8 size, ARM64Reg Rd, ARM64Reg Rn);
+ void SCVTF(u8 size, ARM64Reg Rd, ARM64Reg Rn);
+ void UCVTF(u8 size, ARM64Reg Rd, ARM64Reg Rn);
+ void SCVTF(u8 size, ARM64Reg Rd, ARM64Reg Rn, int scale);
+ void UCVTF(u8 size, ARM64Reg Rd, ARM64Reg Rn, int scale);
+ void SQXTN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn);
+ void SQXTN2(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn);
+ void UQXTN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn);
+ void UQXTN2(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn);
+ void XTN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn);
+ void XTN2(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn);
+
+ // Move
+ void DUP(u8 size, ARM64Reg Rd, ARM64Reg Rn);
+ void INS(u8 size, ARM64Reg Rd, u8 index, ARM64Reg Rn);
+ void INS(u8 size, ARM64Reg Rd, u8 index1, ARM64Reg Rn, u8 index2);
+ void UMOV(u8 size, ARM64Reg Rd, ARM64Reg Rn, u8 index);
+ void SMOV(u8 size, ARM64Reg Rd, ARM64Reg Rn, u8 index);
+
+ // One source
+ void FCVT(u8 size_to, u8 size_from, ARM64Reg Rd, ARM64Reg Rn);
+
+ // Scalar convert float to int, in a lot of variants.
+ // Note that the scalar version of this operation has two encodings, one that goes to an integer
+ // register
+ // and one that outputs to a scalar fp register.
+ void FCVTS(ARM64Reg Rd, ARM64Reg Rn, RoundingMode round);
+ void FCVTU(ARM64Reg Rd, ARM64Reg Rn, RoundingMode round);
+
+ // Scalar convert int to float. No rounding mode specifier necessary.
+ void SCVTF(ARM64Reg Rd, ARM64Reg Rn);
+ void UCVTF(ARM64Reg Rd, ARM64Reg Rn);
+
+ // Scalar fixed point to float. scale is the number of fractional bits.
+ void SCVTF(ARM64Reg Rd, ARM64Reg Rn, int scale);
+ void UCVTF(ARM64Reg Rd, ARM64Reg Rn, int scale);
+
+ // Float comparison
+ void FCMP(ARM64Reg Rn, ARM64Reg Rm);
+ void FCMP(ARM64Reg Rn);
+ void FCMPE(ARM64Reg Rn, ARM64Reg Rm);
+ void FCMPE(ARM64Reg Rn);
+ void FCMEQ(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+ void FCMEQ(u8 size, ARM64Reg Rd, ARM64Reg Rn);
+ void FCMGE(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+ void FCMGE(u8 size, ARM64Reg Rd, ARM64Reg Rn);
+ void FCMGT(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+ void FCMGT(u8 size, ARM64Reg Rd, ARM64Reg Rn);
+ void FCMLE(u8 size, ARM64Reg Rd, ARM64Reg Rn);
+ void FCMLT(u8 size, ARM64Reg Rd, ARM64Reg Rn);
+
+ // Conditional select
+ void FCSEL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, CCFlags cond);
+
+ // Permute
+ void UZP1(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+ void TRN1(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+ void ZIP1(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+ void UZP2(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+ void TRN2(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+ void ZIP2(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+
+ // Shift by immediate
+ void SSHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift);
+ void SSHLL2(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift);
+ void USHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift);
+ void USHLL2(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift);
+ void SHRN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift);
+ void SHRN2(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift);
+ void SXTL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn);
+ void SXTL2(u8 src_size, ARM64Reg Rd, ARM64Reg Rn);
+ void UXTL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn);
+ void UXTL2(u8 src_size, ARM64Reg Rd, ARM64Reg Rn);
+
+ // vector x indexed element
+ void FMUL(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, u8 index);
+ void FMLA(u8 esize, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, u8 index);
+
+ // Modified Immediate
+ void MOVI(u8 size, ARM64Reg Rd, u64 imm, u8 shift = 0);
+ void BIC(u8 size, ARM64Reg Rd, u8 imm, u8 shift = 0);
+
+ void MOVI2F(ARM64Reg Rd, float value, ARM64Reg scratch = INVALID_REG, bool negate = false);
+ void MOVI2FDUP(ARM64Reg Rd, float value, ARM64Reg scratch = INVALID_REG);
+
+ // ABI related
+ void ABI_PushRegisters(BitSet32 registers, ARM64Reg tmp = INVALID_REG);
+ void ABI_PopRegisters(BitSet32 registers, ARM64Reg tmp = INVALID_REG);
+
+private:
+ ARM64XEmitter* m_emit;
+ inline void Write32(u32 value) { m_emit->Write32(value); }
+ // Emitting functions
+ void EmitLoadStoreImmediate(u8 size, u32 opc, IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+ void EmitScalar2Source(bool M, bool S, u32 type, u32 opcode, ARM64Reg Rd, ARM64Reg Rn,
+ ARM64Reg Rm);
+ void EmitThreeSame(bool U, u32 size, u32 opcode, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+ void EmitCopy(bool Q, u32 op, u32 imm5, u32 imm4, ARM64Reg Rd, ARM64Reg Rn);
+ void Emit2RegMisc(bool Q, bool U, u32 size, u32 opcode, ARM64Reg Rd, ARM64Reg Rn);
+ void EmitLoadStoreSingleStructure(bool L, bool R, u32 opcode, bool S, u32 size, ARM64Reg Rt,
+ ARM64Reg Rn);
+ void EmitLoadStoreSingleStructure(bool L, bool R, u32 opcode, bool S, u32 size, ARM64Reg Rt,
+ ARM64Reg Rn, ARM64Reg Rm);
+ void Emit1Source(bool M, bool S, u32 type, u32 opcode, ARM64Reg Rd, ARM64Reg Rn);
+ void EmitConversion(bool sf, bool S, u32 type, u32 rmode, u32 opcode, ARM64Reg Rd, ARM64Reg Rn);
+ void EmitConversion2(bool sf, bool S, bool direction, u32 type, u32 rmode, u32 opcode, int scale,
+ ARM64Reg Rd, ARM64Reg Rn);
+ void EmitCompare(bool M, bool S, u32 op, u32 opcode2, ARM64Reg Rn, ARM64Reg Rm);
+ void EmitCondSelect(bool M, bool S, CCFlags cond, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+ void EmitPermute(u32 size, u32 op, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+ void EmitScalarImm(bool M, bool S, u32 type, u32 imm5, ARM64Reg Rd, u32 imm8);
+ void EmitShiftImm(bool Q, bool U, u32 immh, u32 immb, u32 opcode, ARM64Reg Rd, ARM64Reg Rn);
+ void EmitScalarShiftImm(bool U, u32 immh, u32 immb, u32 opcode, ARM64Reg Rd, ARM64Reg Rn);
+ void EmitLoadStoreMultipleStructure(u32 size, bool L, u32 opcode, ARM64Reg Rt, ARM64Reg Rn);
+ void EmitLoadStoreMultipleStructurePost(u32 size, bool L, u32 opcode, ARM64Reg Rt, ARM64Reg Rn,
+ ARM64Reg Rm);
+ void EmitScalar1Source(bool M, bool S, u32 type, u32 opcode, ARM64Reg Rd, ARM64Reg Rn);
+ void EmitVectorxElement(bool U, u32 size, bool L, u32 opcode, bool H, ARM64Reg Rd, ARM64Reg Rn,
+ ARM64Reg Rm);
+ void EmitLoadStoreUnscaled(u32 size, u32 op, ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+ void EmitConvertScalarToInt(ARM64Reg Rd, ARM64Reg Rn, RoundingMode round, bool sign);
+ void EmitScalar3Source(bool isDouble, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra,
+ int opcode);
+ void EncodeLoadStorePair(u32 size, bool load, IndexType type, ARM64Reg Rt, ARM64Reg Rt2,
+ ARM64Reg Rn, s32 imm);
+ void EncodeLoadStoreRegisterOffset(u32 size, bool load, ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm);
+ void EncodeModImm(bool Q, u8 op, u8 cmode, u8 o2, ARM64Reg Rd, u8 abcdefgh);
+
+ void SSHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift, bool upper);
+ void USHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift, bool upper);
+ void SHRN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift, bool upper);
+ void SXTL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, bool upper);
+ void UXTL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, bool upper);
+};
+
+} \ No newline at end of file
diff --git a/src/dolphin/ArmCommon.h b/src/dolphin/ArmCommon.h
new file mode 100644
index 0000000..6d82e9d
--- /dev/null
+++ b/src/dolphin/ArmCommon.h
@@ -0,0 +1,27 @@
+// Copyright 2014 Dolphin Emulator Project
+// Licensed under GPLv2+
+// Refer to the license.txt file included.
+
+#include "../types.h"
+
+enum CCFlags
+{
+ CC_EQ = 0, // Equal
+ CC_NEQ, // Not equal
+ CC_CS, // Carry Set
+ CC_CC, // Carry Clear
+ CC_MI, // Minus (Negative)
+ CC_PL, // Plus
+ CC_VS, // Overflow
+ CC_VC, // No Overflow
+ CC_HI, // Unsigned higher
+ CC_LS, // Unsigned lower or same
+ CC_GE, // Signed greater than or equal
+ CC_LT, // Signed less than
+ CC_GT, // Signed greater than
+ CC_LE, // Signed less than or equal
+ CC_AL, // Always (unconditional) 14
+ CC_HS = CC_CS, // Alias of CC_CS Unsigned higher or same
+ CC_LO = CC_CC, // Alias of CC_CC Unsigned lower
+};
+const u32 NO_COND = 0xE0000000;
diff --git a/src/dolphin/BitUtils.h b/src/dolphin/BitUtils.h
new file mode 100644
index 0000000..8b64a92
--- /dev/null
+++ b/src/dolphin/BitUtils.h
@@ -0,0 +1,254 @@
+// Copyright 2017 Dolphin Emulator Project
+// Licensed under GPLv2+
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <climits>
+#include <cstddef>
+#include <cstring>
+#include <type_traits>
+
+namespace Common
+{
+///
+/// Retrieves the size of a type in bits.
+///
+/// @tparam T Type to get the size of.
+///
+/// @return the size of the type in bits.
+///
+template <typename T>
+constexpr size_t BitSize() noexcept
+{
+ return sizeof(T) * CHAR_BIT;
+}
+
+///
+/// Extracts a bit from a value.
+///
+/// @param src The value to extract a bit from.
+/// @param bit The bit to extract.
+///
+/// @tparam T The type of the value.
+///
+/// @return The extracted bit.
+///
+template <typename T>
+constexpr T ExtractBit(const T src, const size_t bit) noexcept
+{
+ return (src >> bit) & static_cast<T>(1);
+}
+
+///
+/// Extracts a bit from a value.
+///
+/// @param src The value to extract a bit from.
+///
+/// @tparam bit The bit to extract.
+/// @tparam T The type of the value.
+///
+/// @return The extracted bit.
+///
+template <size_t bit, typename T>
+constexpr T ExtractBit(const T src) noexcept
+{
+ static_assert(bit < BitSize<T>(), "Specified bit must be within T's bit width.");
+
+ return ExtractBit(src, bit);
+}
+
+///
+/// Extracts a range of bits from a value.
+///
+/// @param src The value to extract the bits from.
+/// @param begin The beginning of the bit range. This is inclusive.
+/// @param end The ending of the bit range. This is inclusive.
+///
+/// @tparam T The type of the value.
+/// @tparam Result The returned result type. This is the unsigned analog
+/// of a signed type if a signed type is passed as T.
+///
+/// @return The extracted bits.
+///
+template <typename T, typename Result = std::make_unsigned_t<T>>
+constexpr Result ExtractBits(const T src, const size_t begin, const size_t end) noexcept
+{
+ return static_cast<Result>(((static_cast<Result>(src) << ((BitSize<T>() - 1) - end)) >>
+ (BitSize<T>() - end + begin - 1)));
+}
+
+///
+/// Extracts a range of bits from a value.
+///
+/// @param src The value to extract the bits from.
+///
+/// @tparam begin The beginning of the bit range. This is inclusive.
+/// @tparam end The ending of the bit range. This is inclusive.
+/// @tparam T The type of the value.
+/// @tparam Result The returned result type. This is the unsigned analog
+/// of a signed type if a signed type is passed as T.
+///
+/// @return The extracted bits.
+///
+template <size_t begin, size_t end, typename T, typename Result = std::make_unsigned_t<T>>
+constexpr Result ExtractBits(const T src) noexcept
+{
+ static_assert(begin < end, "Beginning bit must be less than the ending bit.");
+ static_assert(begin < BitSize<T>(), "Beginning bit is larger than T's bit width.");
+ static_assert(end < BitSize<T>(), "Ending bit is larger than T's bit width.");
+
+ return ExtractBits<T, Result>(src, begin, end);
+}
+
+///
+/// Rotates a value left (ROL).
+///
+/// @param value The value to rotate.
+/// @param amount The number of bits to rotate the value.
+/// @tparam T An unsigned type.
+///
+/// @return The rotated value.
+///
+template <typename T>
+constexpr T RotateLeft(const T value, size_t amount) noexcept
+{
+ static_assert(std::is_unsigned<T>(), "Can only rotate unsigned types left.");
+
+ amount %= BitSize<T>();
+
+ if (amount == 0)
+ return value;
+
+ return static_cast<T>((value << amount) | (value >> (BitSize<T>() - amount)));
+}
+
+///
+/// Rotates a value right (ROR).
+///
+/// @param value The value to rotate.
+/// @param amount The number of bits to rotate the value.
+/// @tparam T An unsigned type.
+///
+/// @return The rotated value.
+///
+template <typename T>
+constexpr T RotateRight(const T value, size_t amount) noexcept
+{
+ static_assert(std::is_unsigned<T>(), "Can only rotate unsigned types right.");
+
+ amount %= BitSize<T>();
+
+ if (amount == 0)
+ return value;
+
+ return static_cast<T>((value >> amount) | (value << (BitSize<T>() - amount)));
+}
+
+///
+/// Verifies whether the supplied value is a valid bit mask of the form 0b00...0011...11.
+/// Both edge cases of all zeros and all ones are considered valid masks, too.
+///
+/// @param mask The mask value to test for validity.
+///
+/// @tparam T The type of the value.
+///
+/// @return A bool indicating whether the mask is valid.
+///
+template <typename T>
+constexpr bool IsValidLowMask(const T mask) noexcept
+{
+ static_assert(std::is_integral<T>::value, "Mask must be an integral type.");
+ static_assert(std::is_unsigned<T>::value, "Signed masks can introduce hard to find bugs.");
+
+ // Can be efficiently determined without looping or bit counting. It's the counterpart
+ // to https://graphics.stanford.edu/~seander/bithacks.html#DetermineIfPowerOf2
+ // and doesn't require special casing either edge case.
+ return (mask & (mask + 1)) == 0;
+}
+
+///
+/// Reinterpret objects of one type as another by bit-casting between object representations.
+///
+/// @remark This is the example implementation of std::bit_cast which is to be included
+/// in C++2a. See http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2017/p0476r2.html
+/// for more details. The only difference is this variant is not constexpr,
+/// as the mechanism for bit_cast requires a compiler built-in to have that quality.
+///
+/// @param source The source object to convert to another representation.
+///
+/// @tparam To The type to reinterpret source as.
+/// @tparam From The initial type representation of source.
+///
+/// @return The representation of type From as type To.
+///
+/// @pre Both To and From types must be the same size
+/// @pre Both To and From types must satisfy the TriviallyCopyable concept.
+///
+template <typename To, typename From>
+inline To BitCast(const From& source) noexcept
+{
+ static_assert(sizeof(From) == sizeof(To),
+ "BitCast source and destination types must be equal in size.");
+ static_assert(std::is_trivially_copyable<From>(),
+ "BitCast source type must be trivially copyable.");
+ static_assert(std::is_trivially_copyable<To>(),
+ "BitCast destination type must be trivially copyable.");
+
+ std::aligned_storage_t<sizeof(To), alignof(To)> storage;
+ std::memcpy(&storage, &source, sizeof(storage));
+ return reinterpret_cast<To&>(storage);
+}
+
+template <typename T, typename PtrType>
+class BitCastPtrType
+{
+public:
+ static_assert(std::is_trivially_copyable<PtrType>(),
+ "BitCastPtr source type must be trivially copyable.");
+ static_assert(std::is_trivially_copyable<T>(),
+ "BitCastPtr destination type must be trivially copyable.");
+
+ explicit BitCastPtrType(PtrType* ptr) : m_ptr(ptr) {}
+
+ // Enable operator= only for pointers to non-const data
+ template <typename S>
+ inline typename std::enable_if<std::is_same<S, T>() && !std::is_const<PtrType>()>::type
+ operator=(const S& source)
+ {
+ std::memcpy(m_ptr, &source, sizeof(source));
+ }
+
+ inline operator T() const
+ {
+ T result;
+ std::memcpy(&result, m_ptr, sizeof(result));
+ return result;
+ }
+
+private:
+ PtrType* m_ptr;
+};
+
+// Provides an aliasing-safe alternative to reinterpret_cast'ing pointers to structs
+// Conversion constructor and operator= provided for a convenient syntax.
+// Usage: MyStruct s = BitCastPtr<MyStruct>(some_ptr);
+// BitCastPtr<MyStruct>(some_ptr) = s;
+template <typename T, typename PtrType>
+inline auto BitCastPtr(PtrType* ptr) noexcept -> BitCastPtrType<T, PtrType>
+{
+ return BitCastPtrType<T, PtrType>{ptr};
+}
+
+template <typename T>
+void SetBit(T& value, size_t bit_number, bool bit_value)
+{
+ static_assert(std::is_unsigned<T>(), "SetBit is only sane on unsigned types.");
+
+ if (bit_value)
+ value |= (T{1} << bit_number);
+ else
+ value &= ~(T{1} << bit_number);
+}
+
+} // namespace Common
diff --git a/src/dolphin/Compat.h b/src/dolphin/Compat.h
index f2f52a5..787d505 100644
--- a/src/dolphin/Compat.h
+++ b/src/dolphin/Compat.h
@@ -61,3 +61,15 @@
{ \
printf(fmt "\n", ## __VA_ARGS__); \
} while (false)
+
+#if __cplusplus < 201703L
+// cheat
+namespace std
+{
+template <typename T>
+T clamp(const T& v, const T& lo, const T& hi)
+{
+ return v < lo ? lo : (v > hi ? hi : v);
+}
+}
+#endif \ No newline at end of file
diff --git a/src/dolphin/MathUtil.cpp b/src/dolphin/MathUtil.cpp
new file mode 100644
index 0000000..70f2ede
--- /dev/null
+++ b/src/dolphin/MathUtil.cpp
@@ -0,0 +1,13 @@
+// Copyright 2008 Dolphin Emulator Project
+// Licensed under GPLv2+
+// Refer to the license.txt file included.
+
+#include "MathUtil.h"
+
+#include <numeric>
+
+// Calculate sum of a float list
+float MathFloatVectorSum(const std::vector<float>& Vec)
+{
+ return std::accumulate(Vec.begin(), Vec.end(), 0.0f);
+}
diff --git a/src/dolphin/MathUtil.h b/src/dolphin/MathUtil.h
new file mode 100644
index 0000000..b1dbbae
--- /dev/null
+++ b/src/dolphin/MathUtil.h
@@ -0,0 +1,121 @@
+// Copyright 2008 Dolphin Emulator Project
+// Licensed under GPLv2+
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <algorithm>
+#include <vector>
+
+#include "Compat.h"
+
+#include "../types.h"
+
+#ifdef _MSC_VER
+#include <intrin.h>
+#endif
+
+namespace MathUtil
+{
+constexpr double TAU = 6.2831853071795865;
+constexpr double PI = TAU / 2;
+
+template <typename T>
+constexpr auto Sign(const T& val) -> decltype((T{} < val) - (val < T{}))
+{
+ return (T{} < val) - (val < T{});
+}
+
+template <typename T, typename F>
+constexpr auto Lerp(const T& x, const T& y, const F& a) -> decltype(x + (y - x) * a)
+{
+ return x + (y - x) * a;
+}
+
+template <typename T>
+constexpr bool IsPow2(T imm)
+{
+ return imm > 0 && (imm & (imm - 1)) == 0;
+}
+
+constexpr u32 NextPowerOf2(u32 value)
+{
+ --value;
+ value |= value >> 1;
+ value |= value >> 2;
+ value |= value >> 4;
+ value |= value >> 8;
+ value |= value >> 16;
+ ++value;
+
+ return value;
+}
+
+template <class T>
+struct Rectangle
+{
+ T left{};
+ T top{};
+ T right{};
+ T bottom{};
+
+ constexpr Rectangle() = default;
+
+ constexpr Rectangle(T theLeft, T theTop, T theRight, T theBottom)
+ : left(theLeft), top(theTop), right(theRight), bottom(theBottom)
+ {
+ }
+
+ constexpr bool operator==(const Rectangle& r) const
+ {
+ return left == r.left && top == r.top && right == r.right && bottom == r.bottom;
+ }
+
+ T GetWidth() const { return abs(right - left); }
+ T GetHeight() const { return abs(bottom - top); }
+ // If the rectangle is in a coordinate system with a lower-left origin, use
+ // this Clamp.
+ void ClampLL(T x1, T y1, T x2, T y2)
+ {
+ left = std::clamp(left, x1, x2);
+ right = std::clamp(right, x1, x2);
+ top = std::clamp(top, y2, y1);
+ bottom = std::clamp(bottom, y2, y1);
+ }
+
+ // If the rectangle is in a coordinate system with an upper-left origin,
+ // use this Clamp.
+ void ClampUL(T x1, T y1, T x2, T y2)
+ {
+ left = std::clamp(left, x1, x2);
+ right = std::clamp(right, x1, x2);
+ top = std::clamp(top, y1, y2);
+ bottom = std::clamp(bottom, y1, y2);
+ }
+};
+
+} // namespace MathUtil
+
+float MathFloatVectorSum(const std::vector<float>&);
+
+// Rounds down. 0 -> undefined
+inline int IntLog2(u64 val)
+{
+#if defined(__GNUC__)
+ return 63 - __builtin_clzll(val);
+
+#elif defined(_MSC_VER)
+ unsigned long result = ULONG_MAX;
+ _BitScanReverse64(&result, val);
+ return result;
+
+#else
+ int result = -1;
+ while (val != 0)
+ {
+ val >>= 1;
+ ++result;
+ }
+ return result;
+#endif
+}