aboutsummaryrefslogtreecommitdiff
path: root/src/dolphin
diff options
context:
space:
mode:
Diffstat (limited to 'src/dolphin')
-rw-r--r--src/dolphin/Align.h24
-rw-r--r--src/dolphin/Arm64Emitter.cpp4466
-rw-r--r--src/dolphin/Arm64Emitter.h1151
-rw-r--r--src/dolphin/ArmCommon.h27
-rw-r--r--src/dolphin/BitSet.h218
-rw-r--r--src/dolphin/BitUtils.h254
-rw-r--r--src/dolphin/CPUDetect.h76
-rw-r--r--src/dolphin/CommonFuncs.cpp52
-rw-r--r--src/dolphin/CommonFuncs.h58
-rw-r--r--src/dolphin/Compat.h75
-rw-r--r--src/dolphin/MathUtil.cpp13
-rw-r--r--src/dolphin/MathUtil.h121
-rw-r--r--src/dolphin/license_dolphin.txt339
-rw-r--r--src/dolphin/x64ABI.cpp119
-rw-r--r--src/dolphin/x64ABI.h58
-rw-r--r--src/dolphin/x64CPUDetect.cpp273
-rw-r--r--src/dolphin/x64Emitter.cpp3399
-rw-r--r--src/dolphin/x64Emitter.h1169
-rw-r--r--src/dolphin/x64Reg.h96
19 files changed, 11988 insertions, 0 deletions
diff --git a/src/dolphin/Align.h b/src/dolphin/Align.h
new file mode 100644
index 0000000..40c4576
--- /dev/null
+++ b/src/dolphin/Align.h
@@ -0,0 +1,24 @@
+// This file is under the public domain.
+
+#pragma once
+
+#include <cstddef>
+#include <type_traits>
+
+namespace Common
+{
+template <typename T>
+constexpr T AlignUp(T value, size_t size)
+{
+ static_assert(std::is_unsigned<T>(), "T must be an unsigned value.");
+ return static_cast<T>(value + (size - value % size) % size);
+}
+
+template <typename T>
+constexpr T AlignDown(T value, size_t size)
+{
+ static_assert(std::is_unsigned<T>(), "T must be an unsigned value.");
+ return static_cast<T>(value - value % size);
+}
+
+} // namespace Common
diff --git a/src/dolphin/Arm64Emitter.cpp b/src/dolphin/Arm64Emitter.cpp
new file mode 100644
index 0000000..dd2416b
--- /dev/null
+++ b/src/dolphin/Arm64Emitter.cpp
@@ -0,0 +1,4466 @@
+// Copyright 2015 Dolphin Emulator Project
+// Licensed under GPLv2+
+// Refer to the license.txt file included.
+
+#include <algorithm>
+#include <array>
+#include <cinttypes>
+#include <cstring>
+#include <vector>
+
+#include "Compat.h"
+#include "Align.h"
+#include "Arm64Emitter.h"
+#include "BitUtils.h"
+#include "../types.h"
+#include "MathUtil.h"
+
+namespace Arm64Gen
+{
+namespace
+{
+const int kWRegSizeInBits = 32;
+const int kXRegSizeInBits = 64;
+
+// The below few functions are taken from V8.
+int CountLeadingZeros(uint64_t value, int width)
+{
+ // TODO(jbramley): Optimize this for ARM64 hosts.
+ int count = 0;
+ uint64_t bit_test = 1ULL << (width - 1);
+ while ((count < width) && ((bit_test & value) == 0))
+ {
+ count++;
+ bit_test >>= 1;
+ }
+ return count;
+}
+
+uint64_t LargestPowerOf2Divisor(uint64_t value)
+{
+ return value & -(int64_t)value;
+}
+
+// For ADD/SUB
+bool IsImmArithmetic(uint64_t input, u32* val, bool* shift)
+{
+ if (input < 4096)
+ {
+ *val = input;
+ *shift = false;
+ return true;
+ }
+ else if ((input & 0xFFF000) == input)
+ {
+ *val = input >> 12;
+ *shift = true;
+ return true;
+ }
+ return false;
+}
+
+// For AND/TST/ORR/EOR etc
+bool IsImmLogical(uint64_t value, unsigned int width, unsigned int* n, unsigned int* imm_s,
+ unsigned int* imm_r)
+{
+ // DCHECK((n != NULL) && (imm_s != NULL) && (imm_r != NULL));
+ // DCHECK((width == kWRegSizeInBits) || (width == kXRegSizeInBits));
+
+ bool negate = false;
+
+ // Logical immediates are encoded using parameters n, imm_s and imm_r using
+ // the following table:
+ //
+ // N imms immr size S R
+ // 1 ssssss rrrrrr 64 UInt(ssssss) UInt(rrrrrr)
+ // 0 0sssss xrrrrr 32 UInt(sssss) UInt(rrrrr)
+ // 0 10ssss xxrrrr 16 UInt(ssss) UInt(rrrr)
+ // 0 110sss xxxrrr 8 UInt(sss) UInt(rrr)
+ // 0 1110ss xxxxrr 4 UInt(ss) UInt(rr)
+ // 0 11110s xxxxxr 2 UInt(s) UInt(r)
+ // (s bits must not be all set)
+ //
+ // A pattern is constructed of size bits, where the least significant S+1 bits
+ // are set. The pattern is rotated right by R, and repeated across a 32 or
+ // 64-bit value, depending on destination register width.
+ //
+ // Put another way: the basic format of a logical immediate is a single
+ // contiguous stretch of 1 bits, repeated across the whole word at intervals
+ // given by a power of 2. To identify them quickly, we first locate the
+ // lowest stretch of 1 bits, then the next 1 bit above that; that combination
+ // is different for every logical immediate, so it gives us all the
+ // information we need to identify the only logical immediate that our input
+ // could be, and then we simply check if that's the value we actually have.
+ //
+ // (The rotation parameter does give the possibility of the stretch of 1 bits
+ // going 'round the end' of the word. To deal with that, we observe that in
+ // any situation where that happens the bitwise NOT of the value is also a
+ // valid logical immediate. So we simply invert the input whenever its low bit
+ // is set, and then we know that the rotated case can't arise.)
+
+ if (value & 1)
+ {
+ // If the low bit is 1, negate the value, and set a flag to remember that we
+ // did (so that we can adjust the return values appropriately).
+ negate = true;
+ value = ~value;
+ }
+
+ if (width == kWRegSizeInBits)
+ {
+ // To handle 32-bit logical immediates, the very easiest thing is to repeat
+ // the input value twice to make a 64-bit word. The correct encoding of that
+ // as a logical immediate will also be the correct encoding of the 32-bit
+ // value.
+
+ // The most-significant 32 bits may not be zero (ie. negate is true) so
+ // shift the value left before duplicating it.
+ value <<= kWRegSizeInBits;
+ value |= value >> kWRegSizeInBits;
+ }
+
+ // The basic analysis idea: imagine our input word looks like this.
+ //
+ // 0011111000111110001111100011111000111110001111100011111000111110
+ // c b a
+ // |<--d-->|
+ //
+ // We find the lowest set bit (as an actual power-of-2 value, not its index)
+ // and call it a. Then we add a to our original number, which wipes out the
+ // bottommost stretch of set bits and replaces it with a 1 carried into the
+ // next zero bit. Then we look for the new lowest set bit, which is in
+ // position b, and subtract it, so now our number is just like the original
+ // but with the lowest stretch of set bits completely gone. Now we find the
+ // lowest set bit again, which is position c in the diagram above. Then we'll
+ // measure the distance d between bit positions a and c (using CLZ), and that
+ // tells us that the only valid logical immediate that could possibly be equal
+ // to this number is the one in which a stretch of bits running from a to just
+ // below b is replicated every d bits.
+ uint64_t a = LargestPowerOf2Divisor(value);
+ uint64_t value_plus_a = value + a;
+ uint64_t b = LargestPowerOf2Divisor(value_plus_a);
+ uint64_t value_plus_a_minus_b = value_plus_a - b;
+ uint64_t c = LargestPowerOf2Divisor(value_plus_a_minus_b);
+
+ int d, clz_a, out_n;
+ uint64_t mask;
+
+ if (c != 0)
+ {
+ // The general case, in which there is more than one stretch of set bits.
+ // Compute the repeat distance d, and set up a bitmask covering the basic
+ // unit of repetition (i.e. a word with the bottom d bits set). Also, in all
+ // of these cases the N bit of the output will be zero.
+ clz_a = CountLeadingZeros(a, kXRegSizeInBits);
+ int clz_c = CountLeadingZeros(c, kXRegSizeInBits);
+ d = clz_a - clz_c;
+ mask = ((UINT64_C(1) << d) - 1);
+ out_n = 0;
+ }
+ else
+ {
+ // Handle degenerate cases.
+ //
+ // If any of those 'find lowest set bit' operations didn't find a set bit at
+ // all, then the word will have been zero thereafter, so in particular the
+ // last lowest_set_bit operation will have returned zero. So we can test for
+ // all the special case conditions in one go by seeing if c is zero.
+ if (a == 0)
+ {
+ // The input was zero (or all 1 bits, which will come to here too after we
+ // inverted it at the start of the function), for which we just return
+ // false.
+ return false;
+ }
+ else
+ {
+ // Otherwise, if c was zero but a was not, then there's just one stretch
+ // of set bits in our word, meaning that we have the trivial case of
+ // d == 64 and only one 'repetition'. Set up all the same variables as in
+ // the general case above, and set the N bit in the output.
+ clz_a = CountLeadingZeros(a, kXRegSizeInBits);
+ d = 64;
+ mask = ~UINT64_C(0);
+ out_n = 1;
+ }
+ }
+
+ // If the repeat period d is not a power of two, it can't be encoded.
+ if (!MathUtil::IsPow2<u64>(d))
+ return false;
+
+ // If the bit stretch (b - a) does not fit within the mask derived from the
+ // repeat period, then fail.
+ if (((b - a) & ~mask) != 0)
+ return false;
+
+ // The only possible option is b - a repeated every d bits. Now we're going to
+ // actually construct the valid logical immediate derived from that
+ // specification, and see if it equals our original input.
+ //
+ // To repeat a value every d bits, we multiply it by a number of the form
+ // (1 + 2^d + 2^(2d) + ...), i.e. 0x0001000100010001 or similar. These can
+ // be derived using a table lookup on CLZ(d).
+ static const std::array<uint64_t, 6> multipliers = {{
+ 0x0000000000000001UL,
+ 0x0000000100000001UL,
+ 0x0001000100010001UL,
+ 0x0101010101010101UL,
+ 0x1111111111111111UL,
+ 0x5555555555555555UL,
+ }};
+
+ int multiplier_idx = CountLeadingZeros(d, kXRegSizeInBits) - 57;
+
+ // Ensure that the index to the multipliers array is within bounds.
+ DEBUG_ASSERT((multiplier_idx >= 0) && (static_cast<size_t>(multiplier_idx) < multipliers.size()));
+
+ uint64_t multiplier = multipliers[multiplier_idx];
+ uint64_t candidate = (b - a) * multiplier;
+
+ // The candidate pattern doesn't match our input value, so fail.
+ if (value != candidate)
+ return false;
+
+ // We have a match! This is a valid logical immediate, so now we have to
+ // construct the bits and pieces of the instruction encoding that generates
+ // it.
+
+ // Count the set bits in our basic stretch. The special case of clz(0) == -1
+ // makes the answer come out right for stretches that reach the very top of
+ // the word (e.g. numbers like 0xffffc00000000000).
+ int clz_b = (b == 0) ? -1 : CountLeadingZeros(b, kXRegSizeInBits);
+ int s = clz_a - clz_b;
+
+ // Decide how many bits to rotate right by, to put the low bit of that basic
+ // stretch in position a.
+ int r;
+ if (negate)
+ {
+ // If we inverted the input right at the start of this function, here's
+ // where we compensate: the number of set bits becomes the number of clear
+ // bits, and the rotation count is based on position b rather than position
+ // a (since b is the location of the 'lowest' 1 bit after inversion).
+ s = d - s;
+ r = (clz_b + 1) & (d - 1);
+ }
+ else
+ {
+ r = (clz_a + 1) & (d - 1);
+ }
+
+ // Now we're done, except for having to encode the S output in such a way that
+ // it gives both the number of set bits and the length of the repeated
+ // segment. The s field is encoded like this:
+ //
+ // imms size S
+ // ssssss 64 UInt(ssssss)
+ // 0sssss 32 UInt(sssss)
+ // 10ssss 16 UInt(ssss)
+ // 110sss 8 UInt(sss)
+ // 1110ss 4 UInt(ss)
+ // 11110s 2 UInt(s)
+ //
+ // So we 'or' (-d << 1) with our computed s to form imms.
+ *n = out_n;
+ *imm_s = ((-d << 1) | (s - 1)) & 0x3f;
+ *imm_r = r;
+
+ return true;
+}
+
+float FPImm8ToFloat(u8 bits)
+{
+ const u32 sign = bits >> 7;
+ const u32 bit6 = (bits >> 6) & 1;
+ const u32 exp = ((!bit6) << 7) | (0x7C * bit6) | ((bits >> 4) & 3);
+ const u32 mantissa = (bits & 0xF) << 19;
+ const u32 f = (sign << 31) | (exp << 23) | mantissa;
+
+ return Common::BitCast<float>(f);
+}
+
+bool FPImm8FromFloat(float value, u8* imm_out)
+{
+ const u32 f = Common::BitCast<u32>(value);
+ const u32 mantissa4 = (f & 0x7FFFFF) >> 19;
+ const u32 exponent = (f >> 23) & 0xFF;
+ const u32 sign = f >> 31;
+
+ if ((exponent >> 7) == ((exponent >> 6) & 1))
+ return false;
+
+ const u8 imm8 = (sign << 7) | ((!(exponent >> 7)) << 6) | ((exponent & 3) << 4) | mantissa4;
+ const float new_float = FPImm8ToFloat(imm8);
+ if (new_float == value)
+ *imm_out = imm8;
+ else
+ return false;
+
+ return true;
+}
+} // Anonymous namespace
+
+void ARM64XEmitter::SetCodePtrUnsafe(ptrdiff_t ptr)
+{
+ m_code = ptr;
+}
+
+void ARM64XEmitter::SetCodePtr(ptrdiff_t ptr)
+{
+ SetCodePtrUnsafe(ptr);
+ m_lastCacheFlushEnd = ptr;
+}
+
+void ARM64XEmitter::SetCodeBase(u8* rwbase, u8* rxbase)
+{
+ m_code = 0;
+ m_lastCacheFlushEnd = 0;
+ m_rwbase = rwbase;
+ m_rxbase = rxbase;
+}
+
+ptrdiff_t ARM64XEmitter::GetCodeOffset()
+{
+ return m_code;
+}
+
+const u8* ARM64XEmitter::GetRWPtr()
+{
+ return m_rwbase + m_code;
+}
+
+u8* ARM64XEmitter::GetWriteableRWPtr()
+{
+ return m_rwbase + m_code;
+}
+
+void* ARM64XEmitter::GetRXPtr()
+{
+ return m_rxbase + m_code;
+}
+
+void ARM64XEmitter::ReserveCodeSpace(u32 bytes)
+{
+ for (u32 i = 0; i < bytes / 4; i++)
+ BRK(0);
+}
+
+ptrdiff_t ARM64XEmitter::AlignCode16()
+{
+ int c = int((u64)m_code & 15);
+ if (c)
+ ReserveCodeSpace(16 - c);
+ return m_code;
+}
+
+ptrdiff_t ARM64XEmitter::AlignCodePage()
+{
+ int c = int((u64)m_code & 4095);
+ if (c)
+ ReserveCodeSpace(4096 - c);
+ return m_code;
+}
+
+void ARM64XEmitter::Write32(u32 value)
+{
+ std::memcpy(m_rwbase + m_code, &value, sizeof(u32));
+ m_code += sizeof(u32);
+}
+
+void ARM64XEmitter::FlushIcache()
+{
+ FlushIcacheSection(m_rxbase + m_lastCacheFlushEnd, m_rxbase + m_code);
+ m_lastCacheFlushEnd = m_code;
+}
+
+void ARM64XEmitter::FlushIcacheSection(u8* start, u8* end)
+{
+ if (start == end)
+ return;
+
+#if defined(IOS)
+ // Header file says this is equivalent to: sys_icache_invalidate(start, end - start);
+ sys_cache_control(kCacheFunctionPrepareForExecution, start, end - start);
+#else
+ // Don't rely on GCC's __clear_cache implementation, as it caches
+ // icache/dcache cache line sizes, that can vary between cores on
+ // big.LITTLE architectures.
+ u64 addr, ctr_el0;
+ static size_t icache_line_size = 0xffff, dcache_line_size = 0xffff;
+ size_t isize, dsize;
+
+ __asm__ volatile("mrs %0, ctr_el0" : "=r"(ctr_el0));
+ isize = 4 << ((ctr_el0 >> 0) & 0xf);
+ dsize = 4 << ((ctr_el0 >> 16) & 0xf);
+
+ // use the global minimum cache line size
+ icache_line_size = isize = icache_line_size < isize ? icache_line_size : isize;
+ dcache_line_size = dsize = dcache_line_size < dsize ? dcache_line_size : dsize;
+
+ addr = (u64)start & ~(u64)(dsize - 1);
+ for (; addr < (u64)end; addr += dsize)
+ // use "civac" instead of "cvau", as this is the suggested workaround for
+ // Cortex-A53 errata 819472, 826319, 827319 and 824069.
+ __asm__ volatile("dc civac, %0" : : "r"(addr) : "memory");
+ __asm__ volatile("dsb ish" : : : "memory");
+
+ addr = (u64)start & ~(u64)(isize - 1);
+ for (; addr < (u64)end; addr += isize)
+ __asm__ volatile("ic ivau, %0" : : "r"(addr) : "memory");
+
+ __asm__ volatile("dsb ish" : : : "memory");
+ __asm__ volatile("isb" : : : "memory");
+#endif
+}
+
+// Exception generation
+static const u32 ExcEnc[][3] = {
+ {0, 0, 1}, // SVC
+ {0, 0, 2}, // HVC
+ {0, 0, 3}, // SMC
+ {1, 0, 0}, // BRK
+ {2, 0, 0}, // HLT
+ {5, 0, 1}, // DCPS1
+ {5, 0, 2}, // DCPS2
+ {5, 0, 3}, // DCPS3
+};
+
+// Arithmetic generation
+static const u32 ArithEnc[] = {
+ 0x058, // ADD
+ 0x258, // SUB
+};
+
+// Conditional Select
+static const u32 CondSelectEnc[][2] = {
+ {0, 0}, // CSEL
+ {0, 1}, // CSINC
+ {1, 0}, // CSINV
+ {1, 1}, // CSNEG
+};
+
+// Data-Processing (1 source)
+static const u32 Data1SrcEnc[][2] = {
+ {0, 0}, // RBIT
+ {0, 1}, // REV16
+ {0, 2}, // REV32
+ {0, 3}, // REV64
+ {0, 4}, // CLZ
+ {0, 5}, // CLS
+};
+
+// Data-Processing (2 source)
+static const u32 Data2SrcEnc[] = {
+ 0x02, // UDIV
+ 0x03, // SDIV
+ 0x08, // LSLV
+ 0x09, // LSRV
+ 0x0A, // ASRV
+ 0x0B, // RORV
+ 0x10, // CRC32B
+ 0x11, // CRC32H
+ 0x12, // CRC32W
+ 0x14, // CRC32CB
+ 0x15, // CRC32CH
+ 0x16, // CRC32CW
+ 0x13, // CRC32X (64bit Only)
+ 0x17, // XRC32CX (64bit Only)
+};
+
+// Data-Processing (3 source)
+static const u32 Data3SrcEnc[][2] = {
+ {0, 0}, // MADD
+ {0, 1}, // MSUB
+ {1, 0}, // SMADDL (64Bit Only)
+ {1, 1}, // SMSUBL (64Bit Only)
+ {2, 0}, // SMULH (64Bit Only)
+ {5, 0}, // UMADDL (64Bit Only)
+ {5, 1}, // UMSUBL (64Bit Only)
+ {6, 0}, // UMULH (64Bit Only)
+};
+
+// Logical (shifted register)
+static const u32 LogicalEnc[][2] = {
+ {0, 0}, // AND
+ {0, 1}, // BIC
+ {1, 0}, // OOR
+ {1, 1}, // ORN
+ {2, 0}, // EOR
+ {2, 1}, // EON
+ {3, 0}, // ANDS
+ {3, 1}, // BICS
+};
+
+// Load/Store Exclusive
+static const u32 LoadStoreExcEnc[][5] = {
+ {0, 0, 0, 0, 0}, // STXRB
+ {0, 0, 0, 0, 1}, // STLXRB
+ {0, 0, 1, 0, 0}, // LDXRB
+ {0, 0, 1, 0, 1}, // LDAXRB
+ {0, 1, 0, 0, 1}, // STLRB
+ {0, 1, 1, 0, 1}, // LDARB
+ {1, 0, 0, 0, 0}, // STXRH
+ {1, 0, 0, 0, 1}, // STLXRH
+ {1, 0, 1, 0, 0}, // LDXRH
+ {1, 0, 1, 0, 1}, // LDAXRH
+ {1, 1, 0, 0, 1}, // STLRH
+ {1, 1, 1, 0, 1}, // LDARH
+ {2, 0, 0, 0, 0}, // STXR
+ {3, 0, 0, 0, 0}, // (64bit) STXR
+ {2, 0, 0, 0, 1}, // STLXR
+ {3, 0, 0, 0, 1}, // (64bit) STLXR
+ {2, 0, 0, 1, 0}, // STXP
+ {3, 0, 0, 1, 0}, // (64bit) STXP
+ {2, 0, 0, 1, 1}, // STLXP
+ {3, 0, 0, 1, 1}, // (64bit) STLXP
+ {2, 0, 1, 0, 0}, // LDXR
+ {3, 0, 1, 0, 0}, // (64bit) LDXR
+ {2, 0, 1, 0, 1}, // LDAXR
+ {3, 0, 1, 0, 1}, // (64bit) LDAXR
+ {2, 0, 1, 1, 0}, // LDXP
+ {3, 0, 1, 1, 0}, // (64bit) LDXP
+ {2, 0, 1, 1, 1}, // LDAXP
+ {3, 0, 1, 1, 1}, // (64bit) LDAXP
+ {2, 1, 0, 0, 1}, // STLR
+ {3, 1, 0, 0, 1}, // (64bit) STLR
+ {2, 1, 1, 0, 1}, // LDAR
+ {3, 1, 1, 0, 1}, // (64bit) LDAR
+};
+
+void ARM64XEmitter::EncodeCompareBranchInst(u32 op, ARM64Reg Rt, const void* ptr)
+{
+ bool b64Bit = Is64Bit(Rt);
+ s64 distance = (s64)ptr - (s64)(m_rxbase + m_code);
+
+ ASSERT_MSG(DYNA_REC, !(distance & 0x3), "%s: distance must be a multiple of 4: %" PRIx64,
+ __func__, distance);
+
+ distance >>= 2;
+
+ ASSERT_MSG(DYNA_REC, distance >= -0x40000 && distance <= 0x3FFFF,
+ "%s: Received too large distance: %" PRIx64, __func__, distance);
+
+ Rt = DecodeReg(Rt);
+ Write32((b64Bit << 31) | (0x34 << 24) | (op << 24) | (((u32)distance << 5) & 0xFFFFE0) | Rt);
+}
+
+void ARM64XEmitter::EncodeTestBranchInst(u32 op, ARM64Reg Rt, u8 bits, const void* ptr)
+{
+ bool b64Bit = Is64Bit(Rt);
+ s64 distance = (s64)ptr - (s64)(m_rxbase + m_code);
+
+ ASSERT_MSG(DYNA_REC, !(distance & 0x3), "%s: distance must be a multiple of 4: %" PRIx64,
+ __func__, distance);
+
+ distance >>= 2;
+
+ ASSERT_MSG(DYNA_REC, distance >= -0x3FFF && distance < 0x3FFF,
+ "%s: Received too large distance: %" PRIx64, __func__, distance);
+
+ Rt = DecodeReg(Rt);
+ Write32((b64Bit << 31) | (0x36 << 24) | (op << 24) | (bits << 19) |
+ (((u32)distance << 5) & 0x7FFE0) | Rt);
+}
+
+void ARM64XEmitter::EncodeUnconditionalBranchInst(u32 op, const void* ptr)
+{
+ s64 distance = (s64)ptr - s64(m_rxbase + m_code);
+
+ ASSERT_MSG(DYNA_REC, !(distance & 0x3), "%s: distance must be a multiple of 4: %" PRIx64,
+ __func__, distance);
+
+ distance >>= 2;
+
+ ASSERT_MSG(DYNA_REC, distance >= -0x2000000LL && distance <= 0x1FFFFFFLL,
+ "%s: Received too large distance: %" PRIx64, __func__, distance);
+
+ Write32((op << 31) | (0x5 << 26) | (distance & 0x3FFFFFF));
+}
+
+void ARM64XEmitter::EncodeUnconditionalBranchInst(u32 opc, u32 op2, u32 op3, u32 op4, ARM64Reg Rn)
+{
+ Rn = DecodeReg(Rn);
+ Write32((0x6B << 25) | (opc << 21) | (op2 << 16) | (op3 << 10) | (Rn << 5) | op4);
+}
+
+void ARM64XEmitter::EncodeExceptionInst(u32 instenc, u32 imm)
+{
+ ASSERT_MSG(DYNA_REC, !(imm & ~0xFFFF), "%s: Exception instruction too large immediate: %d",
+ __func__, imm);
+
+ Write32((0xD4 << 24) | (ExcEnc[instenc][0] << 21) | (imm << 5) | (ExcEnc[instenc][1] << 2) |
+ ExcEnc[instenc][2]);
+}
+
+void ARM64XEmitter::EncodeSystemInst(u32 op0, u32 op1, u32 CRn, u32 CRm, u32 op2, ARM64Reg Rt)
+{
+ Write32((0x354 << 22) | (op0 << 19) | (op1 << 16) | (CRn << 12) | (CRm << 8) | (op2 << 5) | Rt);
+}
+
+void ARM64XEmitter::EncodeArithmeticInst(u32 instenc, bool flags, ARM64Reg Rd, ARM64Reg Rn,
+ ARM64Reg Rm, ArithOption Option)
+{
+ bool b64Bit = Is64Bit(Rd);
+
+ Rd = DecodeReg(Rd);
+ Rn = DecodeReg(Rn);
+ Rm = DecodeReg(Rm);
+ Write32((b64Bit << 31) | (flags << 29) | (ArithEnc[instenc] << 21) |
+ (Option.GetType() == ArithOption::TYPE_EXTENDEDREG ? (1 << 21) : 0) | (Rm << 16) |
+ Option.GetData() | (Rn << 5) | Rd);
+}
+
+void ARM64XEmitter::EncodeArithmeticCarryInst(u32 op, bool flags, ARM64Reg Rd, ARM64Reg Rn,
+ ARM64Reg Rm)
+{
+ bool b64Bit = Is64Bit(Rd);
+
+ Rd = DecodeReg(Rd);
+ Rm = DecodeReg(Rm);
+ Rn = DecodeReg(Rn);
+ Write32((b64Bit << 31) | (op << 30) | (flags << 29) | (0xD0 << 21) | (Rm << 16) | (Rn << 5) | Rd);
+}
+
+void ARM64XEmitter::EncodeCondCompareImmInst(u32 op, ARM64Reg Rn, u32 imm, u32 nzcv, CCFlags cond)
+{
+ bool b64Bit = Is64Bit(Rn);
+
+ ASSERT_MSG(DYNA_REC, !(imm & ~0x1F), "%s: too large immediate: %d", __func__, imm);
+ ASSERT_MSG(DYNA_REC, !(nzcv & ~0xF), "%s: Flags out of range: %d", __func__, nzcv);
+
+ Rn = DecodeReg(Rn);
+ Write32((b64Bit << 31) | (op << 30) | (1 << 29) | (0xD2 << 21) | (imm << 16) | (cond << 12) |
+ (1 << 11) | (Rn << 5) | nzcv);
+}
+
+void ARM64XEmitter::EncodeCondCompareRegInst(u32 op, ARM64Reg Rn, ARM64Reg Rm, u32 nzcv,
+ CCFlags cond)
+{
+ bool b64Bit = Is64Bit(Rm);
+
+ ASSERT_MSG(DYNA_REC, !(nzcv & ~0xF), "%s: Flags out of range: %d", __func__, nzcv);
+
+ Rm = DecodeReg(Rm);
+ Rn = DecodeReg(Rn);
+ Write32((b64Bit << 31) | (op << 30) | (1 << 29) | (0xD2 << 21) | (Rm << 16) | (cond << 12) |
+ (Rn << 5) | nzcv);
+}
+
+void ARM64XEmitter::EncodeCondSelectInst(u32 instenc, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm,
+ CCFlags cond)
+{
+ bool b64Bit = Is64Bit(Rd);
+
+ Rd = DecodeReg(Rd);
+ Rm = DecodeReg(Rm);
+ Rn = DecodeReg(Rn);
+ Write32((b64Bit << 31) | (CondSelectEnc[instenc][0] << 30) | (0xD4 << 21) | (Rm << 16) |
+ (cond << 12) | (CondSelectEnc[instenc][1] << 10) | (Rn << 5) | Rd);
+}
+
+void ARM64XEmitter::EncodeData1SrcInst(u32 instenc, ARM64Reg Rd, ARM64Reg Rn)
+{
+ bool b64Bit = Is64Bit(Rd);
+
+ Rd = DecodeReg(Rd);
+ Rn = DecodeReg(Rn);
+ Write32((b64Bit << 31) | (0x2D6 << 21) | (Data1SrcEnc[instenc][0] << 16) |
+ (Data1SrcEnc[instenc][1] << 10) | (Rn << 5) | Rd);
+}
+
+void ARM64XEmitter::EncodeData2SrcInst(u32 instenc, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+ bool b64Bit = Is64Bit(Rd);
+
+ Rd = DecodeReg(Rd);
+ Rm = DecodeReg(Rm);
+ Rn = DecodeReg(Rn);
+ Write32((b64Bit << 31) | (0x0D6 << 21) | (Rm << 16) | (Data2SrcEnc[instenc] << 10) | (Rn << 5) |
+ Rd);
+}
+
+void ARM64XEmitter::EncodeData3SrcInst(u32 instenc, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm,
+ ARM64Reg Ra)
+{
+ bool b64Bit = Is64Bit(Rd);
+
+ Rd = DecodeReg(Rd);
+ Rm = DecodeReg(Rm);
+ Rn = DecodeReg(Rn);
+ Ra = DecodeReg(Ra);
+ Write32((b64Bit << 31) | (0xD8 << 21) | (Data3SrcEnc[instenc][0] << 21) | (Rm << 16) |
+ (Data3SrcEnc[instenc][1] << 15) | (Ra << 10) | (Rn << 5) | Rd);
+}
+
+void ARM64XEmitter::EncodeLogicalInst(u32 instenc, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm,
+ ArithOption Shift)
+{
+ bool b64Bit = Is64Bit(Rd);
+
+ Rd = DecodeReg(Rd);
+ Rm = DecodeReg(Rm);
+ Rn = DecodeReg(Rn);
+ Write32((b64Bit << 31) | (LogicalEnc[instenc][0] << 29) | (0x5 << 25) |
+ (LogicalEnc[instenc][1] << 21) | Shift.GetData() | (Rm << 16) | (Rn << 5) | Rd);
+}
+
+void ARM64XEmitter::EncodeLoadRegisterInst(u32 bitop, ARM64Reg Rt, u32 imm)
+{
+ bool b64Bit = Is64Bit(Rt);
+ bool bVec = IsVector(Rt);
+
+ ASSERT_MSG(DYNA_REC, !(imm & 0xFFFFF), "%s: offset too large %d", __func__, imm);
+
+ Rt = DecodeReg(Rt);
+ if (b64Bit && bitop != 0x2) // LDRSW(0x2) uses 64bit reg, doesn't have 64bit bit set
+ bitop |= 0x1;
+ Write32((bitop << 30) | (bVec << 26) | (0x18 << 24) | (imm << 5) | Rt);
+}
+
+void ARM64XEmitter::EncodeLoadStoreExcInst(u32 instenc, ARM64Reg Rs, ARM64Reg Rt2, ARM64Reg Rn,
+ ARM64Reg Rt)
+{
+ Rs = DecodeReg(Rs);
+ Rt2 = DecodeReg(Rt2);
+ Rn = DecodeReg(Rn);
+ Rt = DecodeReg(Rt);
+ Write32((LoadStoreExcEnc[instenc][0] << 30) | (0x8 << 24) | (LoadStoreExcEnc[instenc][1] << 23) |
+ (LoadStoreExcEnc[instenc][2] << 22) | (LoadStoreExcEnc[instenc][3] << 21) | (Rs << 16) |
+ (LoadStoreExcEnc[instenc][4] << 15) | (Rt2 << 10) | (Rn << 5) | Rt);
+}
+
+void ARM64XEmitter::EncodeLoadStorePairedInst(u32 op, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn,
+ u32 imm)
+{
+ bool b64Bit = Is64Bit(Rt);
+ bool b128Bit = IsQuad(Rt);
+ bool bVec = IsVector(Rt);
+
+ if (b128Bit)
+ imm >>= 4;
+ else if (b64Bit)
+ imm >>= 3;
+ else
+ imm >>= 2;
+
+ ASSERT_MSG(DYNA_REC, !(imm & ~0xF), "%s: offset too large %d", __func__, imm);
+
+ u32 opc = 0;
+ if (b128Bit)
+ opc = 2;
+ else if (b64Bit && bVec)
+ opc = 1;
+ else if (b64Bit && !bVec)
+ opc = 2;
+
+ Rt = DecodeReg(Rt);
+ Rt2 = DecodeReg(Rt2);
+ Rn = DecodeReg(Rn);
+ Write32((opc << 30) | (bVec << 26) | (op << 22) | (imm << 15) | (Rt2 << 10) | (Rn << 5) | Rt);
+}
+
+void ARM64XEmitter::EncodeLoadStoreIndexedInst(u32 op, u32 op2, ARM64Reg Rt, ARM64Reg Rn, s32 imm)
+{
+ bool b64Bit = Is64Bit(Rt);
+ bool bVec = IsVector(Rt);
+
+ u32 offset = imm & 0x1FF;
+
+ ASSERT_MSG(DYNA_REC, !(imm < -256 || imm > 255), "%s: offset too large %d", __func__, imm);
+
+ Rt = DecodeReg(Rt);
+ Rn = DecodeReg(Rn);
+ Write32((b64Bit << 30) | (op << 22) | (bVec << 26) | (offset << 12) | (op2 << 10) | (Rn << 5) |
+ Rt);
+}
+
+void ARM64XEmitter::EncodeLoadStoreIndexedInst(u32 op, ARM64Reg Rt, ARM64Reg Rn, s32 imm, u8 size)
+{
+ bool b64Bit = Is64Bit(Rt);
+ bool bVec = IsVector(Rt);
+
+ if (size == 64)
+ imm >>= 3;
+ else if (size == 32)
+ imm >>= 2;
+ else if (size == 16)
+ imm >>= 1;
+
+ ASSERT_MSG(DYNA_REC, imm >= 0, "%s(INDEX_UNSIGNED): offset must be positive %d", __func__, imm);
+ ASSERT_MSG(DYNA_REC, !(imm & ~0xFFF), "%s(INDEX_UNSIGNED): offset too large %d", __func__, imm);
+
+ Rt = DecodeReg(Rt);
+ Rn = DecodeReg(Rn);
+ Write32((b64Bit << 30) | (op << 22) | (bVec << 26) | (imm << 10) | (Rn << 5) | Rt);
+}
+
+void ARM64XEmitter::EncodeMOVWideInst(u32 op, ARM64Reg Rd, u32 imm, ShiftAmount pos)
+{
+ bool b64Bit = Is64Bit(Rd);
+
+ ASSERT_MSG(DYNA_REC, !(imm & ~0xFFFF), "%s: immediate out of range: %d", __func__, imm);
+
+ Rd = DecodeReg(Rd);
+ Write32((b64Bit << 31) | (op << 29) | (0x25 << 23) | (pos << 21) | (imm << 5) | Rd);
+}
+
+void ARM64XEmitter::EncodeBitfieldMOVInst(u32 op, ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms)
+{
+ bool b64Bit = Is64Bit(Rd);
+
+ Rd = DecodeReg(Rd);
+ Rn = DecodeReg(Rn);
+ Write32((b64Bit << 31) | (op << 29) | (0x26 << 23) | (b64Bit << 22) | (immr << 16) |
+ (imms << 10) | (Rn << 5) | Rd);
+}
+
+void ARM64XEmitter::EncodeLoadStoreRegisterOffset(u32 size, u32 opc, ARM64Reg Rt, ARM64Reg Rn,
+ ArithOption Rm)
+{
+ Rt = DecodeReg(Rt);
+ Rn = DecodeReg(Rn);
+ ARM64Reg decoded_Rm = DecodeReg(Rm.GetReg());
+
+ Write32((size << 30) | (opc << 22) | (0x1C1 << 21) | (decoded_Rm << 16) | Rm.GetData() |
+ (1 << 11) | (Rn << 5) | Rt);
+}
+
+void ARM64XEmitter::EncodeAddSubImmInst(u32 op, bool flags, u32 shift, u32 imm, ARM64Reg Rn,
+ ARM64Reg Rd)
+{
+ bool b64Bit = Is64Bit(Rd);
+
+ ASSERT_MSG(DYNA_REC, !(imm & ~0xFFF), "%s: immediate too large: %x", __func__, imm);
+
+ Rd = DecodeReg(Rd);
+ Rn = DecodeReg(Rn);
+ Write32((b64Bit << 31) | (op << 30) | (flags << 29) | (0x11 << 24) | (shift << 22) | (imm << 10) |
+ (Rn << 5) | Rd);
+}
+
+void ARM64XEmitter::EncodeLogicalImmInst(u32 op, ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms,
+ int n)
+{
+ // Sometimes Rd is fixed to SP, but can still be 32bit or 64bit.
+ // Use Rn to determine bitness here.
+ bool b64Bit = Is64Bit(Rn);
+
+ Rd = DecodeReg(Rd);
+ Rn = DecodeReg(Rn);
+
+ Write32((b64Bit << 31) | (op << 29) | (0x24 << 23) | (n << 22) | (immr << 16) | (imms << 10) |
+ (Rn << 5) | Rd);
+}
+
+void ARM64XEmitter::EncodeLoadStorePair(u32 op, u32 load, IndexType type, ARM64Reg Rt, ARM64Reg Rt2,
+ ARM64Reg Rn, s32 imm)
+{
+ bool b64Bit = Is64Bit(Rt);
+ u32 type_encode = 0;
+
+ switch (type)
+ {
+ case INDEX_SIGNED:
+ type_encode = 0b010;
+ break;
+ case INDEX_POST:
+ type_encode = 0b001;
+ break;
+ case INDEX_PRE:
+ type_encode = 0b011;
+ break;
+ case INDEX_UNSIGNED:
+ ASSERT_MSG(DYNA_REC, false, "%s doesn't support INDEX_UNSIGNED!", __func__);
+ break;
+ }
+
+ if (b64Bit)
+ {
+ op |= 0b10;
+ imm >>= 3;
+ }
+ else
+ {
+ imm >>= 2;
+ }
+
+ Rt = DecodeReg(Rt);
+ Rt2 = DecodeReg(Rt2);
+ Rn = DecodeReg(Rn);
+
+ Write32((op << 30) | (0b101 << 27) | (type_encode << 23) | (load << 22) | ((imm & 0x7F) << 15) |
+ (Rt2 << 10) | (Rn << 5) | Rt);
+}
+void ARM64XEmitter::EncodeAddressInst(u32 op, ARM64Reg Rd, s32 imm)
+{
+ Rd = DecodeReg(Rd);
+
+ Write32((op << 31) | ((imm & 0x3) << 29) | (0x10 << 24) | ((imm & 0x1FFFFC) << 3) | Rd);
+}
+
+void ARM64XEmitter::EncodeLoadStoreUnscaled(u32 size, u32 op, ARM64Reg Rt, ARM64Reg Rn, s32 imm)
+{
+ ASSERT_MSG(DYNA_REC, !(imm < -256 || imm > 255), "%s received too large offset: %d", __func__,
+ imm);
+ Rt = DecodeReg(Rt);
+ Rn = DecodeReg(Rn);
+
+ Write32((size << 30) | (0b111 << 27) | (op << 22) | ((imm & 0x1FF) << 12) | (Rn << 5) | Rt);
+}
+
+static constexpr bool IsInRangeImm19(s64 distance)
+{
+ return (distance >= -0x40000 && distance <= 0x3FFFF);
+}
+
+static constexpr bool IsInRangeImm14(s64 distance)
+{
+ return (distance >= -0x2000 && distance <= 0x1FFF);
+}
+
+static constexpr bool IsInRangeImm26(s64 distance)
+{
+ return (distance >= -0x2000000 && distance <= 0x1FFFFFF);
+}
+
+static constexpr u32 MaskImm19(s64 distance)
+{
+ return distance & 0x7FFFF;
+}
+
+static constexpr u32 MaskImm14(s64 distance)
+{
+ return distance & 0x3FFF;
+}
+
+static constexpr u32 MaskImm26(s64 distance)
+{
+ return distance & 0x3FFFFFF;
+}
+
+// FixupBranch branching
+void ARM64XEmitter::SetJumpTarget(FixupBranch const& branch)
+{
+ bool Not = false;
+ u32 inst = 0;
+ s64 distance = (s64)(m_code - branch.ptr);
+ distance >>= 2;
+
+ switch (branch.type)
+ {
+ case 1: // CBNZ
+ Not = true;
+ case 0: // CBZ
+ {
+ ASSERT_MSG(DYNA_REC, IsInRangeImm19(distance), "%s(%d): Received too large distance: %" PRIx64,
+ __func__, branch.type, distance);
+ bool b64Bit = Is64Bit(branch.reg);
+ ARM64Reg reg = DecodeReg(branch.reg);
+ inst = (b64Bit << 31) | (0x1A << 25) | (Not << 24) | (MaskImm19(distance) << 5) | reg;
+ }
+ break;
+ case 2: // B (conditional)
+ ASSERT_MSG(DYNA_REC, IsInRangeImm19(distance), "%s(%d): Received too large distance: %" PRIx64,
+ __func__, branch.type, distance);
+ inst = (0x2A << 25) | (MaskImm19(distance) << 5) | branch.cond;
+ break;
+ case 4: // TBNZ
+ Not = true;
+ case 3: // TBZ
+ {
+ ASSERT_MSG(DYNA_REC, IsInRangeImm14(distance), "%s(%d): Received too large distance: %" PRIx64,
+ __func__, branch.type, distance);
+ ARM64Reg reg = DecodeReg(branch.reg);
+ inst = ((branch.bit & 0x20) << 26) | (0x1B << 25) | (Not << 24) | ((branch.bit & 0x1F) << 19) |
+ (MaskImm14(distance) << 5) | reg;
+ }
+ break;
+ case 5: // B (uncoditional)
+ ASSERT_MSG(DYNA_REC, IsInRangeImm26(distance), "%s(%d): Received too large distance: %" PRIx64,
+ __func__, branch.type, distance);
+ inst = (0x5 << 26) | MaskImm26(distance);
+ break;
+ case 6: // BL (unconditional)
+ ASSERT_MSG(DYNA_REC, IsInRangeImm26(distance), "%s(%d): Received too large distance: %" PRIx64,
+ __func__, branch.type, distance);
+ inst = (0x25 << 26) | MaskImm26(distance);
+ break;
+ }
+
+ std::memcpy(m_rwbase + branch.ptr, &inst, sizeof(inst));
+}
+
+FixupBranch ARM64XEmitter::CBZ(ARM64Reg Rt)
+{
+ FixupBranch branch;
+ branch.ptr = m_code;
+ branch.type = 0;
+ branch.reg = Rt;
+ HINT(HINT_NOP);
+ return branch;
+}
+FixupBranch ARM64XEmitter::CBNZ(ARM64Reg Rt)
+{
+ FixupBranch branch;
+ branch.ptr = m_code;
+ branch.type = 1;
+ branch.reg = Rt;
+ HINT(HINT_NOP);
+ return branch;
+}
+FixupBranch ARM64XEmitter::B(CCFlags cond)
+{
+ FixupBranch branch;
+ branch.ptr = m_code;
+ branch.type = 2;
+ branch.cond = cond;
+ HINT(HINT_NOP);
+ return branch;
+}
+FixupBranch ARM64XEmitter::TBZ(ARM64Reg Rt, u8 bit)
+{
+ FixupBranch branch;
+ branch.ptr = m_code;
+ branch.type = 3;
+ branch.reg = Rt;
+ branch.bit = bit;
+ HINT(HINT_NOP);
+ return branch;
+}
+FixupBranch ARM64XEmitter::TBNZ(ARM64Reg Rt, u8 bit)
+{
+ FixupBranch branch;
+ branch.ptr = m_code;
+ branch.type = 4;
+ branch.reg = Rt;
+ branch.bit = bit;
+ HINT(HINT_NOP);
+ return branch;
+}
+FixupBranch ARM64XEmitter::B()
+{
+ FixupBranch branch;
+ branch.ptr = m_code;
+ branch.type = 5;
+ HINT(HINT_NOP);
+ return branch;
+}
+FixupBranch ARM64XEmitter::BL()
+{
+ FixupBranch branch;
+ branch.ptr = m_code;
+ branch.type = 6;
+ HINT(HINT_NOP);
+ return branch;
+}
+
+// Compare and Branch
+void ARM64XEmitter::CBZ(ARM64Reg Rt, const void* ptr)
+{
+ EncodeCompareBranchInst(0, Rt, ptr);
+}
+void ARM64XEmitter::CBNZ(ARM64Reg Rt, const void* ptr)
+{
+ EncodeCompareBranchInst(1, Rt, ptr);
+}
+
+// Conditional Branch
+void ARM64XEmitter::B(CCFlags cond, const void* ptr)
+{
+ s64 distance = (s64)ptr - (s64)(m_rxbase + m_code);
+
+ distance >>= 2;
+
+ ASSERT_MSG(DYNA_REC, IsInRangeImm19(distance),
+ "%s: Received too large distance: %p->%p %" PRIi64 " %" PRIx64, __func__, m_execcode, ptr,
+ distance, distance);
+ Write32((0x54 << 24) | (MaskImm19(distance) << 5) | cond);
+}
+
+// Test and Branch
+void ARM64XEmitter::TBZ(ARM64Reg Rt, u8 bits, const void* ptr)
+{
+ EncodeTestBranchInst(0, Rt, bits, ptr);
+}
+void ARM64XEmitter::TBNZ(ARM64Reg Rt, u8 bits, const void* ptr)
+{
+ EncodeTestBranchInst(1, Rt, bits, ptr);
+}
+
+// Unconditional Branch
+void ARM64XEmitter::B(const void* ptr)
+{
+ EncodeUnconditionalBranchInst(0, ptr);
+}
+void ARM64XEmitter::BL(const void* ptr)
+{
+ EncodeUnconditionalBranchInst(1, ptr);
+}
+
+void ARM64XEmitter::QuickCallFunction(ARM64Reg scratchreg, const void* func)
+{
+ s64 distance = (s64)func - (s64)(m_rxbase + m_code);
+ distance >>= 2; // Can only branch to opcode-aligned (4) addresses
+ if (!IsInRangeImm26(distance))
+ {
+ // WARN_LOG(DYNA_REC, "Distance too far in function call (%p to %p)! Using scratch.", m_code,
+ // func);
+ MOVI2R(scratchreg, (uintptr_t)func);
+ BLR(scratchreg);
+ }
+ else
+ {
+ BL(func);
+ }
+}
+
+void ARM64XEmitter::QuickTailCall(ARM64Reg scratchreg, const void* func)
+{
+ s64 distance = (s64)func - (s64)(m_rxbase + m_code);
+ distance >>= 2; // Can only branch to opcode-aligned (4) addresses
+ if (!IsInRangeImm26(distance))
+ {
+ // WARN_LOG(DYNA_REC, "Distance too far in function call (%p to %p)! Using scratch.", m_code,
+ // func);
+ MOVI2R(scratchreg, (uintptr_t)func);
+ BR(scratchreg);
+ }
+ else
+ {
+ B(func);
+ }
+}
+
+// Unconditional Branch (register)
+void ARM64XEmitter::BR(ARM64Reg Rn)
+{
+ EncodeUnconditionalBranchInst(0, 0x1F, 0, 0, Rn);
+}
+void ARM64XEmitter::BLR(ARM64Reg Rn)
+{
+ EncodeUnconditionalBranchInst(1, 0x1F, 0, 0, Rn);
+}
+void ARM64XEmitter::RET(ARM64Reg Rn)
+{
+ EncodeUnconditionalBranchInst(2, 0x1F, 0, 0, Rn);
+}
+void ARM64XEmitter::ERET()
+{
+ EncodeUnconditionalBranchInst(4, 0x1F, 0, 0, SP);
+}
+void ARM64XEmitter::DRPS()
+{
+ EncodeUnconditionalBranchInst(5, 0x1F, 0, 0, SP);
+}
+
+// Exception generation
+void ARM64XEmitter::SVC(u32 imm)
+{
+ EncodeExceptionInst(0, imm);
+}
+
+void ARM64XEmitter::HVC(u32 imm)
+{
+ EncodeExceptionInst(1, imm);
+}
+
+void ARM64XEmitter::SMC(u32 imm)
+{
+ EncodeExceptionInst(2, imm);
+}
+
+void ARM64XEmitter::BRK(u32 imm)
+{
+ EncodeExceptionInst(3, imm);
+}
+
+void ARM64XEmitter::HLT(u32 imm)
+{
+ EncodeExceptionInst(4, imm);
+}
+
+void ARM64XEmitter::DCPS1(u32 imm)
+{
+ EncodeExceptionInst(5, imm);
+}
+
+void ARM64XEmitter::DCPS2(u32 imm)
+{
+ EncodeExceptionInst(6, imm);
+}
+
+void ARM64XEmitter::DCPS3(u32 imm)
+{
+ EncodeExceptionInst(7, imm);
+}
+
+// System
+void ARM64XEmitter::_MSR(PStateField field, u8 imm)
+{
+ u32 op1 = 0, op2 = 0;
+ switch (field)
+ {
+ case FIELD_SPSel:
+ op1 = 0;
+ op2 = 5;
+ break;
+ case FIELD_DAIFSet:
+ op1 = 3;
+ op2 = 6;
+ break;
+ case FIELD_DAIFClr:
+ op1 = 3;
+ op2 = 7;
+ break;
+ default:
+ ASSERT_MSG(DYNA_REC, false, "Invalid PStateField to do a imm move to");
+ break;
+ }
+ EncodeSystemInst(0, op1, 4, imm, op2, WSP);
+}
+
+static void GetSystemReg(PStateField field, int& o0, int& op1, int& CRn, int& CRm, int& op2)
+{
+ switch (field)
+ {
+ case FIELD_NZCV:
+ o0 = 3;
+ op1 = 3;
+ CRn = 4;
+ CRm = 2;
+ op2 = 0;
+ break;
+ case FIELD_FPCR:
+ o0 = 3;
+ op1 = 3;
+ CRn = 4;
+ CRm = 4;
+ op2 = 0;
+ break;
+ case FIELD_FPSR:
+ o0 = 3;
+ op1 = 3;
+ CRn = 4;
+ CRm = 4;
+ op2 = 1;
+ break;
+ case FIELD_PMCR_EL0:
+ o0 = 3;
+ op1 = 3;
+ CRn = 9;
+ CRm = 6;
+ op2 = 0;
+ break;
+ case FIELD_PMCCNTR_EL0:
+ o0 = 3;
+ op1 = 3;
+ CRn = 9;
+ CRm = 7;
+ op2 = 0;
+ break;
+ default:
+ ASSERT_MSG(DYNA_REC, false, "Invalid PStateField to do a register move from/to");
+ break;
+ }
+}
+
+void ARM64XEmitter::_MSR(PStateField field, ARM64Reg Rt)
+{
+ int o0 = 0, op1 = 0, CRn = 0, CRm = 0, op2 = 0;
+ ASSERT_MSG(DYNA_REC, Is64Bit(Rt), "MSR: Rt must be 64-bit");
+ GetSystemReg(field, o0, op1, CRn, CRm, op2);
+ EncodeSystemInst(o0, op1, CRn, CRm, op2, DecodeReg(Rt));
+}
+
+void ARM64XEmitter::MRS(ARM64Reg Rt, PStateField field)
+{
+ int o0 = 0, op1 = 0, CRn = 0, CRm = 0, op2 = 0;
+ ASSERT_MSG(DYNA_REC, Is64Bit(Rt), "MRS: Rt must be 64-bit");
+ GetSystemReg(field, o0, op1, CRn, CRm, op2);
+ EncodeSystemInst(o0 | 4, op1, CRn, CRm, op2, DecodeReg(Rt));
+}
+
+void ARM64XEmitter::CNTVCT(Arm64Gen::ARM64Reg Rt)
+{
+ ASSERT_MSG(DYNA_REC, Is64Bit(Rt), "CNTVCT: Rt must be 64-bit");
+
+ // MRS <Xt>, CNTVCT_EL0 ; Read CNTVCT_EL0 into Xt
+ EncodeSystemInst(3 | 4, 3, 0xe, 0, 2, DecodeReg(Rt));
+}
+
+void ARM64XEmitter::HINT(SystemHint op)
+{
+ EncodeSystemInst(0, 3, 2, 0, op, WSP);
+}
+void ARM64XEmitter::CLREX()
+{
+ EncodeSystemInst(0, 3, 3, 0, 2, WSP);
+}
+void ARM64XEmitter::DSB(BarrierType type)
+{
+ EncodeSystemInst(0, 3, 3, type, 4, WSP);
+}
+void ARM64XEmitter::DMB(BarrierType type)
+{
+ EncodeSystemInst(0, 3, 3, type, 5, WSP);
+}
+void ARM64XEmitter::ISB(BarrierType type)
+{
+ EncodeSystemInst(0, 3, 3, type, 6, WSP);
+}
+
+// Add/Subtract (extended register)
+void ARM64XEmitter::ADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+ ADD(Rd, Rn, Rm, ArithOption(Rd, ST_LSL, 0));
+}
+
+void ARM64XEmitter::ADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Option)
+{
+ EncodeArithmeticInst(0, false, Rd, Rn, Rm, Option);
+}
+
+void ARM64XEmitter::ADDS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+ EncodeArithmeticInst(0, true, Rd, Rn, Rm, ArithOption(Rd, ST_LSL, 0));
+}
+
+void ARM64XEmitter::ADDS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Option)
+{
+ EncodeArithmeticInst(0, true, Rd, Rn, Rm, Option);
+}
+
+void ARM64XEmitter::SUB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+ SUB(Rd, Rn, Rm, ArithOption(Rd, ST_LSL, 0));
+}
+
+void ARM64XEmitter::SUB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Option)
+{
+ EncodeArithmeticInst(1, false, Rd, Rn, Rm, Option);
+}
+
+void ARM64XEmitter::SUBS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+ EncodeArithmeticInst(1, true, Rd, Rn, Rm, ArithOption(Rd, ST_LSL, 0));
+}
+
+void ARM64XEmitter::SUBS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Option)
+{
+ EncodeArithmeticInst(1, true, Rd, Rn, Rm, Option);
+}
+
+void ARM64XEmitter::CMN(ARM64Reg Rn, ARM64Reg Rm)
+{
+ CMN(Rn, Rm, ArithOption(Rn, ST_LSL, 0));
+}
+
+void ARM64XEmitter::CMN(ARM64Reg Rn, ARM64Reg Rm, ArithOption Option)
+{
+ EncodeArithmeticInst(0, true, Is64Bit(Rn) ? ZR : WZR, Rn, Rm, Option);
+}
+
+void ARM64XEmitter::CMP(ARM64Reg Rn, ARM64Reg Rm)
+{
+ CMP(Rn, Rm, ArithOption(Rn, ST_LSL, 0));
+}
+
+void ARM64XEmitter::CMP(ARM64Reg Rn, ARM64Reg Rm, ArithOption Option)
+{
+ EncodeArithmeticInst(1, true, Is64Bit(Rn) ? ZR : WZR, Rn, Rm, Option);
+}
+
+// Add/Subtract (with carry)
+void ARM64XEmitter::ADC(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+ EncodeArithmeticCarryInst(0, false, Rd, Rn, Rm);
+}
+void ARM64XEmitter::ADCS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+ EncodeArithmeticCarryInst(0, true, Rd, Rn, Rm);
+}
+void ARM64XEmitter::SBC(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+ EncodeArithmeticCarryInst(1, false, Rd, Rn, Rm);
+}
+void ARM64XEmitter::SBCS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+ EncodeArithmeticCarryInst(1, true, Rd, Rn, Rm);
+}
+
+// Conditional Compare (immediate)
+void ARM64XEmitter::CCMN(ARM64Reg Rn, u32 imm, u32 nzcv, CCFlags cond)
+{
+ EncodeCondCompareImmInst(0, Rn, imm, nzcv, cond);
+}
+void ARM64XEmitter::CCMP(ARM64Reg Rn, u32 imm, u32 nzcv, CCFlags cond)
+{
+ EncodeCondCompareImmInst(1, Rn, imm, nzcv, cond);
+}
+
+// Conditiona Compare (register)
+void ARM64XEmitter::CCMN(ARM64Reg Rn, ARM64Reg Rm, u32 nzcv, CCFlags cond)
+{
+ EncodeCondCompareRegInst(0, Rn, Rm, nzcv, cond);
+}
+void ARM64XEmitter::CCMP(ARM64Reg Rn, ARM64Reg Rm, u32 nzcv, CCFlags cond)
+{
+ EncodeCondCompareRegInst(1, Rn, Rm, nzcv, cond);
+}
+
+// Conditional Select
+void ARM64XEmitter::CSEL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, CCFlags cond)
+{
+ EncodeCondSelectInst(0, Rd, Rn, Rm, cond);
+}
+void ARM64XEmitter::CSINC(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, CCFlags cond)
+{
+ EncodeCondSelectInst(1, Rd, Rn, Rm, cond);
+}
+void ARM64XEmitter::CSINV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, CCFlags cond)
+{
+ EncodeCondSelectInst(2, Rd, Rn, Rm, cond);
+}
+void ARM64XEmitter::CSNEG(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, CCFlags cond)
+{
+ EncodeCondSelectInst(3, Rd, Rn, Rm, cond);
+}
+
+// Data-Processing 1 source
+void ARM64XEmitter::RBIT(ARM64Reg Rd, ARM64Reg Rn)
+{
+ EncodeData1SrcInst(0, Rd, Rn);
+}
+void ARM64XEmitter::REV16(ARM64Reg Rd, ARM64Reg Rn)
+{
+ EncodeData1SrcInst(1, Rd, Rn);
+}
+void ARM64XEmitter::REV32(ARM64Reg Rd, ARM64Reg Rn)
+{
+ EncodeData1SrcInst(2, Rd, Rn);
+}
+void ARM64XEmitter::REV64(ARM64Reg Rd, ARM64Reg Rn)
+{
+ EncodeData1SrcInst(3, Rd, Rn);
+}
+void ARM64XEmitter::CLZ(ARM64Reg Rd, ARM64Reg Rn)
+{
+ EncodeData1SrcInst(4, Rd, Rn);
+}
+void ARM64XEmitter::CLS(ARM64Reg Rd, ARM64Reg Rn)
+{
+ EncodeData1SrcInst(5, Rd, Rn);
+}
+
+// Data-Processing 2 source
+void ARM64XEmitter::UDIV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+ EncodeData2SrcInst(0, Rd, Rn, Rm);
+}
+void ARM64XEmitter::SDIV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+ EncodeData2SrcInst(1, Rd, Rn, Rm);
+}
+void ARM64XEmitter::LSLV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+ EncodeData2SrcInst(2, Rd, Rn, Rm);
+}
+void ARM64XEmitter::LSRV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+ EncodeData2SrcInst(3, Rd, Rn, Rm);
+}
+void ARM64XEmitter::ASRV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+ EncodeData2SrcInst(4, Rd, Rn, Rm);
+}
+void ARM64XEmitter::RORV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+ EncodeData2SrcInst(5, Rd, Rn, Rm);
+}
+void ARM64XEmitter::CRC32B(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+ EncodeData2SrcInst(6, Rd, Rn, Rm);
+}
+void ARM64XEmitter::CRC32H(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+ EncodeData2SrcInst(7, Rd, Rn, Rm);
+}
+void ARM64XEmitter::CRC32W(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+ EncodeData2SrcInst(8, Rd, Rn, Rm);
+}
+void ARM64XEmitter::CRC32CB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+ EncodeData2SrcInst(9, Rd, Rn, Rm);
+}
+void ARM64XEmitter::CRC32CH(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+ EncodeData2SrcInst(10, Rd, Rn, Rm);
+}
+void ARM64XEmitter::CRC32CW(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+ EncodeData2SrcInst(11, Rd, Rn, Rm);
+}
+void ARM64XEmitter::CRC32X(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+ EncodeData2SrcInst(12, Rd, Rn, Rm);
+}
+void ARM64XEmitter::CRC32CX(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+ EncodeData2SrcInst(13, Rd, Rn, Rm);
+}
+
+// Data-Processing 3 source
+void ARM64XEmitter::MADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra)
+{
+ EncodeData3SrcInst(0, Rd, Rn, Rm, Ra);
+}
+void ARM64XEmitter::MSUB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra)
+{
+ EncodeData3SrcInst(1, Rd, Rn, Rm, Ra);
+}
+void ARM64XEmitter::SMADDL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra)
+{
+ EncodeData3SrcInst(2, Rd, Rn, Rm, Ra);
+}
+void ARM64XEmitter::SMULL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+ SMADDL(Rd, Rn, Rm, SP);
+}
+void ARM64XEmitter::SMSUBL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra)
+{
+ EncodeData3SrcInst(3, Rd, Rn, Rm, Ra);
+}
+void ARM64XEmitter::SMULH(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+ EncodeData3SrcInst(4, Rd, Rn, Rm, SP);
+}
+void ARM64XEmitter::UMADDL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra)
+{
+ EncodeData3SrcInst(5, Rd, Rn, Rm, Ra);
+}
+void ARM64XEmitter::UMULL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+ UMADDL(Rd, Rn, Rm, SP);
+}
+void ARM64XEmitter::UMSUBL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra)
+{
+ EncodeData3SrcInst(6, Rd, Rn, Rm, Ra);
+}
+void ARM64XEmitter::UMULH(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+ EncodeData3SrcInst(7, Rd, Rn, Rm, SP);
+}
+void ARM64XEmitter::MUL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+ EncodeData3SrcInst(0, Rd, Rn, Rm, SP);
+}
+void ARM64XEmitter::MNEG(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+ EncodeData3SrcInst(1, Rd, Rn, Rm, SP);
+}
+
+// Logical (shifted register)
+void ARM64XEmitter::AND(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift)
+{
+ EncodeLogicalInst(0, Rd, Rn, Rm, Shift);
+}
+void ARM64XEmitter::BIC(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift)
+{
+ EncodeLogicalInst(1, Rd, Rn, Rm, Shift);
+}
+void ARM64XEmitter::ORR(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift)
+{
+ EncodeLogicalInst(2, Rd, Rn, Rm, Shift);
+}
+void ARM64XEmitter::ORN(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift)
+{
+ EncodeLogicalInst(3, Rd, Rn, Rm, Shift);
+}
+void ARM64XEmitter::EOR(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift)
+{
+ EncodeLogicalInst(4, Rd, Rn, Rm, Shift);
+}
+void ARM64XEmitter::EON(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift)
+{
+ EncodeLogicalInst(5, Rd, Rn, Rm, Shift);
+}
+void ARM64XEmitter::ANDS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift)
+{
+ EncodeLogicalInst(6, Rd, Rn, Rm, Shift);
+}
+void ARM64XEmitter::BICS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift)
+{
+ EncodeLogicalInst(7, Rd, Rn, Rm, Shift);
+}
+
+void ARM64XEmitter::MOV(ARM64Reg Rd, ARM64Reg Rm, ArithOption Shift)
+{
+ ORR(Rd, Is64Bit(Rd) ? ZR : WZR, Rm, Shift);
+}
+
+void ARM64XEmitter::MOV(ARM64Reg Rd, ARM64Reg Rm)
+{
+ if (IsGPR(Rd) && IsGPR(Rm))
+ ORR(Rd, Is64Bit(Rd) ? ZR : WZR, Rm, ArithOption(Rm, ST_LSL, 0));
+ else
+ ASSERT_MSG(DYNA_REC, false, "Non-GPRs not supported in MOV");
+}
+void ARM64XEmitter::MVN(ARM64Reg Rd, ARM64Reg Rm)
+{
+ ORN(Rd, Is64Bit(Rd) ? ZR : WZR, Rm, ArithOption(Rm, ST_LSL, 0));
+}
+void ARM64XEmitter::LSL(ARM64Reg Rd, ARM64Reg Rm, int shift)
+{
+ int bits = Is64Bit(Rd) ? 64 : 32;
+ UBFM(Rd, Rm, (bits - shift) & (bits - 1), bits - shift - 1);
+}
+void ARM64XEmitter::LSR(ARM64Reg Rd, ARM64Reg Rm, int shift)
+{
+ int bits = Is64Bit(Rd) ? 64 : 32;
+ UBFM(Rd, Rm, shift, bits - 1);
+}
+void ARM64XEmitter::ASR(ARM64Reg Rd, ARM64Reg Rm, int shift)
+{
+ int bits = Is64Bit(Rd) ? 64 : 32;
+ SBFM(Rd, Rm, shift, bits - 1);
+}
+void ARM64XEmitter::ROR_(ARM64Reg Rd, ARM64Reg Rm, int shift)
+{
+ EXTR(Rd, Rm, Rm, shift);
+}
+
+// Logical (immediate)
+void ARM64XEmitter::AND(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms, bool invert)
+{
+ EncodeLogicalImmInst(0, Rd, Rn, immr, imms, invert);
+}
+void ARM64XEmitter::ANDS(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms, bool invert)
+{
+ EncodeLogicalImmInst(3, Rd, Rn, immr, imms, invert);
+}
+void ARM64XEmitter::EOR(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms, bool invert)
+{
+ EncodeLogicalImmInst(2, Rd, Rn, immr, imms, invert);
+}
+void ARM64XEmitter::ORR(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms, bool invert)
+{
+ EncodeLogicalImmInst(1, Rd, Rn, immr, imms, invert);
+}
+void ARM64XEmitter::TST(ARM64Reg Rn, u32 immr, u32 imms, bool invert)
+{
+ EncodeLogicalImmInst(3, Is64Bit(Rn) ? ZR : WZR, Rn, immr, imms, invert);
+}
+
+// Add/subtract (immediate)
+void ARM64XEmitter::ADD(ARM64Reg Rd, ARM64Reg Rn, u32 imm, bool shift)
+{
+ EncodeAddSubImmInst(0, false, shift, imm, Rn, Rd);
+}
+void ARM64XEmitter::ADDS(ARM64Reg Rd, ARM64Reg Rn, u32 imm, bool shift)
+{
+ EncodeAddSubImmInst(0, true, shift, imm, Rn, Rd);
+}
+void ARM64XEmitter::SUB(ARM64Reg Rd, ARM64Reg Rn, u32 imm, bool shift)
+{
+ EncodeAddSubImmInst(1, false, shift, imm, Rn, Rd);
+}
+void ARM64XEmitter::SUBS(ARM64Reg Rd, ARM64Reg Rn, u32 imm, bool shift)
+{
+ EncodeAddSubImmInst(1, true, shift, imm, Rn, Rd);
+}
+void ARM64XEmitter::CMP(ARM64Reg Rn, u32 imm, bool shift)
+{
+ EncodeAddSubImmInst(1, true, shift, imm, Rn, Is64Bit(Rn) ? SP : WSP);
+}
+
+// Data Processing (Immediate)
+void ARM64XEmitter::MOVZ(ARM64Reg Rd, u32 imm, ShiftAmount pos)
+{
+ EncodeMOVWideInst(2, Rd, imm, pos);
+}
+void ARM64XEmitter::MOVN(ARM64Reg Rd, u32 imm, ShiftAmount pos)
+{
+ EncodeMOVWideInst(0, Rd, imm, pos);
+}
+void ARM64XEmitter::MOVK(ARM64Reg Rd, u32 imm, ShiftAmount pos)
+{
+ EncodeMOVWideInst(3, Rd, imm, pos);
+}
+
+// Bitfield move
+void ARM64XEmitter::BFM(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms)
+{
+ EncodeBitfieldMOVInst(1, Rd, Rn, immr, imms);
+}
+void ARM64XEmitter::SBFM(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms)
+{
+ EncodeBitfieldMOVInst(0, Rd, Rn, immr, imms);
+}
+void ARM64XEmitter::UBFM(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms)
+{
+ EncodeBitfieldMOVInst(2, Rd, Rn, immr, imms);
+}
+
+void ARM64XEmitter::BFI(ARM64Reg Rd, ARM64Reg Rn, u32 lsb, u32 width)
+{
+ u32 size = Is64Bit(Rn) ? 64 : 32;
+ ASSERT_MSG(DYNA_REC, (lsb + width) <= size,
+ "%s passed lsb %d and width %d which is greater than the register size!", __func__,
+ lsb, width);
+ EncodeBitfieldMOVInst(1, Rd, Rn, (size - lsb) % size, width - 1);
+}
+void ARM64XEmitter::UBFIZ(ARM64Reg Rd, ARM64Reg Rn, u32 lsb, u32 width)
+{
+ u32 size = Is64Bit(Rn) ? 64 : 32;
+ ASSERT_MSG(DYNA_REC, (lsb + width) <= size,
+ "%s passed lsb %d and width %d which is greater than the register size!", __func__,
+ lsb, width);
+ EncodeBitfieldMOVInst(2, Rd, Rn, (size - lsb) % size, width - 1);
+}
+void ARM64XEmitter::EXTR(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, u32 shift)
+{
+ bool sf = Is64Bit(Rd);
+ bool N = sf;
+ Rd = DecodeReg(Rd);
+ Rn = DecodeReg(Rn);
+ Rm = DecodeReg(Rm);
+ Write32((sf << 31) | (0x27 << 23) | (N << 22) | (Rm << 16) | (shift << 10) | (Rm << 5) | Rd);
+}
+void ARM64XEmitter::SXTB(ARM64Reg Rd, ARM64Reg Rn)
+{
+ SBFM(Rd, Rn, 0, 7);
+}
+void ARM64XEmitter::SXTH(ARM64Reg Rd, ARM64Reg Rn)
+{
+ SBFM(Rd, Rn, 0, 15);
+}
+void ARM64XEmitter::SXTW(ARM64Reg Rd, ARM64Reg Rn)
+{
+ ASSERT_MSG(DYNA_REC, Is64Bit(Rd), "%s requires 64bit register as destination", __func__);
+ SBFM(Rd, Rn, 0, 31);
+}
+void ARM64XEmitter::UXTB(ARM64Reg Rd, ARM64Reg Rn)
+{
+ UBFM(Rd, Rn, 0, 7);
+}
+void ARM64XEmitter::UXTH(ARM64Reg Rd, ARM64Reg Rn)
+{
+ UBFM(Rd, Rn, 0, 15);
+}
+
+// Load Register (Literal)
+void ARM64XEmitter::LDR(ARM64Reg Rt, u32 imm)
+{
+ EncodeLoadRegisterInst(0, Rt, imm);
+}
+void ARM64XEmitter::LDRSW(ARM64Reg Rt, u32 imm)
+{
+ EncodeLoadRegisterInst(2, Rt, imm);
+}
+void ARM64XEmitter::PRFM(ARM64Reg Rt, u32 imm)
+{
+ EncodeLoadRegisterInst(3, Rt, imm);
+}
+
+// Load/Store pair
+void ARM64XEmitter::LDP(IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, s32 imm)
+{
+ EncodeLoadStorePair(0, 1, type, Rt, Rt2, Rn, imm);
+}
+void ARM64XEmitter::LDPSW(IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, s32 imm)
+{
+ EncodeLoadStorePair(1, 1, type, Rt, Rt2, Rn, imm);
+}
+void ARM64XEmitter::STP(IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, s32 imm)
+{
+ EncodeLoadStorePair(0, 0, type, Rt, Rt2, Rn, imm);
+}
+
+// Load/Store Exclusive
+void ARM64XEmitter::STXRB(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rn)
+{
+ EncodeLoadStoreExcInst(0, Rs, SP, Rt, Rn);
+}
+void ARM64XEmitter::STLXRB(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rn)
+{
+ EncodeLoadStoreExcInst(1, Rs, SP, Rt, Rn);
+}
+void ARM64XEmitter::LDXRB(ARM64Reg Rt, ARM64Reg Rn)
+{
+ EncodeLoadStoreExcInst(2, SP, SP, Rt, Rn);
+}
+void ARM64XEmitter::LDAXRB(ARM64Reg Rt, ARM64Reg Rn)
+{
+ EncodeLoadStoreExcInst(3, SP, SP, Rt, Rn);
+}
+void ARM64XEmitter::STLRB(ARM64Reg Rt, ARM64Reg Rn)
+{
+ EncodeLoadStoreExcInst(4, SP, SP, Rt, Rn);
+}
+void ARM64XEmitter::LDARB(ARM64Reg Rt, ARM64Reg Rn)
+{
+ EncodeLoadStoreExcInst(5, SP, SP, Rt, Rn);
+}
+void ARM64XEmitter::STXRH(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rn)
+{
+ EncodeLoadStoreExcInst(6, Rs, SP, Rt, Rn);
+}
+void ARM64XEmitter::STLXRH(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rn)
+{
+ EncodeLoadStoreExcInst(7, Rs, SP, Rt, Rn);
+}
+void ARM64XEmitter::LDXRH(ARM64Reg Rt, ARM64Reg Rn)
+{
+ EncodeLoadStoreExcInst(8, SP, SP, Rt, Rn);
+}
+void ARM64XEmitter::LDAXRH(ARM64Reg Rt, ARM64Reg Rn)
+{
+ EncodeLoadStoreExcInst(9, SP, SP, Rt, Rn);
+}
+void ARM64XEmitter::STLRH(ARM64Reg Rt, ARM64Reg Rn)
+{
+ EncodeLoadStoreExcInst(10, SP, SP, Rt, Rn);
+}
+void ARM64XEmitter::LDARH(ARM64Reg Rt, ARM64Reg Rn)
+{
+ EncodeLoadStoreExcInst(11, SP, SP, Rt, Rn);
+}
+void ARM64XEmitter::STXR(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rn)
+{
+ EncodeLoadStoreExcInst(12 + Is64Bit(Rt), Rs, SP, Rt, Rn);
+}
+void ARM64XEmitter::STLXR(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rn)
+{
+ EncodeLoadStoreExcInst(14 + Is64Bit(Rt), Rs, SP, Rt, Rn);
+}
+void ARM64XEmitter::STXP(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn)
+{
+ EncodeLoadStoreExcInst(16 + Is64Bit(Rt), Rs, Rt2, Rt, Rn);
+}
+void ARM64XEmitter::STLXP(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn)
+{
+ EncodeLoadStoreExcInst(18 + Is64Bit(Rt), Rs, Rt2, Rt, Rn);
+}
+void ARM64XEmitter::LDXR(ARM64Reg Rt, ARM64Reg Rn)
+{
+ EncodeLoadStoreExcInst(20 + Is64Bit(Rt), SP, SP, Rt, Rn);
+}
+void ARM64XEmitter::LDAXR(ARM64Reg Rt, ARM64Reg Rn)
+{
+ EncodeLoadStoreExcInst(22 + Is64Bit(Rt), SP, SP, Rt, Rn);
+}
+void ARM64XEmitter::LDXP(ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn)
+{
+ EncodeLoadStoreExcInst(24 + Is64Bit(Rt), SP, Rt2, Rt, Rn);
+}
+void ARM64XEmitter::LDAXP(ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn)
+{
+ EncodeLoadStoreExcInst(26 + Is64Bit(Rt), SP, Rt2, Rt, Rn);
+}
+void ARM64XEmitter::STLR(ARM64Reg Rt, ARM64Reg Rn)
+{
+ EncodeLoadStoreExcInst(28 + Is64Bit(Rt), SP, SP, Rt, Rn);
+}
+void ARM64XEmitter::LDAR(ARM64Reg Rt, ARM64Reg Rn)
+{
+ EncodeLoadStoreExcInst(30 + Is64Bit(Rt), SP, SP, Rt, Rn);
+}
+
+// Load/Store no-allocate pair (offset)
+void ARM64XEmitter::STNP(ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, u32 imm)
+{
+ EncodeLoadStorePairedInst(0xA0, Rt, Rt2, Rn, imm);
+}
+void ARM64XEmitter::LDNP(ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, u32 imm)
+{
+ EncodeLoadStorePairedInst(0xA1, Rt, Rt2, Rn, imm);
+}
+
+// Load/Store register (immediate post-indexed)
+// XXX: Most of these support vectors
+void ARM64XEmitter::STRB(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm)
+{
+ if (type == INDEX_UNSIGNED)
+ EncodeLoadStoreIndexedInst(0x0E4, Rt, Rn, imm, 8);
+ else
+ EncodeLoadStoreIndexedInst(0x0E0, type == INDEX_POST ? 1 : 3, Rt, Rn, imm);
+}
+void ARM64XEmitter::LDRB(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm)
+{
+ if (type == INDEX_UNSIGNED)
+ EncodeLoadStoreIndexedInst(0x0E5, Rt, Rn, imm, 8);
+ else
+ EncodeLoadStoreIndexedInst(0x0E1, type == INDEX_POST ? 1 : 3, Rt, Rn, imm);
+}
+void ARM64XEmitter::LDRSB(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm)
+{
+ if (type == INDEX_UNSIGNED)
+ EncodeLoadStoreIndexedInst(Is64Bit(Rt) ? 0x0E6 : 0x0E7, Rt, Rn, imm, 8);
+ else
+ EncodeLoadStoreIndexedInst(Is64Bit(Rt) ? 0x0E2 : 0x0E3, type == INDEX_POST ? 1 : 3, Rt, Rn,
+ imm);
+}
+void ARM64XEmitter::STRH(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm)
+{
+ if (type == INDEX_UNSIGNED)
+ EncodeLoadStoreIndexedInst(0x1E4, Rt, Rn, imm, 16);
+ else
+ EncodeLoadStoreIndexedInst(0x1E0, type == INDEX_POST ? 1 : 3, Rt, Rn, imm);
+}
+void ARM64XEmitter::LDRH(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm)
+{
+ if (type == INDEX_UNSIGNED)
+ EncodeLoadStoreIndexedInst(0x1E5, Rt, Rn, imm, 16);
+ else
+ EncodeLoadStoreIndexedInst(0x1E1, type == INDEX_POST ? 1 : 3, Rt, Rn, imm);
+}
+void ARM64XEmitter::LDRSH(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm)
+{
+ if (type == INDEX_UNSIGNED)
+ EncodeLoadStoreIndexedInst(Is64Bit(Rt) ? 0x1E6 : 0x1E7, Rt, Rn, imm, 16);
+ else
+ EncodeLoadStoreIndexedInst(Is64Bit(Rt) ? 0x1E2 : 0x1E3, type == INDEX_POST ? 1 : 3, Rt, Rn,
+ imm);
+}
+void ARM64XEmitter::STR(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm)
+{
+ if (type == INDEX_UNSIGNED)
+ EncodeLoadStoreIndexedInst(Is64Bit(Rt) ? 0x3E4 : 0x2E4, Rt, Rn, imm, Is64Bit(Rt) ? 64 : 32);
+ else
+ EncodeLoadStoreIndexedInst(Is64Bit(Rt) ? 0x3E0 : 0x2E0, type == INDEX_POST ? 1 : 3, Rt, Rn,
+ imm);
+}
+void ARM64XEmitter::LDR(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm)
+{
+ if (type == INDEX_UNSIGNED)
+ EncodeLoadStoreIndexedInst(Is64Bit(Rt) ? 0x3E5 : 0x2E5, Rt, Rn, imm, Is64Bit(Rt) ? 64 : 32);
+ else
+ EncodeLoadStoreIndexedInst(Is64Bit(Rt) ? 0x3E1 : 0x2E1, type == INDEX_POST ? 1 : 3, Rt, Rn,
+ imm);
+}
+void ARM64XEmitter::LDRSW(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm)
+{
+ if (type == INDEX_UNSIGNED)
+ EncodeLoadStoreIndexedInst(0x2E6, Rt, Rn, imm, 32);
+ else
+ EncodeLoadStoreIndexedInst(0x2E2, type == INDEX_POST ? 1 : 3, Rt, Rn, imm);
+}
+
+// Load/Store register (register offset)
+void ARM64XEmitter::STRB(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm)
+{
+ EncodeLoadStoreRegisterOffset(0, 0, Rt, Rn, Rm);
+}
+void ARM64XEmitter::LDRB(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm)
+{
+ EncodeLoadStoreRegisterOffset(0, 1, Rt, Rn, Rm);
+}
+void ARM64XEmitter::LDRSB(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm)
+{
+ bool b64Bit = Is64Bit(Rt);
+ EncodeLoadStoreRegisterOffset(0, 3 - b64Bit, Rt, Rn, Rm);
+}
+void ARM64XEmitter::STRH(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm)
+{
+ EncodeLoadStoreRegisterOffset(1, 0, Rt, Rn, Rm);
+}
+void ARM64XEmitter::LDRH(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm)
+{
+ EncodeLoadStoreRegisterOffset(1, 1, Rt, Rn, Rm);
+}
+void ARM64XEmitter::LDRSH(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm)
+{
+ bool b64Bit = Is64Bit(Rt);
+ EncodeLoadStoreRegisterOffset(1, 3 - b64Bit, Rt, Rn, Rm);
+}
+void ARM64XEmitter::STR(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm)
+{
+ bool b64Bit = Is64Bit(Rt);
+ EncodeLoadStoreRegisterOffset(2 + b64Bit, 0, Rt, Rn, Rm);
+}
+void ARM64XEmitter::LDR(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm)
+{
+ bool b64Bit = Is64Bit(Rt);
+ EncodeLoadStoreRegisterOffset(2 + b64Bit, 1, Rt, Rn, Rm);
+}
+void ARM64XEmitter::LDRSW(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm)
+{
+ EncodeLoadStoreRegisterOffset(2, 2, Rt, Rn, Rm);
+}
+void ARM64XEmitter::PRFM(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm)
+{
+ EncodeLoadStoreRegisterOffset(3, 2, Rt, Rn, Rm);
+}
+
+// Load/Store register (unscaled offset)
+void ARM64XEmitter::STURB(ARM64Reg Rt, ARM64Reg Rn, s32 imm)
+{
+ EncodeLoadStoreUnscaled(0, 0, Rt, Rn, imm);
+}
+void ARM64XEmitter::LDURB(ARM64Reg Rt, ARM64Reg Rn, s32 imm)
+{
+ EncodeLoadStoreUnscaled(0, 1, Rt, Rn, imm);
+}
+void ARM64XEmitter::LDURSB(ARM64Reg Rt, ARM64Reg Rn, s32 imm)
+{
+ EncodeLoadStoreUnscaled(0, Is64Bit(Rt) ? 2 : 3, Rt, Rn, imm);
+}
+void ARM64XEmitter::STURH(ARM64Reg Rt, ARM64Reg Rn, s32 imm)
+{
+ EncodeLoadStoreUnscaled(1, 0, Rt, Rn, imm);
+}
+void ARM64XEmitter::LDURH(ARM64Reg Rt, ARM64Reg Rn, s32 imm)
+{
+ EncodeLoadStoreUnscaled(1, 1, Rt, Rn, imm);
+}
+void ARM64XEmitter::LDURSH(ARM64Reg Rt, ARM64Reg Rn, s32 imm)
+{
+ EncodeLoadStoreUnscaled(1, Is64Bit(Rt) ? 2 : 3, Rt, Rn, imm);
+}
+void ARM64XEmitter::STUR(ARM64Reg Rt, ARM64Reg Rn, s32 imm)
+{
+ EncodeLoadStoreUnscaled(Is64Bit(Rt) ? 3 : 2, 0, Rt, Rn, imm);
+}
+void ARM64XEmitter::LDUR(ARM64Reg Rt, ARM64Reg Rn, s32 imm)
+{
+ EncodeLoadStoreUnscaled(Is64Bit(Rt) ? 3 : 2, 1, Rt, Rn, imm);
+}
+void ARM64XEmitter::LDURSW(ARM64Reg Rt, ARM64Reg Rn, s32 imm)
+{
+ ASSERT_MSG(DYNA_REC, !Is64Bit(Rt), "%s must have a 64bit destination register!", __func__);
+ EncodeLoadStoreUnscaled(2, 2, Rt, Rn, imm);
+}
+
+void ARM64XEmitter::LDRGeneric(int size, bool signExtend, ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm)
+{
+ switch (size | signExtend)
+ {
+ case 32: LDR (Rt, Rn, Rm); break;
+ case 33: LDRSW(Rt, Rn, Rm); break;
+ case 16: LDRH (Rt, Rn, Rm); break;
+ case 17: LDRSH(Rt, Rn, Rm); break;
+ case 8: LDRB (Rt, Rn, Rm); break;
+ case 9: LDRSB(Rt, Rn, Rm); break;
+ default: PanicAlert("LDRGeneric(reg): invalid size %d", size); break;
+ }
+}
+void ARM64XEmitter::STRGeneric(int size, ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm)
+{
+ switch (size)
+ {
+ case 32: STR (Rt, Rn, Rm); break;
+ case 16: STRH (Rt, Rn, Rm); break;
+ case 8: STRB (Rt, Rn, Rm); break;
+ default: PanicAlert("STRGeneric(reg): invalid size %d", size); break;
+ }
+}
+
+void ARM64XEmitter::LDRGeneric(int size, bool signExtend, IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm)
+{
+ switch (size | signExtend)
+ {
+ case 32: LDR (type, Rt, Rn, imm); break;
+ case 33: LDRSW(type, Rt, Rn, imm); break;
+ case 16: LDRH (type, Rt, Rn, imm); break;
+ case 17: LDRSH(type, Rt, Rn, imm); break;
+ case 8: LDRB (type, Rt, Rn, imm); break;
+ case 9: LDRSB(type, Rt, Rn, imm); break;
+ default: PanicAlert("LDRGeneric(imm): invalid size %d", size); break;
+ }
+}
+void ARM64XEmitter::STRGeneric(int size, IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm)
+{
+ switch (size)
+ {
+ case 32: STR (type, Rt, Rn, imm); break;
+ case 16: STRH (type, Rt, Rn, imm); break;
+ case 8: STRB (type, Rt, Rn, imm); break;
+ default: PanicAlert("STRGeneric(imm): invalid size %d", size); break;
+ }
+}
+
+// Address of label/page PC-relative
+void ARM64XEmitter::ADR(ARM64Reg Rd, s32 imm)
+{
+ EncodeAddressInst(0, Rd, imm);
+}
+void ARM64XEmitter::ADRP(ARM64Reg Rd, s32 imm)
+{
+ EncodeAddressInst(1, Rd, imm >> 12);
+}
+
+// Wrapper around MOVZ+MOVK (and later MOVN)
+void ARM64XEmitter::MOVI2R(ARM64Reg Rd, u64 imm, bool optimize)
+{
+ unsigned int parts = Is64Bit(Rd) ? 4 : 2;
+ BitSet32 upload_part(0);
+
+ // Always start with a movz! Kills the dependency on the register.
+ bool use_movz = true;
+
+ if (!imm)
+ {
+ // Zero immediate, just clear the register. EOR is pointless when we have MOVZ, which looks
+ // clearer in disasm too.
+ MOVZ(Rd, 0, SHIFT_0);
+ return;
+ }
+
+ if ((Is64Bit(Rd) && imm == std::numeric_limits<u64>::max()) ||
+ (!Is64Bit(Rd) && imm == std::numeric_limits<u32>::max()))
+ {
+ // Max unsigned value (or if signed, -1)
+ // Set to ~ZR
+ ARM64Reg ZR = Is64Bit(Rd) ? SP : WSP;
+ ORN(Rd, ZR, ZR, ArithOption(ZR, ST_LSL, 0));
+ return;
+ }
+
+ // TODO: Make some more systemic use of MOVN, but this will take care of most cases.
+ // Small negative integer. Use MOVN
+ if (!Is64Bit(Rd) && (imm | 0xFFFF0000) == imm)
+ {
+ MOVN(Rd, ~imm, SHIFT_0);
+ return;
+ }
+
+ // XXX: Use MOVN when possible.
+ // XXX: Optimize more
+ // XXX: Support rotating immediates to save instructions
+ if (optimize)
+ {
+ for (unsigned int i = 0; i < parts; ++i)
+ {
+ if ((imm >> (i * 16)) & 0xFFFF)
+ upload_part[i] = 1;
+ }
+ }
+
+ u64 aligned_pc = (u64)(m_rxbase + m_code) & ~0xFFF;
+s64 aligned_offset = (s64)imm - (s64)aligned_pc;
+ // The offset for ADR/ADRP is an s32, so make sure it can be represented in that
+ if (upload_part.Count() > 1 && std::abs(aligned_offset) < 0x7FFFFFFFLL)
+ {
+ // Immediate we are loading is within 4GB of our aligned range
+ // Most likely a address that we can load in one or two instructions
+ if (!(std::abs(aligned_offset) & 0xFFF))
+ {
+ // Aligned ADR
+ ADRP(Rd, (s32)aligned_offset);
+ return;
+ }
+ else
+ {
+ // If the address is within 1MB of PC we can load it in a single instruction still
+ s64 offset = (s64)imm - (s64)(m_rxbase + m_code);
+ if (offset >= -0xFFFFF && offset <= 0xFFFFF)
+ {
+ ADR(Rd, (s32)offset);
+ return;
+ }
+ else
+ {
+ ADRP(Rd, (s32)(aligned_offset & ~0xFFF));
+ ADD(Rd, Rd, imm & 0xFFF);
+ return;
+ }
+ }
+ }
+
+ for (unsigned i = 0; i < parts; ++i)
+ {
+ if (use_movz && upload_part[i])
+ {
+ MOVZ(Rd, (imm >> (i * 16)) & 0xFFFF, (ShiftAmount)i);
+ use_movz = false;
+ }
+ else
+ {
+ if (upload_part[i] || !optimize)
+ MOVK(Rd, (imm >> (i * 16)) & 0xFFFF, (ShiftAmount)i);
+ }
+ }
+}
+
+bool ARM64XEmitter::MOVI2R2(ARM64Reg Rd, u64 imm1, u64 imm2)
+{
+ // TODO: Also optimize for performance, not just for code size.
+ ptrdiff_t start_offset = GetCodeOffset();
+
+ MOVI2R(Rd, imm1);
+ int size1 = GetCodeOffset() - start_offset;
+
+ SetCodePtrUnsafe(start_offset);
+
+ MOVI2R(Rd, imm2);
+ int size2 = GetCodeOffset() - start_offset;
+
+ SetCodePtrUnsafe(start_offset);
+
+ bool element = size1 > size2;
+
+ MOVI2R(Rd, element ? imm2 : imm1);
+
+ return element;
+}
+
+void ARM64XEmitter::ABI_PushRegisters(BitSet32 registers)
+{
+ int num_regs = registers.Count();
+ int stack_size = (num_regs + (num_regs & 1)) * 8;
+ auto it = registers.begin();
+
+ if (!num_regs)
+ return;
+
+ // 8 byte per register, but 16 byte alignment, so we may have to padd one register.
+ // Only update the SP on the last write to avoid the dependency between those stores.
+
+ // The first push must adjust the SP, else a context switch may invalidate everything below SP.
+ if (num_regs & 1)
+ {
+ STR(INDEX_PRE, (ARM64Reg)(X0 + *it++), SP, -stack_size);
+ }
+ else
+ {
+ ARM64Reg first_reg = (ARM64Reg)(X0 + *it++);
+ ARM64Reg second_reg = (ARM64Reg)(X0 + *it++);
+ STP(INDEX_PRE, first_reg, second_reg, SP, -stack_size);
+ }
+
+ // Fast store for all other registers, this is always an even number.
+ for (int i = 0; i < (num_regs - 1) / 2; i++)
+ {
+ ARM64Reg odd_reg = (ARM64Reg)(X0 + *it++);
+ ARM64Reg even_reg = (ARM64Reg)(X0 + *it++);
+ STP(INDEX_SIGNED, odd_reg, even_reg, SP, 16 * (i + 1));
+ }
+
+ ASSERT_MSG(DYNA_REC, it == registers.end(), "%s registers don't match.", __func__);
+}
+
+void ARM64XEmitter::ABI_PopRegisters(BitSet32 registers, BitSet32 ignore_mask)
+{
+ int num_regs = registers.Count();
+ int stack_size = (num_regs + (num_regs & 1)) * 8;
+ auto it = registers.begin();
+
+ if (!num_regs)
+ return;
+
+ // We must adjust the SP in the end, so load the first (two) registers at least.
+ ARM64Reg first = (ARM64Reg)(X0 + *it++);
+ ARM64Reg second;
+ if (!(num_regs & 1))
+ second = (ARM64Reg)(X0 + *it++);
+
+ // 8 byte per register, but 16 byte alignment, so we may have to padd one register.
+ // Only update the SP on the last load to avoid the dependency between those loads.
+
+ // Fast load for all but the first (two) registers, this is always an even number.
+ for (int i = 0; i < (num_regs - 1) / 2; i++)
+ {
+ ARM64Reg odd_reg = (ARM64Reg)(X0 + *it++);
+ ARM64Reg even_reg = (ARM64Reg)(X0 + *it++);
+ LDP(INDEX_SIGNED, odd_reg, even_reg, SP, 16 * (i + 1));
+ }
+
+ // Post loading the first (two) registers.
+ if (num_regs & 1)
+ LDR(INDEX_POST, first, SP, stack_size);
+ else
+ LDP(INDEX_POST, first, second, SP, stack_size);
+
+ ASSERT_MSG(DYNA_REC, it == registers.end(), "%s registers don't match.", __func__);
+}
+
+// Float Emitter
+void ARM64FloatEmitter::EmitLoadStoreImmediate(u8 size, u32 opc, IndexType type, ARM64Reg Rt,
+ ARM64Reg Rn, s32 imm)
+{
+ Rt = DecodeReg(Rt);
+ Rn = DecodeReg(Rn);
+ u32 encoded_size = 0;
+ u32 encoded_imm = 0;
+
+ if (size == 8)
+ encoded_size = 0;
+ else if (size == 16)
+ encoded_size = 1;
+ else if (size == 32)
+ encoded_size = 2;
+ else if (size == 64)
+ encoded_size = 3;
+ else if (size == 128)
+ encoded_size = 0;
+
+ if (type == INDEX_UNSIGNED)
+ {
+ ASSERT_MSG(DYNA_REC, !(imm & ((size - 1) >> 3)),
+ "%s(INDEX_UNSIGNED) immediate offset must be aligned to size! (%d) (%p)", __func__,
+ imm, m_emit->GetCodePtr());
+ ASSERT_MSG(DYNA_REC, imm >= 0, "%s(INDEX_UNSIGNED) immediate offset must be positive!",
+ __func__);
+ if (size == 16)
+ imm >>= 1;
+ else if (size == 32)
+ imm >>= 2;
+ else if (size == 64)
+ imm >>= 3;
+ else if (size == 128)
+ imm >>= 4;
+ encoded_imm = (imm & 0xFFF);
+ }
+ else
+ {
+ ASSERT_MSG(DYNA_REC, !(imm < -256 || imm > 255),
+ "%s immediate offset must be within range of -256 to 256!", __func__);
+ encoded_imm = (imm & 0x1FF) << 2;
+ if (type == INDEX_POST)
+ encoded_imm |= 1;
+ else
+ encoded_imm |= 3;
+ }
+
+ Write32((encoded_size << 30) | (0xF << 26) | (type == INDEX_UNSIGNED ? (1 << 24) : 0) |
+ (size == 128 ? (1 << 23) : 0) | (opc << 22) | (encoded_imm << 10) | (Rn << 5) | Rt);
+}
+
+void ARM64FloatEmitter::EmitScalar2Source(bool M, bool S, u32 type, u32 opcode, ARM64Reg Rd,
+ ARM64Reg Rn, ARM64Reg Rm)
+{
+ ASSERT_MSG(DYNA_REC, !IsQuad(Rd), "%s only supports double and single registers!", __func__);
+ Rd = DecodeReg(Rd);
+ Rn = DecodeReg(Rn);
+ Rm = DecodeReg(Rm);
+
+ Write32((M << 31) | (S << 29) | (0b11110001 << 21) | (type << 22) | (Rm << 16) | (opcode << 12) |
+ (1 << 11) | (Rn << 5) | Rd);
+}
+
+void ARM64FloatEmitter::EmitThreeSame(bool U, u32 size, u32 opcode, ARM64Reg Rd, ARM64Reg Rn,
+ ARM64Reg Rm)
+{
+ ASSERT_MSG(DYNA_REC, !IsSingle(Rd), "%s doesn't support singles!", __func__);
+ bool quad = IsQuad(Rd);
+ Rd = DecodeReg(Rd);
+ Rn = DecodeReg(Rn);
+ Rm = DecodeReg(Rm);
+
+ Write32((quad << 30) | (U << 29) | (0b1110001 << 21) | (size << 22) | (Rm << 16) |
+ (opcode << 11) | (1 << 10) | (Rn << 5) | Rd);
+}
+
+void ARM64FloatEmitter::EmitCopy(bool Q, u32 op, u32 imm5, u32 imm4, ARM64Reg Rd, ARM64Reg Rn)
+{
+ Rd = DecodeReg(Rd);
+ Rn = DecodeReg(Rn);
+
+ Write32((Q << 30) | (op << 29) | (0b111 << 25) | (imm5 << 16) | (imm4 << 11) | (1 << 10) |
+ (Rn << 5) | Rd);
+}
+
+void ARM64FloatEmitter::Emit2RegMisc(bool Q, bool U, u32 size, u32 opcode, ARM64Reg Rd, ARM64Reg Rn)
+{
+ ASSERT_MSG(DYNA_REC, !IsSingle(Rd), "%s doesn't support singles!", __func__);
+ Rd = DecodeReg(Rd);
+ Rn = DecodeReg(Rn);
+
+ Write32((Q << 30) | (U << 29) | (0b1110001 << 21) | (size << 22) | (opcode << 12) | (1 << 11) |
+ (Rn << 5) | Rd);
+}
+
+void ARM64FloatEmitter::EmitLoadStoreSingleStructure(bool L, bool R, u32 opcode, bool S, u32 size,
+ ARM64Reg Rt, ARM64Reg Rn)
+{
+ ASSERT_MSG(DYNA_REC, !IsSingle(Rt), "%s doesn't support singles!", __func__);
+ bool quad = IsQuad(Rt);
+ Rt = DecodeReg(Rt);
+ Rn = DecodeReg(Rn);
+
+ Write32((quad << 30) | (0b1101 << 24) | (L << 22) | (R << 21) | (opcode << 13) | (S << 12) |
+ (size << 10) | (Rn << 5) | Rt);
+}
+
+void ARM64FloatEmitter::EmitLoadStoreSingleStructure(bool L, bool R, u32 opcode, bool S, u32 size,
+ ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm)
+{
+ ASSERT_MSG(DYNA_REC, !IsSingle(Rt), "%s doesn't support singles!", __func__);
+ bool quad = IsQuad(Rt);
+ Rt = DecodeReg(Rt);
+ Rn = DecodeReg(Rn);
+ Rm = DecodeReg(Rm);
+
+ Write32((quad << 30) | (0x1B << 23) | (L << 22) | (R << 21) | (Rm << 16) | (opcode << 13) |
+ (S << 12) | (size << 10) | (Rn << 5) | Rt);
+}
+
+void ARM64FloatEmitter::Emit1Source(bool M, bool S, u32 type, u32 opcode, ARM64Reg Rd, ARM64Reg Rn)
+{
+ ASSERT_MSG(DYNA_REC, !IsQuad(Rd), "%s doesn't support vector!", __func__);
+ Rd = DecodeReg(Rd);
+ Rn = DecodeReg(Rn);
+
+ Write32((M << 31) | (S << 29) | (0xF1 << 21) | (type << 22) | (opcode << 15) | (1 << 14) |
+ (Rn << 5) | Rd);
+}
+
+void ARM64FloatEmitter::EmitConversion(bool sf, bool S, u32 type, u32 rmode, u32 opcode,
+ ARM64Reg Rd, ARM64Reg Rn)
+{
+ ASSERT_MSG(DYNA_REC, Rn <= SP, "%s only supports GPR as source!", __func__);
+ Rd = DecodeReg(Rd);
+ Rn = DecodeReg(Rn);
+
+ Write32((sf << 31) | (S << 29) | (0xF1 << 21) | (type << 22) | (rmode << 19) | (opcode << 16) |
+ (Rn << 5) | Rd);
+}
+
+void ARM64FloatEmitter::EmitConvertScalarToInt(ARM64Reg Rd, ARM64Reg Rn, RoundingMode round,
+ bool sign)
+{
+ DEBUG_ASSERT_MSG(DYNA_REC, IsScalar(Rn), "fcvts: Rn must be floating point");
+ if (IsGPR(Rd))
+ {
+ // Use the encoding that transfers the result to a GPR.
+ bool sf = Is64Bit(Rd);
+ int type = IsDouble(Rn) ? 1 : 0;
+ Rd = DecodeReg(Rd);
+ Rn = DecodeReg(Rn);
+ int opcode = (sign ? 1 : 0);
+ int rmode = 0;
+ switch (round)
+ {
+ case ROUND_A:
+ rmode = 0;
+ opcode |= 4;
+ break;
+ case ROUND_P:
+ rmode = 1;
+ break;
+ case ROUND_M:
+ rmode = 2;
+ break;
+ case ROUND_Z:
+ rmode = 3;
+ break;
+ case ROUND_N:
+ rmode = 0;
+ break;
+ }
+ EmitConversion2(sf, 0, true, type, rmode, opcode, 0, Rd, Rn);
+ }
+ else
+ {
+ // Use the encoding (vector, single) that keeps the result in the fp register.
+ int sz = IsDouble(Rn);
+ Rd = DecodeReg(Rd);
+ Rn = DecodeReg(Rn);
+ int opcode = 0;
+ switch (round)
+ {
+ case ROUND_A:
+ opcode = 0x1C;
+ break;
+ case ROUND_N:
+ opcode = 0x1A;
+ break;
+ case ROUND_M:
+ opcode = 0x1B;
+ break;
+ case ROUND_P:
+ opcode = 0x1A;
+ sz |= 2;
+ break;
+ case ROUND_Z:
+ opcode = 0x1B;
+ sz |= 2;
+ break;
+ }
+ Write32((0x5E << 24) | (sign << 29) | (sz << 22) | (1 << 21) | (opcode << 12) | (2 << 10) |
+ (Rn << 5) | Rd);
+ }
+}
+
+void ARM64FloatEmitter::FCVTS(ARM64Reg Rd, ARM64Reg Rn, RoundingMode round)
+{
+ EmitConvertScalarToInt(Rd, Rn, round, false);
+}
+
+void ARM64FloatEmitter::FCVTU(ARM64Reg Rd, ARM64Reg Rn, RoundingMode round)
+{
+ EmitConvertScalarToInt(Rd, Rn, round, true);
+}
+
+void ARM64FloatEmitter::EmitConversion2(bool sf, bool S, bool direction, u32 type, u32 rmode,
+ u32 opcode, int scale, ARM64Reg Rd, ARM64Reg Rn)
+{
+ Rd = DecodeReg(Rd);
+ Rn = DecodeReg(Rn);
+
+ Write32((sf << 31) | (S << 29) | (0xF0 << 21) | (direction << 21) | (type << 22) | (rmode << 19) |
+ (opcode << 16) | (scale << 10) | (Rn << 5) | Rd);
+}
+
+void ARM64FloatEmitter::EmitCompare(bool M, bool S, u32 op, u32 opcode2, ARM64Reg Rn, ARM64Reg Rm)
+{
+ ASSERT_MSG(DYNA_REC, !IsQuad(Rn), "%s doesn't support vector!", __func__);
+ bool is_double = IsDouble(Rn);
+
+ Rn = DecodeReg(Rn);
+ Rm = DecodeReg(Rm);
+
+ Write32((M << 31) | (S << 29) | (0xF1 << 21) | (is_double << 22) | (Rm << 16) | (op << 14) |
+ (1 << 13) | (Rn << 5) | opcode2);
+}
+
+void ARM64FloatEmitter::EmitCondSelect(bool M, bool S, CCFlags cond, ARM64Reg Rd, ARM64Reg Rn,
+ ARM64Reg Rm)
+{
+ ASSERT_MSG(DYNA_REC, !IsQuad(Rd), "%s doesn't support vector!", __func__);
+ bool is_double = IsDouble(Rd);
+
+ Rd = DecodeReg(Rd);
+ Rn = DecodeReg(Rn);
+ Rm = DecodeReg(Rm);
+
+ Write32((M << 31) | (S << 29) | (0xF1 << 21) | (is_double << 22) | (Rm << 16) | (cond << 12) |
+ (3 << 10) | (Rn << 5) | Rd);
+}
+
+void ARM64FloatEmitter::EmitPermute(u32 size, u32 op, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+ ASSERT_MSG(DYNA_REC, !IsSingle(Rd), "%s doesn't support singles!", __func__);
+
+ bool quad = IsQuad(Rd);
+
+ u32 encoded_size = 0;
+ if (size == 16)
+ encoded_size = 1;
+ else if (size == 32)
+ encoded_size = 2;
+ else if (size == 64)
+ encoded_size = 3;
+
+ Rd = DecodeReg(Rd);
+ Rn = DecodeReg(Rn);
+ Rm = DecodeReg(Rm);
+
+ Write32((quad << 30) | (7 << 25) | (encoded_size << 22) | (Rm << 16) | (op << 12) | (1 << 11) |
+ (Rn << 5) | Rd);
+}
+
+void ARM64FloatEmitter::EmitScalarImm(bool M, bool S, u32 type, u32 imm5, ARM64Reg Rd, u32 imm8)
+{
+ ASSERT_MSG(DYNA_REC, !IsQuad(Rd), "%s doesn't support vector!", __func__);
+
+ bool is_double = !IsSingle(Rd);
+
+ Rd = DecodeReg(Rd);
+
+ Write32((M << 31) | (S << 29) | (0xF1 << 21) | (is_double << 22) | (type << 22) | (imm8 << 13) |
+ (1 << 12) | (imm5 << 5) | Rd);
+}
+
+void ARM64FloatEmitter::EmitShiftImm(bool Q, bool U, u32 immh, u32 immb, u32 opcode, ARM64Reg Rd,
+ ARM64Reg Rn)
+{
+ ASSERT_MSG(DYNA_REC, immh, "%s bad encoding! Can't have zero immh", __func__);
+
+ Rd = DecodeReg(Rd);
+ Rn = DecodeReg(Rn);
+
+ Write32((Q << 30) | (U << 29) | (0xF << 24) | (immh << 19) | (immb << 16) | (opcode << 11) |
+ (1 << 10) | (Rn << 5) | Rd);
+}
+
+void ARM64FloatEmitter::EmitScalarShiftImm(bool U, u32 immh, u32 immb, u32 opcode, ARM64Reg Rd,
+ ARM64Reg Rn)
+{
+ Rd = DecodeReg(Rd);
+ Rn = DecodeReg(Rn);
+
+ Write32((2 << 30) | (U << 29) | (0x3E << 23) | (immh << 19) | (immb << 16) | (opcode << 11) |
+ (1 << 10) | (Rn << 5) | Rd);
+}
+
+void ARM64FloatEmitter::EmitLoadStoreMultipleStructure(u32 size, bool L, u32 opcode, ARM64Reg Rt,
+ ARM64Reg Rn)
+{
+ bool quad = IsQuad(Rt);
+ u32 encoded_size = 0;
+
+ if (size == 16)
+ encoded_size = 1;
+ else if (size == 32)
+ encoded_size = 2;
+ else if (size == 64)
+ encoded_size = 3;
+
+ Rt = DecodeReg(Rt);
+ Rn = DecodeReg(Rn);
+
+ Write32((quad << 30) | (3 << 26) | (L << 22) | (opcode << 12) | (encoded_size << 10) | (Rn << 5) |
+ Rt);
+}
+
+void ARM64FloatEmitter::EmitLoadStoreMultipleStructurePost(u32 size, bool L, u32 opcode,
+ ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm)
+{
+ bool quad = IsQuad(Rt);
+ u32 encoded_size = 0;
+
+ if (size == 16)
+ encoded_size = 1;
+ else if (size == 32)
+ encoded_size = 2;
+ else if (size == 64)
+ encoded_size = 3;
+
+ Rt = DecodeReg(Rt);
+ Rn = DecodeReg(Rn);
+ Rm = DecodeReg(Rm);
+
+ Write32((quad << 30) | (0b11001 << 23) | (L << 22) | (Rm << 16) | (opcode << 12) |
+ (encoded_size << 10) | (Rn << 5) | Rt);
+}
+
+void ARM64FloatEmitter::EmitScalar1Source(bool M, bool S, u32 type, u32 opcode, ARM64Reg Rd,
+ ARM64Reg Rn)
+{
+ ASSERT_MSG(DYNA_REC, !IsQuad(Rd), "%s doesn't support vector!", __func__);
+
+ Rd = DecodeReg(Rd);
+ Rn = DecodeReg(Rn);
+
+ Write32((M << 31) | (S << 29) | (0xF1 << 21) | (type << 22) | (opcode << 15) | (1 << 14) |
+ (Rn << 5) | Rd);
+}
+
+void ARM64FloatEmitter::EmitVectorxElement(bool U, u32 size, bool L, u32 opcode, bool H,
+ ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+ bool quad = IsQuad(Rd);
+
+ Rd = DecodeReg(Rd);
+ Rn = DecodeReg(Rn);
+ Rm = DecodeReg(Rm);
+
+ Write32((quad << 30) | (U << 29) | (0xF << 24) | (size << 22) | (L << 21) | (Rm << 16) |
+ (opcode << 12) | (H << 11) | (Rn << 5) | Rd);
+}
+
+void ARM64FloatEmitter::EmitLoadStoreUnscaled(u32 size, u32 op, ARM64Reg Rt, ARM64Reg Rn, s32 imm)
+{
+ ASSERT_MSG(DYNA_REC, !(imm < -256 || imm > 255), "%s received too large offset: %d", __func__,
+ imm);
+ Rt = DecodeReg(Rt);
+ Rn = DecodeReg(Rn);
+
+ Write32((size << 30) | (0xF << 26) | (op << 22) | ((imm & 0x1FF) << 12) | (Rn << 5) | Rt);
+}
+
+void ARM64FloatEmitter::EncodeLoadStorePair(u32 size, bool load, IndexType type, ARM64Reg Rt,
+ ARM64Reg Rt2, ARM64Reg Rn, s32 imm)
+{
+ u32 type_encode = 0;
+ u32 opc = 0;
+
+ switch (type)
+ {
+ case INDEX_SIGNED:
+ type_encode = 0b010;
+ break;
+ case INDEX_POST:
+ type_encode = 0b001;
+ break;
+ case INDEX_PRE:
+ type_encode = 0b011;
+ break;
+ case INDEX_UNSIGNED:
+ ASSERT_MSG(DYNA_REC, false, "%s doesn't support INDEX_UNSIGNED!", __func__);
+ break;
+ }
+
+ if (size == 128)
+ {
+ ASSERT_MSG(DYNA_REC, !(imm & 0xF), "%s received invalid offset 0x%x!", __func__, imm);
+ opc = 2;
+ imm >>= 4;
+ }
+ else if (size == 64)
+ {
+ ASSERT_MSG(DYNA_REC, !(imm & 0x7), "%s received invalid offset 0x%x!", __func__, imm);
+ opc = 1;
+ imm >>= 3;
+ }
+ else if (size == 32)
+ {
+ ASSERT_MSG(DYNA_REC, !(imm & 0x3), "%s received invalid offset 0x%x!", __func__, imm);
+ opc = 0;
+ imm >>= 2;
+ }
+
+ Rt = DecodeReg(Rt);
+ Rt2 = DecodeReg(Rt2);
+ Rn = DecodeReg(Rn);
+
+ Write32((opc << 30) | (0b1011 << 26) | (type_encode << 23) | (load << 22) | ((imm & 0x7F) << 15) |
+ (Rt2 << 10) | (Rn << 5) | Rt);
+}
+
+void ARM64FloatEmitter::EncodeLoadStoreRegisterOffset(u32 size, bool load, ARM64Reg Rt, ARM64Reg Rn,
+ ArithOption Rm)
+{
+ ASSERT_MSG(DYNA_REC, Rm.GetType() == ArithOption::TYPE_EXTENDEDREG,
+ "%s must contain an extended reg as Rm!", __func__);
+
+ u32 encoded_size = 0;
+ u32 encoded_op = 0;
+
+ if (size == 8)
+ {
+ encoded_size = 0;
+ encoded_op = 0;
+ }
+ else if (size == 16)
+ {
+ encoded_size = 1;
+ encoded_op = 0;
+ }
+ else if (size == 32)
+ {
+ encoded_size = 2;
+ encoded_op = 0;
+ }
+ else if (size == 64)
+ {
+ encoded_size = 3;
+ encoded_op = 0;
+ }
+ else if (size == 128)
+ {
+ encoded_size = 0;
+ encoded_op = 2;
+ }
+
+ if (load)
+ encoded_op |= 1;
+
+ Rt = DecodeReg(Rt);
+ Rn = DecodeReg(Rn);
+ ARM64Reg decoded_Rm = DecodeReg(Rm.GetReg());
+
+ Write32((encoded_size << 30) | (encoded_op << 22) | (0b111100001 << 21) | (decoded_Rm << 16) |
+ Rm.GetData() | (1 << 11) | (Rn << 5) | Rt);
+}
+
+void ARM64FloatEmitter::EncodeModImm(bool Q, u8 op, u8 cmode, u8 o2, ARM64Reg Rd, u8 abcdefgh)
+{
+ union
+ {
+ u8 hex;
+ struct
+ {
+ unsigned defgh : 5;
+ unsigned abc : 3;
+ };
+ } v;
+ v.hex = abcdefgh;
+ Rd = DecodeReg(Rd);
+ Write32((Q << 30) | (op << 29) | (0xF << 24) | (v.abc << 16) | (cmode << 12) | (o2 << 11) |
+ (1 << 10) | (v.defgh << 5) | Rd);
+}
+
+void ARM64FloatEmitter::LDR(u8 size, IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm)
+{
+ EmitLoadStoreImmediate(size, 1, type, Rt, Rn, imm);
+}
+void ARM64FloatEmitter::STR(u8 size, IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm)
+{
+ EmitLoadStoreImmediate(size, 0, type, Rt, Rn, imm);
+}
+
+// Loadstore unscaled
+void ARM64FloatEmitter::LDUR(u8 size, ARM64Reg Rt, ARM64Reg Rn, s32 imm)
+{
+ u32 encoded_size = 0;
+ u32 encoded_op = 0;
+
+ if (size == 8)
+ {
+ encoded_size = 0;
+ encoded_op = 1;
+ }
+ else if (size == 16)
+ {
+ encoded_size = 1;
+ encoded_op = 1;
+ }
+ else if (size == 32)
+ {
+ encoded_size = 2;
+ encoded_op = 1;
+ }
+ else if (size == 64)
+ {
+ encoded_size = 3;
+ encoded_op = 1;
+ }
+ else if (size == 128)
+ {
+ encoded_size = 0;
+ encoded_op = 3;
+ }
+
+ EmitLoadStoreUnscaled(encoded_size, encoded_op, Rt, Rn, imm);
+}
+void ARM64FloatEmitter::STUR(u8 size, ARM64Reg Rt, ARM64Reg Rn, s32 imm)
+{
+ u32 encoded_size = 0;
+ u32 encoded_op = 0;
+
+ if (size == 8)
+ {
+ encoded_size = 0;
+ encoded_op = 0;
+ }
+ else if (size == 16)
+ {
+ encoded_size = 1;
+ encoded_op = 0;
+ }
+ else if (size == 32)
+ {
+ encoded_size = 2;
+ encoded_op = 0;
+ }
+ else if (size == 64)
+ {
+ encoded_size = 3;
+ encoded_op = 0;
+ }
+ else if (size == 128)
+ {
+ encoded_size = 0;
+ encoded_op = 2;
+ }
+
+ EmitLoadStoreUnscaled(encoded_size, encoded_op, Rt, Rn, imm);
+}
+
+// Loadstore single structure
+void ARM64FloatEmitter::LD1(u8 size, ARM64Reg Rt, u8 index, ARM64Reg Rn)
+{
+ bool S = 0;
+ u32 opcode = 0;
+ u32 encoded_size = 0;
+ ARM64Reg encoded_reg = INVALID_REG;
+
+ if (size == 8)
+ {
+ S = (index & 4) != 0;
+ opcode = 0;
+ encoded_size = index & 3;
+ if (index & 8)
+ encoded_reg = EncodeRegToQuad(Rt);
+ else
+ encoded_reg = EncodeRegToDouble(Rt);
+ }
+ else if (size == 16)
+ {
+ S = (index & 2) != 0;
+ opcode = 2;
+ encoded_size = (index & 1) << 1;
+ if (index & 4)
+ encoded_reg = EncodeRegToQuad(Rt);
+ else
+ encoded_reg = EncodeRegToDouble(Rt);
+ }
+ else if (size == 32)
+ {
+ S = (index & 1) != 0;
+ opcode = 4;
+ encoded_size = 0;
+ if (index & 2)
+ encoded_reg = EncodeRegToQuad(Rt);
+ else
+ encoded_reg = EncodeRegToDouble(Rt);
+ }
+ else if (size == 64)
+ {
+ S = 0;
+ opcode = 4;
+ encoded_size = 1;
+ if (index == 1)
+ encoded_reg = EncodeRegToQuad(Rt);
+ else
+ encoded_reg = EncodeRegToDouble(Rt);
+ }
+
+ EmitLoadStoreSingleStructure(1, 0, opcode, S, encoded_size, encoded_reg, Rn);
+}
+
+void ARM64FloatEmitter::LD1(u8 size, ARM64Reg Rt, u8 index, ARM64Reg Rn, ARM64Reg Rm)
+{
+ bool S = 0;
+ u32 opcode = 0;
+ u32 encoded_size = 0;
+ ARM64Reg encoded_reg = INVALID_REG;
+
+ if (size == 8)
+ {
+ S = (index & 4) != 0;
+ opcode = 0;
+ encoded_size = index & 3;
+ if (index & 8)
+ encoded_reg = EncodeRegToQuad(Rt);
+ else
+ encoded_reg = EncodeRegToDouble(Rt);
+ }
+ else if (size == 16)
+ {
+ S = (index & 2) != 0;
+ opcode = 2;
+ encoded_size = (index & 1) << 1;
+ if (index & 4)
+ encoded_reg = EncodeRegToQuad(Rt);
+ else
+ encoded_reg = EncodeRegToDouble(Rt);
+ }
+ else if (size == 32)
+ {
+ S = (index & 1) != 0;
+ opcode = 4;
+ encoded_size = 0;
+ if (index & 2)
+ encoded_reg = EncodeRegToQuad(Rt);
+ else
+ encoded_reg = EncodeRegToDouble(Rt);
+ }
+ else if (size == 64)
+ {
+ S = 0;
+ opcode = 4;
+ encoded_size = 1;
+ if (index == 1)
+ encoded_reg = EncodeRegToQuad(Rt);
+ else
+ encoded_reg = EncodeRegToDouble(Rt);
+ }
+
+ EmitLoadStoreSingleStructure(1, 0, opcode, S, encoded_size, encoded_reg, Rn, Rm);
+}
+
+void ARM64FloatEmitter::LD1R(u8 size, ARM64Reg Rt, ARM64Reg Rn)
+{
+ EmitLoadStoreSingleStructure(1, 0, 6, 0, size >> 4, Rt, Rn);
+}
+void ARM64FloatEmitter::LD2R(u8 size, ARM64Reg Rt, ARM64Reg Rn)
+{
+ EmitLoadStoreSingleStructure(1, 1, 6, 0, size >> 4, Rt, Rn);
+}
+void ARM64FloatEmitter::LD1R(u8 size, ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm)
+{
+ EmitLoadStoreSingleStructure(1, 0, 6, 0, size >> 4, Rt, Rn, Rm);
+}
+void ARM64FloatEmitter::LD2R(u8 size, ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm)
+{
+ EmitLoadStoreSingleStructure(1, 1, 6, 0, size >> 4, Rt, Rn, Rm);
+}
+
+void ARM64FloatEmitter::ST1(u8 size, ARM64Reg Rt, u8 index, ARM64Reg Rn)
+{
+ bool S = 0;
+ u32 opcode = 0;
+ u32 encoded_size = 0;
+ ARM64Reg encoded_reg = INVALID_REG;
+
+ if (size == 8)
+ {
+ S = (index & 4) != 0;
+ opcode = 0;
+ encoded_size = index & 3;
+ if (index & 8)
+ encoded_reg = EncodeRegToQuad(Rt);
+ else
+ encoded_reg = EncodeRegToDouble(Rt);
+ }
+ else if (size == 16)
+ {
+ S = (index & 2) != 0;
+ opcode = 2;
+ encoded_size = (index & 1) << 1;
+ if (index & 4)
+ encoded_reg = EncodeRegToQuad(Rt);
+ else
+ encoded_reg = EncodeRegToDouble(Rt);
+ }
+ else if (size == 32)
+ {
+ S = (index & 1) != 0;
+ opcode = 4;
+ encoded_size = 0;
+ if (index & 2)
+ encoded_reg = EncodeRegToQuad(Rt);
+ else
+ encoded_reg = EncodeRegToDouble(Rt);
+ }
+ else if (size == 64)
+ {
+ S = 0;
+ opcode = 4;
+ encoded_size = 1;
+ if (index == 1)
+ encoded_reg = EncodeRegToQuad(Rt);
+ else
+ encoded_reg = EncodeRegToDouble(Rt);
+ }
+
+ EmitLoadStoreSingleStructure(0, 0, opcode, S, encoded_size, encoded_reg, Rn);
+}
+
+void ARM64FloatEmitter::ST1(u8 size, ARM64Reg Rt, u8 index, ARM64Reg Rn, ARM64Reg Rm)
+{
+ bool S = 0;
+ u32 opcode = 0;
+ u32 encoded_size = 0;
+ ARM64Reg encoded_reg = INVALID_REG;
+
+ if (size == 8)
+ {
+ S = (index & 4) != 0;
+ opcode = 0;
+ encoded_size = index & 3;
+ if (index & 8)
+ encoded_reg = EncodeRegToQuad(Rt);
+ else
+ encoded_reg = EncodeRegToDouble(Rt);
+ }
+ else if (size == 16)
+ {
+ S = (index & 2) != 0;
+ opcode = 2;
+ encoded_size = (index & 1) << 1;
+ if (index & 4)
+ encoded_reg = EncodeRegToQuad(Rt);
+ else
+ encoded_reg = EncodeRegToDouble(Rt);
+ }
+ else if (size == 32)
+ {
+ S = (index & 1) != 0;
+ opcode = 4;
+ encoded_size = 0;
+ if (index & 2)
+ encoded_reg = EncodeRegToQuad(Rt);
+ else
+ encoded_reg = EncodeRegToDouble(Rt);
+ }
+ else if (size == 64)
+ {
+ S = 0;
+ opcode = 4;
+ encoded_size = 1;
+ if (index == 1)
+ encoded_reg = EncodeRegToQuad(Rt);
+ else
+ encoded_reg = EncodeRegToDouble(Rt);
+ }
+
+ EmitLoadStoreSingleStructure(0, 0, opcode, S, encoded_size, encoded_reg, Rn, Rm);
+}
+
+// Loadstore multiple structure
+void ARM64FloatEmitter::LD1(u8 size, u8 count, ARM64Reg Rt, ARM64Reg Rn)
+{
+ ASSERT_MSG(DYNA_REC, !(count == 0 || count > 4), "%s must have a count of 1 to 4 registers!",
+ __func__);
+ u32 opcode = 0;
+ if (count == 1)
+ opcode = 0b111;
+ else if (count == 2)
+ opcode = 0b1010;
+ else if (count == 3)
+ opcode = 0b0110;
+ else if (count == 4)
+ opcode = 0b0010;
+ EmitLoadStoreMultipleStructure(size, 1, opcode, Rt, Rn);
+}
+void ARM64FloatEmitter::LD1(u8 size, u8 count, IndexType type, ARM64Reg Rt, ARM64Reg Rn,
+ ARM64Reg Rm)
+{
+ ASSERT_MSG(DYNA_REC, !(count == 0 || count > 4), "%s must have a count of 1 to 4 registers!",
+ __func__);
+ ASSERT_MSG(DYNA_REC, type == INDEX_POST, "%s only supports post indexing!", __func__);
+
+ u32 opcode = 0;
+ if (count == 1)
+ opcode = 0b111;
+ else if (count == 2)
+ opcode = 0b1010;
+ else if (count == 3)
+ opcode = 0b0110;
+ else if (count == 4)
+ opcode = 0b0010;
+ EmitLoadStoreMultipleStructurePost(size, 1, opcode, Rt, Rn, Rm);
+}
+void ARM64FloatEmitter::ST1(u8 size, u8 count, ARM64Reg Rt, ARM64Reg Rn)
+{
+ ASSERT_MSG(DYNA_REC, !(count == 0 || count > 4), "%s must have a count of 1 to 4 registers!",
+ __func__);
+ u32 opcode = 0;
+ if (count == 1)
+ opcode = 0b111;
+ else if (count == 2)
+ opcode = 0b1010;
+ else if (count == 3)
+ opcode = 0b0110;
+ else if (count == 4)
+ opcode = 0b0010;
+ EmitLoadStoreMultipleStructure(size, 0, opcode, Rt, Rn);
+}
+void ARM64FloatEmitter::ST1(u8 size, u8 count, IndexType type, ARM64Reg Rt, ARM64Reg Rn,
+ ARM64Reg Rm)
+{
+ ASSERT_MSG(DYNA_REC, !(count == 0 || count > 4), "%s must have a count of 1 to 4 registers!",
+ __func__);
+ ASSERT_MSG(DYNA_REC, type == INDEX_POST, "%s only supports post indexing!", __func__);
+
+ u32 opcode = 0;
+ if (count == 1)
+ opcode = 0b111;
+ else if (count == 2)
+ opcode = 0b1010;
+ else if (count == 3)
+ opcode = 0b0110;
+ else if (count == 4)
+ opcode = 0b0010;
+ EmitLoadStoreMultipleStructurePost(size, 0, opcode, Rt, Rn, Rm);
+}
+
+// Scalar - 1 Source
+void ARM64FloatEmitter::FMOV(ARM64Reg Rd, ARM64Reg Rn, bool top)
+{
+ if (IsScalar(Rd) && IsScalar(Rn))
+ {
+ EmitScalar1Source(0, 0, IsDouble(Rd), 0, Rd, Rn);
+ }
+ else
+ {
+ ASSERT_MSG(DYNA_REC, !IsQuad(Rd) && !IsQuad(Rn), "FMOV can't move to/from quads");
+ int rmode = 0;
+ int opcode = 6;
+ int sf = 0;
+ if (IsSingle(Rd) && !Is64Bit(Rn) && !top)
+ {
+ // GPR to scalar single
+ opcode |= 1;
+ }
+ else if (!Is64Bit(Rd) && IsSingle(Rn) && !top)
+ {
+ // Scalar single to GPR - defaults are correct
+ }
+ else
+ {
+ // TODO
+ ASSERT_MSG(DYNA_REC, 0, "FMOV: Unhandled case");
+ }
+ Rd = DecodeReg(Rd);
+ Rn = DecodeReg(Rn);
+ Write32((sf << 31) | (0x1e2 << 20) | (rmode << 19) | (opcode << 16) | (Rn << 5) | Rd);
+ }
+}
+
+// Loadstore paired
+void ARM64FloatEmitter::LDP(u8 size, IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn,
+ s32 imm)
+{
+ EncodeLoadStorePair(size, true, type, Rt, Rt2, Rn, imm);
+}
+void ARM64FloatEmitter::STP(u8 size, IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn,
+ s32 imm)
+{
+ EncodeLoadStorePair(size, false, type, Rt, Rt2, Rn, imm);
+}
+
+// Loadstore register offset
+void ARM64FloatEmitter::STR(u8 size, ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm)
+{
+ EncodeLoadStoreRegisterOffset(size, false, Rt, Rn, Rm);
+}
+void ARM64FloatEmitter::LDR(u8 size, ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm)
+{
+ EncodeLoadStoreRegisterOffset(size, true, Rt, Rn, Rm);
+}
+
+void ARM64FloatEmitter::FABS(ARM64Reg Rd, ARM64Reg Rn)
+{
+ EmitScalar1Source(0, 0, IsDouble(Rd), 1, Rd, Rn);
+}
+void ARM64FloatEmitter::FNEG(ARM64Reg Rd, ARM64Reg Rn)
+{
+ EmitScalar1Source(0, 0, IsDouble(Rd), 2, Rd, Rn);
+}
+void ARM64FloatEmitter::FSQRT(ARM64Reg Rd, ARM64Reg Rn)
+{
+ EmitScalar1Source(0, 0, IsDouble(Rd), 3, Rd, Rn);
+}
+
+// Scalar - 2 Source
+void ARM64FloatEmitter::FADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+ EmitScalar2Source(0, 0, IsDouble(Rd), 2, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::FMUL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+ EmitScalar2Source(0, 0, IsDouble(Rd), 0, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::FSUB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+ EmitScalar2Source(0, 0, IsDouble(Rd), 3, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::FDIV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+ EmitScalar2Source(0, 0, IsDouble(Rd), 1, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::FMAX(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+ EmitScalar2Source(0, 0, IsDouble(Rd), 4, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::FMIN(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+ EmitScalar2Source(0, 0, IsDouble(Rd), 5, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::FMAXNM(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+ EmitScalar2Source(0, 0, IsDouble(Rd), 6, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::FMINNM(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+ EmitScalar2Source(0, 0, IsDouble(Rd), 7, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::FNMUL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+ EmitScalar2Source(0, 0, IsDouble(Rd), 8, Rd, Rn, Rm);
+}
+
+void ARM64FloatEmitter::FMADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra)
+{
+ EmitScalar3Source(IsDouble(Rd), Rd, Rn, Rm, Ra, 0);
+}
+void ARM64FloatEmitter::FMSUB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra)
+{
+ EmitScalar3Source(IsDouble(Rd), Rd, Rn, Rm, Ra, 1);
+}
+void ARM64FloatEmitter::FNMADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra)
+{
+ EmitScalar3Source(IsDouble(Rd), Rd, Rn, Rm, Ra, 2);
+}
+void ARM64FloatEmitter::FNMSUB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra)
+{
+ EmitScalar3Source(IsDouble(Rd), Rd, Rn, Rm, Ra, 3);
+}
+
+void ARM64FloatEmitter::EmitScalar3Source(bool isDouble, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm,
+ ARM64Reg Ra, int opcode)
+{
+ int type = isDouble ? 1 : 0;
+ Rd = DecodeReg(Rd);
+ Rn = DecodeReg(Rn);
+ Rm = DecodeReg(Rm);
+ Ra = DecodeReg(Ra);
+ int o1 = opcode >> 1;
+ int o0 = opcode & 1;
+ m_emit->Write32((0x1F << 24) | (type << 22) | (o1 << 21) | (Rm << 16) | (o0 << 15) | (Ra << 10) |
+ (Rn << 5) | Rd);
+}
+
+// Scalar floating point immediate
+void ARM64FloatEmitter::FMOV(ARM64Reg Rd, uint8_t imm8)
+{
+ EmitScalarImm(0, 0, 0, 0, Rd, imm8);
+}
+
+// Vector
+void ARM64FloatEmitter::AND(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+ EmitThreeSame(0, 0, 3, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::BSL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+ EmitThreeSame(1, 1, 3, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::DUP(u8 size, ARM64Reg Rd, ARM64Reg Rn, u8 index)
+{
+ u32 imm5 = 0;
+
+ if (size == 8)
+ {
+ imm5 = 1;
+ imm5 |= index << 1;
+ }
+ else if (size == 16)
+ {
+ imm5 = 2;
+ imm5 |= index << 2;
+ }
+ else if (size == 32)
+ {
+ imm5 = 4;
+ imm5 |= index << 3;
+ }
+ else if (size == 64)
+ {
+ imm5 = 8;
+ imm5 |= index << 4;
+ }
+
+ EmitCopy(IsQuad(Rd), 0, imm5, 0, Rd, Rn);
+}
+void ARM64FloatEmitter::FABS(u8 size, ARM64Reg Rd, ARM64Reg Rn)
+{
+ Emit2RegMisc(IsQuad(Rd), 0, 2 | (size >> 6), 0xF, Rd, Rn);
+}
+void ARM64FloatEmitter::FADD(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+ EmitThreeSame(0, size >> 6, 0x1A, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::FMAX(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+ EmitThreeSame(0, size >> 6, 0b11110, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::FMLA(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+ EmitThreeSame(0, size >> 6, 0x19, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::FMIN(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+ EmitThreeSame(0, 2 | size >> 6, 0b11110, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::FCVTL(u8 size, ARM64Reg Rd, ARM64Reg Rn)
+{
+ Emit2RegMisc(false, 0, size >> 6, 0x17, Rd, Rn);
+}
+void ARM64FloatEmitter::FCVTL2(u8 size, ARM64Reg Rd, ARM64Reg Rn)
+{
+ Emit2RegMisc(true, 0, size >> 6, 0x17, Rd, Rn);
+}
+void ARM64FloatEmitter::FCVTN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn)
+{
+ Emit2RegMisc(IsQuad(Rd), 0, dest_size >> 5, 0x16, Rd, Rn);
+}
+void ARM64FloatEmitter::FCVTZS(u8 size, ARM64Reg Rd, ARM64Reg Rn)
+{
+ Emit2RegMisc(IsQuad(Rd), 0, 2 | (size >> 6), 0x1B, Rd, Rn);
+}
+void ARM64FloatEmitter::FCVTZU(u8 size, ARM64Reg Rd, ARM64Reg Rn)
+{
+ Emit2RegMisc(IsQuad(Rd), 1, 2 | (size >> 6), 0x1B, Rd, Rn);
+}
+void ARM64FloatEmitter::FDIV(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+ EmitThreeSame(1, size >> 6, 0x1F, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::FMUL(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+ EmitThreeSame(1, size >> 6, 0x1B, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::FNEG(u8 size, ARM64Reg Rd, ARM64Reg Rn)
+{
+ Emit2RegMisc(IsQuad(Rd), 1, 2 | (size >> 6), 0xF, Rd, Rn);
+}
+void ARM64FloatEmitter::FRECPE(u8 size, ARM64Reg Rd, ARM64Reg Rn)
+{
+ Emit2RegMisc(IsQuad(Rd), 0, 2 | (size >> 6), 0x1D, Rd, Rn);
+}
+void ARM64FloatEmitter::FRSQRTE(u8 size, ARM64Reg Rd, ARM64Reg Rn)
+{
+ Emit2RegMisc(IsQuad(Rd), 1, 2 | (size >> 6), 0x1D, Rd, Rn);
+}
+void ARM64FloatEmitter::FSUB(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+ EmitThreeSame(0, 2 | (size >> 6), 0x1A, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::FMLS(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+ EmitThreeSame(0, 2 | (size >> 6), 0x19, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::NOT(ARM64Reg Rd, ARM64Reg Rn)
+{
+ Emit2RegMisc(IsQuad(Rd), 1, 0, 5, Rd, Rn);
+}
+void ARM64FloatEmitter::ORR(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+ EmitThreeSame(0, 2, 3, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::REV16(u8 size, ARM64Reg Rd, ARM64Reg Rn)
+{
+ Emit2RegMisc(IsQuad(Rd), 0, size >> 4, 1, Rd, Rn);
+}
+void ARM64FloatEmitter::REV32(u8 size, ARM64Reg Rd, ARM64Reg Rn)
+{
+ Emit2RegMisc(IsQuad(Rd), 1, size >> 4, 0, Rd, Rn);
+}
+void ARM64FloatEmitter::REV64(u8 size, ARM64Reg Rd, ARM64Reg Rn)
+{
+ Emit2RegMisc(IsQuad(Rd), 0, size >> 4, 0, Rd, Rn);
+}
+void ARM64FloatEmitter::SCVTF(u8 size, ARM64Reg Rd, ARM64Reg Rn)
+{
+ Emit2RegMisc(IsQuad(Rd), 0, size >> 6, 0x1D, Rd, Rn);
+}
+void ARM64FloatEmitter::UCVTF(u8 size, ARM64Reg Rd, ARM64Reg Rn)
+{
+ Emit2RegMisc(IsQuad(Rd), 1, size >> 6, 0x1D, Rd, Rn);
+}
+void ARM64FloatEmitter::SCVTF(u8 size, ARM64Reg Rd, ARM64Reg Rn, int scale)
+{
+ int imm = size * 2 - scale;
+ EmitShiftImm(IsQuad(Rd), 0, imm >> 3, imm & 7, 0x1C, Rd, Rn);
+}
+void ARM64FloatEmitter::UCVTF(u8 size, ARM64Reg Rd, ARM64Reg Rn, int scale)
+{
+ int imm = size * 2 - scale;
+ EmitShiftImm(IsQuad(Rd), 1, imm >> 3, imm & 7, 0x1C, Rd, Rn);
+}
+void ARM64FloatEmitter::SQXTN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn)
+{
+ Emit2RegMisc(false, 0, dest_size >> 4, 0b10100, Rd, Rn);
+}
+void ARM64FloatEmitter::SQXTN2(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn)
+{
+ Emit2RegMisc(true, 0, dest_size >> 4, 0b10100, Rd, Rn);
+}
+void ARM64FloatEmitter::UQXTN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn)
+{
+ Emit2RegMisc(false, 1, dest_size >> 4, 0b10100, Rd, Rn);
+}
+void ARM64FloatEmitter::UQXTN2(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn)
+{
+ Emit2RegMisc(true, 1, dest_size >> 4, 0b10100, Rd, Rn);
+}
+void ARM64FloatEmitter::XTN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn)
+{
+ Emit2RegMisc(false, 0, dest_size >> 4, 0b10010, Rd, Rn);
+}
+void ARM64FloatEmitter::XTN2(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn)
+{
+ Emit2RegMisc(true, 0, dest_size >> 4, 0b10010, Rd, Rn);
+}
+
+// Move
+void ARM64FloatEmitter::DUP(u8 size, ARM64Reg Rd, ARM64Reg Rn)
+{
+ u32 imm5 = 0;
+
+ if (size == 8)
+ imm5 = 1;
+ else if (size == 16)
+ imm5 = 2;
+ else if (size == 32)
+ imm5 = 4;
+ else if (size == 64)
+ imm5 = 8;
+
+ EmitCopy(IsQuad(Rd), 0, imm5, 1, Rd, Rn);
+}
+void ARM64FloatEmitter::INS(u8 size, ARM64Reg Rd, u8 index, ARM64Reg Rn)
+{
+ u32 imm5 = 0;
+
+ if (size == 8)
+ {
+ imm5 = 1;
+ imm5 |= index << 1;
+ }
+ else if (size == 16)
+ {
+ imm5 = 2;
+ imm5 |= index << 2;
+ }
+ else if (size == 32)
+ {
+ imm5 = 4;
+ imm5 |= index << 3;
+ }
+ else if (size == 64)
+ {
+ imm5 = 8;
+ imm5 |= index << 4;
+ }
+
+ EmitCopy(1, 0, imm5, 3, Rd, Rn);
+}
+void ARM64FloatEmitter::INS(u8 size, ARM64Reg Rd, u8 index1, ARM64Reg Rn, u8 index2)
+{
+ u32 imm5 = 0, imm4 = 0;
+
+ if (size == 8)
+ {
+ imm5 = 1;
+ imm5 |= index1 << 1;
+ imm4 = index2;
+ }
+ else if (size == 16)
+ {
+ imm5 = 2;
+ imm5 |= index1 << 2;
+ imm4 = index2 << 1;
+ }
+ else if (size == 32)
+ {
+ imm5 = 4;
+ imm5 |= index1 << 3;
+ imm4 = index2 << 2;
+ }
+ else if (size == 64)
+ {
+ imm5 = 8;
+ imm5 |= index1 << 4;
+ imm4 = index2 << 3;
+ }
+
+ EmitCopy(1, 1, imm5, imm4, Rd, Rn);
+}
+
+void ARM64FloatEmitter::UMOV(u8 size, ARM64Reg Rd, ARM64Reg Rn, u8 index)
+{
+ bool b64Bit = Is64Bit(Rd);
+ ASSERT_MSG(DYNA_REC, Rd < SP, "%s destination must be a GPR!", __func__);
+ ASSERT_MSG(DYNA_REC, !(b64Bit && size != 64),
+ "%s must have a size of 64 when destination is 64bit!", __func__);
+ u32 imm5 = 0;
+
+ if (size == 8)
+ {
+ imm5 = 1;
+ imm5 |= index << 1;
+ }
+ else if (size == 16)
+ {
+ imm5 = 2;
+ imm5 |= index << 2;
+ }
+ else if (size == 32)
+ {
+ imm5 = 4;
+ imm5 |= index << 3;
+ }
+ else if (size == 64)
+ {
+ imm5 = 8;
+ imm5 |= index << 4;
+ }
+
+ EmitCopy(b64Bit, 0, imm5, 7, Rd, Rn);
+}
+void ARM64FloatEmitter::SMOV(u8 size, ARM64Reg Rd, ARM64Reg Rn, u8 index)
+{
+ bool b64Bit = Is64Bit(Rd);
+ ASSERT_MSG(DYNA_REC, Rd < SP, "%s destination must be a GPR!", __func__);
+ ASSERT_MSG(DYNA_REC, size != 64, "%s doesn't support 64bit destination. Use UMOV!", __func__);
+ u32 imm5 = 0;
+
+ if (size == 8)
+ {
+ imm5 = 1;
+ imm5 |= index << 1;
+ }
+ else if (size == 16)
+ {
+ imm5 = 2;
+ imm5 |= index << 2;
+ }
+ else if (size == 32)
+ {
+ imm5 = 4;
+ imm5 |= index << 3;
+ }
+
+ EmitCopy(b64Bit, 0, imm5, 5, Rd, Rn);
+}
+
+// One source
+void ARM64FloatEmitter::FCVT(u8 size_to, u8 size_from, ARM64Reg Rd, ARM64Reg Rn)
+{
+ u32 dst_encoding = 0;
+ u32 src_encoding = 0;
+
+ if (size_to == 16)
+ dst_encoding = 3;
+ else if (size_to == 32)
+ dst_encoding = 0;
+ else if (size_to == 64)
+ dst_encoding = 1;
+
+ if (size_from == 16)
+ src_encoding = 3;
+ else if (size_from == 32)
+ src_encoding = 0;
+ else if (size_from == 64)
+ src_encoding = 1;
+
+ Emit1Source(0, 0, src_encoding, 4 | dst_encoding, Rd, Rn);
+}
+
+void ARM64FloatEmitter::SCVTF(ARM64Reg Rd, ARM64Reg Rn)
+{
+ if (IsScalar(Rn))
+ {
+ // Source is in FP register (like destination!). We must use a vector encoding.
+ bool sign = false;
+ Rd = DecodeReg(Rd);
+ Rn = DecodeReg(Rn);
+ int sz = IsDouble(Rn);
+ Write32((0x5e << 24) | (sign << 29) | (sz << 22) | (0x876 << 10) | (Rn << 5) | Rd);
+ }
+ else
+ {
+ bool sf = Is64Bit(Rn);
+ u32 type = 0;
+ if (IsDouble(Rd))
+ type = 1;
+ EmitConversion(sf, 0, type, 0, 2, Rd, Rn);
+ }
+}
+
+void ARM64FloatEmitter::UCVTF(ARM64Reg Rd, ARM64Reg Rn)
+{
+ if (IsScalar(Rn))
+ {
+ // Source is in FP register (like destination!). We must use a vector encoding.
+ bool sign = true;
+ Rd = DecodeReg(Rd);
+ Rn = DecodeReg(Rn);
+ int sz = IsDouble(Rn);
+ Write32((0x5e << 24) | (sign << 29) | (sz << 22) | (0x876 << 10) | (Rn << 5) | Rd);
+ }
+ else
+ {
+ bool sf = Is64Bit(Rn);
+ u32 type = 0;
+ if (IsDouble(Rd))
+ type = 1;
+
+ EmitConversion(sf, 0, type, 0, 3, Rd, Rn);
+ }
+}
+
+void ARM64FloatEmitter::SCVTF(ARM64Reg Rd, ARM64Reg Rn, int scale)
+{
+ bool sf = Is64Bit(Rn);
+ u32 type = 0;
+ if (IsDouble(Rd))
+ type = 1;
+
+ EmitConversion2(sf, 0, false, type, 0, 2, 64 - scale, Rd, Rn);
+}
+
+void ARM64FloatEmitter::UCVTF(ARM64Reg Rd, ARM64Reg Rn, int scale)
+{
+ bool sf = Is64Bit(Rn);
+ u32 type = 0;
+ if (IsDouble(Rd))
+ type = 1;
+
+ EmitConversion2(sf, 0, false, type, 0, 3, 64 - scale, Rd, Rn);
+}
+
+void ARM64FloatEmitter::FCMP(ARM64Reg Rn, ARM64Reg Rm)
+{
+ EmitCompare(0, 0, 0, 0, Rn, Rm);
+}
+void ARM64FloatEmitter::FCMP(ARM64Reg Rn)
+{
+ EmitCompare(0, 0, 0, 8, Rn, (ARM64Reg)0);
+}
+void ARM64FloatEmitter::FCMPE(ARM64Reg Rn, ARM64Reg Rm)
+{
+ EmitCompare(0, 0, 0, 0x10, Rn, Rm);
+}
+void ARM64FloatEmitter::FCMPE(ARM64Reg Rn)
+{
+ EmitCompare(0, 0, 0, 0x18, Rn, (ARM64Reg)0);
+}
+void ARM64FloatEmitter::FCMEQ(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+ EmitThreeSame(0, size >> 6, 0x1C, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::FCMEQ(u8 size, ARM64Reg Rd, ARM64Reg Rn)
+{
+ Emit2RegMisc(IsQuad(Rd), 0, 2 | (size >> 6), 0xD, Rd, Rn);
+}
+void ARM64FloatEmitter::FCMGE(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+ EmitThreeSame(1, size >> 6, 0x1C, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::FCMGE(u8 size, ARM64Reg Rd, ARM64Reg Rn)
+{
+ Emit2RegMisc(IsQuad(Rd), 1, 2 | (size >> 6), 0x0C, Rd, Rn);
+}
+void ARM64FloatEmitter::FCMGT(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+ EmitThreeSame(1, 2 | (size >> 6), 0x1C, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::FCMGT(u8 size, ARM64Reg Rd, ARM64Reg Rn)
+{
+ Emit2RegMisc(IsQuad(Rd), 0, 2 | (size >> 6), 0x0C, Rd, Rn);
+}
+void ARM64FloatEmitter::FCMLE(u8 size, ARM64Reg Rd, ARM64Reg Rn)
+{
+ Emit2RegMisc(IsQuad(Rd), 1, 2 | (size >> 6), 0xD, Rd, Rn);
+}
+void ARM64FloatEmitter::FCMLT(u8 size, ARM64Reg Rd, ARM64Reg Rn)
+{
+ Emit2RegMisc(IsQuad(Rd), 0, 2 | (size >> 6), 0xE, Rd, Rn);
+}
+
+void ARM64FloatEmitter::FCSEL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, CCFlags cond)
+{
+ EmitCondSelect(0, 0, cond, Rd, Rn, Rm);
+}
+
+// Permute
+void ARM64FloatEmitter::UZP1(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+ EmitPermute(size, 0b001, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::TRN1(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+ EmitPermute(size, 0b010, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::ZIP1(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+ EmitPermute(size, 0b011, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::UZP2(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+ EmitPermute(size, 0b101, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::TRN2(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+ EmitPermute(size, 0b110, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::ZIP2(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+ EmitPermute(size, 0b111, Rd, Rn, Rm);
+}
+
+// Shift by immediate
+void ARM64FloatEmitter::SSHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift)
+{
+ SSHLL(src_size, Rd, Rn, shift, false);
+}
+void ARM64FloatEmitter::SSHLL2(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift)
+{
+ SSHLL(src_size, Rd, Rn, shift, true);
+}
+void ARM64FloatEmitter::SHRN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift)
+{
+ SHRN(dest_size, Rd, Rn, shift, false);
+}
+void ARM64FloatEmitter::SHRN2(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift)
+{
+ SHRN(dest_size, Rd, Rn, shift, true);
+}
+void ARM64FloatEmitter::USHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift)
+{
+ USHLL(src_size, Rd, Rn, shift, false);
+}
+void ARM64FloatEmitter::USHLL2(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift)
+{
+ USHLL(src_size, Rd, Rn, shift, true);
+}
+void ARM64FloatEmitter::SXTL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn)
+{
+ SXTL(src_size, Rd, Rn, false);
+}
+void ARM64FloatEmitter::SXTL2(u8 src_size, ARM64Reg Rd, ARM64Reg Rn)
+{
+ SXTL(src_size, Rd, Rn, true);
+}
+void ARM64FloatEmitter::UXTL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn)
+{
+ UXTL(src_size, Rd, Rn, false);
+}
+void ARM64FloatEmitter::UXTL2(u8 src_size, ARM64Reg Rd, ARM64Reg Rn)
+{
+ UXTL(src_size, Rd, Rn, true);
+}
+
+void ARM64FloatEmitter::SSHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift, bool upper)
+{
+ ASSERT_MSG(DYNA_REC, shift < src_size, "%s shift amount must less than the element size!",
+ __func__);
+ u32 immh = 0;
+ u32 immb = shift & 0xFFF;
+
+ if (src_size == 8)
+ {
+ immh = 1;
+ }
+ else if (src_size == 16)
+ {
+ immh = 2 | ((shift >> 3) & 1);
+ }
+ else if (src_size == 32)
+ {
+ immh = 4 | ((shift >> 3) & 3);
+ ;
+ }
+ EmitShiftImm(upper, 0, immh, immb, 0b10100, Rd, Rn);
+}
+
+void ARM64FloatEmitter::USHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift, bool upper)
+{
+ ASSERT_MSG(DYNA_REC, shift < src_size, "%s shift amount must less than the element size!",
+ __func__);
+ u32 immh = 0;
+ u32 immb = shift & 0xFFF;
+
+ if (src_size == 8)
+ {
+ immh = 1;
+ }
+ else if (src_size == 16)
+ {
+ immh = 2 | ((shift >> 3) & 1);
+ }
+ else if (src_size == 32)
+ {
+ immh = 4 | ((shift >> 3) & 3);
+ ;
+ }
+ EmitShiftImm(upper, 1, immh, immb, 0b10100, Rd, Rn);
+}
+
+void ARM64FloatEmitter::SHRN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift, bool upper)
+{
+ ASSERT_MSG(DYNA_REC, shift < dest_size, "%s shift amount must less than the element size!",
+ __func__);
+ u32 immh = 0;
+ u32 immb = shift & 0xFFF;
+
+ if (dest_size == 8)
+ {
+ immh = 1;
+ }
+ else if (dest_size == 16)
+ {
+ immh = 2 | ((shift >> 3) & 1);
+ }
+ else if (dest_size == 32)
+ {
+ immh = 4 | ((shift >> 3) & 3);
+ ;
+ }
+ EmitShiftImm(upper, 1, immh, immb, 0b10000, Rd, Rn);
+}
+
+void ARM64FloatEmitter::SXTL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, bool upper)
+{
+ SSHLL(src_size, Rd, Rn, 0, upper);
+}
+
+void ARM64FloatEmitter::UXTL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, bool upper)
+{
+ USHLL(src_size, Rd, Rn, 0, upper);
+}
+
+// vector x indexed element
+void ARM64FloatEmitter::FMUL(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, u8 index)
+{
+ ASSERT_MSG(DYNA_REC, size == 32 || size == 64, "%s only supports 32bit or 64bit size!", __func__);
+
+ bool L = false;
+ bool H = false;
+ if (size == 32)
+ {
+ L = index & 1;
+ H = (index >> 1) & 1;
+ }
+ else if (size == 64)
+ {
+ H = index == 1;
+ }
+
+ EmitVectorxElement(0, 2 | (size >> 6), L, 0x9, H, Rd, Rn, Rm);
+}
+
+void ARM64FloatEmitter::FMLA(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, u8 index)
+{
+ ASSERT_MSG(DYNA_REC, size == 32 || size == 64, "%s only supports 32bit or 64bit size!", __func__);
+
+ bool L = false;
+ bool H = false;
+ if (size == 32)
+ {
+ L = index & 1;
+ H = (index >> 1) & 1;
+ }
+ else if (size == 64)
+ {
+ H = index == 1;
+ }
+
+ EmitVectorxElement(0, 2 | (size >> 6), L, 1, H, Rd, Rn, Rm);
+}
+
+// Modified Immediate
+void ARM64FloatEmitter::MOVI(u8 size, ARM64Reg Rd, u64 imm, u8 shift)
+{
+ bool Q = IsQuad(Rd);
+ u8 cmode = 0;
+ u8 op = 0;
+ u8 abcdefgh = imm & 0xFF;
+ if (size == 8)
+ {
+ ASSERT_MSG(DYNA_REC, shift == 0, "%s(size8) doesn't support shift!", __func__);
+ ASSERT_MSG(DYNA_REC, !(imm & ~0xFFULL), "%s(size8) only supports 8bit values!", __func__);
+ }
+ else if (size == 16)
+ {
+ ASSERT_MSG(DYNA_REC, shift == 0 || shift == 8, "%s(size16) only supports shift of {0, 8}!",
+ __func__);
+ ASSERT_MSG(DYNA_REC, !(imm & ~0xFFULL), "%s(size16) only supports 8bit values!", __func__);
+
+ if (shift == 8)
+ cmode |= 2;
+ }
+ else if (size == 32)
+ {
+ ASSERT_MSG(DYNA_REC, shift == 0 || shift == 8 || shift == 16 || shift == 24,
+ "%s(size32) only supports shift of {0, 8, 16, 24}!", __func__);
+ // XXX: Implement support for MOVI - shifting ones variant
+ ASSERT_MSG(DYNA_REC, !(imm & ~0xFFULL), "%s(size32) only supports 8bit values!", __func__);
+ switch (shift)
+ {
+ case 8:
+ cmode |= 2;
+ break;
+ case 16:
+ cmode |= 4;
+ break;
+ case 24:
+ cmode |= 6;
+ break;
+ default:
+ break;
+ }
+ }
+ else // 64
+ {
+ ASSERT_MSG(DYNA_REC, shift == 0, "%s(size64) doesn't support shift!", __func__);
+
+ op = 1;
+ cmode = 0xE;
+ abcdefgh = 0;
+ for (int i = 0; i < 8; ++i)
+ {
+ u8 tmp = (imm >> (i << 3)) & 0xFF;
+ ASSERT_MSG(DYNA_REC, tmp == 0xFF || tmp == 0, "%s(size64) Invalid immediate!", __func__);
+ if (tmp == 0xFF)
+ abcdefgh |= (1 << i);
+ }
+ }
+ EncodeModImm(Q, op, cmode, 0, Rd, abcdefgh);
+}
+
+void ARM64FloatEmitter::BIC(u8 size, ARM64Reg Rd, u8 imm, u8 shift)
+{
+ bool Q = IsQuad(Rd);
+ u8 cmode = 1;
+ u8 op = 1;
+ if (size == 16)
+ {
+ ASSERT_MSG(DYNA_REC, shift == 0 || shift == 8, "%s(size16) only supports shift of {0, 8}!",
+ __func__);
+
+ if (shift == 8)
+ cmode |= 2;
+ }
+ else if (size == 32)
+ {
+ ASSERT_MSG(DYNA_REC, shift == 0 || shift == 8 || shift == 16 || shift == 24,
+ "%s(size32) only supports shift of {0, 8, 16, 24}!", __func__);
+ // XXX: Implement support for MOVI - shifting ones variant
+ switch (shift)
+ {
+ case 8:
+ cmode |= 2;
+ break;
+ case 16:
+ cmode |= 4;
+ break;
+ case 24:
+ cmode |= 6;
+ break;
+ default:
+ break;
+ }
+ }
+ else
+ {
+ ASSERT_MSG(DYNA_REC, false, "%s only supports size of {16, 32}!", __func__);
+ }
+ EncodeModImm(Q, op, cmode, 0, Rd, imm);
+}
+
+void ARM64FloatEmitter::ABI_PushRegisters(BitSet32 registers, ARM64Reg tmp)
+{
+ bool bundled_loadstore = false;
+
+ for (int i = 0; i < 32; ++i)
+ {
+ if (!registers[i])
+ continue;
+
+ int count = 0;
+ while (++count < 4 && (i + count) < 32 && registers[i + count])
+ {
+ }
+ if (count > 1)
+ {
+ bundled_loadstore = true;
+ break;
+ }
+ }
+
+ if (bundled_loadstore && tmp != INVALID_REG)
+ {
+ int num_regs = registers.Count();
+ m_emit->SUB(SP, SP, num_regs * 16);
+ m_emit->ADD(tmp, SP, 0);
+ std::vector<ARM64Reg> island_regs;
+ for (int i = 0; i < 32; ++i)
+ {
+ if (!registers[i])
+ continue;
+
+ int count = 0;
+
+ // 0 = true
+ // 1 < 4 && registers[i + 1] true!
+ // 2 < 4 && registers[i + 2] true!
+ // 3 < 4 && registers[i + 3] true!
+ // 4 < 4 && registers[i + 4] false!
+ while (++count < 4 && (i + count) < 32 && registers[i + count])
+ {
+ }
+
+ if (count == 1)
+ island_regs.push_back((ARM64Reg)(Q0 + i));
+ else
+ ST1(64, count, INDEX_POST, (ARM64Reg)(Q0 + i), tmp);
+
+ i += count - 1;
+ }
+
+ // Handle island registers
+ std::vector<ARM64Reg> pair_regs;
+ for (auto& it : island_regs)
+ {
+ pair_regs.push_back(it);
+ if (pair_regs.size() == 2)
+ {
+ STP(128, INDEX_POST, pair_regs[0], pair_regs[1], tmp, 32);
+ pair_regs.clear();
+ }
+ }
+ if (pair_regs.size())
+ STR(128, INDEX_POST, pair_regs[0], tmp, 16);
+ }
+ else
+ {
+ std::vector<ARM64Reg> pair_regs;
+ for (auto it : registers)
+ {
+ pair_regs.push_back((ARM64Reg)(Q0 + it));
+ if (pair_regs.size() == 2)
+ {
+ STP(128, INDEX_PRE, pair_regs[0], pair_regs[1], SP, -32);
+ pair_regs.clear();
+ }
+ }
+ if (pair_regs.size())
+ STR(128, INDEX_PRE, pair_regs[0], SP, -16);
+ }
+}
+void ARM64FloatEmitter::ABI_PopRegisters(BitSet32 registers, ARM64Reg tmp)
+{
+ bool bundled_loadstore = false;
+ int num_regs = registers.Count();
+
+ for (int i = 0; i < 32; ++i)
+ {
+ if (!registers[i])
+ continue;
+
+ int count = 0;
+ while (++count < 4 && (i + count) < 32 && registers[i + count])
+ {
+ }
+ if (count > 1)
+ {
+ bundled_loadstore = true;
+ break;
+ }
+ }
+
+ if (bundled_loadstore && tmp != INVALID_REG)
+ {
+ // The temporary register is only used to indicate that we can use this code path
+ std::vector<ARM64Reg> island_regs;
+ for (int i = 0; i < 32; ++i)
+ {
+ if (!registers[i])
+ continue;
+
+ int count = 0;
+ while (++count < 4 && (i + count) < 32 && registers[i + count])
+ {
+ }
+
+ if (count == 1)
+ island_regs.push_back((ARM64Reg)(Q0 + i));
+ else
+ LD1(64, count, INDEX_POST, (ARM64Reg)(Q0 + i), SP);
+
+ i += count - 1;
+ }
+
+ // Handle island registers
+ std::vector<ARM64Reg> pair_regs;
+ for (auto& it : island_regs)
+ {
+ pair_regs.push_back(it);
+ if (pair_regs.size() == 2)
+ {
+ LDP(128, INDEX_POST, pair_regs[0], pair_regs[1], SP, 32);
+ pair_regs.clear();
+ }
+ }
+ if (pair_regs.size())
+ LDR(128, INDEX_POST, pair_regs[0], SP, 16);
+ }
+ else
+ {
+ bool odd = num_regs % 2;
+ std::vector<ARM64Reg> pair_regs;
+ for (int i = 31; i >= 0; --i)
+ {
+ if (!registers[i])
+ continue;
+
+ if (odd)
+ {
+ // First load must be a regular LDR if odd
+ odd = false;
+ LDR(128, INDEX_POST, (ARM64Reg)(Q0 + i), SP, 16);
+ }
+ else
+ {
+ pair_regs.push_back((ARM64Reg)(Q0 + i));
+ if (pair_regs.size() == 2)
+ {
+ LDP(128, INDEX_POST, pair_regs[1], pair_regs[0], SP, 32);
+ pair_regs.clear();
+ }
+ }
+ }
+ }
+}
+
+void ARM64XEmitter::ANDI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch)
+{
+ unsigned int n, imm_s, imm_r;
+ if (!Is64Bit(Rn))
+ imm &= 0xFFFFFFFF;
+ if (IsImmLogical(imm, Is64Bit(Rn) ? 64 : 32, &n, &imm_s, &imm_r))
+ {
+ AND(Rd, Rn, imm_r, imm_s, n != 0);
+ }
+ else
+ {
+ ASSERT_MSG(DYNA_REC, scratch != INVALID_REG,
+ "ANDI2R - failed to construct logical immediate value from %08x, need scratch",
+ (u32)imm);
+ MOVI2R(scratch, imm);
+ AND(Rd, Rn, scratch);
+ }
+}
+
+void ARM64XEmitter::ORRI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch)
+{
+ unsigned int n, imm_s, imm_r;
+ if (IsImmLogical(imm, Is64Bit(Rn) ? 64 : 32, &n, &imm_s, &imm_r))
+ {
+ ORR(Rd, Rn, imm_r, imm_s, n != 0);
+ }
+ else
+ {
+ ASSERT_MSG(DYNA_REC, scratch != INVALID_REG,
+ "ORRI2R - failed to construct logical immediate value from %08x, need scratch",
+ (u32)imm);
+ MOVI2R(scratch, imm);
+ ORR(Rd, Rn, scratch);
+ }
+}
+
+void ARM64XEmitter::EORI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch)
+{
+ unsigned int n, imm_s, imm_r;
+ if (IsImmLogical(imm, Is64Bit(Rn) ? 64 : 32, &n, &imm_s, &imm_r))
+ {
+ EOR(Rd, Rn, imm_r, imm_s, n != 0);
+ }
+ else
+ {
+ ASSERT_MSG(DYNA_REC, scratch != INVALID_REG,
+ "EORI2R - failed to construct logical immediate value from %08x, need scratch",
+ (u32)imm);
+ MOVI2R(scratch, imm);
+ EOR(Rd, Rn, scratch);
+ }
+}
+
+void ARM64XEmitter::ANDSI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch)
+{
+ unsigned int n, imm_s, imm_r;
+ if (IsImmLogical(imm, Is64Bit(Rn) ? 64 : 32, &n, &imm_s, &imm_r))
+ {
+ ANDS(Rd, Rn, imm_r, imm_s, n != 0);
+ }
+ else
+ {
+ ASSERT_MSG(DYNA_REC, scratch != INVALID_REG,
+ "ANDSI2R - failed to construct logical immediate value from %08x, need scratch",
+ (u32)imm);
+ MOVI2R(scratch, imm);
+ ANDS(Rd, Rn, scratch);
+ }
+}
+
+void ARM64XEmitter::AddImmediate(ARM64Reg Rd, ARM64Reg Rn, u64 imm, bool shift, bool negative,
+ bool flags)
+{
+ switch ((negative << 1) | flags)
+ {
+ case 0:
+ ADD(Rd, Rn, imm, shift);
+ break;
+ case 1:
+ ADDS(Rd, Rn, imm, shift);
+ break;
+ case 2:
+ SUB(Rd, Rn, imm, shift);
+ break;
+ case 3:
+ SUBS(Rd, Rn, imm, shift);
+ break;
+ }
+}
+
+void ARM64XEmitter::ADDI2R_internal(ARM64Reg Rd, ARM64Reg Rn, u64 imm, bool negative, bool flags,
+ ARM64Reg scratch)
+{
+ bool has_scratch = scratch != INVALID_REG;
+ u64 imm_neg = Is64Bit(Rd) ? -imm : -imm & 0xFFFFFFFFuLL;
+ bool neg_neg = negative ? false : true;
+
+ // Fast paths, aarch64 immediate instructions
+ // Try them all first
+ if (imm <= 0xFFF)
+ {
+ AddImmediate(Rd, Rn, imm, false, negative, flags);
+ return;
+ }
+ if (imm <= 0xFFFFFF && (imm & 0xFFF) == 0)
+ {
+ AddImmediate(Rd, Rn, imm >> 12, true, negative, flags);
+ return;
+ }
+ if (imm_neg <= 0xFFF)
+ {
+ AddImmediate(Rd, Rn, imm_neg, false, neg_neg, flags);
+ return;
+ }
+ if (imm_neg <= 0xFFFFFF && (imm_neg & 0xFFF) == 0)
+ {
+ AddImmediate(Rd, Rn, imm_neg >> 12, true, neg_neg, flags);
+ return;
+ }
+
+ // ADD+ADD is slower than MOVK+ADD, but inplace.
+ // But it supports a few more bits, so use it to avoid MOVK+MOVK+ADD.
+ // As this splits the addition in two parts, this must not be done on setting flags.
+ if (!flags && (imm >= 0x10000u || !has_scratch) && imm < 0x1000000u)
+ {
+ AddImmediate(Rd, Rn, imm & 0xFFF, false, negative, false);
+ AddImmediate(Rd, Rd, imm >> 12, true, negative, false);
+ return;
+ }
+ if (!flags && (imm_neg >= 0x10000u || !has_scratch) && imm_neg < 0x1000000u)
+ {
+ AddImmediate(Rd, Rn, imm_neg & 0xFFF, false, neg_neg, false);
+ AddImmediate(Rd, Rd, imm_neg >> 12, true, neg_neg, false);
+ return;
+ }
+
+ ASSERT_MSG(DYNA_REC, has_scratch,
+ "ADDI2R - failed to construct arithmetic immediate value from %08x, need scratch",
+ (u32)imm);
+
+ negative ^= MOVI2R2(scratch, imm, imm_neg);
+ switch ((negative << 1) | flags)
+ {
+ case 0:
+ ADD(Rd, Rn, scratch);
+ break;
+ case 1:
+ ADDS(Rd, Rn, scratch);
+ break;
+ case 2:
+ SUB(Rd, Rn, scratch);
+ break;
+ case 3:
+ SUBS(Rd, Rn, scratch);
+ break;
+ }
+}
+
+void ARM64XEmitter::ADDI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch)
+{
+ ADDI2R_internal(Rd, Rn, imm, false, false, scratch);
+}
+
+void ARM64XEmitter::ADDSI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch)
+{
+ ADDI2R_internal(Rd, Rn, imm, false, true, scratch);
+}
+
+void ARM64XEmitter::SUBI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch)
+{
+ ADDI2R_internal(Rd, Rn, imm, true, false, scratch);
+}
+
+void ARM64XEmitter::SUBSI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch)
+{
+ ADDI2R_internal(Rd, Rn, imm, true, true, scratch);
+}
+
+void ARM64XEmitter::CMPI2R(ARM64Reg Rn, u64 imm, ARM64Reg scratch)
+{
+ ADDI2R_internal(Is64Bit(Rn) ? ZR : WZR, Rn, imm, true, true, scratch);
+}
+
+bool ARM64XEmitter::TryADDI2R(ARM64Reg Rd, ARM64Reg Rn, u32 imm)
+{
+ u32 val;
+ bool shift;
+ if (IsImmArithmetic(imm, &val, &shift))
+ ADD(Rd, Rn, val, shift);
+ else
+ return false;
+
+ return true;
+}
+
+bool ARM64XEmitter::TrySUBI2R(ARM64Reg Rd, ARM64Reg Rn, u32 imm)
+{
+ u32 val;
+ bool shift;
+ if (IsImmArithmetic(imm, &val, &shift))
+ SUB(Rd, Rn, val, shift);
+ else
+ return false;
+
+ return true;
+}
+
+bool ARM64XEmitter::TryCMPI2R(ARM64Reg Rn, u32 imm)
+{
+ u32 val;
+ bool shift;
+ if (IsImmArithmetic(imm, &val, &shift))
+ CMP(Rn, val, shift);
+ else
+ return false;
+
+ return true;
+}
+
+bool ARM64XEmitter::TryANDI2R(ARM64Reg Rd, ARM64Reg Rn, u32 imm)
+{
+ u32 n, imm_r, imm_s;
+ if (IsImmLogical(imm, 32, &n, &imm_s, &imm_r))
+ AND(Rd, Rn, imm_r, imm_s, n != 0);
+ else
+ return false;
+
+ return true;
+}
+bool ARM64XEmitter::TryORRI2R(ARM64Reg Rd, ARM64Reg Rn, u32 imm)
+{
+ u32 n, imm_r, imm_s;
+ if (IsImmLogical(imm, 32, &n, &imm_s, &imm_r))
+ ORR(Rd, Rn, imm_r, imm_s, n != 0);
+ else
+ return false;
+
+ return true;
+}
+bool ARM64XEmitter::TryEORI2R(ARM64Reg Rd, ARM64Reg Rn, u32 imm)
+{
+ u32 n, imm_r, imm_s;
+ if (IsImmLogical(imm, 32, &n, &imm_s, &imm_r))
+ EOR(Rd, Rn, imm_r, imm_s, n != 0);
+ else
+ return false;
+
+ return true;
+}
+
+void ARM64FloatEmitter::MOVI2F(ARM64Reg Rd, float value, ARM64Reg scratch, bool negate)
+{
+ ASSERT_MSG(DYNA_REC, !IsDouble(Rd), "MOVI2F does not yet support double precision");
+ uint8_t imm8;
+ if (value == 0.0)
+ {
+ FMOV(Rd, IsDouble(Rd) ? ZR : WZR);
+ if (negate)
+ FNEG(Rd, Rd);
+ // TODO: There are some other values we could generate with the float-imm instruction, like
+ // 1.0...
+ }
+ else if (FPImm8FromFloat(value, &imm8))
+ {
+ FMOV(Rd, imm8);
+ }
+ else
+ {
+ ASSERT_MSG(DYNA_REC, scratch != INVALID_REG,
+ "Failed to find a way to generate FP immediate %f without scratch", value);
+ if (negate)
+ value = -value;
+
+ const u32 ival = Common::BitCast<u32>(value);
+ m_emit->MOVI2R(scratch, ival);
+ FMOV(Rd, scratch);
+ }
+}
+
+// TODO: Quite a few values could be generated easily using the MOVI instruction and friends.
+void ARM64FloatEmitter::MOVI2FDUP(ARM64Reg Rd, float value, ARM64Reg scratch)
+{
+ // TODO: Make it work with more element sizes
+ // TODO: Optimize - there are shorter solution for many values
+ ARM64Reg s = (ARM64Reg)(S0 + DecodeReg(Rd));
+ MOVI2F(s, value, scratch);
+ DUP(32, Rd, Rd, 0);
+}
+
+} // namespace Arm64Gen
diff --git a/src/dolphin/Arm64Emitter.h b/src/dolphin/Arm64Emitter.h
new file mode 100644
index 0000000..3d9d4ba
--- /dev/null
+++ b/src/dolphin/Arm64Emitter.h
@@ -0,0 +1,1151 @@
+// Copyright 2015 Dolphin Emulator Project
+// Licensed under GPLv2+
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <cstring>
+#include <functional>
+
+#include "ArmCommon.h"
+#include "BitSet.h"
+#include "Compat.h"
+
+namespace Arm64Gen
+{
+// X30 serves a dual purpose as a link register
+// Encoded as <u3:type><u5:reg>
+// Types:
+// 000 - 32bit GPR
+// 001 - 64bit GPR
+// 010 - VFP single precision
+// 100 - VFP double precision
+// 110 - VFP quad precision
+enum ARM64Reg
+{
+ // 32bit registers
+ W0 = 0,
+ W1,
+ W2,
+ W3,
+ W4,
+ W5,
+ W6,
+ W7,
+ W8,
+ W9,
+ W10,
+ W11,
+ W12,
+ W13,
+ W14,
+ W15,
+ W16,
+ W17,
+ W18,
+ W19,
+ W20,
+ W21,
+ W22,
+ W23,
+ W24,
+ W25,
+ W26,
+ W27,
+ W28,
+ W29,
+ W30,
+
+ WSP, // 32bit stack pointer
+
+ // 64bit registers
+ X0 = 0x20,
+ X1,
+ X2,
+ X3,
+ X4,
+ X5,
+ X6,
+ X7,
+ X8,
+ X9,
+ X10,
+ X11,
+ X12,
+ X13,
+ X14,
+ X15,
+ X16,
+ X17,
+ X18,
+ X19,
+ X20,
+ X21,
+ X22,
+ X23,
+ X24,
+ X25,
+ X26,
+ X27,
+ X28,
+ X29,
+ X30,
+
+ SP, // 64bit stack pointer
+
+ // VFP single precision registers
+ S0 = 0x40,
+ S1,
+ S2,
+ S3,
+ S4,
+ S5,
+ S6,
+ S7,
+ S8,
+ S9,
+ S10,
+ S11,
+ S12,
+ S13,
+ S14,
+ S15,
+ S16,
+ S17,
+ S18,
+ S19,
+ S20,
+ S21,
+ S22,
+ S23,
+ S24,
+ S25,
+ S26,
+ S27,
+ S28,
+ S29,
+ S30,
+ S31,
+
+ // VFP Double Precision registers
+ D0 = 0x80,
+ D1,
+ D2,
+ D3,
+ D4,
+ D5,
+ D6,
+ D7,
+ D8,
+ D9,
+ D10,
+ D11,
+ D12,
+ D13,
+ D14,
+ D15,
+ D16,
+ D17,
+ D18,
+ D19,
+ D20,
+ D21,
+ D22,
+ D23,
+ D24,
+ D25,
+ D26,
+ D27,
+ D28,
+ D29,
+ D30,
+ D31,
+
+ // ASIMD Quad-Word registers
+ Q0 = 0xC0,
+ Q1,
+ Q2,
+ Q3,
+ Q4,
+ Q5,
+ Q6,
+ Q7,
+ Q8,
+ Q9,
+ Q10,
+ Q11,
+ Q12,
+ Q13,
+ Q14,
+ Q15,
+ Q16,
+ Q17,
+ Q18,
+ Q19,
+ Q20,
+ Q21,
+ Q22,
+ Q23,
+ Q24,
+ Q25,
+ Q26,
+ Q27,
+ Q28,
+ Q29,
+ Q30,
+ Q31,
+
+ // For PRFM(prefetch memory) encoding
+ // This is encoded in the Rt register
+ // Data preload
+ PLDL1KEEP = 0,
+ PLDL1STRM,
+ PLDL2KEEP,
+ PLDL2STRM,
+ PLDL3KEEP,
+ PLDL3STRM,
+ // Instruction preload
+ PLIL1KEEP = 8,
+ PLIL1STRM,
+ PLIL2KEEP,
+ PLIL2STRM,
+ PLIL3KEEP,
+ PLIL3STRM,
+ // Prepare for store
+ PLTL1KEEP = 16,
+ PLTL1STRM,
+ PLTL2KEEP,
+ PLTL2STRM,
+ PLTL3KEEP,
+ PLTL3STRM,
+
+ WZR = WSP,
+ ZR = SP,
+
+ INVALID_REG = 0xFFFFFFFF
+};
+
+constexpr bool Is64Bit(ARM64Reg reg)
+{
+ return (reg & 0x20) != 0;
+}
+constexpr bool IsSingle(ARM64Reg reg)
+{
+ return (reg & 0xC0) == 0x40;
+}
+constexpr bool IsDouble(ARM64Reg reg)
+{
+ return (reg & 0xC0) == 0x80;
+}
+constexpr bool IsScalar(ARM64Reg reg)
+{
+ return IsSingle(reg) || IsDouble(reg);
+}
+constexpr bool IsQuad(ARM64Reg reg)
+{
+ return (reg & 0xC0) == 0xC0;
+}
+constexpr bool IsVector(ARM64Reg reg)
+{
+ return (reg & 0xC0) != 0;
+}
+constexpr bool IsGPR(ARM64Reg reg)
+{
+ return static_cast<int>(reg) < 0x40;
+}
+
+constexpr ARM64Reg DecodeReg(ARM64Reg reg)
+{
+ return static_cast<ARM64Reg>(reg & 0x1F);
+}
+constexpr ARM64Reg EncodeRegTo64(ARM64Reg reg)
+{
+ return static_cast<ARM64Reg>(reg | 0x20);
+}
+constexpr ARM64Reg EncodeRegToSingle(ARM64Reg reg)
+{
+ return static_cast<ARM64Reg>(DecodeReg(reg) + S0);
+}
+constexpr ARM64Reg EncodeRegToDouble(ARM64Reg reg)
+{
+ return static_cast<ARM64Reg>((reg & ~0xC0) | 0x80);
+}
+constexpr ARM64Reg EncodeRegToQuad(ARM64Reg reg)
+{
+ return static_cast<ARM64Reg>(reg | 0xC0);
+}
+
+enum OpType
+{
+ TYPE_IMM = 0,
+ TYPE_REG,
+ TYPE_IMMSREG,
+ TYPE_RSR,
+ TYPE_MEM
+};
+
+enum ShiftType
+{
+ ST_LSL = 0,
+ ST_LSR = 1,
+ ST_ASR = 2,
+ ST_ROR = 3,
+};
+
+enum IndexType
+{
+ INDEX_UNSIGNED,
+ INDEX_POST,
+ INDEX_PRE,
+ INDEX_SIGNED, // used in LDP/STP
+};
+
+enum ShiftAmount
+{
+ SHIFT_0 = 0,
+ SHIFT_16 = 1,
+ SHIFT_32 = 2,
+ SHIFT_48 = 3,
+};
+
+enum RoundingMode
+{
+ ROUND_A, // round to nearest, ties to away
+ ROUND_M, // round towards -inf
+ ROUND_N, // round to nearest, ties to even
+ ROUND_P, // round towards +inf
+ ROUND_Z, // round towards zero
+};
+
+struct FixupBranch
+{
+ ptrdiff_t ptr;
+ // Type defines
+ // 0 = CBZ (32bit)
+ // 1 = CBNZ (32bit)
+ // 2 = B (conditional)
+ // 3 = TBZ
+ // 4 = TBNZ
+ // 5 = B (unconditional)
+ // 6 = BL (unconditional)
+ u32 type;
+
+ // Used with B.cond
+ CCFlags cond;
+
+ // Used with TBZ/TBNZ
+ u8 bit;
+
+ // Used with Test/Compare and Branch
+ ARM64Reg reg;
+};
+
+enum PStateField
+{
+ FIELD_SPSel = 0,
+ FIELD_DAIFSet,
+ FIELD_DAIFClr,
+ FIELD_NZCV, // The only system registers accessible from EL0 (user space)
+ FIELD_PMCR_EL0,
+ FIELD_PMCCNTR_EL0,
+ FIELD_FPCR = 0x340,
+ FIELD_FPSR = 0x341,
+};
+
+enum SystemHint
+{
+ HINT_NOP = 0,
+ HINT_YIELD,
+ HINT_WFE,
+ HINT_WFI,
+ HINT_SEV,
+ HINT_SEVL,
+};
+
+enum BarrierType
+{
+ OSHLD = 1,
+ OSHST = 2,
+ OSH = 3,
+ NSHLD = 5,
+ NSHST = 6,
+ NSH = 7,
+ ISHLD = 9,
+ ISHST = 10,
+ ISH = 11,
+ LD = 13,
+ ST = 14,
+ SY = 15,
+};
+
+class ArithOption
+{
+public:
+ enum WidthSpecifier
+ {
+ WIDTH_DEFAULT,
+ WIDTH_32BIT,
+ WIDTH_64BIT,
+ };
+
+ enum ExtendSpecifier
+ {
+ EXTEND_UXTB = 0x0,
+ EXTEND_UXTH = 0x1,
+ EXTEND_UXTW = 0x2, /* Also LSL on 32bit width */
+ EXTEND_UXTX = 0x3, /* Also LSL on 64bit width */
+ EXTEND_SXTB = 0x4,
+ EXTEND_SXTH = 0x5,
+ EXTEND_SXTW = 0x6,
+ EXTEND_SXTX = 0x7,
+ };
+
+ enum TypeSpecifier
+ {
+ TYPE_EXTENDEDREG,
+ TYPE_IMM,
+ TYPE_SHIFTEDREG,
+ };
+
+private:
+ ARM64Reg m_destReg;
+ WidthSpecifier m_width;
+ ExtendSpecifier m_extend;
+ TypeSpecifier m_type;
+ ShiftType m_shifttype;
+ u32 m_shift;
+
+public:
+ ArithOption(ARM64Reg Rd, bool index = false)
+ {
+ // Indexed registers are a certain feature of AARch64
+ // On Loadstore instructions that use a register offset
+ // We can have the register as an index
+ // If we are indexing then the offset register will
+ // be shifted to the left so we are indexing at intervals
+ // of the size of what we are loading
+ // 8-bit: Index does nothing
+ // 16-bit: Index LSL 1
+ // 32-bit: Index LSL 2
+ // 64-bit: Index LSL 3
+ if (index)
+ m_shift = 4;
+ else
+ m_shift = 0;
+
+ m_destReg = Rd;
+ m_type = TYPE_EXTENDEDREG;
+ if (Is64Bit(Rd))
+ {
+ m_width = WIDTH_64BIT;
+ m_extend = EXTEND_UXTX;
+ }
+ else
+ {
+ m_width = WIDTH_32BIT;
+ m_extend = EXTEND_UXTW;
+ }
+ m_shifttype = ST_LSL;
+ }
+ ArithOption(ARM64Reg Rd, ShiftType shift_type, u32 shift)
+ {
+ m_destReg = Rd;
+ m_shift = shift;
+ m_shifttype = shift_type;
+ m_type = TYPE_SHIFTEDREG;
+ if (Is64Bit(Rd))
+ {
+ m_width = WIDTH_64BIT;
+ if (shift == 64)
+ m_shift = 0;
+ }
+ else
+ {
+ m_width = WIDTH_32BIT;
+ if (shift == 32)
+ m_shift = 0;
+ }
+ }
+ TypeSpecifier GetType() const { return m_type; }
+ ARM64Reg GetReg() const { return m_destReg; }
+ u32 GetData() const
+ {
+ switch (m_type)
+ {
+ case TYPE_EXTENDEDREG:
+ return (m_extend << 13) | (m_shift << 10);
+ break;
+ case TYPE_SHIFTEDREG:
+ return (m_shifttype << 22) | (m_shift << 10);
+ break;
+ default:
+ DEBUG_ASSERT_MSG(DYNA_REC, false, "Invalid type in GetData");
+ break;
+ }
+ return 0;
+ }
+};
+
+class ARM64XEmitter
+{
+ friend class ARM64FloatEmitter;
+
+private:
+ ptrdiff_t m_code;
+ ptrdiff_t m_lastCacheFlushEnd;
+ u8* m_rwbase;
+ u8* m_rxbase;
+
+ void AddImmediate(ARM64Reg Rd, ARM64Reg Rn, u64 imm, bool shift, bool negative, bool flags);
+ void EncodeCompareBranchInst(u32 op, ARM64Reg Rt, const void* ptr);
+ void EncodeTestBranchInst(u32 op, ARM64Reg Rt, u8 bits, const void* ptr);
+ void EncodeUnconditionalBranchInst(u32 op, const void* ptr);
+ void EncodeUnconditionalBranchInst(u32 opc, u32 op2, u32 op3, u32 op4, ARM64Reg Rn);
+ void EncodeExceptionInst(u32 instenc, u32 imm);
+ void EncodeSystemInst(u32 op0, u32 op1, u32 CRn, u32 CRm, u32 op2, ARM64Reg Rt);
+ void EncodeArithmeticInst(u32 instenc, bool flags, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm,
+ ArithOption Option);
+ void EncodeArithmeticCarryInst(u32 op, bool flags, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+ void EncodeCondCompareImmInst(u32 op, ARM64Reg Rn, u32 imm, u32 nzcv, CCFlags cond);
+ void EncodeCondCompareRegInst(u32 op, ARM64Reg Rn, ARM64Reg Rm, u32 nzcv, CCFlags cond);
+ void EncodeCondSelectInst(u32 instenc, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, CCFlags cond);
+ void EncodeData1SrcInst(u32 instenc, ARM64Reg Rd, ARM64Reg Rn);
+ void EncodeData2SrcInst(u32 instenc, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+ void EncodeData3SrcInst(u32 instenc, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra);
+ void EncodeLogicalInst(u32 instenc, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift);
+ void EncodeLoadRegisterInst(u32 bitop, ARM64Reg Rt, u32 imm);
+ void EncodeLoadStoreExcInst(u32 instenc, ARM64Reg Rs, ARM64Reg Rt2, ARM64Reg Rn, ARM64Reg Rt);
+ void EncodeLoadStorePairedInst(u32 op, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, u32 imm);
+ void EncodeLoadStoreIndexedInst(u32 op, u32 op2, ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+ void EncodeLoadStoreIndexedInst(u32 op, ARM64Reg Rt, ARM64Reg Rn, s32 imm, u8 size);
+ void EncodeMOVWideInst(u32 op, ARM64Reg Rd, u32 imm, ShiftAmount pos);
+ void EncodeBitfieldMOVInst(u32 op, ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms);
+ void EncodeLoadStoreRegisterOffset(u32 size, u32 opc, ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm);
+ void EncodeAddSubImmInst(u32 op, bool flags, u32 shift, u32 imm, ARM64Reg Rn, ARM64Reg Rd);
+ void EncodeLogicalImmInst(u32 op, ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms, int n);
+ void EncodeLoadStorePair(u32 op, u32 load, IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn,
+ s32 imm);
+ void EncodeAddressInst(u32 op, ARM64Reg Rd, s32 imm);
+ void EncodeLoadStoreUnscaled(u32 size, u32 op, ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+
+protected:
+ // TODO: make this less ugly
+ // used for Switch where memory is executable and writeable and different addresses
+ // we need to take this for relative addressing in account
+
+ void Write32(u32 value);
+
+public:
+ ARM64XEmitter() : m_code(0), m_lastCacheFlushEnd(0), m_rwbase(nullptr), m_rxbase(nullptr) {}
+ ARM64XEmitter(u8* rwbase, u8* rxbase, ptrdiff_t offset)
+ {
+ m_rwbase = rwbase;
+ m_rxbase = rxbase;
+ m_code = offset;
+ m_lastCacheFlushEnd = offset;
+ }
+
+ virtual ~ARM64XEmitter() {}
+ void SetCodePtr(ptrdiff_t ptr);
+ void SetCodePtrUnsafe(ptrdiff_t ptr);
+ void SetCodeBase(u8* rwbase, u8* rxbase);
+ void ReserveCodeSpace(u32 bytes);
+ ptrdiff_t AlignCode16();
+ ptrdiff_t AlignCodePage();
+ ptrdiff_t GetCodeOffset();
+ const u8* GetRWPtr();
+ u8* GetWriteableRWPtr();
+ void* GetRXPtr();
+ void FlushIcache();
+ void FlushIcacheSection(u8* start, u8* end);
+
+ // FixupBranch branching
+ void SetJumpTarget(FixupBranch const& branch);
+ FixupBranch CBZ(ARM64Reg Rt);
+ FixupBranch CBNZ(ARM64Reg Rt);
+ FixupBranch B(CCFlags cond);
+ FixupBranch TBZ(ARM64Reg Rt, u8 bit);
+ FixupBranch TBNZ(ARM64Reg Rt, u8 bit);
+ FixupBranch B();
+ FixupBranch BL();
+
+ // Compare and Branch
+ void CBZ(ARM64Reg Rt, const void* ptr);
+ void CBNZ(ARM64Reg Rt, const void* ptr);
+
+ // Conditional Branch
+ void B(CCFlags cond, const void* ptr);
+
+ // Test and Branch
+ void TBZ(ARM64Reg Rt, u8 bits, const void* ptr);
+ void TBNZ(ARM64Reg Rt, u8 bits, const void* ptr);
+
+ // Unconditional Branch
+ void B(const void* ptr);
+ void BL(const void* ptr);
+
+ // Unconditional Branch (register)
+ void BR(ARM64Reg Rn);
+ void BLR(ARM64Reg Rn);
+ void RET(ARM64Reg Rn = X30);
+ void ERET();
+ void DRPS();
+
+ // Exception generation
+ void SVC(u32 imm);
+ void HVC(u32 imm);
+ void SMC(u32 imm);
+ void BRK(u32 imm);
+ void HLT(u32 imm);
+ void DCPS1(u32 imm);
+ void DCPS2(u32 imm);
+ void DCPS3(u32 imm);
+
+ // System
+ void _MSR(PStateField field, u8 imm);
+ void _MSR(PStateField field, ARM64Reg Rt);
+ void MRS(ARM64Reg Rt, PStateField field);
+ void CNTVCT(ARM64Reg Rt);
+
+ void HINT(SystemHint op);
+ void CLREX();
+ void DSB(BarrierType type);
+ void DMB(BarrierType type);
+ void ISB(BarrierType type);
+
+ // Add/Subtract (Extended/Shifted register)
+ void ADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+ void ADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Option);
+ void ADDS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+ void ADDS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Option);
+ void SUB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+ void SUB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Option);
+ void SUBS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+ void SUBS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Option);
+ void CMN(ARM64Reg Rn, ARM64Reg Rm);
+ void CMN(ARM64Reg Rn, ARM64Reg Rm, ArithOption Option);
+ void CMP(ARM64Reg Rn, ARM64Reg Rm);
+ void CMP(ARM64Reg Rn, ARM64Reg Rm, ArithOption Option);
+
+ // Add/Subtract (with carry)
+ void ADC(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+ void ADCS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+ void SBC(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+ void SBCS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+
+ // Conditional Compare (immediate)
+ void CCMN(ARM64Reg Rn, u32 imm, u32 nzcv, CCFlags cond);
+ void CCMP(ARM64Reg Rn, u32 imm, u32 nzcv, CCFlags cond);
+
+ // Conditional Compare (register)
+ void CCMN(ARM64Reg Rn, ARM64Reg Rm, u32 nzcv, CCFlags cond);
+ void CCMP(ARM64Reg Rn, ARM64Reg Rm, u32 nzcv, CCFlags cond);
+
+ // Conditional Select
+ void CSEL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, CCFlags cond);
+ void CSINC(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, CCFlags cond);
+ void CSINV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, CCFlags cond);
+ void CSNEG(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, CCFlags cond);
+
+ // Aliases
+ void CSET(ARM64Reg Rd, CCFlags cond)
+ {
+ ARM64Reg zr = Is64Bit(Rd) ? ZR : WZR;
+ CSINC(Rd, zr, zr, (CCFlags)((u32)cond ^ 1));
+ }
+ void CSETM(ARM64Reg Rd, CCFlags cond)
+ {
+ ARM64Reg zr = Is64Bit(Rd) ? ZR : WZR;
+ CSINV(Rd, zr, zr, (CCFlags)((u32)cond ^ 1));
+ }
+ void NEG(ARM64Reg Rd, ARM64Reg Rs) { SUB(Rd, Is64Bit(Rd) ? ZR : WZR, Rs); }
+ // Data-Processing 1 source
+ void RBIT(ARM64Reg Rd, ARM64Reg Rn);
+ void REV16(ARM64Reg Rd, ARM64Reg Rn);
+ void REV32(ARM64Reg Rd, ARM64Reg Rn);
+ void REV64(ARM64Reg Rd, ARM64Reg Rn);
+ void CLZ(ARM64Reg Rd, ARM64Reg Rn);
+ void CLS(ARM64Reg Rd, ARM64Reg Rn);
+
+ // Data-Processing 2 source
+ void UDIV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+ void SDIV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+ void LSLV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+ void LSRV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+ void ASRV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+ void RORV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+ void CRC32B(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+ void CRC32H(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+ void CRC32W(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+ void CRC32CB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+ void CRC32CH(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+ void CRC32CW(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+ void CRC32X(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+ void CRC32CX(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+
+ // Data-Processing 3 source
+ void MADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra);
+ void MSUB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra);
+ void SMADDL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra);
+ void SMULL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+ void SMSUBL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra);
+ void SMULH(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+ void UMADDL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra);
+ void UMULL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+ void UMSUBL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra);
+ void UMULH(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+ void MUL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+ void MNEG(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+
+ // Logical (shifted register)
+ void AND(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift);
+ void BIC(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift);
+ void ORR(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift);
+ void ORN(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift);
+ void EOR(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift);
+ void EON(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift);
+ void ANDS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift);
+ void BICS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift);
+
+ // Wrap the above for saner syntax
+ void AND(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { AND(Rd, Rn, Rm, ArithOption(Rd, ST_LSL, 0)); }
+ void BIC(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { BIC(Rd, Rn, Rm, ArithOption(Rd, ST_LSL, 0)); }
+ void ORR(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { ORR(Rd, Rn, Rm, ArithOption(Rd, ST_LSL, 0)); }
+ void ORN(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { ORN(Rd, Rn, Rm, ArithOption(Rd, ST_LSL, 0)); }
+ void EOR(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { EOR(Rd, Rn, Rm, ArithOption(Rd, ST_LSL, 0)); }
+ void EON(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { EON(Rd, Rn, Rm, ArithOption(Rd, ST_LSL, 0)); }
+ void ANDS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { ANDS(Rd, Rn, Rm, ArithOption(Rd, ST_LSL, 0)); }
+ void BICS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { BICS(Rd, Rn, Rm, ArithOption(Rd, ST_LSL, 0)); }
+ // Convenience wrappers around ORR. These match the official convenience syntax.
+ void MOV(ARM64Reg Rd, ARM64Reg Rm, ArithOption Shift);
+ void MOV(ARM64Reg Rd, ARM64Reg Rm);
+ void MVN(ARM64Reg Rd, ARM64Reg Rm);
+
+ // Convenience wrappers around UBFM/EXTR.
+ void LSR(ARM64Reg Rd, ARM64Reg Rm, int shift);
+ void LSL(ARM64Reg Rd, ARM64Reg Rm, int shift);
+ void ASR(ARM64Reg Rd, ARM64Reg Rm, int shift);
+ void ROR_(ARM64Reg Rd, ARM64Reg Rm, int shift);
+
+ // Logical (immediate)
+ void AND(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms, bool invert = false);
+ void ANDS(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms, bool invert = false);
+ void EOR(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms, bool invert = false);
+ void ORR(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms, bool invert = false);
+ void TST(ARM64Reg Rn, u32 immr, u32 imms, bool invert = false);
+ void TST(ARM64Reg Rn, ARM64Reg Rm) { ANDS(Is64Bit(Rn) ? ZR : WZR, Rn, Rm); }
+ // Add/subtract (immediate)
+ void ADD(ARM64Reg Rd, ARM64Reg Rn, u32 imm, bool shift = false);
+ void ADDS(ARM64Reg Rd, ARM64Reg Rn, u32 imm, bool shift = false);
+ void SUB(ARM64Reg Rd, ARM64Reg Rn, u32 imm, bool shift = false);
+ void SUBS(ARM64Reg Rd, ARM64Reg Rn, u32 imm, bool shift = false);
+ void CMP(ARM64Reg Rn, u32 imm, bool shift = false);
+
+ // Data Processing (Immediate)
+ void MOVZ(ARM64Reg Rd, u32 imm, ShiftAmount pos = SHIFT_0);
+ void MOVN(ARM64Reg Rd, u32 imm, ShiftAmount pos = SHIFT_0);
+ void MOVK(ARM64Reg Rd, u32 imm, ShiftAmount pos = SHIFT_0);
+
+ // Bitfield move
+ void BFM(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms);
+ void SBFM(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms);
+ void UBFM(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms);
+ void BFI(ARM64Reg Rd, ARM64Reg Rn, u32 lsb, u32 width);
+ void UBFIZ(ARM64Reg Rd, ARM64Reg Rn, u32 lsb, u32 width);
+
+ // Extract register (ROR with two inputs, if same then faster on A67)
+ void EXTR(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, u32 shift);
+
+ // Aliases
+ void SXTB(ARM64Reg Rd, ARM64Reg Rn);
+ void SXTH(ARM64Reg Rd, ARM64Reg Rn);
+ void SXTW(ARM64Reg Rd, ARM64Reg Rn);
+ void UXTB(ARM64Reg Rd, ARM64Reg Rn);
+ void UXTH(ARM64Reg Rd, ARM64Reg Rn);
+
+ void UBFX(ARM64Reg Rd, ARM64Reg Rn, int lsb, int width) { UBFM(Rd, Rn, lsb, lsb + width - 1); }
+ // Load Register (Literal)
+ void LDR(ARM64Reg Rt, u32 imm);
+ void LDRSW(ARM64Reg Rt, u32 imm);
+ void PRFM(ARM64Reg Rt, u32 imm);
+
+ // Load/Store Exclusive
+ void STXRB(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rn);
+ void STLXRB(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rn);
+ void LDXRB(ARM64Reg Rt, ARM64Reg Rn);
+ void LDAXRB(ARM64Reg Rt, ARM64Reg Rn);
+ void STLRB(ARM64Reg Rt, ARM64Reg Rn);
+ void LDARB(ARM64Reg Rt, ARM64Reg Rn);
+ void STXRH(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rn);
+ void STLXRH(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rn);
+ void LDXRH(ARM64Reg Rt, ARM64Reg Rn);
+ void LDAXRH(ARM64Reg Rt, ARM64Reg Rn);
+ void STLRH(ARM64Reg Rt, ARM64Reg Rn);
+ void LDARH(ARM64Reg Rt, ARM64Reg Rn);
+ void STXR(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rn);
+ void STLXR(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rn);
+ void STXP(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn);
+ void STLXP(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn);
+ void LDXR(ARM64Reg Rt, ARM64Reg Rn);
+ void LDAXR(ARM64Reg Rt, ARM64Reg Rn);
+ void LDXP(ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn);
+ void LDAXP(ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn);
+ void STLR(ARM64Reg Rt, ARM64Reg Rn);
+ void LDAR(ARM64Reg Rt, ARM64Reg Rn);
+
+ // Load/Store no-allocate pair (offset)
+ void STNP(ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, u32 imm);
+ void LDNP(ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, u32 imm);
+
+ // Load/Store register (immediate indexed)
+ void STRB(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+ void LDRB(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+ void LDRSB(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+ void STRH(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+ void LDRH(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+ void LDRSH(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+ void STR(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+ void LDR(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+ void LDRSW(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+
+ // Load/Store register (register offset)
+ void STRB(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm);
+ void LDRB(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm);
+ void LDRSB(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm);
+ void STRH(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm);
+ void LDRH(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm);
+ void LDRSH(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm);
+ void STR(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm);
+ void LDR(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm);
+ void LDRSW(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm);
+ void PRFM(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm);
+
+ // Load/Store register (unscaled offset)
+ void STURB(ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+ void LDURB(ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+ void LDURSB(ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+ void STURH(ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+ void LDURH(ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+ void LDURSH(ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+ void STUR(ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+ void LDUR(ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+ void LDURSW(ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+
+ // Load/Store pair
+ void LDP(IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, s32 imm);
+ void LDPSW(IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, s32 imm);
+ void STP(IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, s32 imm);
+
+ void LDRGeneric(int size, bool signExtend, ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm);
+ void STRGeneric(int size, ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm);
+
+ void LDRGeneric(int size, bool signExtend, IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+ void STRGeneric(int size, IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+
+ // Address of label/page PC-relative
+ void ADR(ARM64Reg Rd, s32 imm);
+ void ADRP(ARM64Reg Rd, s32 imm);
+
+ // Wrapper around MOVZ+MOVK
+ void MOVI2R(ARM64Reg Rd, u64 imm, bool optimize = true);
+ bool MOVI2R2(ARM64Reg Rd, u64 imm1, u64 imm2);
+ template <class P>
+ void MOVP2R(ARM64Reg Rd, P* ptr)
+ {
+ ASSERT_MSG(DYNA_REC, Is64Bit(Rd), "Can't store pointers in 32-bit registers");
+ MOVI2R(Rd, (uintptr_t)ptr);
+ }
+
+ // Wrapper around AND x, y, imm etc. If you are sure the imm will work, no need to pass a scratch
+ // register.
+ void ANDI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch = INVALID_REG);
+ void ANDSI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch = INVALID_REG);
+ void TSTI2R(ARM64Reg Rn, u64 imm, ARM64Reg scratch = INVALID_REG)
+ {
+ ANDSI2R(Is64Bit(Rn) ? ZR : WZR, Rn, imm, scratch);
+ }
+ void ORRI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch = INVALID_REG);
+ void EORI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch = INVALID_REG);
+ void CMPI2R(ARM64Reg Rn, u64 imm, ARM64Reg scratch = INVALID_REG);
+
+ void ADDI2R_internal(ARM64Reg Rd, ARM64Reg Rn, u64 imm, bool negative, bool flags,
+ ARM64Reg scratch);
+ void ADDI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch = INVALID_REG);
+ void ADDSI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch = INVALID_REG);
+ void SUBI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch = INVALID_REG);
+ void SUBSI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch = INVALID_REG);
+
+ bool TryADDI2R(ARM64Reg Rd, ARM64Reg Rn, u32 imm);
+ bool TrySUBI2R(ARM64Reg Rd, ARM64Reg Rn, u32 imm);
+ bool TryCMPI2R(ARM64Reg Rn, u32 imm);
+
+ bool TryANDI2R(ARM64Reg Rd, ARM64Reg Rn, u32 imm);
+ bool TryORRI2R(ARM64Reg Rd, ARM64Reg Rn, u32 imm);
+ bool TryEORI2R(ARM64Reg Rd, ARM64Reg Rn, u32 imm);
+
+ // ABI related
+ void ABI_PushRegisters(BitSet32 registers);
+ void ABI_PopRegisters(BitSet32 registers, BitSet32 ignore_mask = BitSet32(0));
+
+ // Utility to generate a call to a std::function object.
+ //
+ // Unfortunately, calling operator() directly is undefined behavior in C++
+ // (this method might be a thunk in the case of multi-inheritance) so we
+ // have to go through a trampoline function.
+ template <typename T, typename... Args>
+ static T CallLambdaTrampoline(const std::function<T(Args...)>* f, Args... args)
+ {
+ return (*f)(args...);
+ }
+
+ // This function expects you to have set up the state.
+ // Overwrites X0 and X30
+ template <typename T, typename... Args>
+ ARM64Reg ABI_SetupLambda(const std::function<T(Args...)>* f)
+ {
+ auto trampoline = &ARM64XEmitter::CallLambdaTrampoline<T, Args...>;
+ MOVI2R(X30, (uintptr_t)trampoline);
+ MOVI2R(X0, (uintptr_t) const_cast<void*>((const void*)f));
+ return X30;
+ }
+
+ void QuickTailCall(ARM64Reg scratchreg, const void* func);
+ template <typename T>
+ void QuickTailCall(ARM64Reg scratchreg, T func)
+ {
+ QuickTailCall(scratchreg, (const void*)func);
+ }
+
+ // Plain function call
+ void QuickCallFunction(ARM64Reg scratchreg, const void* func);
+ template <typename T>
+ void QuickCallFunction(ARM64Reg scratchreg, T func)
+ {
+ QuickCallFunction(scratchreg, (const void*)func);
+ }
+};
+
+class ARM64FloatEmitter
+{
+public:
+ ARM64FloatEmitter(ARM64XEmitter* emit) : m_emit(emit) {}
+ void LDR(u8 size, IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+ void STR(u8 size, IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+
+ // Loadstore unscaled
+ void LDUR(u8 size, ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+ void STUR(u8 size, ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+
+ // Loadstore single structure
+ void LD1(u8 size, ARM64Reg Rt, u8 index, ARM64Reg Rn);
+ void LD1(u8 size, ARM64Reg Rt, u8 index, ARM64Reg Rn, ARM64Reg Rm);
+ void LD1R(u8 size, ARM64Reg Rt, ARM64Reg Rn);
+ void LD2R(u8 size, ARM64Reg Rt, ARM64Reg Rn);
+ void LD1R(u8 size, ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm);
+ void LD2R(u8 size, ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm);
+ void ST1(u8 size, ARM64Reg Rt, u8 index, ARM64Reg Rn);
+ void ST1(u8 size, ARM64Reg Rt, u8 index, ARM64Reg Rn, ARM64Reg Rm);
+
+ // Loadstore multiple structure
+ void LD1(u8 size, u8 count, ARM64Reg Rt, ARM64Reg Rn);
+ void LD1(u8 size, u8 count, IndexType type, ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm = SP);
+ void ST1(u8 size, u8 count, ARM64Reg Rt, ARM64Reg Rn);
+ void ST1(u8 size, u8 count, IndexType type, ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm = SP);
+
+ // Loadstore paired
+ void LDP(u8 size, IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, s32 imm);
+ void STP(u8 size, IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, s32 imm);
+
+ // Loadstore register offset
+ void STR(u8 size, ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm);
+ void LDR(u8 size, ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm);
+
+ // Scalar - 1 Source
+ void FABS(ARM64Reg Rd, ARM64Reg Rn);
+ void FNEG(ARM64Reg Rd, ARM64Reg Rn);
+ void FSQRT(ARM64Reg Rd, ARM64Reg Rn);
+ void FMOV(ARM64Reg Rd, ARM64Reg Rn, bool top = false); // Also generalized move between GPR/FP
+
+ // Scalar - 2 Source
+ void FADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+ void FMUL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+ void FSUB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+ void FDIV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+ void FMAX(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+ void FMIN(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+ void FMAXNM(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+ void FMINNM(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+ void FNMUL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+
+ // Scalar - 3 Source. Note - the accumulator is last on ARM!
+ void FMADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra);
+ void FMSUB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra);
+ void FNMADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra);
+ void FNMSUB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra);
+
+ // Scalar floating point immediate
+ void FMOV(ARM64Reg Rd, uint8_t imm8);
+
+ // Vector
+ void AND(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+ void BSL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+ void DUP(u8 size, ARM64Reg Rd, ARM64Reg Rn, u8 index);
+ void FABS(u8 size, ARM64Reg Rd, ARM64Reg Rn);
+ void FADD(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+ void FMAX(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+ void FMLA(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+ void FMLS(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+ void FMIN(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+ void FCVTL(u8 size, ARM64Reg Rd, ARM64Reg Rn);
+ void FCVTL2(u8 size, ARM64Reg Rd, ARM64Reg Rn);
+ void FCVTN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn);
+ void FCVTN2(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn);
+ void FCVTZS(u8 size, ARM64Reg Rd, ARM64Reg Rn);
+ void FCVTZU(u8 size, ARM64Reg Rd, ARM64Reg Rn);
+ void FDIV(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+ void FMUL(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+ void FNEG(u8 size, ARM64Reg Rd, ARM64Reg Rn);
+ void FRECPE(u8 size, ARM64Reg Rd, ARM64Reg Rn);
+ void FRSQRTE(u8 size, ARM64Reg Rd, ARM64Reg Rn);
+ void FSUB(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+ void NOT(ARM64Reg Rd, ARM64Reg Rn);
+ void ORR(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+ void MOV(ARM64Reg Rd, ARM64Reg Rn) { ORR(Rd, Rn, Rn); }
+ void REV16(u8 size, ARM64Reg Rd, ARM64Reg Rn);
+ void REV32(u8 size, ARM64Reg Rd, ARM64Reg Rn);
+ void REV64(u8 size, ARM64Reg Rd, ARM64Reg Rn);
+ void SCVTF(u8 size, ARM64Reg Rd, ARM64Reg Rn);
+ void UCVTF(u8 size, ARM64Reg Rd, ARM64Reg Rn);
+ void SCVTF(u8 size, ARM64Reg Rd, ARM64Reg Rn, int scale);
+ void UCVTF(u8 size, ARM64Reg Rd, ARM64Reg Rn, int scale);
+ void SQXTN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn);
+ void SQXTN2(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn);
+ void UQXTN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn);
+ void UQXTN2(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn);
+ void XTN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn);
+ void XTN2(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn);
+
+ // Move
+ void DUP(u8 size, ARM64Reg Rd, ARM64Reg Rn);
+ void INS(u8 size, ARM64Reg Rd, u8 index, ARM64Reg Rn);
+ void INS(u8 size, ARM64Reg Rd, u8 index1, ARM64Reg Rn, u8 index2);
+ void UMOV(u8 size, ARM64Reg Rd, ARM64Reg Rn, u8 index);
+ void SMOV(u8 size, ARM64Reg Rd, ARM64Reg Rn, u8 index);
+
+ // One source
+ void FCVT(u8 size_to, u8 size_from, ARM64Reg Rd, ARM64Reg Rn);
+
+ // Scalar convert float to int, in a lot of variants.
+ // Note that the scalar version of this operation has two encodings, one that goes to an integer
+ // register
+ // and one that outputs to a scalar fp register.
+ void FCVTS(ARM64Reg Rd, ARM64Reg Rn, RoundingMode round);
+ void FCVTU(ARM64Reg Rd, ARM64Reg Rn, RoundingMode round);
+
+ // Scalar convert int to float. No rounding mode specifier necessary.
+ void SCVTF(ARM64Reg Rd, ARM64Reg Rn);
+ void UCVTF(ARM64Reg Rd, ARM64Reg Rn);
+
+ // Scalar fixed point to float. scale is the number of fractional bits.
+ void SCVTF(ARM64Reg Rd, ARM64Reg Rn, int scale);
+ void UCVTF(ARM64Reg Rd, ARM64Reg Rn, int scale);
+
+ // Float comparison
+ void FCMP(ARM64Reg Rn, ARM64Reg Rm);
+ void FCMP(ARM64Reg Rn);
+ void FCMPE(ARM64Reg Rn, ARM64Reg Rm);
+ void FCMPE(ARM64Reg Rn);
+ void FCMEQ(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+ void FCMEQ(u8 size, ARM64Reg Rd, ARM64Reg Rn);
+ void FCMGE(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+ void FCMGE(u8 size, ARM64Reg Rd, ARM64Reg Rn);
+ void FCMGT(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+ void FCMGT(u8 size, ARM64Reg Rd, ARM64Reg Rn);
+ void FCMLE(u8 size, ARM64Reg Rd, ARM64Reg Rn);
+ void FCMLT(u8 size, ARM64Reg Rd, ARM64Reg Rn);
+
+ // Conditional select
+ void FCSEL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, CCFlags cond);
+
+ // Permute
+ void UZP1(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+ void TRN1(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+ void ZIP1(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+ void UZP2(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+ void TRN2(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+ void ZIP2(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+
+ // Shift by immediate
+ void SSHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift);
+ void SSHLL2(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift);
+ void USHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift);
+ void USHLL2(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift);
+ void SHRN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift);
+ void SHRN2(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift);
+ void SXTL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn);
+ void SXTL2(u8 src_size, ARM64Reg Rd, ARM64Reg Rn);
+ void UXTL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn);
+ void UXTL2(u8 src_size, ARM64Reg Rd, ARM64Reg Rn);
+
+ // vector x indexed element
+ void FMUL(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, u8 index);
+ void FMLA(u8 esize, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, u8 index);
+
+ // Modified Immediate
+ void MOVI(u8 size, ARM64Reg Rd, u64 imm, u8 shift = 0);
+ void BIC(u8 size, ARM64Reg Rd, u8 imm, u8 shift = 0);
+
+ void MOVI2F(ARM64Reg Rd, float value, ARM64Reg scratch = INVALID_REG, bool negate = false);
+ void MOVI2FDUP(ARM64Reg Rd, float value, ARM64Reg scratch = INVALID_REG);
+
+ // ABI related
+ void ABI_PushRegisters(BitSet32 registers, ARM64Reg tmp = INVALID_REG);
+ void ABI_PopRegisters(BitSet32 registers, ARM64Reg tmp = INVALID_REG);
+
+private:
+ ARM64XEmitter* m_emit;
+ inline void Write32(u32 value) { m_emit->Write32(value); }
+ // Emitting functions
+ void EmitLoadStoreImmediate(u8 size, u32 opc, IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+ void EmitScalar2Source(bool M, bool S, u32 type, u32 opcode, ARM64Reg Rd, ARM64Reg Rn,
+ ARM64Reg Rm);
+ void EmitThreeSame(bool U, u32 size, u32 opcode, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+ void EmitCopy(bool Q, u32 op, u32 imm5, u32 imm4, ARM64Reg Rd, ARM64Reg Rn);
+ void Emit2RegMisc(bool Q, bool U, u32 size, u32 opcode, ARM64Reg Rd, ARM64Reg Rn);
+ void EmitLoadStoreSingleStructure(bool L, bool R, u32 opcode, bool S, u32 size, ARM64Reg Rt,
+ ARM64Reg Rn);
+ void EmitLoadStoreSingleStructure(bool L, bool R, u32 opcode, bool S, u32 size, ARM64Reg Rt,
+ ARM64Reg Rn, ARM64Reg Rm);
+ void Emit1Source(bool M, bool S, u32 type, u32 opcode, ARM64Reg Rd, ARM64Reg Rn);
+ void EmitConversion(bool sf, bool S, u32 type, u32 rmode, u32 opcode, ARM64Reg Rd, ARM64Reg Rn);
+ void EmitConversion2(bool sf, bool S, bool direction, u32 type, u32 rmode, u32 opcode, int scale,
+ ARM64Reg Rd, ARM64Reg Rn);
+ void EmitCompare(bool M, bool S, u32 op, u32 opcode2, ARM64Reg Rn, ARM64Reg Rm);
+ void EmitCondSelect(bool M, bool S, CCFlags cond, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+ void EmitPermute(u32 size, u32 op, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+ void EmitScalarImm(bool M, bool S, u32 type, u32 imm5, ARM64Reg Rd, u32 imm8);
+ void EmitShiftImm(bool Q, bool U, u32 immh, u32 immb, u32 opcode, ARM64Reg Rd, ARM64Reg Rn);
+ void EmitScalarShiftImm(bool U, u32 immh, u32 immb, u32 opcode, ARM64Reg Rd, ARM64Reg Rn);
+ void EmitLoadStoreMultipleStructure(u32 size, bool L, u32 opcode, ARM64Reg Rt, ARM64Reg Rn);
+ void EmitLoadStoreMultipleStructurePost(u32 size, bool L, u32 opcode, ARM64Reg Rt, ARM64Reg Rn,
+ ARM64Reg Rm);
+ void EmitScalar1Source(bool M, bool S, u32 type, u32 opcode, ARM64Reg Rd, ARM64Reg Rn);
+ void EmitVectorxElement(bool U, u32 size, bool L, u32 opcode, bool H, ARM64Reg Rd, ARM64Reg Rn,
+ ARM64Reg Rm);
+ void EmitLoadStoreUnscaled(u32 size, u32 op, ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+ void EmitConvertScalarToInt(ARM64Reg Rd, ARM64Reg Rn, RoundingMode round, bool sign);
+ void EmitScalar3Source(bool isDouble, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra,
+ int opcode);
+ void EncodeLoadStorePair(u32 size, bool load, IndexType type, ARM64Reg Rt, ARM64Reg Rt2,
+ ARM64Reg Rn, s32 imm);
+ void EncodeLoadStoreRegisterOffset(u32 size, bool load, ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm);
+ void EncodeModImm(bool Q, u8 op, u8 cmode, u8 o2, ARM64Reg Rd, u8 abcdefgh);
+
+ void SSHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift, bool upper);
+ void USHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift, bool upper);
+ void SHRN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift, bool upper);
+ void SXTL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, bool upper);
+ void UXTL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, bool upper);
+};
+
+} \ No newline at end of file
diff --git a/src/dolphin/ArmCommon.h b/src/dolphin/ArmCommon.h
new file mode 100644
index 0000000..6d82e9d
--- /dev/null
+++ b/src/dolphin/ArmCommon.h
@@ -0,0 +1,27 @@
+// Copyright 2014 Dolphin Emulator Project
+// Licensed under GPLv2+
+// Refer to the license.txt file included.
+
+#include "../types.h"
+
+enum CCFlags
+{
+ CC_EQ = 0, // Equal
+ CC_NEQ, // Not equal
+ CC_CS, // Carry Set
+ CC_CC, // Carry Clear
+ CC_MI, // Minus (Negative)
+ CC_PL, // Plus
+ CC_VS, // Overflow
+ CC_VC, // No Overflow
+ CC_HI, // Unsigned higher
+ CC_LS, // Unsigned lower or same
+ CC_GE, // Signed greater than or equal
+ CC_LT, // Signed less than
+ CC_GT, // Signed greater than
+ CC_LE, // Signed less than or equal
+ CC_AL, // Always (unconditional) 14
+ CC_HS = CC_CS, // Alias of CC_CS Unsigned higher or same
+ CC_LO = CC_CC, // Alias of CC_CC Unsigned lower
+};
+const u32 NO_COND = 0xE0000000;
diff --git a/src/dolphin/BitSet.h b/src/dolphin/BitSet.h
new file mode 100644
index 0000000..d32b020
--- /dev/null
+++ b/src/dolphin/BitSet.h
@@ -0,0 +1,218 @@
+// This file is under the public domain.
+
+#pragma once
+
+#include <cstddef>
+#include <initializer_list>
+#include <type_traits>
+#include "../types.h"
+
+#ifdef _WIN32
+
+#include <intrin.h>
+
+namespace Common
+{
+template <typename T>
+constexpr int CountSetBits(T v)
+{
+ // from https://graphics.stanford.edu/~seander/bithacks.html
+ // GCC has this built in, but MSVC's intrinsic will only emit the actual
+ // POPCNT instruction, which we're not depending on
+ v = v - ((v >> 1) & (T) ~(T)0 / 3);
+ v = (v & (T) ~(T)0 / 15 * 3) + ((v >> 2) & (T) ~(T)0 / 15 * 3);
+ v = (v + (v >> 4)) & (T) ~(T)0 / 255 * 15;
+ return (T)(v * ((T) ~(T)0 / 255)) >> (sizeof(T) - 1) * 8;
+}
+inline int LeastSignificantSetBit(u8 val)
+{
+ unsigned long index;
+ _BitScanForward(&index, val);
+ return (int)index;
+}
+inline int LeastSignificantSetBit(u16 val)
+{
+ unsigned long index;
+ _BitScanForward(&index, val);
+ return (int)index;
+}
+inline int LeastSignificantSetBit(u32 val)
+{
+ unsigned long index;
+ _BitScanForward(&index, val);
+ return (int)index;
+}
+inline int LeastSignificantSetBit(u64 val)
+{
+ unsigned long index;
+ _BitScanForward64(&index, val);
+ return (int)index;
+}
+#else
+namespace Common
+{
+constexpr int CountSetBits(u8 val)
+{
+ return __builtin_popcount(val);
+}
+constexpr int CountSetBits(u16 val)
+{
+ return __builtin_popcount(val);
+}
+constexpr int CountSetBits(u32 val)
+{
+ return __builtin_popcount(val);
+}
+constexpr int CountSetBits(u64 val)
+{
+ return __builtin_popcountll(val);
+}
+inline int LeastSignificantSetBit(u8 val)
+{
+ return __builtin_ctz(val);
+}
+inline int LeastSignificantSetBit(u16 val)
+{
+ return __builtin_ctz(val);
+}
+inline int LeastSignificantSetBit(u32 val)
+{
+ return __builtin_ctz(val);
+}
+inline int LeastSignificantSetBit(u64 val)
+{
+ return __builtin_ctzll(val);
+}
+#endif
+
+// Similar to std::bitset, this is a class which encapsulates a bitset, i.e.
+// using the set bits of an integer to represent a set of integers. Like that
+// class, it acts like an array of bools:
+// BitSet32 bs;
+// bs[1] = true;
+// but also like the underlying integer ([0] = least significant bit):
+// BitSet32 bs2 = ...;
+// bs = (bs ^ bs2) & BitSet32(0xffff);
+// The following additional functionality is provided:
+// - Construction using an initializer list.
+// BitSet bs { 1, 2, 4, 8 };
+// - Efficiently iterating through the set bits:
+// for (int i : bs)
+// [i is the *index* of a set bit]
+// (This uses the appropriate CPU instruction to find the next set bit in one
+// operation.)
+// - Counting set bits using .Count() - see comment on that method.
+
+// TODO: use constexpr when MSVC gets out of the Dark Ages
+
+template <typename IntTy>
+class BitSet
+{
+ static_assert(!std::is_signed<IntTy>::value, "BitSet should not be used with signed types");
+
+public:
+ // A reference to a particular bit, returned from operator[].
+ class Ref
+ {
+ public:
+ constexpr Ref(Ref&& other) : m_bs(other.m_bs), m_mask(other.m_mask) {}
+ constexpr Ref(BitSet* bs, IntTy mask) : m_bs(bs), m_mask(mask) {}
+ constexpr operator bool() const { return (m_bs->m_val & m_mask) != 0; }
+ bool operator=(bool set)
+ {
+ m_bs->m_val = (m_bs->m_val & ~m_mask) | (set ? m_mask : 0);
+ return set;
+ }
+
+ private:
+ BitSet* m_bs;
+ IntTy m_mask;
+ };
+
+ // A STL-like iterator is required to be able to use range-based for loops.
+ class Iterator
+ {
+ public:
+ constexpr Iterator(const Iterator& other) : m_val(other.m_val), m_bit(other.m_bit) {}
+ constexpr Iterator(IntTy val, int bit) : m_val(val), m_bit(bit) {}
+ Iterator& operator=(Iterator other)
+ {
+ new (this) Iterator(other);
+ return *this;
+ }
+ Iterator& operator++()
+ {
+ if (m_val == 0)
+ {
+ m_bit = -1;
+ }
+ else
+ {
+ int bit = LeastSignificantSetBit(m_val);
+ m_val &= ~(1 << bit);
+ m_bit = bit;
+ }
+ return *this;
+ }
+ Iterator operator++(int)
+ {
+ Iterator other(*this);
+ ++*this;
+ return other;
+ }
+ constexpr int operator*() const { return m_bit; }
+ constexpr bool operator==(Iterator other) const { return m_bit == other.m_bit; }
+ constexpr bool operator!=(Iterator other) const { return m_bit != other.m_bit; }
+
+ private:
+ IntTy m_val;
+ int m_bit;
+ };
+
+ constexpr BitSet() : m_val(0) {}
+ constexpr explicit BitSet(IntTy val) : m_val(val) {}
+ BitSet(std::initializer_list<int> init)
+ {
+ m_val = 0;
+ for (int bit : init)
+ m_val |= (IntTy)1 << bit;
+ }
+
+ constexpr static BitSet AllTrue(size_t count)
+ {
+ return BitSet(count == sizeof(IntTy) * 8 ? ~(IntTy)0 : (((IntTy)1 << count) - 1));
+ }
+
+ Ref operator[](size_t bit) { return Ref(this, (IntTy)1 << bit); }
+ constexpr const Ref operator[](size_t bit) const { return (*const_cast<BitSet*>(this))[bit]; }
+ constexpr bool operator==(BitSet other) const { return m_val == other.m_val; }
+ constexpr bool operator!=(BitSet other) const { return m_val != other.m_val; }
+ constexpr bool operator<(BitSet other) const { return m_val < other.m_val; }
+ constexpr bool operator>(BitSet other) const { return m_val > other.m_val; }
+ constexpr BitSet operator|(BitSet other) const { return BitSet(m_val | other.m_val); }
+ constexpr BitSet operator&(BitSet other) const { return BitSet(m_val & other.m_val); }
+ constexpr BitSet operator^(BitSet other) const { return BitSet(m_val ^ other.m_val); }
+ constexpr BitSet operator~() const { return BitSet(~m_val); }
+ constexpr BitSet operator<<(IntTy shift) const { return BitSet(m_val << shift); }
+ constexpr BitSet operator>>(IntTy shift) const { return BitSet(m_val >> shift); }
+ constexpr explicit operator bool() const { return m_val != 0; }
+ BitSet& operator|=(BitSet other) { return *this = *this | other; }
+ BitSet& operator&=(BitSet other) { return *this = *this & other; }
+ BitSet& operator^=(BitSet other) { return *this = *this ^ other; }
+ BitSet& operator<<=(IntTy shift) { return *this = *this << shift; }
+ BitSet& operator>>=(IntTy shift) { return *this = *this >> shift; }
+ // Warning: Even though on modern CPUs this is a single fast instruction,
+ // Dolphin's official builds do not currently assume POPCNT support on x86,
+ // so slower explicit bit twiddling is generated. Still should generally
+ // be faster than a loop.
+ constexpr unsigned int Count() const { return CountSetBits(m_val); }
+ constexpr Iterator begin() const { return ++Iterator(m_val, 0); }
+ constexpr Iterator end() const { return Iterator(m_val, -1); }
+ IntTy m_val;
+};
+} // namespace Common
+
+using BitSet8 = Common::BitSet<u8>;
+using BitSet16 = Common::BitSet<u16>;
+using BitSet32 = Common::BitSet<u32>;
+using BitSet64 = Common::BitSet<u64>;
diff --git a/src/dolphin/BitUtils.h b/src/dolphin/BitUtils.h
new file mode 100644
index 0000000..8b64a92
--- /dev/null
+++ b/src/dolphin/BitUtils.h
@@ -0,0 +1,254 @@
+// Copyright 2017 Dolphin Emulator Project
+// Licensed under GPLv2+
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <climits>
+#include <cstddef>
+#include <cstring>
+#include <type_traits>
+
+namespace Common
+{
+///
+/// Retrieves the size of a type in bits.
+///
+/// @tparam T Type to get the size of.
+///
+/// @return the size of the type in bits.
+///
+template <typename T>
+constexpr size_t BitSize() noexcept
+{
+ return sizeof(T) * CHAR_BIT;
+}
+
+///
+/// Extracts a bit from a value.
+///
+/// @param src The value to extract a bit from.
+/// @param bit The bit to extract.
+///
+/// @tparam T The type of the value.
+///
+/// @return The extracted bit.
+///
+template <typename T>
+constexpr T ExtractBit(const T src, const size_t bit) noexcept
+{
+ return (src >> bit) & static_cast<T>(1);
+}
+
+///
+/// Extracts a bit from a value.
+///
+/// @param src The value to extract a bit from.
+///
+/// @tparam bit The bit to extract.
+/// @tparam T The type of the value.
+///
+/// @return The extracted bit.
+///
+template <size_t bit, typename T>
+constexpr T ExtractBit(const T src) noexcept
+{
+ static_assert(bit < BitSize<T>(), "Specified bit must be within T's bit width.");
+
+ return ExtractBit(src, bit);
+}
+
+///
+/// Extracts a range of bits from a value.
+///
+/// @param src The value to extract the bits from.
+/// @param begin The beginning of the bit range. This is inclusive.
+/// @param end The ending of the bit range. This is inclusive.
+///
+/// @tparam T The type of the value.
+/// @tparam Result The returned result type. This is the unsigned analog
+/// of a signed type if a signed type is passed as T.
+///
+/// @return The extracted bits.
+///
+template <typename T, typename Result = std::make_unsigned_t<T>>
+constexpr Result ExtractBits(const T src, const size_t begin, const size_t end) noexcept
+{
+ return static_cast<Result>(((static_cast<Result>(src) << ((BitSize<T>() - 1) - end)) >>
+ (BitSize<T>() - end + begin - 1)));
+}
+
+///
+/// Extracts a range of bits from a value.
+///
+/// @param src The value to extract the bits from.
+///
+/// @tparam begin The beginning of the bit range. This is inclusive.
+/// @tparam end The ending of the bit range. This is inclusive.
+/// @tparam T The type of the value.
+/// @tparam Result The returned result type. This is the unsigned analog
+/// of a signed type if a signed type is passed as T.
+///
+/// @return The extracted bits.
+///
+template <size_t begin, size_t end, typename T, typename Result = std::make_unsigned_t<T>>
+constexpr Result ExtractBits(const T src) noexcept
+{
+ static_assert(begin < end, "Beginning bit must be less than the ending bit.");
+ static_assert(begin < BitSize<T>(), "Beginning bit is larger than T's bit width.");
+ static_assert(end < BitSize<T>(), "Ending bit is larger than T's bit width.");
+
+ return ExtractBits<T, Result>(src, begin, end);
+}
+
+///
+/// Rotates a value left (ROL).
+///
+/// @param value The value to rotate.
+/// @param amount The number of bits to rotate the value.
+/// @tparam T An unsigned type.
+///
+/// @return The rotated value.
+///
+template <typename T>
+constexpr T RotateLeft(const T value, size_t amount) noexcept
+{
+ static_assert(std::is_unsigned<T>(), "Can only rotate unsigned types left.");
+
+ amount %= BitSize<T>();
+
+ if (amount == 0)
+ return value;
+
+ return static_cast<T>((value << amount) | (value >> (BitSize<T>() - amount)));
+}
+
+///
+/// Rotates a value right (ROR).
+///
+/// @param value The value to rotate.
+/// @param amount The number of bits to rotate the value.
+/// @tparam T An unsigned type.
+///
+/// @return The rotated value.
+///
+template <typename T>
+constexpr T RotateRight(const T value, size_t amount) noexcept
+{
+ static_assert(std::is_unsigned<T>(), "Can only rotate unsigned types right.");
+
+ amount %= BitSize<T>();
+
+ if (amount == 0)
+ return value;
+
+ return static_cast<T>((value >> amount) | (value << (BitSize<T>() - amount)));
+}
+
+///
+/// Verifies whether the supplied value is a valid bit mask of the form 0b00...0011...11.
+/// Both edge cases of all zeros and all ones are considered valid masks, too.
+///
+/// @param mask The mask value to test for validity.
+///
+/// @tparam T The type of the value.
+///
+/// @return A bool indicating whether the mask is valid.
+///
+template <typename T>
+constexpr bool IsValidLowMask(const T mask) noexcept
+{
+ static_assert(std::is_integral<T>::value, "Mask must be an integral type.");
+ static_assert(std::is_unsigned<T>::value, "Signed masks can introduce hard to find bugs.");
+
+ // Can be efficiently determined without looping or bit counting. It's the counterpart
+ // to https://graphics.stanford.edu/~seander/bithacks.html#DetermineIfPowerOf2
+ // and doesn't require special casing either edge case.
+ return (mask & (mask + 1)) == 0;
+}
+
+///
+/// Reinterpret objects of one type as another by bit-casting between object representations.
+///
+/// @remark This is the example implementation of std::bit_cast which is to be included
+/// in C++2a. See http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2017/p0476r2.html
+/// for more details. The only difference is this variant is not constexpr,
+/// as the mechanism for bit_cast requires a compiler built-in to have that quality.
+///
+/// @param source The source object to convert to another representation.
+///
+/// @tparam To The type to reinterpret source as.
+/// @tparam From The initial type representation of source.
+///
+/// @return The representation of type From as type To.
+///
+/// @pre Both To and From types must be the same size
+/// @pre Both To and From types must satisfy the TriviallyCopyable concept.
+///
+template <typename To, typename From>
+inline To BitCast(const From& source) noexcept
+{
+ static_assert(sizeof(From) == sizeof(To),
+ "BitCast source and destination types must be equal in size.");
+ static_assert(std::is_trivially_copyable<From>(),
+ "BitCast source type must be trivially copyable.");
+ static_assert(std::is_trivially_copyable<To>(),
+ "BitCast destination type must be trivially copyable.");
+
+ std::aligned_storage_t<sizeof(To), alignof(To)> storage;
+ std::memcpy(&storage, &source, sizeof(storage));
+ return reinterpret_cast<To&>(storage);
+}
+
+template <typename T, typename PtrType>
+class BitCastPtrType
+{
+public:
+ static_assert(std::is_trivially_copyable<PtrType>(),
+ "BitCastPtr source type must be trivially copyable.");
+ static_assert(std::is_trivially_copyable<T>(),
+ "BitCastPtr destination type must be trivially copyable.");
+
+ explicit BitCastPtrType(PtrType* ptr) : m_ptr(ptr) {}
+
+ // Enable operator= only for pointers to non-const data
+ template <typename S>
+ inline typename std::enable_if<std::is_same<S, T>() && !std::is_const<PtrType>()>::type
+ operator=(const S& source)
+ {
+ std::memcpy(m_ptr, &source, sizeof(source));
+ }
+
+ inline operator T() const
+ {
+ T result;
+ std::memcpy(&result, m_ptr, sizeof(result));
+ return result;
+ }
+
+private:
+ PtrType* m_ptr;
+};
+
+// Provides an aliasing-safe alternative to reinterpret_cast'ing pointers to structs
+// Conversion constructor and operator= provided for a convenient syntax.
+// Usage: MyStruct s = BitCastPtr<MyStruct>(some_ptr);
+// BitCastPtr<MyStruct>(some_ptr) = s;
+template <typename T, typename PtrType>
+inline auto BitCastPtr(PtrType* ptr) noexcept -> BitCastPtrType<T, PtrType>
+{
+ return BitCastPtrType<T, PtrType>{ptr};
+}
+
+template <typename T>
+void SetBit(T& value, size_t bit_number, bool bit_value)
+{
+ static_assert(std::is_unsigned<T>(), "SetBit is only sane on unsigned types.");
+
+ if (bit_value)
+ value |= (T{1} << bit_number);
+ else
+ value &= ~(T{1} << bit_number);
+}
+
+} // namespace Common
diff --git a/src/dolphin/CPUDetect.h b/src/dolphin/CPUDetect.h
new file mode 100644
index 0000000..bd4fd8d
--- /dev/null
+++ b/src/dolphin/CPUDetect.h
@@ -0,0 +1,76 @@
+// Copyright 2008 Dolphin Emulator Project
+// Licensed under GPLv2+
+// Refer to the license_dolphin.txt file included.
+
+// Detect the CPU, so we'll know which optimizations to use
+#pragma once
+
+#include <string>
+
+enum class CPUVendor
+{
+ Intel,
+ AMD,
+ ARM,
+ Other,
+};
+
+struct CPUInfo
+{
+ CPUVendor vendor = CPUVendor::Intel;
+
+ char cpu_string[0x41] = {};
+ char brand_string[0x21] = {};
+ bool OS64bit = false;
+ bool CPU64bit = false;
+ bool Mode64bit = false;
+
+ bool HTT = false;
+ int num_cores = 0;
+ int logical_cpu_count = 0;
+
+ bool bSSE = false;
+ bool bSSE2 = false;
+ bool bSSE3 = false;
+ bool bSSSE3 = false;
+ bool bPOPCNT = false;
+ bool bSSE4_1 = false;
+ bool bSSE4_2 = false;
+ bool bLZCNT = false;
+ bool bSSE4A = false;
+ bool bAVX = false;
+ bool bAVX2 = false;
+ bool bBMI1 = false;
+ bool bBMI2 = false;
+ bool bFMA = false;
+ bool bFMA4 = false;
+ bool bAES = false;
+ // FXSAVE/FXRSTOR
+ bool bFXSR = false;
+ bool bMOVBE = false;
+ // This flag indicates that the hardware supports some mode
+ // in which denormal inputs _and_ outputs are automatically set to (signed) zero.
+ bool bFlushToZero = false;
+ bool bLAHFSAHF64 = false;
+ bool bLongMode = false;
+ bool bAtom = false;
+
+ // ARMv8 specific
+ bool bFP = false;
+ bool bASIMD = false;
+ bool bCRC32 = false;
+ bool bSHA1 = false;
+ bool bSHA2 = false;
+
+ // Call Detect()
+ explicit CPUInfo();
+
+ // Turn the CPU info into a string we can show
+ std::string Summarize();
+
+private:
+ // Detects the various CPU features
+ void Detect();
+};
+
+extern CPUInfo cpu_info;
diff --git a/src/dolphin/CommonFuncs.cpp b/src/dolphin/CommonFuncs.cpp
new file mode 100644
index 0000000..f85051d
--- /dev/null
+++ b/src/dolphin/CommonFuncs.cpp
@@ -0,0 +1,52 @@
+// Copyright 2009 Dolphin Emulator Project
+// Licensed under GPLv2+
+// Refer to the license_dolphin.txt file included.
+
+#include <cstddef>
+#include <cstring>
+#include <errno.h>
+#include <type_traits>
+
+#include "CommonFuncs.h"
+
+#ifdef _WIN32
+#include <windows.h>
+#define strerror_r(err, buf, len) strerror_s(buf, len, err)
+#endif
+
+constexpr size_t BUFFER_SIZE = 256;
+
+// Wrapper function to get last strerror(errno) string.
+// This function might change the error code.
+std::string LastStrerrorString()
+{
+ char error_message[BUFFER_SIZE];
+
+ // There are two variants of strerror_r. The XSI version stores the message to the passed-in
+ // buffer and returns an int (0 on success). The GNU version returns a pointer to the message,
+ // which might have been stored in the passed-in buffer or might be a static string.
+
+ // We check defines in order to figure out variant is in use, and we store the returned value
+ // to a variable so that we'll get a compile-time check that our assumption was correct.
+
+#if defined(__GLIBC__) && (_GNU_SOURCE || (_POSIX_C_SOURCE < 200112L && _XOPEN_SOURCE < 600))
+ const char* str = strerror_r(errno, error_message, BUFFER_SIZE);
+ return std::string(str);
+#else
+ int error_code = strerror_r(errno, error_message, BUFFER_SIZE);
+ return error_code == 0 ? std::string(error_message) : "";
+#endif
+}
+
+#ifdef _WIN32
+// Wrapper function to get GetLastError() string.
+// This function might change the error code.
+std::string GetLastErrorString()
+{
+ char error_message[BUFFER_SIZE];
+
+ FormatMessageA(FORMAT_MESSAGE_FROM_SYSTEM, nullptr, GetLastError(),
+ MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), error_message, BUFFER_SIZE, nullptr);
+ return std::string(error_message);
+}
+#endif
diff --git a/src/dolphin/CommonFuncs.h b/src/dolphin/CommonFuncs.h
new file mode 100644
index 0000000..708fbc3
--- /dev/null
+++ b/src/dolphin/CommonFuncs.h
@@ -0,0 +1,58 @@
+// Copyright 2009 Dolphin Emulator Project
+// Licensed under GPLv2+
+// Refer to the license_dolphin.txt file included.
+
+#pragma once
+
+#include <cstddef>
+#include <string>
+#include "../types.h"
+
+// Will fail to compile on a non-array:
+template <typename T, size_t N>
+constexpr size_t ArraySize(T (&arr)[N])
+{
+ return N;
+}
+
+#ifndef _WIN32
+
+// go to debugger mode
+#define Crash() \
+ { \
+ __builtin_trap(); \
+ }
+
+#else // WIN32
+// Function Cross-Compatibility
+#define strcasecmp _stricmp
+#define strncasecmp _strnicmp
+#define unlink _unlink
+#define vscprintf _vscprintf
+
+// 64 bit offsets for Windows
+#define fseeko _fseeki64
+#define ftello _ftelli64
+#define atoll _atoi64
+#define stat _stat64
+#define fstat _fstat64
+#define fileno _fileno
+
+extern "C" {
+__declspec(dllimport) void __stdcall DebugBreak(void);
+}
+#define Crash() \
+ { \
+ DebugBreak(); \
+ }
+#endif // WIN32 ndef
+
+// Wrapper function to get last strerror(errno) string.
+// This function might change the error code.
+std::string LastStrerrorString();
+
+#ifdef _WIN32
+// Wrapper function to get GetLastError() string.
+// This function might change the error code.
+std::string GetLastErrorString();
+#endif
diff --git a/src/dolphin/Compat.h b/src/dolphin/Compat.h
new file mode 100644
index 0000000..787d505
--- /dev/null
+++ b/src/dolphin/Compat.h
@@ -0,0 +1,75 @@
+// Stubs for Assert.h and Log.h
+#pragma once
+
+#include <assert.h>
+
+// Assert stub
+#define ASSERT_MSG(_t_, _a_, _fmt_, ...) \
+ assert(_a_) \
+ /*do \
+ { \
+ if (!(_a_)) \
+ { \
+ if (!PanicYesNo(_fmt_, ##__VA_ARGS__)) \
+ Crash(); \
+ } \
+ } while (0)*/
+
+#define DEBUG_ASSERT_MSG(_t_, _a_, _msg_, ...) \
+ assert(_a_); \
+ /*do \
+ { \
+ if (MAX_LOGLEVEL >= LogTypes::LOG_LEVELS::LDEBUG && !(_a_)) \
+ { \
+ ERROR_LOG(_t_, _msg_, ##__VA_ARGS__); \
+ if (!PanicYesNo(_msg_, ##__VA_ARGS__)) \
+ Crash(); \
+ } \
+ } while (0)*/
+
+#define ASSERT(_a_) \
+ assert(_a_) \
+ /*do \
+ { \
+ ASSERT_MSG(MASTER_LOG, _a_, \
+ _trans("An error occurred.\n\n Line: %d\n File: %s\n\nIgnore and continue?"), \
+ __LINE__, __FILE__); \
+ } while (0)*/
+
+#define DEBUG_ASSERT(_a_) \
+ assert(_a_) \
+ /*do \
+ { \
+ if (MAX_LOGLEVEL >= LogTypes::LOG_LEVELS::LDEBUG) \
+ ASSERT(_a_); \
+ } while (0)*/
+
+// Log Stub
+#include <cstdio>
+
+#define PanicAlert(fmt, ...) \
+ do \
+ { \
+ printf(fmt "\n", ## __VA_ARGS__); \
+ abort(); \
+ } while (false)
+
+#define DYNA_REC 0
+
+#define ERROR_LOG(which, fmt, ...) \
+ do \
+ { \
+ printf(fmt "\n", ## __VA_ARGS__); \
+ } while (false)
+
+#if __cplusplus < 201703L
+// cheat
+namespace std
+{
+template <typename T>
+T clamp(const T& v, const T& lo, const T& hi)
+{
+ return v < lo ? lo : (v > hi ? hi : v);
+}
+}
+#endif \ No newline at end of file
diff --git a/src/dolphin/MathUtil.cpp b/src/dolphin/MathUtil.cpp
new file mode 100644
index 0000000..70f2ede
--- /dev/null
+++ b/src/dolphin/MathUtil.cpp
@@ -0,0 +1,13 @@
+// Copyright 2008 Dolphin Emulator Project
+// Licensed under GPLv2+
+// Refer to the license.txt file included.
+
+#include "MathUtil.h"
+
+#include <numeric>
+
+// Calculate sum of a float list
+float MathFloatVectorSum(const std::vector<float>& Vec)
+{
+ return std::accumulate(Vec.begin(), Vec.end(), 0.0f);
+}
diff --git a/src/dolphin/MathUtil.h b/src/dolphin/MathUtil.h
new file mode 100644
index 0000000..b1dbbae
--- /dev/null
+++ b/src/dolphin/MathUtil.h
@@ -0,0 +1,121 @@
+// Copyright 2008 Dolphin Emulator Project
+// Licensed under GPLv2+
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <algorithm>
+#include <vector>
+
+#include "Compat.h"
+
+#include "../types.h"
+
+#ifdef _MSC_VER
+#include <intrin.h>
+#endif
+
+namespace MathUtil
+{
+constexpr double TAU = 6.2831853071795865;
+constexpr double PI = TAU / 2;
+
+template <typename T>
+constexpr auto Sign(const T& val) -> decltype((T{} < val) - (val < T{}))
+{
+ return (T{} < val) - (val < T{});
+}
+
+template <typename T, typename F>
+constexpr auto Lerp(const T& x, const T& y, const F& a) -> decltype(x + (y - x) * a)
+{
+ return x + (y - x) * a;
+}
+
+template <typename T>
+constexpr bool IsPow2(T imm)
+{
+ return imm > 0 && (imm & (imm - 1)) == 0;
+}
+
+constexpr u32 NextPowerOf2(u32 value)
+{
+ --value;
+ value |= value >> 1;
+ value |= value >> 2;
+ value |= value >> 4;
+ value |= value >> 8;
+ value |= value >> 16;
+ ++value;
+
+ return value;
+}
+
+template <class T>
+struct Rectangle
+{
+ T left{};
+ T top{};
+ T right{};
+ T bottom{};
+
+ constexpr Rectangle() = default;
+
+ constexpr Rectangle(T theLeft, T theTop, T theRight, T theBottom)
+ : left(theLeft), top(theTop), right(theRight), bottom(theBottom)
+ {
+ }
+
+ constexpr bool operator==(const Rectangle& r) const
+ {
+ return left == r.left && top == r.top && right == r.right && bottom == r.bottom;
+ }
+
+ T GetWidth() const { return abs(right - left); }
+ T GetHeight() const { return abs(bottom - top); }
+ // If the rectangle is in a coordinate system with a lower-left origin, use
+ // this Clamp.
+ void ClampLL(T x1, T y1, T x2, T y2)
+ {
+ left = std::clamp(left, x1, x2);
+ right = std::clamp(right, x1, x2);
+ top = std::clamp(top, y2, y1);
+ bottom = std::clamp(bottom, y2, y1);
+ }
+
+ // If the rectangle is in a coordinate system with an upper-left origin,
+ // use this Clamp.
+ void ClampUL(T x1, T y1, T x2, T y2)
+ {
+ left = std::clamp(left, x1, x2);
+ right = std::clamp(right, x1, x2);
+ top = std::clamp(top, y1, y2);
+ bottom = std::clamp(bottom, y1, y2);
+ }
+};
+
+} // namespace MathUtil
+
+float MathFloatVectorSum(const std::vector<float>&);
+
+// Rounds down. 0 -> undefined
+inline int IntLog2(u64 val)
+{
+#if defined(__GNUC__)
+ return 63 - __builtin_clzll(val);
+
+#elif defined(_MSC_VER)
+ unsigned long result = ULONG_MAX;
+ _BitScanReverse64(&result, val);
+ return result;
+
+#else
+ int result = -1;
+ while (val != 0)
+ {
+ val >>= 1;
+ ++result;
+ }
+ return result;
+#endif
+}
diff --git a/src/dolphin/license_dolphin.txt b/src/dolphin/license_dolphin.txt
new file mode 100644
index 0000000..d511905
--- /dev/null
+++ b/src/dolphin/license_dolphin.txt
@@ -0,0 +1,339 @@
+ GNU GENERAL PUBLIC LICENSE
+ Version 2, June 1991
+
+ Copyright (C) 1989, 1991 Free Software Foundation, Inc.,
+ 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+ Preamble
+
+ The licenses for most software are designed to take away your
+freedom to share and change it. By contrast, the GNU General Public
+License is intended to guarantee your freedom to share and change free
+software--to make sure the software is free for all its users. This
+General Public License applies to most of the Free Software
+Foundation's software and to any other program whose authors commit to
+using it. (Some other Free Software Foundation software is covered by
+the GNU Lesser General Public License instead.) You can apply it to
+your programs, too.
+
+ When we speak of free software, we are referring to freedom, not
+price. Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+this service if you wish), that you receive source code or can get it
+if you want it, that you can change the software or use pieces of it
+in new free programs; and that you know you can do these things.
+
+ To protect your rights, we need to make restrictions that forbid
+anyone to deny you these rights or to ask you to surrender the rights.
+These restrictions translate to certain responsibilities for you if you
+distribute copies of the software, or if you modify it.
+
+ For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must give the recipients all the rights that
+you have. You must make sure that they, too, receive or can get the
+source code. And you must show them these terms so they know their
+rights.
+
+ We protect your rights with two steps: (1) copyright the software, and
+(2) offer you this license which gives you legal permission to copy,
+distribute and/or modify the software.
+
+ Also, for each author's protection and ours, we want to make certain
+that everyone understands that there is no warranty for this free
+software. If the software is modified by someone else and passed on, we
+want its recipients to know that what they have is not the original, so
+that any problems introduced by others will not reflect on the original
+authors' reputations.
+
+ Finally, any free program is threatened constantly by software
+patents. We wish to avoid the danger that redistributors of a free
+program will individually obtain patent licenses, in effect making the
+program proprietary. To prevent this, we have made it clear that any
+patent must be licensed for everyone's free use or not licensed at all.
+
+ The precise terms and conditions for copying, distribution and
+modification follow.
+
+ GNU GENERAL PUBLIC LICENSE
+ TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+ 0. This License applies to any program or other work which contains
+a notice placed by the copyright holder saying it may be distributed
+under the terms of this General Public License. The "Program", below,
+refers to any such program or work, and a "work based on the Program"
+means either the Program or any derivative work under copyright law:
+that is to say, a work containing the Program or a portion of it,
+either verbatim or with modifications and/or translated into another
+language. (Hereinafter, translation is included without limitation in
+the term "modification".) Each licensee is addressed as "you".
+
+Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope. The act of
+running the Program is not restricted, and the output from the Program
+is covered only if its contents constitute a work based on the
+Program (independent of having been made by running the Program).
+Whether that is true depends on what the Program does.
+
+ 1. You may copy and distribute verbatim copies of the Program's
+source code as you receive it, in any medium, provided that you
+conspicuously and appropriately publish on each copy an appropriate
+copyright notice and disclaimer of warranty; keep intact all the
+notices that refer to this License and to the absence of any warranty;
+and give any other recipients of the Program a copy of this License
+along with the Program.
+
+You may charge a fee for the physical act of transferring a copy, and
+you may at your option offer warranty protection in exchange for a fee.
+
+ 2. You may modify your copy or copies of the Program or any portion
+of it, thus forming a work based on the Program, and copy and
+distribute such modifications or work under the terms of Section 1
+above, provided that you also meet all of these conditions:
+
+ a) You must cause the modified files to carry prominent notices
+ stating that you changed the files and the date of any change.
+
+ b) You must cause any work that you distribute or publish, that in
+ whole or in part contains or is derived from the Program or any
+ part thereof, to be licensed as a whole at no charge to all third
+ parties under the terms of this License.
+
+ c) If the modified program normally reads commands interactively
+ when run, you must cause it, when started running for such
+ interactive use in the most ordinary way, to print or display an
+ announcement including an appropriate copyright notice and a
+ notice that there is no warranty (or else, saying that you provide
+ a warranty) and that users may redistribute the program under
+ these conditions, and telling the user how to view a copy of this
+ License. (Exception: if the Program itself is interactive but
+ does not normally print such an announcement, your work based on
+ the Program is not required to print an announcement.)
+
+These requirements apply to the modified work as a whole. If
+identifiable sections of that work are not derived from the Program,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works. But when you
+distribute the same sections as part of a whole which is a work based
+on the Program, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Program.
+
+In addition, mere aggregation of another work not based on the Program
+with the Program (or with a work based on the Program) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+ 3. You may copy and distribute the Program (or a work based on it,
+under Section 2) in object code or executable form under the terms of
+Sections 1 and 2 above provided that you also do one of the following:
+
+ a) Accompany it with the complete corresponding machine-readable
+ source code, which must be distributed under the terms of Sections
+ 1 and 2 above on a medium customarily used for software interchange; or,
+
+ b) Accompany it with a written offer, valid for at least three
+ years, to give any third party, for a charge no more than your
+ cost of physically performing source distribution, a complete
+ machine-readable copy of the corresponding source code, to be
+ distributed under the terms of Sections 1 and 2 above on a medium
+ customarily used for software interchange; or,
+
+ c) Accompany it with the information you received as to the offer
+ to distribute corresponding source code. (This alternative is
+ allowed only for noncommercial distribution and only if you
+ received the program in object code or executable form with such
+ an offer, in accord with Subsection b above.)
+
+The source code for a work means the preferred form of the work for
+making modifications to it. For an executable work, complete source
+code means all the source code for all modules it contains, plus any
+associated interface definition files, plus the scripts used to
+control compilation and installation of the executable. However, as a
+special exception, the source code distributed need not include
+anything that is normally distributed (in either source or binary
+form) with the major components (compiler, kernel, and so on) of the
+operating system on which the executable runs, unless that component
+itself accompanies the executable.
+
+If distribution of executable or object code is made by offering
+access to copy from a designated place, then offering equivalent
+access to copy the source code from the same place counts as
+distribution of the source code, even though third parties are not
+compelled to copy the source along with the object code.
+
+ 4. You may not copy, modify, sublicense, or distribute the Program
+except as expressly provided under this License. Any attempt
+otherwise to copy, modify, sublicense or distribute the Program is
+void, and will automatically terminate your rights under this License.
+However, parties who have received copies, or rights, from you under
+this License will not have their licenses terminated so long as such
+parties remain in full compliance.
+
+ 5. You are not required to accept this License, since you have not
+signed it. However, nothing else grants you permission to modify or
+distribute the Program or its derivative works. These actions are
+prohibited by law if you do not accept this License. Therefore, by
+modifying or distributing the Program (or any work based on the
+Program), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Program or works based on it.
+
+ 6. Each time you redistribute the Program (or any work based on the
+Program), the recipient automatically receives a license from the
+original licensor to copy, distribute or modify the Program subject to
+these terms and conditions. You may not impose any further
+restrictions on the recipients' exercise of the rights granted herein.
+You are not responsible for enforcing compliance by third parties to
+this License.
+
+ 7. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License. If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Program at all. For example, if a patent
+license would not permit royalty-free redistribution of the Program by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Program.
+
+If any portion of this section is held invalid or unenforceable under
+any particular circumstance, the balance of the section is intended to
+apply and the section as a whole is intended to apply in other
+circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system, which is
+implemented by public license practices. Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+
+ 8. If the distribution and/or use of the Program is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Program under this License
+may add an explicit geographical distribution limitation excluding
+those countries, so that distribution is permitted only in or among
+countries not thus excluded. In such case, this License incorporates
+the limitation as if written in the body of this License.
+
+ 9. The Free Software Foundation may publish revised and/or new versions
+of the General Public License from time to time. Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+Each version is given a distinguishing version number. If the Program
+specifies a version number of this License which applies to it and "any
+later version", you have the option of following the terms and conditions
+either of that version or of any later version published by the Free
+Software Foundation. If the Program does not specify a version number of
+this License, you may choose any version ever published by the Free Software
+Foundation.
+
+ 10. If you wish to incorporate parts of the Program into other free
+programs whose distribution conditions are different, write to the author
+to ask for permission. For software which is copyrighted by the Free
+Software Foundation, write to the Free Software Foundation; we sometimes
+make exceptions for this. Our decision will be guided by the two goals
+of preserving the free status of all derivatives of our free software and
+of promoting the sharing and reuse of software generally.
+
+ NO WARRANTY
+
+ 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
+FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN
+OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
+PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
+OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS
+TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE
+PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
+REPAIR OR CORRECTION.
+
+ 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
+REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
+INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
+OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
+TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
+YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
+PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGES.
+
+ END OF TERMS AND CONDITIONS
+
+ How to Apply These Terms to Your New Programs
+
+ If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+ To do so, attach the following notices to the program. It is safest
+to attach them to the start of each source file to most effectively
+convey the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+ <one line to give the program's name and a brief idea of what it does.>
+ Copyright (C) <year> <name of author>
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License along
+ with this program; if not, write to the Free Software Foundation, Inc.,
+ 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+Also add information on how to contact you by electronic and paper mail.
+
+If the program is interactive, make it output a short notice like this
+when it starts in an interactive mode:
+
+ Gnomovision version 69, Copyright (C) year name of author
+ Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+ This is free software, and you are welcome to redistribute it
+ under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License. Of course, the commands you use may
+be called something other than `show w' and `show c'; they could even be
+mouse-clicks or menu items--whatever suits your program.
+
+You should also get your employer (if you work as a programmer) or your
+school, if any, to sign a "copyright disclaimer" for the program, if
+necessary. Here is a sample; alter the names:
+
+ Yoyodyne, Inc., hereby disclaims all copyright interest in the program
+ `Gnomovision' (which makes passes at compilers) written by James Hacker.
+
+ <signature of Ty Coon>, 1 April 1989
+ Ty Coon, President of Vice
+
+This General Public License does not permit incorporating your program into
+proprietary programs. If your program is a subroutine library, you may
+consider it more useful to permit linking proprietary applications with the
+library. If this is what you want to do, use the GNU Lesser General
+Public License instead of this License.
diff --git a/src/dolphin/x64ABI.cpp b/src/dolphin/x64ABI.cpp
new file mode 100644
index 0000000..d86a158
--- /dev/null
+++ b/src/dolphin/x64ABI.cpp
@@ -0,0 +1,119 @@
+// Copyright 2008 Dolphin Emulator Project
+// Licensed under GPLv2+
+// Refer to the license_dolphin.txt file included.
+
+#include "../types.h"
+#include "x64ABI.h"
+#include "x64Emitter.h"
+
+using namespace Gen;
+
+// Shared code between Win64 and Unix64
+
+void XEmitter::ABI_CalculateFrameSize(BitSet32 mask, size_t rsp_alignment, size_t needed_frame_size,
+ size_t* shadowp, size_t* subtractionp, size_t* xmm_offsetp)
+{
+ size_t shadow = 0;
+#if defined(_WIN32)
+ shadow = 0x20;
+#endif
+
+ int count = (mask & ABI_ALL_GPRS).Count();
+ rsp_alignment -= count * 8;
+ size_t subtraction = 0;
+ int fpr_count = (mask & ABI_ALL_FPRS).Count();
+ if (fpr_count)
+ {
+ // If we have any XMMs to save, we must align the stack here.
+ subtraction = rsp_alignment & 0xf;
+ }
+ subtraction += 16 * fpr_count;
+ size_t xmm_base_subtraction = subtraction;
+ subtraction += needed_frame_size;
+ subtraction += shadow;
+ // Final alignment.
+ rsp_alignment -= subtraction;
+ subtraction += rsp_alignment & 0xf;
+
+ *shadowp = shadow;
+ *subtractionp = subtraction;
+ *xmm_offsetp = subtraction - xmm_base_subtraction;
+}
+
+size_t XEmitter::ABI_PushRegistersAndAdjustStack(BitSet32 mask, size_t rsp_alignment,
+ size_t needed_frame_size)
+{
+ size_t shadow, subtraction, xmm_offset;
+ ABI_CalculateFrameSize(mask, rsp_alignment, needed_frame_size, &shadow, &subtraction,
+ &xmm_offset);
+
+ for (int r : mask& ABI_ALL_GPRS)
+ PUSH((X64Reg)r);
+
+ if (subtraction)
+ SUB(64, R(RSP), subtraction >= 0x80 ? Imm32((u32)subtraction) : Imm8((u8)subtraction));
+
+ for (int x : mask& ABI_ALL_FPRS)
+ {
+ MOVAPD(MDisp(RSP, (int)xmm_offset), (X64Reg)(x - 16));
+ xmm_offset += 16;
+ }
+
+ return shadow;
+}
+
+void XEmitter::ABI_PopRegistersAndAdjustStack(BitSet32 mask, size_t rsp_alignment,
+ size_t needed_frame_size)
+{
+ size_t shadow, subtraction, xmm_offset;
+ ABI_CalculateFrameSize(mask, rsp_alignment, needed_frame_size, &shadow, &subtraction,
+ &xmm_offset);
+
+ for (int x : mask& ABI_ALL_FPRS)
+ {
+ MOVAPD((X64Reg)(x - 16), MDisp(RSP, (int)xmm_offset));
+ xmm_offset += 16;
+ }
+
+ if (subtraction)
+ ADD(64, R(RSP), subtraction >= 0x80 ? Imm32((u32)subtraction) : Imm8((u8)subtraction));
+
+ for (int r = 15; r >= 0; r--)
+ {
+ if (mask[r])
+ POP((X64Reg)r);
+ }
+}
+
+void XEmitter::MOVTwo(int bits, Gen::X64Reg dst1, Gen::X64Reg src1, s32 offset1, Gen::X64Reg dst2,
+ Gen::X64Reg src2)
+{
+ if (dst1 == src2 && dst2 == src1)
+ {
+ XCHG(bits, R(src1), R(src2));
+ if (offset1)
+ ADD(bits, R(dst1), Imm32(offset1));
+ }
+ else if (src2 != dst1)
+ {
+ if (dst1 != src1 && offset1)
+ LEA(bits, dst1, MDisp(src1, offset1));
+ else if (dst1 != src1)
+ MOV(bits, R(dst1), R(src1));
+ else if (offset1)
+ ADD(bits, R(dst1), Imm32(offset1));
+ if (dst2 != src2)
+ MOV(bits, R(dst2), R(src2));
+ }
+ else
+ {
+ if (dst2 != src2)
+ MOV(bits, R(dst2), R(src2));
+ if (dst1 != src1 && offset1)
+ LEA(bits, dst1, MDisp(src1, offset1));
+ else if (dst1 != src1)
+ MOV(bits, R(dst1), R(src1));
+ else if (offset1)
+ ADD(bits, R(dst1), Imm32(offset1));
+ }
+}
diff --git a/src/dolphin/x64ABI.h b/src/dolphin/x64ABI.h
new file mode 100644
index 0000000..94336d0
--- /dev/null
+++ b/src/dolphin/x64ABI.h
@@ -0,0 +1,58 @@
+// Copyright 2008 Dolphin Emulator Project
+// Licensed under GPLv2+
+// Refer to the license_dolphin.txt file included.
+
+#pragma once
+
+#include "BitSet.h"
+#include "x64Reg.h"
+
+// x64 ABI:s, and helpers to help follow them when JIT-ing code.
+// All convensions return values in EAX (+ possibly EDX).
+
+// Windows 64-bit
+// * 4-reg "fastcall" variant, very new-skool stack handling
+// * Callee moves stack pointer, to make room for shadow regs for the biggest function _it itself
+// calls_
+// * Parameters passed in RCX, RDX, ... further parameters are MOVed into the allocated stack space.
+// Scratch: RAX RCX RDX R8 R9 R10 R11
+// Callee-save: RBX RSI RDI RBP R12 R13 R14 R15
+// Parameters: RCX RDX R8 R9, further MOV-ed
+
+// Linux 64-bit
+// * 6-reg "fastcall" variant, old skool stack handling (parameters are pushed)
+// Scratch: RAX RCX RDX RSI RDI R8 R9 R10 R11
+// Callee-save: RBX RBP R12 R13 R14 R15
+// Parameters: RDI RSI RDX RCX R8 R9
+
+#define ABI_ALL_FPRS BitSet32(0xffff0000)
+#define ABI_ALL_GPRS BitSet32(0x0000ffff)
+
+#ifdef _WIN32 // 64-bit Windows - the really exotic calling convention
+
+#define ABI_PARAM1 RCX
+#define ABI_PARAM2 RDX
+#define ABI_PARAM3 R8
+#define ABI_PARAM4 R9
+
+// xmm0-xmm15 use the upper 16 bits in the functions that push/pop registers.
+#define ABI_ALL_CALLER_SAVED \
+ (BitSet32{RAX, RCX, RDX, R8, R9, R10, R11, XMM0 + 16, XMM1 + 16, XMM2 + 16, XMM3 + 16, \
+ XMM4 + 16, XMM5 + 16})
+#else // 64-bit Unix / OS X
+
+#define ABI_PARAM1 RDI
+#define ABI_PARAM2 RSI
+#define ABI_PARAM3 RDX
+#define ABI_PARAM4 RCX
+#define ABI_PARAM5 R8
+#define ABI_PARAM6 R9
+
+// FIXME: avoid pushing all 16 XMM registers when possible? most functions we call probably
+// don't actually clobber them.
+#define ABI_ALL_CALLER_SAVED (BitSet32{RAX, RCX, RDX, RDI, RSI, R8, R9, R10, R11} | ABI_ALL_FPRS)
+#endif // WIN32
+
+#define ABI_ALL_CALLEE_SAVED (~ABI_ALL_CALLER_SAVED)
+
+#define ABI_RETURN RAX
diff --git a/src/dolphin/x64CPUDetect.cpp b/src/dolphin/x64CPUDetect.cpp
new file mode 100644
index 0000000..49b51c9
--- /dev/null
+++ b/src/dolphin/x64CPUDetect.cpp
@@ -0,0 +1,273 @@
+// Copyright 2008 Dolphin Emulator Project
+// Licensed under GPLv2+
+// Refer to the license_dolphin.txt file included.
+
+#include <cstring>
+#include <string>
+
+#include "CPUDetect.h"
+#include "../types.h"
+
+#ifndef _MSVC_VER
+
+#ifdef __FreeBSD__
+#include <unistd.h>
+
+#include <machine/cpufunc.h>
+#include <sys/types.h>
+#endif
+
+static inline void __cpuidex(int info[4], int function_id, int subfunction_id)
+{
+#ifdef __FreeBSD__
+ // Despite the name, this is just do_cpuid() with ECX as second input.
+ cpuid_count((u_int)function_id, (u_int)subfunction_id, (u_int*)info);
+#else
+ info[0] = function_id; // eax
+ info[2] = subfunction_id; // ecx
+ __asm__("cpuid"
+ : "=a"(info[0]), "=b"(info[1]), "=c"(info[2]), "=d"(info[3])
+ : "a"(function_id), "c"(subfunction_id));
+#endif
+}
+
+static inline void __cpuid(int info[4], int function_id)
+{
+ return __cpuidex(info, function_id, 0);
+}
+
+#endif // ifndef _WIN32
+
+#ifdef _MSVC_VER
+
+static u64 xgetbv(u32 index)
+{
+ return _xgetbv(index);
+}
+constexpr u32 XCR_XFEATURE_ENABLED_MASK = _XCR_XFEATURE_ENABLED_MASK;
+
+#else
+
+static u64 xgetbv(u32 index)
+{
+ u32 eax, edx;
+ __asm__ __volatile__("xgetbv" : "=a"(eax), "=d"(edx) : "c"(index));
+ return ((u64)edx << 32) | eax;
+}
+constexpr u32 XCR_XFEATURE_ENABLED_MASK = 0;
+#endif // ifdef _WIN32
+
+CPUInfo cpu_info;
+
+CPUInfo::CPUInfo()
+{
+ Detect();
+}
+
+// Detects the various CPU features
+void CPUInfo::Detect()
+{
+#ifdef _M_X86_64
+ Mode64bit = true;
+ OS64bit = true;
+#endif
+ num_cores = 1;
+
+ // Set obvious defaults, for extra safety
+ if (Mode64bit)
+ {
+ bSSE = true;
+ bSSE2 = true;
+ bLongMode = true;
+ }
+
+ // Assume CPU supports the CPUID instruction. Those that don't can barely
+ // boot modern OS:es anyway.
+ int cpu_id[4];
+
+ // Detect CPU's CPUID capabilities, and grab CPU string
+ __cpuid(cpu_id, 0x00000000);
+ u32 max_std_fn = cpu_id[0]; // EAX
+ std::memcpy(&brand_string[0], &cpu_id[1], sizeof(int));
+ std::memcpy(&brand_string[4], &cpu_id[3], sizeof(int));
+ std::memcpy(&brand_string[8], &cpu_id[2], sizeof(int));
+ __cpuid(cpu_id, 0x80000000);
+ u32 max_ex_fn = cpu_id[0];
+ if (!strcmp(brand_string, "GenuineIntel"))
+ vendor = CPUVendor::Intel;
+ else if (!strcmp(brand_string, "AuthenticAMD"))
+ vendor = CPUVendor::AMD;
+ else
+ vendor = CPUVendor::Other;
+
+ // Set reasonable default brand string even if brand string not available.
+ strcpy(cpu_string, brand_string);
+
+ // Detect family and other misc stuff.
+ bool ht = false;
+ HTT = ht;
+ logical_cpu_count = 1;
+ if (max_std_fn >= 1)
+ {
+ __cpuid(cpu_id, 0x00000001);
+ int family = ((cpu_id[0] >> 8) & 0xf) + ((cpu_id[0] >> 20) & 0xff);
+ int model = ((cpu_id[0] >> 4) & 0xf) + ((cpu_id[0] >> 12) & 0xf0);
+ // Detect people unfortunate enough to be running Dolphin on an Atom
+ if (family == 6 &&
+ (model == 0x1C || model == 0x26 || model == 0x27 || model == 0x35 || model == 0x36 ||
+ model == 0x37 || model == 0x4A || model == 0x4D || model == 0x5A || model == 0x5D))
+ bAtom = true;
+ logical_cpu_count = (cpu_id[1] >> 16) & 0xFF;
+ ht = (cpu_id[3] >> 28) & 1;
+
+ if ((cpu_id[3] >> 25) & 1)
+ bSSE = true;
+ if ((cpu_id[3] >> 26) & 1)
+ bSSE2 = true;
+ if ((cpu_id[2]) & 1)
+ bSSE3 = true;
+ if ((cpu_id[2] >> 9) & 1)
+ bSSSE3 = true;
+ if ((cpu_id[2] >> 19) & 1)
+ bSSE4_1 = true;
+ if ((cpu_id[2] >> 20) & 1)
+ bSSE4_2 = true;
+ if ((cpu_id[2] >> 22) & 1)
+ bMOVBE = true;
+ if ((cpu_id[2] >> 25) & 1)
+ bAES = true;
+
+ if ((cpu_id[3] >> 24) & 1)
+ {
+ // We can use FXSAVE.
+ bFXSR = true;
+ }
+
+ // AVX support requires 3 separate checks:
+ // - Is the AVX bit set in CPUID?
+ // - Is the XSAVE bit set in CPUID?
+ // - XGETBV result has the XCR bit set.
+ if (((cpu_id[2] >> 28) & 1) && ((cpu_id[2] >> 27) & 1))
+ {
+ if ((xgetbv(XCR_XFEATURE_ENABLED_MASK) & 0x6) == 0x6)
+ {
+ bAVX = true;
+ if ((cpu_id[2] >> 12) & 1)
+ bFMA = true;
+ }
+ }
+
+ if (max_std_fn >= 7)
+ {
+ __cpuidex(cpu_id, 0x00000007, 0x00000000);
+ // careful; we can't enable AVX2 unless the XSAVE/XGETBV checks above passed
+ if ((cpu_id[1] >> 5) & 1)
+ bAVX2 = bAVX;
+ if ((cpu_id[1] >> 3) & 1)
+ bBMI1 = true;
+ if ((cpu_id[1] >> 8) & 1)
+ bBMI2 = true;
+ }
+ }
+
+ bFlushToZero = bSSE;
+
+ if (max_ex_fn >= 0x80000004)
+ {
+ // Extract CPU model string
+ __cpuid(cpu_id, 0x80000002);
+ memcpy(cpu_string, cpu_id, sizeof(cpu_id));
+ __cpuid(cpu_id, 0x80000003);
+ memcpy(cpu_string + 16, cpu_id, sizeof(cpu_id));
+ __cpuid(cpu_id, 0x80000004);
+ memcpy(cpu_string + 32, cpu_id, sizeof(cpu_id));
+ }
+ if (max_ex_fn >= 0x80000001)
+ {
+ // Check for more features.
+ __cpuid(cpu_id, 0x80000001);
+ if (cpu_id[2] & 1)
+ bLAHFSAHF64 = true;
+ if ((cpu_id[2] >> 5) & 1)
+ bLZCNT = true;
+ if ((cpu_id[2] >> 16) & 1)
+ bFMA4 = true;
+ if ((cpu_id[3] >> 29) & 1)
+ bLongMode = true;
+ }
+
+ num_cores = (logical_cpu_count == 0) ? 1 : logical_cpu_count;
+
+ if (max_ex_fn >= 0x80000008)
+ {
+ // Get number of cores. This is a bit complicated. Following AMD manual here.
+ __cpuid(cpu_id, 0x80000008);
+ int apic_id_core_id_size = (cpu_id[2] >> 12) & 0xF;
+ if (apic_id_core_id_size == 0)
+ {
+ if (ht)
+ {
+ // New mechanism for modern Intel CPUs.
+ if (vendor == CPUVendor::Intel)
+ {
+ __cpuidex(cpu_id, 0x00000004, 0x00000000);
+ int cores_x_package = ((cpu_id[0] >> 26) & 0x3F) + 1;
+ HTT = (cores_x_package < logical_cpu_count);
+ cores_x_package = ((logical_cpu_count % cores_x_package) == 0) ? cores_x_package : 1;
+ num_cores = (cores_x_package > 1) ? cores_x_package : num_cores;
+ logical_cpu_count /= cores_x_package;
+ }
+ }
+ }
+ else
+ {
+ // Use AMD's new method.
+ num_cores = (cpu_id[2] & 0xFF) + 1;
+ }
+ }
+}
+
+// Turn the CPU info into a string we can show
+std::string CPUInfo::Summarize()
+{
+ std::string sum(cpu_string);
+ sum += " (";
+ sum += brand_string;
+ sum += ")";
+
+ if (bSSE)
+ sum += ", SSE";
+ if (bSSE2)
+ {
+ sum += ", SSE2";
+ if (!bFlushToZero)
+ sum += " (but not DAZ!)";
+ }
+ if (bSSE3)
+ sum += ", SSE3";
+ if (bSSSE3)
+ sum += ", SSSE3";
+ if (bSSE4_1)
+ sum += ", SSE4.1";
+ if (bSSE4_2)
+ sum += ", SSE4.2";
+ if (HTT)
+ sum += ", HTT";
+ if (bAVX)
+ sum += ", AVX";
+ if (bAVX2)
+ sum += ", AVX2";
+ if (bBMI1)
+ sum += ", BMI1";
+ if (bBMI2)
+ sum += ", BMI2";
+ if (bFMA)
+ sum += ", FMA";
+ if (bAES)
+ sum += ", AES";
+ if (bMOVBE)
+ sum += ", MOVBE";
+ if (bLongMode)
+ sum += ", 64-bit support";
+ return sum;
+}
diff --git a/src/dolphin/x64Emitter.cpp b/src/dolphin/x64Emitter.cpp
new file mode 100644
index 0000000..343f314
--- /dev/null
+++ b/src/dolphin/x64Emitter.cpp
@@ -0,0 +1,3399 @@
+// Copyright 2008 Dolphin Emulator Project
+// Licensed under GPLv2+
+// Refer to the license_dolphin.txt file included.
+
+#include <cinttypes>
+#include <cstring>
+
+#include "CPUDetect.h"
+#include "../types.h"
+#include "x64Emitter.h"
+#include "x64Reg.h"
+#include "Compat.h"
+#include "CommonFuncs.h"
+
+namespace Gen
+{
+// TODO(ector): Add EAX special casing, for ever so slightly smaller code.
+struct NormalOpDef
+{
+ u8 toRm8, toRm32, fromRm8, fromRm32, imm8, imm32, simm8, eaximm8, eaximm32, ext;
+};
+
+// 0xCC is code for invalid combination of immediates
+static const NormalOpDef normalops[11] = {
+ {0x00, 0x01, 0x02, 0x03, 0x80, 0x81, 0x83, 0x04, 0x05, 0}, // ADD
+ {0x10, 0x11, 0x12, 0x13, 0x80, 0x81, 0x83, 0x14, 0x15, 2}, // ADC
+
+ {0x28, 0x29, 0x2A, 0x2B, 0x80, 0x81, 0x83, 0x2C, 0x2D, 5}, // SUB
+ {0x18, 0x19, 0x1A, 0x1B, 0x80, 0x81, 0x83, 0x1C, 0x1D, 3}, // SBB
+
+ {0x20, 0x21, 0x22, 0x23, 0x80, 0x81, 0x83, 0x24, 0x25, 4}, // AND
+ {0x08, 0x09, 0x0A, 0x0B, 0x80, 0x81, 0x83, 0x0C, 0x0D, 1}, // OR
+
+ {0x30, 0x31, 0x32, 0x33, 0x80, 0x81, 0x83, 0x34, 0x35, 6}, // XOR
+ {0x88, 0x89, 0x8A, 0x8B, 0xC6, 0xC7, 0xCC, 0xCC, 0xCC, 0}, // MOV
+
+ {0x84, 0x85, 0x84, 0x85, 0xF6, 0xF7, 0xCC, 0xA8, 0xA9, 0}, // TEST (to == from)
+ {0x38, 0x39, 0x3A, 0x3B, 0x80, 0x81, 0x83, 0x3C, 0x3D, 7}, // CMP
+
+ {0x86, 0x87, 0x86, 0x87, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 7}, // XCHG
+};
+
+enum NormalSSEOps
+{
+ sseCMP = 0xC2,
+ sseADD = 0x58, // ADD
+ sseSUB = 0x5C, // SUB
+ sseAND = 0x54, // AND
+ sseANDN = 0x55, // ANDN
+ sseOR = 0x56,
+ sseXOR = 0x57,
+ sseMUL = 0x59, // MUL
+ sseDIV = 0x5E, // DIV
+ sseMIN = 0x5D, // MIN
+ sseMAX = 0x5F, // MAX
+ sseCOMIS = 0x2F, // COMIS
+ sseUCOMIS = 0x2E, // UCOMIS
+ sseSQRT = 0x51, // SQRT
+ sseRCP = 0x53, // RCP
+ sseRSQRT = 0x52, // RSQRT (NO DOUBLE PRECISION!!!)
+ sseMOVAPfromRM = 0x28, // MOVAP from RM
+ sseMOVAPtoRM = 0x29, // MOVAP to RM
+ sseMOVUPfromRM = 0x10, // MOVUP from RM
+ sseMOVUPtoRM = 0x11, // MOVUP to RM
+ sseMOVLPfromRM = 0x12,
+ sseMOVLPtoRM = 0x13,
+ sseMOVHPfromRM = 0x16,
+ sseMOVHPtoRM = 0x17,
+ sseMOVHLPS = 0x12,
+ sseMOVLHPS = 0x16,
+ sseMOVDQfromRM = 0x6F,
+ sseMOVDQtoRM = 0x7F,
+ sseMASKMOVDQU = 0xF7,
+ sseLDDQU = 0xF0,
+ sseSHUF = 0xC6,
+ sseMOVNTDQ = 0xE7,
+ sseMOVNTP = 0x2B,
+};
+
+enum class NormalOp
+{
+ ADD,
+ ADC,
+ SUB,
+ SBB,
+ AND,
+ OR,
+ XOR,
+ MOV,
+ TEST,
+ CMP,
+ XCHG,
+};
+
+enum class FloatOp
+{
+ LD = 0,
+ ST = 2,
+ STP = 3,
+ LD80 = 5,
+ STP80 = 7,
+
+ Invalid = -1,
+};
+
+void XEmitter::SetCodePtr(u8* ptr)
+{
+ code = ptr;
+}
+
+const u8* XEmitter::GetCodePtr() const
+{
+ return code;
+}
+
+u8* XEmitter::GetWritableCodePtr()
+{
+ return code;
+}
+
+void XEmitter::Write8(u8 value)
+{
+ *code++ = value;
+}
+
+void XEmitter::Write16(u16 value)
+{
+ std::memcpy(code, &value, sizeof(u16));
+ code += sizeof(u16);
+}
+
+void XEmitter::Write32(u32 value)
+{
+ std::memcpy(code, &value, sizeof(u32));
+ code += sizeof(u32);
+}
+
+void XEmitter::Write64(u64 value)
+{
+ std::memcpy(code, &value, sizeof(u64));
+ code += sizeof(u64);
+}
+
+void XEmitter::ReserveCodeSpace(int bytes)
+{
+ for (int i = 0; i < bytes; i++)
+ *code++ = 0xCC;
+}
+
+u8* XEmitter::AlignCodeTo(size_t alignment)
+{
+ ASSERT_MSG(DYNA_REC, alignment != 0 && (alignment & (alignment - 1)) == 0,
+ "Alignment must be power of two");
+ u64 c = reinterpret_cast<u64>(code) & (alignment - 1);
+ if (c)
+ ReserveCodeSpace(static_cast<int>(alignment - c));
+ return code;
+}
+
+u8* XEmitter::AlignCode4()
+{
+ return AlignCodeTo(4);
+}
+
+u8* XEmitter::AlignCode16()
+{
+ return AlignCodeTo(16);
+}
+
+u8* XEmitter::AlignCodePage()
+{
+ return AlignCodeTo(4096);
+}
+
+// This operation modifies flags; check to see the flags are locked.
+// If the flags are locked, we should immediately and loudly fail before
+// causing a subtle JIT bug.
+void XEmitter::CheckFlags()
+{
+ ASSERT_MSG(DYNA_REC, !flags_locked, "Attempt to modify flags while flags locked!");
+}
+
+void XEmitter::WriteModRM(int mod, int reg, int rm)
+{
+ Write8((u8)((mod << 6) | ((reg & 7) << 3) | (rm & 7)));
+}
+
+void XEmitter::WriteSIB(int scale, int index, int base)
+{
+ Write8((u8)((scale << 6) | ((index & 7) << 3) | (base & 7)));
+}
+
+void OpArg::WriteREX(XEmitter* emit, int opBits, int bits, int customOp) const
+{
+ if (customOp == -1)
+ customOp = operandReg;
+ u8 op = 0x40;
+ // REX.W (whether operation is a 64-bit operation)
+ if (opBits == 64)
+ op |= 8;
+ // REX.R (whether ModR/M reg field refers to R8-R15.
+ if (customOp & 8)
+ op |= 4;
+ // REX.X (whether ModR/M SIB index field refers to R8-R15)
+ if (indexReg & 8)
+ op |= 2;
+ // REX.B (whether ModR/M rm or SIB base or opcode reg field refers to R8-R15)
+ if (offsetOrBaseReg & 8)
+ op |= 1;
+ // Write REX if wr have REX bits to write, or if the operation accesses
+ // SIL, DIL, BPL, or SPL.
+ if (op != 0x40 || (scale == SCALE_NONE && bits == 8 && (offsetOrBaseReg & 0x10c) == 4) ||
+ (opBits == 8 && (customOp & 0x10c) == 4))
+ {
+ emit->Write8(op);
+ // Check the operation doesn't access AH, BH, CH, or DH.
+ DEBUG_ASSERT((offsetOrBaseReg & 0x100) == 0);
+ DEBUG_ASSERT((customOp & 0x100) == 0);
+ }
+}
+
+void OpArg::WriteVEX(XEmitter* emit, X64Reg regOp1, X64Reg regOp2, int L, int pp, int mmmmm,
+ int W) const
+{
+ int R = !(regOp1 & 8);
+ int X = !(indexReg & 8);
+ int B = !(offsetOrBaseReg & 8);
+
+ int vvvv = (regOp2 == X64Reg::INVALID_REG) ? 0xf : (regOp2 ^ 0xf);
+
+ // do we need any VEX fields that only appear in the three-byte form?
+ if (X == 1 && B == 1 && W == 0 && mmmmm == 1)
+ {
+ u8 RvvvvLpp = (R << 7) | (vvvv << 3) | (L << 2) | pp;
+ emit->Write8(0xC5);
+ emit->Write8(RvvvvLpp);
+ }
+ else
+ {
+ u8 RXBmmmmm = (R << 7) | (X << 6) | (B << 5) | mmmmm;
+ u8 WvvvvLpp = (W << 7) | (vvvv << 3) | (L << 2) | pp;
+ emit->Write8(0xC4);
+ emit->Write8(RXBmmmmm);
+ emit->Write8(WvvvvLpp);
+ }
+}
+
+void OpArg::WriteRest(XEmitter* emit, int extraBytes, X64Reg _operandReg,
+ bool warn_64bit_offset) const
+{
+ if (_operandReg == INVALID_REG)
+ _operandReg = (X64Reg)this->operandReg;
+ int mod = 0;
+ int ireg = indexReg;
+ bool SIB = false;
+ int _offsetOrBaseReg = this->offsetOrBaseReg;
+
+ if (scale == SCALE_RIP) // Also, on 32-bit, just an immediate address
+ {
+ // Oh, RIP addressing.
+ _offsetOrBaseReg = 5;
+ emit->WriteModRM(0, _operandReg, _offsetOrBaseReg);
+ // TODO : add some checks
+ u64 ripAddr = (u64)emit->GetCodePtr() + 4 + extraBytes;
+ s64 distance = (s64)offset - (s64)ripAddr;
+ ASSERT_MSG(DYNA_REC,
+ (distance < 0x80000000LL && distance >= -0x80000000LL) || !warn_64bit_offset,
+ "WriteRest: op out of range (0x%" PRIx64 " uses 0x%" PRIx64 ")", ripAddr, offset);
+ s32 offs = (s32)distance;
+ emit->Write32((u32)offs);
+ return;
+ }
+
+ if (scale == 0)
+ {
+ // Oh, no memory, Just a reg.
+ mod = 3; // 11
+ }
+ else
+ {
+ // Ah good, no scaling.
+ if (scale == SCALE_ATREG && !((_offsetOrBaseReg & 7) == 4 || (_offsetOrBaseReg & 7) == 5))
+ {
+ // Okay, we're good. No SIB necessary.
+ int ioff = (int)offset;
+ if (ioff == 0)
+ {
+ mod = 0;
+ }
+ else if (ioff < -128 || ioff > 127)
+ {
+ mod = 2; // 32-bit displacement
+ }
+ else
+ {
+ mod = 1; // 8-bit displacement
+ }
+ }
+ else if (scale >= SCALE_NOBASE_2 && scale <= SCALE_NOBASE_8)
+ {
+ SIB = true;
+ mod = 0;
+ _offsetOrBaseReg = 5;
+ }
+ else
+ {
+ if ((_offsetOrBaseReg & 7) == 4) // this would occupy the SIB encoding :(
+ {
+ // So we have to fake it with SIB encoding :(
+ SIB = true;
+ }
+
+ if (scale >= SCALE_1 && scale < SCALE_ATREG)
+ {
+ SIB = true;
+ }
+
+ if (scale == SCALE_ATREG && ((_offsetOrBaseReg & 7) == 4))
+ {
+ SIB = true;
+ ireg = _offsetOrBaseReg;
+ }
+
+ // Okay, we're fine. Just disp encoding.
+ // We need displacement. Which size?
+ int ioff = (int)(s64)offset;
+ if (ioff < -128 || ioff > 127)
+ {
+ mod = 2; // 32-bit displacement
+ }
+ else
+ {
+ mod = 1; // 8-bit displacement
+ }
+ }
+ }
+
+ // Okay. Time to do the actual writing
+ // ModRM byte:
+ int oreg = _offsetOrBaseReg;
+ if (SIB)
+ oreg = 4;
+
+ emit->WriteModRM(mod, _operandReg & 7, oreg & 7);
+
+ if (SIB)
+ {
+ // SIB byte
+ int ss;
+ switch (scale)
+ {
+ case SCALE_NONE:
+ _offsetOrBaseReg = 4;
+ ss = 0;
+ break; // RSP
+ case SCALE_1:
+ ss = 0;
+ break;
+ case SCALE_2:
+ ss = 1;
+ break;
+ case SCALE_4:
+ ss = 2;
+ break;
+ case SCALE_8:
+ ss = 3;
+ break;
+ case SCALE_NOBASE_2:
+ ss = 1;
+ break;
+ case SCALE_NOBASE_4:
+ ss = 2;
+ break;
+ case SCALE_NOBASE_8:
+ ss = 3;
+ break;
+ case SCALE_ATREG:
+ ss = 0;
+ break;
+ default:
+ ASSERT_MSG(DYNA_REC, 0, "Invalid scale for SIB byte");
+ ss = 0;
+ break;
+ }
+ emit->Write8((u8)((ss << 6) | ((ireg & 7) << 3) | (_offsetOrBaseReg & 7)));
+ }
+
+ if (mod == 1) // 8-bit disp
+ {
+ emit->Write8((u8)(s8)(s32)offset);
+ }
+ else if (mod == 2 || (scale >= SCALE_NOBASE_2 && scale <= SCALE_NOBASE_8)) // 32-bit disp
+ {
+ emit->Write32((u32)offset);
+ }
+}
+
+// W = operand extended width (1 if 64-bit)
+// R = register# upper bit
+// X = scale amnt upper bit
+// B = base register# upper bit
+void XEmitter::Rex(int w, int r, int x, int b)
+{
+ w = w ? 1 : 0;
+ r = r ? 1 : 0;
+ x = x ? 1 : 0;
+ b = b ? 1 : 0;
+ u8 rx = (u8)(0x40 | (w << 3) | (r << 2) | (x << 1) | (b));
+ if (rx != 0x40)
+ Write8(rx);
+}
+
+void XEmitter::JMP(const u8* addr, bool force5Bytes)
+{
+ u64 fn = (u64)addr;
+ if (!force5Bytes)
+ {
+ s64 distance = (s64)(fn - ((u64)code + 2));
+ ASSERT_MSG(DYNA_REC, distance >= -0x80 && distance < 0x80,
+ "Jump target too far away, needs force5Bytes = true");
+ // 8 bits will do
+ Write8(0xEB);
+ Write8((u8)(s8)distance);
+ }
+ else
+ {
+ s64 distance = (s64)(fn - ((u64)code + 5));
+
+ ASSERT_MSG(DYNA_REC, distance >= -0x80000000LL && distance < 0x80000000LL,
+ "Jump target too far away, needs indirect register");
+ Write8(0xE9);
+ Write32((u32)(s32)distance);
+ }
+}
+
+void XEmitter::JMPptr(const OpArg& arg2)
+{
+ OpArg arg = arg2;
+ if (arg.IsImm())
+ ASSERT_MSG(DYNA_REC, 0, "JMPptr - Imm argument");
+ arg.operandReg = 4;
+ arg.WriteREX(this, 0, 0);
+ Write8(0xFF);
+ arg.WriteRest(this);
+}
+
+// Can be used to trap other processors, before overwriting their code
+// not used in Dolphin
+void XEmitter::JMPself()
+{
+ Write8(0xEB);
+ Write8(0xFE);
+}
+
+void XEmitter::CALLptr(OpArg arg)
+{
+ if (arg.IsImm())
+ ASSERT_MSG(DYNA_REC, 0, "CALLptr - Imm argument");
+ arg.operandReg = 2;
+ arg.WriteREX(this, 0, 0);
+ Write8(0xFF);
+ arg.WriteRest(this);
+}
+
+void XEmitter::CALL(const void* fnptr)
+{
+ u64 distance = u64(fnptr) - (u64(code) + 5);
+ ASSERT_MSG(DYNA_REC, distance < 0x0000000080000000ULL || distance >= 0xFFFFFFFF80000000ULL,
+ "CALL out of range (%p calls %p)", code, fnptr);
+ Write8(0xE8);
+ Write32(u32(distance));
+}
+
+FixupBranch XEmitter::CALL()
+{
+ FixupBranch branch;
+ branch.type = FixupBranch::Type::Branch32Bit;
+ branch.ptr = code + 5;
+ Write8(0xE8);
+ Write32(0);
+ return branch;
+}
+
+FixupBranch XEmitter::J(bool force5bytes)
+{
+ FixupBranch branch;
+ branch.type = force5bytes ? FixupBranch::Type::Branch32Bit : FixupBranch::Type::Branch8Bit;
+ branch.ptr = code + (force5bytes ? 5 : 2);
+ if (!force5bytes)
+ {
+ // 8 bits will do
+ Write8(0xEB);
+ Write8(0);
+ }
+ else
+ {
+ Write8(0xE9);
+ Write32(0);
+ }
+ return branch;
+}
+
+FixupBranch XEmitter::J_CC(CCFlags conditionCode, bool force5bytes)
+{
+ FixupBranch branch;
+ branch.type = force5bytes ? FixupBranch::Type::Branch32Bit : FixupBranch::Type::Branch8Bit;
+ branch.ptr = code + (force5bytes ? 6 : 2);
+ if (!force5bytes)
+ {
+ // 8 bits will do
+ Write8(0x70 + conditionCode);
+ Write8(0);
+ }
+ else
+ {
+ Write8(0x0F);
+ Write8(0x80 + conditionCode);
+ Write32(0);
+ }
+ return branch;
+}
+
+void XEmitter::J_CC(CCFlags conditionCode, const u8* addr)
+{
+ u64 fn = (u64)addr;
+ s64 distance = (s64)(fn - ((u64)code + 2));
+ if (distance < -0x80 || distance >= 0x80)
+ {
+ distance = (s64)(fn - ((u64)code + 6));
+ ASSERT_MSG(DYNA_REC, distance >= -0x80000000LL && distance < 0x80000000LL,
+ "Jump target too far away, needs indirect register");
+ Write8(0x0F);
+ Write8(0x80 + conditionCode);
+ Write32((u32)(s32)distance);
+ }
+ else
+ {
+ Write8(0x70 + conditionCode);
+ Write8((u8)(s8)distance);
+ }
+}
+
+void XEmitter::SetJumpTarget(const FixupBranch& branch)
+{
+ if (branch.type == FixupBranch::Type::Branch8Bit)
+ {
+ s64 distance = (s64)(code - branch.ptr);
+ if (!(distance >= -0x80 && distance < 0x80))
+ {
+ printf("miauz\n");
+ }
+ ASSERT_MSG(DYNA_REC, distance >= -0x80 && distance < 0x80,
+ "Jump target too far away, needs force5Bytes = true");
+ branch.ptr[-1] = (u8)(s8)distance;
+ }
+ else if (branch.type == FixupBranch::Type::Branch32Bit)
+ {
+ s64 distance = (s64)(code - branch.ptr);
+ ASSERT_MSG(DYNA_REC, distance >= -0x80000000LL && distance < 0x80000000LL,
+ "Jump target too far away, needs indirect register");
+
+ s32 valid_distance = static_cast<s32>(distance);
+ std::memcpy(&branch.ptr[-4], &valid_distance, sizeof(s32));
+ }
+}
+
+// Single byte opcodes
+// There is no PUSHAD/POPAD in 64-bit mode.
+void XEmitter::INT3()
+{
+ Write8(0xCC);
+}
+void XEmitter::RET()
+{
+ Write8(0xC3);
+}
+void XEmitter::RET_FAST()
+{
+ Write8(0xF3);
+ Write8(0xC3);
+} // two-byte return (rep ret) - recommended by AMD optimization manual for the case of jumping to
+ // a ret
+
+// The first sign of decadence: optimized NOPs.
+void XEmitter::NOP(size_t size)
+{
+ DEBUG_ASSERT((int)size > 0);
+ while (true)
+ {
+ switch (size)
+ {
+ case 0:
+ return;
+ case 1:
+ Write8(0x90);
+ return;
+ case 2:
+ Write8(0x66);
+ Write8(0x90);
+ return;
+ case 3:
+ Write8(0x0F);
+ Write8(0x1F);
+ Write8(0x00);
+ return;
+ case 4:
+ Write8(0x0F);
+ Write8(0x1F);
+ Write8(0x40);
+ Write8(0x00);
+ return;
+ case 5:
+ Write8(0x0F);
+ Write8(0x1F);
+ Write8(0x44);
+ Write8(0x00);
+ Write8(0x00);
+ return;
+ case 6:
+ Write8(0x66);
+ Write8(0x0F);
+ Write8(0x1F);
+ Write8(0x44);
+ Write8(0x00);
+ Write8(0x00);
+ return;
+ case 7:
+ Write8(0x0F);
+ Write8(0x1F);
+ Write8(0x80);
+ Write8(0x00);
+ Write8(0x00);
+ Write8(0x00);
+ Write8(0x00);
+ return;
+ case 8:
+ Write8(0x0F);
+ Write8(0x1F);
+ Write8(0x84);
+ Write8(0x00);
+ Write8(0x00);
+ Write8(0x00);
+ Write8(0x00);
+ Write8(0x00);
+ return;
+ case 9:
+ Write8(0x66);
+ Write8(0x0F);
+ Write8(0x1F);
+ Write8(0x84);
+ Write8(0x00);
+ Write8(0x00);
+ Write8(0x00);
+ Write8(0x00);
+ Write8(0x00);
+ return;
+ case 10:
+ Write8(0x66);
+ Write8(0x66);
+ Write8(0x0F);
+ Write8(0x1F);
+ Write8(0x84);
+ Write8(0x00);
+ Write8(0x00);
+ Write8(0x00);
+ Write8(0x00);
+ Write8(0x00);
+ return;
+ default:
+ // Even though x86 instructions are allowed to be up to 15 bytes long,
+ // AMD advises against using NOPs longer than 11 bytes because they
+ // carry a performance penalty on CPUs older than AMD family 16h.
+ Write8(0x66);
+ Write8(0x66);
+ Write8(0x66);
+ Write8(0x0F);
+ Write8(0x1F);
+ Write8(0x84);
+ Write8(0x00);
+ Write8(0x00);
+ Write8(0x00);
+ Write8(0x00);
+ Write8(0x00);
+ size -= 11;
+ continue;
+ }
+ }
+}
+
+void XEmitter::PAUSE()
+{
+ Write8(0xF3);
+ NOP();
+} // use in tight spinloops for energy saving on some CPU
+void XEmitter::CLC()
+{
+ CheckFlags();
+ Write8(0xF8);
+} // clear carry
+void XEmitter::CMC()
+{
+ CheckFlags();
+ Write8(0xF5);
+} // flip carry
+void XEmitter::STC()
+{
+ CheckFlags();
+ Write8(0xF9);
+} // set carry
+
+// TODO: xchg ah, al ???
+void XEmitter::XCHG_AHAL()
+{
+ Write8(0x86);
+ Write8(0xe0);
+ // alt. 86 c4
+}
+
+// These two can not be executed on early Intel 64-bit CPU:s, only on AMD!
+void XEmitter::LAHF()
+{
+ Write8(0x9F);
+}
+void XEmitter::SAHF()
+{
+ CheckFlags();
+ Write8(0x9E);
+}
+
+void XEmitter::PUSHF()
+{
+ Write8(0x9C);
+}
+void XEmitter::POPF()
+{
+ CheckFlags();
+ Write8(0x9D);
+}
+
+void XEmitter::LFENCE()
+{
+ Write8(0x0F);
+ Write8(0xAE);
+ Write8(0xE8);
+}
+void XEmitter::MFENCE()
+{
+ Write8(0x0F);
+ Write8(0xAE);
+ Write8(0xF0);
+}
+void XEmitter::SFENCE()
+{
+ Write8(0x0F);
+ Write8(0xAE);
+ Write8(0xF8);
+}
+
+void XEmitter::WriteSimple1Byte(int bits, u8 byte, X64Reg reg)
+{
+ if (bits == 16)
+ Write8(0x66);
+ Rex(bits == 64, 0, 0, (int)reg >> 3);
+ Write8(byte + ((int)reg & 7));
+}
+
+void XEmitter::WriteSimple2Byte(int bits, u8 byte1, u8 byte2, X64Reg reg)
+{
+ if (bits == 16)
+ Write8(0x66);
+ Rex(bits == 64, 0, 0, (int)reg >> 3);
+ Write8(byte1);
+ Write8(byte2 + ((int)reg & 7));
+}
+
+void XEmitter::CWD(int bits)
+{
+ if (bits == 16)
+ Write8(0x66);
+ Rex(bits == 64, 0, 0, 0);
+ Write8(0x99);
+}
+
+void XEmitter::CBW(int bits)
+{
+ if (bits == 8)
+ Write8(0x66);
+ Rex(bits == 32, 0, 0, 0);
+ Write8(0x98);
+}
+
+// Simple opcodes
+
+// push/pop do not need wide to be 64-bit
+void XEmitter::PUSH(X64Reg reg)
+{
+ WriteSimple1Byte(32, 0x50, reg);
+}
+void XEmitter::POP(X64Reg reg)
+{
+ WriteSimple1Byte(32, 0x58, reg);
+}
+
+void XEmitter::PUSH(int bits, const OpArg& reg)
+{
+ if (reg.IsSimpleReg())
+ PUSH(reg.GetSimpleReg());
+ else if (reg.IsImm())
+ {
+ switch (reg.GetImmBits())
+ {
+ case 8:
+ Write8(0x6A);
+ Write8((u8)(s8)reg.offset);
+ break;
+ case 16:
+ Write8(0x66);
+ Write8(0x68);
+ Write16((u16)(s16)(s32)reg.offset);
+ break;
+ case 32:
+ Write8(0x68);
+ Write32((u32)reg.offset);
+ break;
+ default:
+ ASSERT_MSG(DYNA_REC, 0, "PUSH - Bad imm bits");
+ break;
+ }
+ }
+ else
+ {
+ if (bits == 16)
+ Write8(0x66);
+ reg.WriteREX(this, bits, bits);
+ Write8(0xFF);
+ reg.WriteRest(this, 0, (X64Reg)6);
+ }
+}
+
+void XEmitter::POP(int /*bits*/, const OpArg& reg)
+{
+ if (reg.IsSimpleReg())
+ POP(reg.GetSimpleReg());
+ else
+ ASSERT_MSG(DYNA_REC, 0, "POP - Unsupported encoding");
+}
+
+void XEmitter::BSWAP(int bits, X64Reg reg)
+{
+ if (bits >= 32)
+ {
+ WriteSimple2Byte(bits, 0x0F, 0xC8, reg);
+ }
+ else if (bits == 16)
+ {
+ ROL(16, R(reg), Imm8(8));
+ }
+ else if (bits == 8)
+ {
+ // Do nothing - can't bswap a single byte...
+ }
+ else
+ {
+ ASSERT_MSG(DYNA_REC, 0, "BSWAP - Wrong number of bits");
+ }
+}
+
+// Undefined opcode - reserved
+// If we ever need a way to always cause a non-breakpoint hard exception...
+void XEmitter::UD2()
+{
+ Write8(0x0F);
+ Write8(0x0B);
+}
+
+void XEmitter::PREFETCH(PrefetchLevel level, OpArg arg)
+{
+ ASSERT_MSG(DYNA_REC, !arg.IsImm(), "PREFETCH - Imm argument");
+ arg.operandReg = (u8)level;
+ arg.WriteREX(this, 0, 0);
+ Write8(0x0F);
+ Write8(0x18);
+ arg.WriteRest(this);
+}
+
+void XEmitter::SETcc(CCFlags flag, OpArg dest)
+{
+ ASSERT_MSG(DYNA_REC, !dest.IsImm(), "SETcc - Imm argument");
+ dest.operandReg = 0;
+ dest.WriteREX(this, 0, 8);
+ Write8(0x0F);
+ Write8(0x90 + (u8)flag);
+ dest.WriteRest(this);
+}
+
+void XEmitter::CMOVcc(int bits, X64Reg dest, OpArg src, CCFlags flag)
+{
+ ASSERT_MSG(DYNA_REC, !src.IsImm(), "CMOVcc - Imm argument");
+ ASSERT_MSG(DYNA_REC, bits != 8, "CMOVcc - 8 bits unsupported");
+ if (bits == 16)
+ Write8(0x66);
+ src.operandReg = dest;
+ src.WriteREX(this, bits, bits);
+ Write8(0x0F);
+ Write8(0x40 + (u8)flag);
+ src.WriteRest(this);
+}
+
+void XEmitter::WriteMulDivType(int bits, OpArg src, int ext)
+{
+ ASSERT_MSG(DYNA_REC, !src.IsImm(), "WriteMulDivType - Imm argument");
+ CheckFlags();
+ src.operandReg = ext;
+ if (bits == 16)
+ Write8(0x66);
+ src.WriteREX(this, bits, bits, 0);
+ if (bits == 8)
+ {
+ Write8(0xF6);
+ }
+ else
+ {
+ Write8(0xF7);
+ }
+ src.WriteRest(this);
+}
+
+void XEmitter::MUL(int bits, const OpArg& src)
+{
+ WriteMulDivType(bits, src, 4);
+}
+void XEmitter::DIV(int bits, const OpArg& src)
+{
+ WriteMulDivType(bits, src, 6);
+}
+void XEmitter::IMUL(int bits, const OpArg& src)
+{
+ WriteMulDivType(bits, src, 5);
+}
+void XEmitter::IDIV(int bits, const OpArg& src)
+{
+ WriteMulDivType(bits, src, 7);
+}
+void XEmitter::NEG(int bits, const OpArg& src)
+{
+ WriteMulDivType(bits, src, 3);
+}
+void XEmitter::NOT(int bits, const OpArg& src)
+{
+ WriteMulDivType(bits, src, 2);
+}
+
+void XEmitter::WriteBitSearchType(int bits, X64Reg dest, OpArg src, u8 byte2, bool rep)
+{
+ ASSERT_MSG(DYNA_REC, !src.IsImm(), "WriteBitSearchType - Imm argument");
+ CheckFlags();
+ src.operandReg = (u8)dest;
+ if (bits == 16)
+ Write8(0x66);
+ if (rep)
+ Write8(0xF3);
+ src.WriteREX(this, bits, bits);
+ Write8(0x0F);
+ Write8(byte2);
+ src.WriteRest(this);
+}
+
+void XEmitter::MOVNTI(int bits, const OpArg& dest, X64Reg src)
+{
+ if (bits <= 16)
+ ASSERT_MSG(DYNA_REC, 0, "MOVNTI - bits<=16");
+ WriteBitSearchType(bits, src, dest, 0xC3);
+}
+
+void XEmitter::BSF(int bits, X64Reg dest, const OpArg& src)
+{
+ WriteBitSearchType(bits, dest, src, 0xBC);
+} // Bottom bit to top bit
+void XEmitter::BSR(int bits, X64Reg dest, const OpArg& src)
+{
+ WriteBitSearchType(bits, dest, src, 0xBD);
+} // Top bit to bottom bit
+
+void XEmitter::TZCNT(int bits, X64Reg dest, const OpArg& src)
+{
+ CheckFlags();
+ if (!cpu_info.bBMI1)
+ PanicAlert("Trying to use BMI1 on a system that doesn't support it. Bad programmer.");
+ WriteBitSearchType(bits, dest, src, 0xBC, true);
+}
+void XEmitter::LZCNT(int bits, X64Reg dest, const OpArg& src)
+{
+ CheckFlags();
+ if (!cpu_info.bLZCNT)
+ PanicAlert("Trying to use LZCNT on a system that doesn't support it. Bad programmer.");
+ WriteBitSearchType(bits, dest, src, 0xBD, true);
+}
+
+void XEmitter::MOVSX(int dbits, int sbits, X64Reg dest, OpArg src)
+{
+ ASSERT_MSG(DYNA_REC, !src.IsImm(), "MOVSX - Imm argument");
+ if (dbits == sbits)
+ {
+ MOV(dbits, R(dest), src);
+ return;
+ }
+ src.operandReg = (u8)dest;
+ if (dbits == 16)
+ Write8(0x66);
+ src.WriteREX(this, dbits, sbits);
+ if (sbits == 8)
+ {
+ Write8(0x0F);
+ Write8(0xBE);
+ }
+ else if (sbits == 16)
+ {
+ Write8(0x0F);
+ Write8(0xBF);
+ }
+ else if (sbits == 32 && dbits == 64)
+ {
+ Write8(0x63);
+ }
+ else
+ {
+ Crash();
+ }
+ src.WriteRest(this);
+}
+
+void XEmitter::MOVZX(int dbits, int sbits, X64Reg dest, OpArg src)
+{
+ ASSERT_MSG(DYNA_REC, !src.IsImm(), "MOVZX - Imm argument");
+ if (dbits == sbits)
+ {
+ MOV(dbits, R(dest), src);
+ return;
+ }
+ src.operandReg = (u8)dest;
+ if (dbits == 16)
+ Write8(0x66);
+ // the 32bit result is automatically zero extended to 64bit
+ src.WriteREX(this, dbits == 64 ? 32 : dbits, sbits);
+ if (sbits == 8)
+ {
+ Write8(0x0F);
+ Write8(0xB6);
+ }
+ else if (sbits == 16)
+ {
+ Write8(0x0F);
+ Write8(0xB7);
+ }
+ else if (sbits == 32 && dbits == 64)
+ {
+ Write8(0x8B);
+ }
+ else
+ {
+ ASSERT_MSG(DYNA_REC, 0, "MOVZX - Invalid size");
+ }
+ src.WriteRest(this);
+}
+
+void XEmitter::WriteMOVBE(int bits, u8 op, X64Reg reg, const OpArg& arg)
+{
+ ASSERT_MSG(DYNA_REC, cpu_info.bMOVBE, "Generating MOVBE on a system that does not support it.");
+ if (bits == 8)
+ {
+ MOV(8, op & 1 ? arg : R(reg), op & 1 ? R(reg) : arg);
+ return;
+ }
+ if (bits == 16)
+ Write8(0x66);
+ ASSERT_MSG(DYNA_REC, !arg.IsSimpleReg() && !arg.IsImm(), "MOVBE: need r<-m or m<-r!");
+ arg.WriteREX(this, bits, bits, reg);
+ Write8(0x0F);
+ Write8(0x38);
+ Write8(op);
+ arg.WriteRest(this, 0, reg);
+}
+void XEmitter::MOVBE(int bits, X64Reg dest, const OpArg& src)
+{
+ WriteMOVBE(bits, 0xF0, dest, src);
+}
+void XEmitter::MOVBE(int bits, const OpArg& dest, X64Reg src)
+{
+ WriteMOVBE(bits, 0xF1, src, dest);
+}
+
+void XEmitter::LoadAndSwap(int size, X64Reg dst, const OpArg& src, bool sign_extend, MovInfo* info)
+{
+ if (info)
+ {
+ info->address = GetWritableCodePtr();
+ info->nonAtomicSwapStore = false;
+ }
+
+ switch (size)
+ {
+ case 8:
+ if (sign_extend)
+ MOVSX(32, 8, dst, src);
+ else
+ MOVZX(32, 8, dst, src);
+ break;
+ case 16:
+ MOVZX(32, 16, dst, src);
+ if (sign_extend)
+ {
+ BSWAP(32, dst);
+ SAR(32, R(dst), Imm8(16));
+ }
+ else
+ {
+ ROL(16, R(dst), Imm8(8));
+ }
+ break;
+ case 32:
+ case 64:
+ if (cpu_info.bMOVBE)
+ {
+ MOVBE(size, dst, src);
+ }
+ else
+ {
+ MOV(size, R(dst), src);
+ BSWAP(size, dst);
+ }
+ break;
+ }
+}
+
+void XEmitter::SwapAndStore(int size, const OpArg& dst, X64Reg src, MovInfo* info)
+{
+ if (cpu_info.bMOVBE)
+ {
+ if (info)
+ {
+ info->address = GetWritableCodePtr();
+ info->nonAtomicSwapStore = false;
+ }
+ MOVBE(size, dst, src);
+ }
+ else
+ {
+ BSWAP(size, src);
+ if (info)
+ {
+ info->address = GetWritableCodePtr();
+ info->nonAtomicSwapStore = true;
+ info->nonAtomicSwapStoreSrc = src;
+ }
+ MOV(size, dst, R(src));
+ }
+}
+
+void XEmitter::LEA(int bits, X64Reg dest, OpArg src)
+{
+ ASSERT_MSG(DYNA_REC, !src.IsImm(), "LEA - Imm argument");
+ src.operandReg = (u8)dest;
+ if (bits == 16)
+ Write8(0x66); // TODO: performance warning
+ src.WriteREX(this, bits, bits);
+ Write8(0x8D);
+ src.WriteRest(this, 0, INVALID_REG, bits == 64);
+}
+
+// shift can be either imm8 or cl
+void XEmitter::WriteShift(int bits, OpArg dest, const OpArg& shift, int ext)
+{
+ CheckFlags();
+ bool writeImm = false;
+ if (dest.IsImm())
+ {
+ ASSERT_MSG(DYNA_REC, 0, "WriteShift - can't shift imms");
+ }
+ if ((shift.IsSimpleReg() && shift.GetSimpleReg() != ECX) ||
+ (shift.IsImm() && shift.GetImmBits() != 8))
+ {
+ ASSERT_MSG(DYNA_REC, 0, "WriteShift - illegal argument");
+ }
+ dest.operandReg = ext;
+ if (bits == 16)
+ Write8(0x66);
+ dest.WriteREX(this, bits, bits, 0);
+ if (shift.GetImmBits() == 8)
+ {
+ // ok an imm
+ u8 imm = (u8)shift.offset;
+ if (imm == 1)
+ {
+ Write8(bits == 8 ? 0xD0 : 0xD1);
+ }
+ else
+ {
+ writeImm = true;
+ Write8(bits == 8 ? 0xC0 : 0xC1);
+ }
+ }
+ else
+ {
+ Write8(bits == 8 ? 0xD2 : 0xD3);
+ }
+ dest.WriteRest(this, writeImm ? 1 : 0);
+ if (writeImm)
+ Write8((u8)shift.offset);
+}
+
+// large rotates and shift are slower on Intel than AMD
+// Intel likes to rotate by 1, and the op is smaller too
+void XEmitter::ROL(int bits, const OpArg& dest, const OpArg& shift)
+{
+ WriteShift(bits, dest, shift, 0);
+}
+void XEmitter::ROR_(int bits, const OpArg& dest, const OpArg& shift)
+{
+ WriteShift(bits, dest, shift, 1);
+}
+void XEmitter::RCL(int bits, const OpArg& dest, const OpArg& shift)
+{
+ WriteShift(bits, dest, shift, 2);
+}
+void XEmitter::RCR(int bits, const OpArg& dest, const OpArg& shift)
+{
+ WriteShift(bits, dest, shift, 3);
+}
+void XEmitter::SHL(int bits, const OpArg& dest, const OpArg& shift)
+{
+ WriteShift(bits, dest, shift, 4);
+}
+void XEmitter::SHR(int bits, const OpArg& dest, const OpArg& shift)
+{
+ WriteShift(bits, dest, shift, 5);
+}
+void XEmitter::SAR(int bits, const OpArg& dest, const OpArg& shift)
+{
+ WriteShift(bits, dest, shift, 7);
+}
+
+// index can be either imm8 or register, don't use memory destination because it's slow
+void XEmitter::WriteBitTest(int bits, const OpArg& dest, const OpArg& index, int ext)
+{
+ CheckFlags();
+ if (dest.IsImm())
+ {
+ ASSERT_MSG(DYNA_REC, 0, "WriteBitTest - can't test imms");
+ }
+ if ((index.IsImm() && index.GetImmBits() != 8))
+ {
+ ASSERT_MSG(DYNA_REC, 0, "WriteBitTest - illegal argument");
+ }
+ if (bits == 16)
+ Write8(0x66);
+ if (index.IsImm())
+ {
+ dest.WriteREX(this, bits, bits);
+ Write8(0x0F);
+ Write8(0xBA);
+ dest.WriteRest(this, 1, (X64Reg)ext);
+ Write8((u8)index.offset);
+ }
+ else
+ {
+ X64Reg operand = index.GetSimpleReg();
+ dest.WriteREX(this, bits, bits, operand);
+ Write8(0x0F);
+ Write8(0x83 + 8 * ext);
+ dest.WriteRest(this, 1, operand);
+ }
+}
+
+void XEmitter::BT(int bits, const OpArg& dest, const OpArg& index)
+{
+ WriteBitTest(bits, dest, index, 4);
+}
+void XEmitter::BTS(int bits, const OpArg& dest, const OpArg& index)
+{
+ WriteBitTest(bits, dest, index, 5);
+}
+void XEmitter::BTR(int bits, const OpArg& dest, const OpArg& index)
+{
+ WriteBitTest(bits, dest, index, 6);
+}
+void XEmitter::BTC(int bits, const OpArg& dest, const OpArg& index)
+{
+ WriteBitTest(bits, dest, index, 7);
+}
+
+// shift can be either imm8 or cl
+void XEmitter::SHRD(int bits, const OpArg& dest, const OpArg& src, const OpArg& shift)
+{
+ CheckFlags();
+ if (dest.IsImm())
+ {
+ ASSERT_MSG(DYNA_REC, 0, "SHRD - can't use imms as destination");
+ }
+ if (!src.IsSimpleReg())
+ {
+ ASSERT_MSG(DYNA_REC, 0, "SHRD - must use simple register as source");
+ }
+ if ((shift.IsSimpleReg() && shift.GetSimpleReg() != ECX) ||
+ (shift.IsImm() && shift.GetImmBits() != 8))
+ {
+ ASSERT_MSG(DYNA_REC, 0, "SHRD - illegal shift");
+ }
+ if (bits == 16)
+ Write8(0x66);
+ X64Reg operand = src.GetSimpleReg();
+ dest.WriteREX(this, bits, bits, operand);
+ if (shift.GetImmBits() == 8)
+ {
+ Write8(0x0F);
+ Write8(0xAC);
+ dest.WriteRest(this, 1, operand);
+ Write8((u8)shift.offset);
+ }
+ else
+ {
+ Write8(0x0F);
+ Write8(0xAD);
+ dest.WriteRest(this, 0, operand);
+ }
+}
+
+void XEmitter::SHLD(int bits, const OpArg& dest, const OpArg& src, const OpArg& shift)
+{
+ CheckFlags();
+ if (dest.IsImm())
+ {
+ ASSERT_MSG(DYNA_REC, 0, "SHLD - can't use imms as destination");
+ }
+ if (!src.IsSimpleReg())
+ {
+ ASSERT_MSG(DYNA_REC, 0, "SHLD - must use simple register as source");
+ }
+ if ((shift.IsSimpleReg() && shift.GetSimpleReg() != ECX) ||
+ (shift.IsImm() && shift.GetImmBits() != 8))
+ {
+ ASSERT_MSG(DYNA_REC, 0, "SHLD - illegal shift");
+ }
+ if (bits == 16)
+ Write8(0x66);
+ X64Reg operand = src.GetSimpleReg();
+ dest.WriteREX(this, bits, bits, operand);
+ if (shift.GetImmBits() == 8)
+ {
+ Write8(0x0F);
+ Write8(0xA4);
+ dest.WriteRest(this, 1, operand);
+ Write8((u8)shift.offset);
+ }
+ else
+ {
+ Write8(0x0F);
+ Write8(0xA5);
+ dest.WriteRest(this, 0, operand);
+ }
+}
+
+void OpArg::WriteSingleByteOp(XEmitter* emit, u8 op, X64Reg _operandReg, int bits)
+{
+ if (bits == 16)
+ emit->Write8(0x66);
+
+ this->operandReg = (u8)_operandReg;
+ WriteREX(emit, bits, bits);
+ emit->Write8(op);
+ WriteRest(emit);
+}
+
+// operand can either be immediate or register
+void OpArg::WriteNormalOp(XEmitter* emit, bool toRM, NormalOp op, const OpArg& operand,
+ int bits) const
+{
+ X64Reg _operandReg;
+ if (IsImm())
+ {
+ ASSERT_MSG(DYNA_REC, 0, "WriteNormalOp - Imm argument, wrong order");
+ }
+
+ if (bits == 16)
+ emit->Write8(0x66);
+
+ int immToWrite = 0;
+ const NormalOpDef& op_def = normalops[static_cast<int>(op)];
+
+ if (operand.IsImm())
+ {
+ WriteREX(emit, bits, bits);
+
+ if (!toRM)
+ {
+ ASSERT_MSG(DYNA_REC, 0, "WriteNormalOp - Writing to Imm (!toRM)");
+ }
+
+ if (operand.scale == SCALE_IMM8 && bits == 8)
+ {
+ // op al, imm8
+ if (!scale && offsetOrBaseReg == AL && op_def.eaximm8 != 0xCC)
+ {
+ emit->Write8(op_def.eaximm8);
+ emit->Write8((u8)operand.offset);
+ return;
+ }
+ // mov reg, imm8
+ if (!scale && op == NormalOp::MOV)
+ {
+ emit->Write8(0xB0 + (offsetOrBaseReg & 7));
+ emit->Write8((u8)operand.offset);
+ return;
+ }
+ // op r/m8, imm8
+ emit->Write8(op_def.imm8);
+ immToWrite = 8;
+ }
+ else if ((operand.scale == SCALE_IMM16 && bits == 16) ||
+ (operand.scale == SCALE_IMM32 && bits == 32) ||
+ (operand.scale == SCALE_IMM32 && bits == 64))
+ {
+ // Try to save immediate size if we can, but first check to see
+ // if the instruction supports simm8.
+ // op r/m, imm8
+ if (op_def.simm8 != 0xCC &&
+ ((operand.scale == SCALE_IMM16 && (s16)operand.offset == (s8)operand.offset) ||
+ (operand.scale == SCALE_IMM32 && (s32)operand.offset == (s8)operand.offset)))
+ {
+ emit->Write8(op_def.simm8);
+ immToWrite = 8;
+ }
+ else
+ {
+ // mov reg, imm
+ if (!scale && op == NormalOp::MOV && bits != 64)
+ {
+ emit->Write8(0xB8 + (offsetOrBaseReg & 7));
+ if (bits == 16)
+ emit->Write16((u16)operand.offset);
+ else
+ emit->Write32((u32)operand.offset);
+ return;
+ }
+ // op eax, imm
+ if (!scale && offsetOrBaseReg == EAX && op_def.eaximm32 != 0xCC)
+ {
+ emit->Write8(op_def.eaximm32);
+ if (bits == 16)
+ emit->Write16((u16)operand.offset);
+ else
+ emit->Write32((u32)operand.offset);
+ return;
+ }
+ // op r/m, imm
+ emit->Write8(op_def.imm32);
+ immToWrite = bits == 16 ? 16 : 32;
+ }
+ }
+ else if ((operand.scale == SCALE_IMM8 && bits == 16) ||
+ (operand.scale == SCALE_IMM8 && bits == 32) ||
+ (operand.scale == SCALE_IMM8 && bits == 64))
+ {
+ // op r/m, imm8
+ emit->Write8(op_def.simm8);
+ immToWrite = 8;
+ }
+ else if (operand.scale == SCALE_IMM64 && bits == 64)
+ {
+ if (scale)
+ {
+ ASSERT_MSG(DYNA_REC, 0,
+ "WriteNormalOp - MOV with 64-bit imm requires register destination");
+ }
+ // mov reg64, imm64
+ else if (op == NormalOp::MOV)
+ {
+ // movabs reg64, imm64 (10 bytes)
+ if (static_cast<s64>(operand.offset) != static_cast<s32>(operand.offset))
+ {
+ emit->Write8(0xB8 + (offsetOrBaseReg & 7));
+ emit->Write64(operand.offset);
+ return;
+ }
+ // mov reg64, simm32 (7 bytes)
+ emit->Write8(op_def.imm32);
+ immToWrite = 32;
+ }
+ else
+ {
+ ASSERT_MSG(DYNA_REC, 0, "WriteNormalOp - Only MOV can take 64-bit imm");
+ }
+ }
+ else
+ {
+ ASSERT_MSG(DYNA_REC, 0, "WriteNormalOp - Unhandled case %d %d", operand.scale, bits);
+ }
+
+ // pass extension in REG of ModRM
+ _operandReg = static_cast<X64Reg>(op_def.ext);
+ }
+ else
+ {
+ _operandReg = (X64Reg)operand.offsetOrBaseReg;
+ WriteREX(emit, bits, bits, _operandReg);
+ // op r/m, reg
+ if (toRM)
+ {
+ emit->Write8(bits == 8 ? op_def.toRm8 : op_def.toRm32);
+ }
+ // op reg, r/m
+ else
+ {
+ emit->Write8(bits == 8 ? op_def.fromRm8 : op_def.fromRm32);
+ }
+ }
+ WriteRest(emit, immToWrite >> 3, _operandReg);
+ switch (immToWrite)
+ {
+ case 0:
+ break;
+ case 8:
+ emit->Write8((u8)operand.offset);
+ break;
+ case 16:
+ emit->Write16((u16)operand.offset);
+ break;
+ case 32:
+ emit->Write32((u32)operand.offset);
+ break;
+ default:
+ ASSERT_MSG(DYNA_REC, 0, "WriteNormalOp - Unhandled case");
+ }
+}
+
+void XEmitter::WriteNormalOp(int bits, NormalOp op, const OpArg& a1, const OpArg& a2)
+{
+ if (a1.IsImm())
+ {
+ // Booh! Can't write to an imm
+ ASSERT_MSG(DYNA_REC, 0, "WriteNormalOp - a1 cannot be imm");
+ return;
+ }
+ if (a2.IsImm())
+ {
+ a1.WriteNormalOp(this, true, op, a2, bits);
+ }
+ else
+ {
+ if (a1.IsSimpleReg())
+ {
+ a2.WriteNormalOp(this, false, op, a1, bits);
+ }
+ else
+ {
+ ASSERT_MSG(DYNA_REC, a2.IsSimpleReg() || a2.IsImm(),
+ "WriteNormalOp - a1 and a2 cannot both be memory");
+ a1.WriteNormalOp(this, true, op, a2, bits);
+ }
+ }
+}
+
+void XEmitter::ADD(int bits, const OpArg& a1, const OpArg& a2)
+{
+ CheckFlags();
+ WriteNormalOp(bits, NormalOp::ADD, a1, a2);
+}
+void XEmitter::ADC(int bits, const OpArg& a1, const OpArg& a2)
+{
+ CheckFlags();
+ WriteNormalOp(bits, NormalOp::ADC, a1, a2);
+}
+void XEmitter::SUB(int bits, const OpArg& a1, const OpArg& a2)
+{
+ CheckFlags();
+ WriteNormalOp(bits, NormalOp::SUB, a1, a2);
+}
+void XEmitter::SBB(int bits, const OpArg& a1, const OpArg& a2)
+{
+ CheckFlags();
+ WriteNormalOp(bits, NormalOp::SBB, a1, a2);
+}
+void XEmitter::AND(int bits, const OpArg& a1, const OpArg& a2)
+{
+ CheckFlags();
+ WriteNormalOp(bits, NormalOp::AND, a1, a2);
+}
+void XEmitter::OR(int bits, const OpArg& a1, const OpArg& a2)
+{
+ CheckFlags();
+ WriteNormalOp(bits, NormalOp::OR, a1, a2);
+}
+void XEmitter::XOR(int bits, const OpArg& a1, const OpArg& a2)
+{
+ CheckFlags();
+ WriteNormalOp(bits, NormalOp::XOR, a1, a2);
+}
+void XEmitter::MOV(int bits, const OpArg& a1, const OpArg& a2)
+{
+ if (bits == 64 && a1.IsSimpleReg() && a2.scale == SCALE_IMM64 &&
+ a2.offset == static_cast<u32>(a2.offset))
+ {
+ WriteNormalOp(32, NormalOp::MOV, a1, a2.AsImm32());
+ return;
+ }
+ if (a1.IsSimpleReg() && a2.IsSimpleReg() && a1.GetSimpleReg() == a2.GetSimpleReg())
+ {
+ ERROR_LOG(DYNA_REC, "Redundant MOV @ %p - bug in JIT?", code);
+ }
+ WriteNormalOp(bits, NormalOp::MOV, a1, a2);
+}
+void XEmitter::TEST(int bits, const OpArg& a1, const OpArg& a2)
+{
+ CheckFlags();
+ WriteNormalOp(bits, NormalOp::TEST, a1, a2);
+}
+void XEmitter::CMP(int bits, const OpArg& a1, const OpArg& a2)
+{
+ CheckFlags();
+ WriteNormalOp(bits, NormalOp::CMP, a1, a2);
+}
+void XEmitter::XCHG(int bits, const OpArg& a1, const OpArg& a2)
+{
+ WriteNormalOp(bits, NormalOp::XCHG, a1, a2);
+}
+void XEmitter::CMP_or_TEST(int bits, const OpArg& a1, const OpArg& a2)
+{
+ CheckFlags();
+ if (a1.IsSimpleReg() && a2.IsZero()) // turn 'CMP reg, 0' into shorter 'TEST reg, reg'
+ {
+ WriteNormalOp(bits, NormalOp::TEST, a1, a1);
+ }
+ else
+ {
+ WriteNormalOp(bits, NormalOp::CMP, a1, a2);
+ }
+}
+
+void XEmitter::MOV_sum(int bits, X64Reg dest, const OpArg& a1, const OpArg& a2)
+{
+ // This stomps on flags, so ensure they aren't locked
+ DEBUG_ASSERT(!flags_locked);
+
+ // Zero shortcuts (note that this can generate no code in the case where a1 == dest && a2 == zero
+ // or a2 == dest && a1 == zero)
+ if (a1.IsZero())
+ {
+ if (!a2.IsSimpleReg() || a2.GetSimpleReg() != dest)
+ {
+ MOV(bits, R(dest), a2);
+ }
+ return;
+ }
+ if (a2.IsZero())
+ {
+ if (!a1.IsSimpleReg() || a1.GetSimpleReg() != dest)
+ {
+ MOV(bits, R(dest), a1);
+ }
+ return;
+ }
+
+ // If dest == a1 or dest == a2 we can simplify this
+ if (a1.IsSimpleReg() && a1.GetSimpleReg() == dest)
+ {
+ ADD(bits, R(dest), a2);
+ return;
+ }
+
+ if (a2.IsSimpleReg() && a2.GetSimpleReg() == dest)
+ {
+ ADD(bits, R(dest), a1);
+ return;
+ }
+
+ // TODO: 32-bit optimizations may apply to other bit sizes (confirm)
+ if (bits == 32)
+ {
+ if (a1.IsImm() && a2.IsImm())
+ {
+ MOV(32, R(dest), Imm32(a1.Imm32() + a2.Imm32()));
+ return;
+ }
+
+ if (a1.IsSimpleReg() && a2.IsSimpleReg())
+ {
+ LEA(32, dest, MRegSum(a1.GetSimpleReg(), a2.GetSimpleReg()));
+ return;
+ }
+
+ if (a1.IsSimpleReg() && a2.IsImm())
+ {
+ LEA(32, dest, MDisp(a1.GetSimpleReg(), a2.Imm32()));
+ return;
+ }
+
+ if (a1.IsImm() && a2.IsSimpleReg())
+ {
+ LEA(32, dest, MDisp(a2.GetSimpleReg(), a1.Imm32()));
+ return;
+ }
+ }
+
+ // Fallback
+ MOV(bits, R(dest), a1);
+ ADD(bits, R(dest), a2);
+}
+
+void XEmitter::IMUL(int bits, X64Reg regOp, const OpArg& a1, const OpArg& a2)
+{
+ CheckFlags();
+ if (bits == 8)
+ {
+ ASSERT_MSG(DYNA_REC, 0, "IMUL - illegal bit size!");
+ return;
+ }
+
+ if (a1.IsImm())
+ {
+ ASSERT_MSG(DYNA_REC, 0, "IMUL - second arg cannot be imm!");
+ return;
+ }
+
+ if (!a2.IsImm())
+ {
+ ASSERT_MSG(DYNA_REC, 0, "IMUL - third arg must be imm!");
+ return;
+ }
+
+ if (bits == 16)
+ Write8(0x66);
+ a1.WriteREX(this, bits, bits, regOp);
+
+ if (a2.GetImmBits() == 8 || (a2.GetImmBits() == 16 && (s8)a2.offset == (s16)a2.offset) ||
+ (a2.GetImmBits() == 32 && (s8)a2.offset == (s32)a2.offset))
+ {
+ Write8(0x6B);
+ a1.WriteRest(this, 1, regOp);
+ Write8((u8)a2.offset);
+ }
+ else
+ {
+ Write8(0x69);
+ if (a2.GetImmBits() == 16 && bits == 16)
+ {
+ a1.WriteRest(this, 2, regOp);
+ Write16((u16)a2.offset);
+ }
+ else if (a2.GetImmBits() == 32 && (bits == 32 || bits == 64))
+ {
+ a1.WriteRest(this, 4, regOp);
+ Write32((u32)a2.offset);
+ }
+ else
+ {
+ ASSERT_MSG(DYNA_REC, 0, "IMUL - unhandled case!");
+ }
+ }
+}
+
+void XEmitter::IMUL(int bits, X64Reg regOp, const OpArg& a)
+{
+ CheckFlags();
+ if (bits == 8)
+ {
+ ASSERT_MSG(DYNA_REC, 0, "IMUL - illegal bit size!");
+ return;
+ }
+
+ if (a.IsImm())
+ {
+ IMUL(bits, regOp, R(regOp), a);
+ return;
+ }
+
+ if (bits == 16)
+ Write8(0x66);
+ a.WriteREX(this, bits, bits, regOp);
+ Write8(0x0F);
+ Write8(0xAF);
+ a.WriteRest(this, 0, regOp);
+}
+
+void XEmitter::WriteSSEOp(u8 opPrefix, u16 op, X64Reg regOp, OpArg arg, int extrabytes)
+{
+ if (opPrefix)
+ Write8(opPrefix);
+ arg.operandReg = regOp;
+ arg.WriteREX(this, 0, 0);
+ Write8(0x0F);
+ if (op > 0xFF)
+ Write8((op >> 8) & 0xFF);
+ Write8(op & 0xFF);
+ arg.WriteRest(this, extrabytes);
+}
+
+static int GetVEXmmmmm(u16 op)
+{
+ // Currently, only 0x38 and 0x3A are used as secondary escape byte.
+ if ((op >> 8) == 0x3A)
+ return 3;
+ else if ((op >> 8) == 0x38)
+ return 2;
+ else
+ return 1;
+}
+
+static int GetVEXpp(u8 opPrefix)
+{
+ if (opPrefix == 0x66)
+ return 1;
+ else if (opPrefix == 0xF3)
+ return 2;
+ else if (opPrefix == 0xF2)
+ return 3;
+ else
+ return 0;
+}
+
+void XEmitter::WriteVEXOp(u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg,
+ int W, int extrabytes)
+{
+ int mmmmm = GetVEXmmmmm(op);
+ int pp = GetVEXpp(opPrefix);
+ // FIXME: we currently don't support 256-bit instructions, and "size" is not the vector size here
+ arg.WriteVEX(this, regOp1, regOp2, 0, pp, mmmmm, W);
+ Write8(op & 0xFF);
+ arg.WriteRest(this, extrabytes, regOp1);
+}
+
+void XEmitter::WriteVEXOp4(u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg,
+ X64Reg regOp3, int W)
+{
+ WriteVEXOp(opPrefix, op, regOp1, regOp2, arg, W, 1);
+ Write8((u8)regOp3 << 4);
+}
+
+void XEmitter::WriteAVXOp(u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg,
+ int W, int extrabytes)
+{
+ if (!cpu_info.bAVX)
+ PanicAlert("Trying to use AVX on a system that doesn't support it. Bad programmer.");
+ WriteVEXOp(opPrefix, op, regOp1, regOp2, arg, W, extrabytes);
+}
+
+void XEmitter::WriteAVXOp4(u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg,
+ X64Reg regOp3, int W)
+{
+ if (!cpu_info.bAVX)
+ PanicAlert("Trying to use AVX on a system that doesn't support it. Bad programmer.");
+ WriteVEXOp4(opPrefix, op, regOp1, regOp2, arg, regOp3, W);
+}
+
+void XEmitter::WriteFMA3Op(u8 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg, int W)
+{
+ if (!cpu_info.bFMA)
+ PanicAlert("Trying to use FMA3 on a system that doesn't support it. Computer is v. f'n madd.");
+ WriteVEXOp(0x66, 0x3800 | op, regOp1, regOp2, arg, W);
+}
+
+void XEmitter::WriteFMA4Op(u8 op, X64Reg dest, X64Reg regOp1, X64Reg regOp2, const OpArg& arg,
+ int W)
+{
+ if (!cpu_info.bFMA4)
+ PanicAlert("Trying to use FMA4 on a system that doesn't support it. Computer is v. f'n madd.");
+ WriteVEXOp4(0x66, 0x3A00 | op, dest, regOp1, arg, regOp2, W);
+}
+
+void XEmitter::WriteBMIOp(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2,
+ const OpArg& arg, int extrabytes)
+{
+ if (arg.IsImm())
+ PanicAlert("BMI1/2 instructions don't support immediate operands.");
+ if (size != 32 && size != 64)
+ PanicAlert("BMI1/2 instructions only support 32-bit and 64-bit modes!");
+ int W = size == 64;
+ WriteVEXOp(opPrefix, op, regOp1, regOp2, arg, W, extrabytes);
+}
+
+void XEmitter::WriteBMI1Op(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2,
+ const OpArg& arg, int extrabytes)
+{
+ CheckFlags();
+ if (!cpu_info.bBMI1)
+ PanicAlert("Trying to use BMI1 on a system that doesn't support it. Bad programmer.");
+ WriteBMIOp(size, opPrefix, op, regOp1, regOp2, arg, extrabytes);
+}
+
+void XEmitter::WriteBMI2Op(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2,
+ const OpArg& arg, int extrabytes)
+{
+ if (!cpu_info.bBMI2)
+ PanicAlert("Trying to use BMI2 on a system that doesn't support it. Bad programmer.");
+ WriteBMIOp(size, opPrefix, op, regOp1, regOp2, arg, extrabytes);
+}
+
+void XEmitter::MOVD_xmm(X64Reg dest, const OpArg& arg)
+{
+ WriteSSEOp(0x66, 0x6E, dest, arg, 0);
+}
+void XEmitter::MOVD_xmm(const OpArg& arg, X64Reg src)
+{
+ WriteSSEOp(0x66, 0x7E, src, arg, 0);
+}
+
+void XEmitter::MOVQ_xmm(X64Reg dest, OpArg arg)
+{
+ // Alternate encoding
+ // This does not display correctly in MSVC's debugger, it thinks it's a MOVD
+ arg.operandReg = dest;
+ Write8(0x66);
+ arg.WriteREX(this, 64, 0);
+ Write8(0x0f);
+ Write8(0x6E);
+ arg.WriteRest(this, 0);
+}
+
+void XEmitter::MOVQ_xmm(OpArg arg, X64Reg src)
+{
+ if (src > 7 || arg.IsSimpleReg())
+ {
+ // Alternate encoding
+ // This does not display correctly in MSVC's debugger, it thinks it's a MOVD
+ arg.operandReg = src;
+ Write8(0x66);
+ arg.WriteREX(this, 64, 0);
+ Write8(0x0f);
+ Write8(0x7E);
+ arg.WriteRest(this, 0);
+ }
+ else
+ {
+ arg.operandReg = src;
+ arg.WriteREX(this, 0, 0);
+ Write8(0x66);
+ Write8(0x0f);
+ Write8(0xD6);
+ arg.WriteRest(this, 0);
+ }
+}
+
+void XEmitter::WriteMXCSR(OpArg arg, int ext)
+{
+ if (arg.IsImm() || arg.IsSimpleReg())
+ ASSERT_MSG(DYNA_REC, 0, "MXCSR - invalid operand");
+
+ arg.operandReg = ext;
+ arg.WriteREX(this, 0, 0);
+ Write8(0x0F);
+ Write8(0xAE);
+ arg.WriteRest(this);
+}
+
+void XEmitter::STMXCSR(const OpArg& memloc)
+{
+ WriteMXCSR(memloc, 3);
+}
+void XEmitter::LDMXCSR(const OpArg& memloc)
+{
+ WriteMXCSR(memloc, 2);
+}
+
+void XEmitter::MOVNTDQ(const OpArg& arg, X64Reg regOp)
+{
+ WriteSSEOp(0x66, sseMOVNTDQ, regOp, arg);
+}
+void XEmitter::MOVNTPS(const OpArg& arg, X64Reg regOp)
+{
+ WriteSSEOp(0x00, sseMOVNTP, regOp, arg);
+}
+void XEmitter::MOVNTPD(const OpArg& arg, X64Reg regOp)
+{
+ WriteSSEOp(0x66, sseMOVNTP, regOp, arg);
+}
+
+void XEmitter::ADDSS(X64Reg regOp, const OpArg& arg)
+{
+ WriteSSEOp(0xF3, sseADD, regOp, arg);
+}
+void XEmitter::ADDSD(X64Reg regOp, const OpArg& arg)
+{
+ WriteSSEOp(0xF2, sseADD, regOp, arg);
+}
+void XEmitter::SUBSS(X64Reg regOp, const OpArg& arg)
+{
+ WriteSSEOp(0xF3, sseSUB, regOp, arg);
+}
+void XEmitter::SUBSD(X64Reg regOp, const OpArg& arg)
+{
+ WriteSSEOp(0xF2, sseSUB, regOp, arg);
+}
+void XEmitter::CMPSS(X64Reg regOp, const OpArg& arg, u8 compare)
+{
+ WriteSSEOp(0xF3, sseCMP, regOp, arg, 1);
+ Write8(compare);
+}
+void XEmitter::CMPSD(X64Reg regOp, const OpArg& arg, u8 compare)
+{
+ WriteSSEOp(0xF2, sseCMP, regOp, arg, 1);
+ Write8(compare);
+}
+void XEmitter::MULSS(X64Reg regOp, const OpArg& arg)
+{
+ WriteSSEOp(0xF3, sseMUL, regOp, arg);
+}
+void XEmitter::MULSD(X64Reg regOp, const OpArg& arg)
+{
+ WriteSSEOp(0xF2, sseMUL, regOp, arg);
+}
+void XEmitter::DIVSS(X64Reg regOp, const OpArg& arg)
+{
+ WriteSSEOp(0xF3, sseDIV, regOp, arg);
+}
+void XEmitter::DIVSD(X64Reg regOp, const OpArg& arg)
+{
+ WriteSSEOp(0xF2, sseDIV, regOp, arg);
+}
+void XEmitter::MINSS(X64Reg regOp, const OpArg& arg)
+{
+ WriteSSEOp(0xF3, sseMIN, regOp, arg);
+}
+void XEmitter::MINSD(X64Reg regOp, const OpArg& arg)
+{
+ WriteSSEOp(0xF2, sseMIN, regOp, arg);
+}
+void XEmitter::MAXSS(X64Reg regOp, const OpArg& arg)
+{
+ WriteSSEOp(0xF3, sseMAX, regOp, arg);
+}
+void XEmitter::MAXSD(X64Reg regOp, const OpArg& arg)
+{
+ WriteSSEOp(0xF2, sseMAX, regOp, arg);
+}
+void XEmitter::SQRTSS(X64Reg regOp, const OpArg& arg)
+{
+ WriteSSEOp(0xF3, sseSQRT, regOp, arg);
+}
+void XEmitter::SQRTSD(X64Reg regOp, const OpArg& arg)
+{
+ WriteSSEOp(0xF2, sseSQRT, regOp, arg);
+}
+void XEmitter::RCPSS(X64Reg regOp, const OpArg& arg)
+{
+ WriteSSEOp(0xF3, sseRCP, regOp, arg);
+}
+void XEmitter::RSQRTSS(X64Reg regOp, const OpArg& arg)
+{
+ WriteSSEOp(0xF3, sseRSQRT, regOp, arg);
+}
+
+void XEmitter::ADDPS(X64Reg regOp, const OpArg& arg)
+{
+ WriteSSEOp(0x00, sseADD, regOp, arg);
+}
+void XEmitter::ADDPD(X64Reg regOp, const OpArg& arg)
+{
+ WriteSSEOp(0x66, sseADD, regOp, arg);
+}
+void XEmitter::SUBPS(X64Reg regOp, const OpArg& arg)
+{
+ WriteSSEOp(0x00, sseSUB, regOp, arg);
+}
+void XEmitter::SUBPD(X64Reg regOp, const OpArg& arg)
+{
+ WriteSSEOp(0x66, sseSUB, regOp, arg);
+}
+void XEmitter::CMPPS(X64Reg regOp, const OpArg& arg, u8 compare)
+{
+ WriteSSEOp(0x00, sseCMP, regOp, arg, 1);
+ Write8(compare);
+}
+void XEmitter::CMPPD(X64Reg regOp, const OpArg& arg, u8 compare)
+{
+ WriteSSEOp(0x66, sseCMP, regOp, arg, 1);
+ Write8(compare);
+}
+void XEmitter::ANDPS(X64Reg regOp, const OpArg& arg)
+{
+ WriteSSEOp(0x00, sseAND, regOp, arg);
+}
+void XEmitter::ANDPD(X64Reg regOp, const OpArg& arg)
+{
+ WriteSSEOp(0x66, sseAND, regOp, arg);
+}
+void XEmitter::ANDNPS(X64Reg regOp, const OpArg& arg)
+{
+ WriteSSEOp(0x00, sseANDN, regOp, arg);
+}
+void XEmitter::ANDNPD(X64Reg regOp, const OpArg& arg)
+{
+ WriteSSEOp(0x66, sseANDN, regOp, arg);
+}
+void XEmitter::ORPS(X64Reg regOp, const OpArg& arg)
+{
+ WriteSSEOp(0x00, sseOR, regOp, arg);
+}
+void XEmitter::ORPD(X64Reg regOp, const OpArg& arg)
+{
+ WriteSSEOp(0x66, sseOR, regOp, arg);
+}
+void XEmitter::XORPS(X64Reg regOp, const OpArg& arg)
+{
+ WriteSSEOp(0x00, sseXOR, regOp, arg);
+}
+void XEmitter::XORPD(X64Reg regOp, const OpArg& arg)
+{
+ WriteSSEOp(0x66, sseXOR, regOp, arg);
+}
+void XEmitter::MULPS(X64Reg regOp, const OpArg& arg)
+{
+ WriteSSEOp(0x00, sseMUL, regOp, arg);
+}
+void XEmitter::MULPD(X64Reg regOp, const OpArg& arg)
+{
+ WriteSSEOp(0x66, sseMUL, regOp, arg);
+}
+void XEmitter::DIVPS(X64Reg regOp, const OpArg& arg)
+{
+ WriteSSEOp(0x00, sseDIV, regOp, arg);
+}
+void XEmitter::DIVPD(X64Reg regOp, const OpArg& arg)
+{
+ WriteSSEOp(0x66, sseDIV, regOp, arg);
+}
+void XEmitter::MINPS(X64Reg regOp, const OpArg& arg)
+{
+ WriteSSEOp(0x00, sseMIN, regOp, arg);
+}
+void XEmitter::MINPD(X64Reg regOp, const OpArg& arg)
+{
+ WriteSSEOp(0x66, sseMIN, regOp, arg);
+}
+void XEmitter::MAXPS(X64Reg regOp, const OpArg& arg)
+{
+ WriteSSEOp(0x00, sseMAX, regOp, arg);
+}
+void XEmitter::MAXPD(X64Reg regOp, const OpArg& arg)
+{
+ WriteSSEOp(0x66, sseMAX, regOp, arg);
+}
+void XEmitter::SQRTPS(X64Reg regOp, const OpArg& arg)
+{
+ WriteSSEOp(0x00, sseSQRT, regOp, arg);
+}
+void XEmitter::SQRTPD(X64Reg regOp, const OpArg& arg)
+{
+ WriteSSEOp(0x66, sseSQRT, regOp, arg);
+}
+void XEmitter::RCPPS(X64Reg regOp, const OpArg& arg)
+{
+ WriteSSEOp(0x00, sseRCP, regOp, arg);
+}
+void XEmitter::RSQRTPS(X64Reg regOp, const OpArg& arg)
+{
+ WriteSSEOp(0x00, sseRSQRT, regOp, arg);
+}
+void XEmitter::SHUFPS(X64Reg regOp, const OpArg& arg, u8 shuffle)
+{
+ WriteSSEOp(0x00, sseSHUF, regOp, arg, 1);
+ Write8(shuffle);
+}
+void XEmitter::SHUFPD(X64Reg regOp, const OpArg& arg, u8 shuffle)
+{
+ WriteSSEOp(0x66, sseSHUF, regOp, arg, 1);
+ Write8(shuffle);
+}
+
+void XEmitter::COMISS(X64Reg regOp, const OpArg& arg)
+{
+ WriteSSEOp(0x00, sseCOMIS, regOp, arg);
+} // weird that these should be packed
+void XEmitter::COMISD(X64Reg regOp, const OpArg& arg)
+{
+ WriteSSEOp(0x66, sseCOMIS, regOp, arg);
+} // ordered
+void XEmitter::UCOMISS(X64Reg regOp, const OpArg& arg)
+{
+ WriteSSEOp(0x00, sseUCOMIS, regOp, arg);
+} // unordered
+void XEmitter::UCOMISD(X64Reg regOp, const OpArg& arg)
+{
+ WriteSSEOp(0x66, sseUCOMIS, regOp, arg);
+}
+
+void XEmitter::MOVAPS(X64Reg regOp, const OpArg& arg)
+{
+ WriteSSEOp(0x00, sseMOVAPfromRM, regOp, arg);
+}
+void XEmitter::MOVAPD(X64Reg regOp, const OpArg& arg)
+{
+ WriteSSEOp(0x66, sseMOVAPfromRM, regOp, arg);
+}
+void XEmitter::MOVAPS(const OpArg& arg, X64Reg regOp)
+{
+ WriteSSEOp(0x00, sseMOVAPtoRM, regOp, arg);
+}
+void XEmitter::MOVAPD(const OpArg& arg, X64Reg regOp)
+{
+ WriteSSEOp(0x66, sseMOVAPtoRM, regOp, arg);
+}
+
+void XEmitter::MOVUPS(X64Reg regOp, const OpArg& arg)
+{
+ WriteSSEOp(0x00, sseMOVUPfromRM, regOp, arg);
+}
+void XEmitter::MOVUPD(X64Reg regOp, const OpArg& arg)
+{
+ WriteSSEOp(0x66, sseMOVUPfromRM, regOp, arg);
+}
+void XEmitter::MOVUPS(const OpArg& arg, X64Reg regOp)
+{
+ WriteSSEOp(0x00, sseMOVUPtoRM, regOp, arg);
+}
+void XEmitter::MOVUPD(const OpArg& arg, X64Reg regOp)
+{
+ WriteSSEOp(0x66, sseMOVUPtoRM, regOp, arg);
+}
+
+void XEmitter::MOVDQA(X64Reg regOp, const OpArg& arg)
+{
+ WriteSSEOp(0x66, sseMOVDQfromRM, regOp, arg);
+}
+void XEmitter::MOVDQA(const OpArg& arg, X64Reg regOp)
+{
+ WriteSSEOp(0x66, sseMOVDQtoRM, regOp, arg);
+}
+void XEmitter::MOVDQU(X64Reg regOp, const OpArg& arg)
+{
+ WriteSSEOp(0xF3, sseMOVDQfromRM, regOp, arg);
+}
+void XEmitter::MOVDQU(const OpArg& arg, X64Reg regOp)
+{
+ WriteSSEOp(0xF3, sseMOVDQtoRM, regOp, arg);
+}
+
+void XEmitter::MOVSS(X64Reg regOp, const OpArg& arg)
+{
+ WriteSSEOp(0xF3, sseMOVUPfromRM, regOp, arg);
+}
+void XEmitter::MOVSD(X64Reg regOp, const OpArg& arg)
+{
+ WriteSSEOp(0xF2, sseMOVUPfromRM, regOp, arg);
+}
+void XEmitter::MOVSS(const OpArg& arg, X64Reg regOp)
+{
+ WriteSSEOp(0xF3, sseMOVUPtoRM, regOp, arg);
+}
+void XEmitter::MOVSD(const OpArg& arg, X64Reg regOp)
+{
+ WriteSSEOp(0xF2, sseMOVUPtoRM, regOp, arg);
+}
+
+void XEmitter::MOVLPS(X64Reg regOp, const OpArg& arg)
+{
+ WriteSSEOp(0x00, sseMOVLPfromRM, regOp, arg);
+}
+void XEmitter::MOVLPD(X64Reg regOp, const OpArg& arg)
+{
+ WriteSSEOp(0x66, sseMOVLPfromRM, regOp, arg);
+}
+void XEmitter::MOVLPS(const OpArg& arg, X64Reg regOp)
+{
+ WriteSSEOp(0x00, sseMOVLPtoRM, regOp, arg);
+}
+void XEmitter::MOVLPD(const OpArg& arg, X64Reg regOp)
+{
+ WriteSSEOp(0x66, sseMOVLPtoRM, regOp, arg);
+}
+
+void XEmitter::MOVHPS(X64Reg regOp, const OpArg& arg)
+{
+ WriteSSEOp(0x00, sseMOVHPfromRM, regOp, arg);
+}
+void XEmitter::MOVHPD(X64Reg regOp, const OpArg& arg)
+{
+ WriteSSEOp(0x66, sseMOVHPfromRM, regOp, arg);
+}
+void XEmitter::MOVHPS(const OpArg& arg, X64Reg regOp)
+{
+ WriteSSEOp(0x00, sseMOVHPtoRM, regOp, arg);
+}
+void XEmitter::MOVHPD(const OpArg& arg, X64Reg regOp)
+{
+ WriteSSEOp(0x66, sseMOVHPtoRM, regOp, arg);
+}
+
+void XEmitter::MOVHLPS(X64Reg regOp1, X64Reg regOp2)
+{
+ WriteSSEOp(0x00, sseMOVHLPS, regOp1, R(regOp2));
+}
+void XEmitter::MOVLHPS(X64Reg regOp1, X64Reg regOp2)
+{
+ WriteSSEOp(0x00, sseMOVLHPS, regOp1, R(regOp2));
+}
+
+void XEmitter::CVTPS2PD(X64Reg regOp, const OpArg& arg)
+{
+ WriteSSEOp(0x00, 0x5A, regOp, arg);
+}
+void XEmitter::CVTPD2PS(X64Reg regOp, const OpArg& arg)
+{
+ WriteSSEOp(0x66, 0x5A, regOp, arg);
+}
+
+void XEmitter::CVTSD2SS(X64Reg regOp, const OpArg& arg)
+{
+ WriteSSEOp(0xF2, 0x5A, regOp, arg);
+}
+void XEmitter::CVTSS2SD(X64Reg regOp, const OpArg& arg)
+{
+ WriteSSEOp(0xF3, 0x5A, regOp, arg);
+}
+void XEmitter::CVTSD2SI(X64Reg regOp, const OpArg& arg)
+{
+ WriteSSEOp(0xF2, 0x2D, regOp, arg);
+}
+void XEmitter::CVTSS2SI(X64Reg regOp, const OpArg& arg)
+{
+ WriteSSEOp(0xF3, 0x2D, regOp, arg);
+}
+void XEmitter::CVTSI2SD(X64Reg regOp, const OpArg& arg)
+{
+ WriteSSEOp(0xF2, 0x2A, regOp, arg);
+}
+void XEmitter::CVTSI2SS(X64Reg regOp, const OpArg& arg)
+{
+ WriteSSEOp(0xF3, 0x2A, regOp, arg);
+}
+
+void XEmitter::CVTDQ2PD(X64Reg regOp, const OpArg& arg)
+{
+ WriteSSEOp(0xF3, 0xE6, regOp, arg);
+}
+void XEmitter::CVTDQ2PS(X64Reg regOp, const OpArg& arg)
+{
+ WriteSSEOp(0x00, 0x5B, regOp, arg);
+}
+void XEmitter::CVTPD2DQ(X64Reg regOp, const OpArg& arg)
+{
+ WriteSSEOp(0xF2, 0xE6, regOp, arg);
+}
+void XEmitter::CVTPS2DQ(X64Reg regOp, const OpArg& arg)
+{
+ WriteSSEOp(0x66, 0x5B, regOp, arg);
+}
+
+void XEmitter::CVTTSD2SI(X64Reg regOp, const OpArg& arg)
+{
+ WriteSSEOp(0xF2, 0x2C, regOp, arg);
+}
+void XEmitter::CVTTSS2SI(X64Reg regOp, const OpArg& arg)
+{
+ WriteSSEOp(0xF3, 0x2C, regOp, arg);
+}
+void XEmitter::CVTTPS2DQ(X64Reg regOp, const OpArg& arg)
+{
+ WriteSSEOp(0xF3, 0x5B, regOp, arg);
+}
+void XEmitter::CVTTPD2DQ(X64Reg regOp, const OpArg& arg)
+{
+ WriteSSEOp(0x66, 0xE6, regOp, arg);
+}
+
+void XEmitter::MASKMOVDQU(X64Reg dest, X64Reg src)
+{
+ WriteSSEOp(0x66, sseMASKMOVDQU, dest, R(src));
+}
+
+void XEmitter::MOVMSKPS(X64Reg dest, const OpArg& arg)
+{
+ WriteSSEOp(0x00, 0x50, dest, arg);
+}
+void XEmitter::MOVMSKPD(X64Reg dest, const OpArg& arg)
+{
+ WriteSSEOp(0x66, 0x50, dest, arg);
+}
+
+void XEmitter::LDDQU(X64Reg dest, const OpArg& arg)
+{
+ WriteSSEOp(0xF2, sseLDDQU, dest, arg);
+} // For integer data only
+
+void XEmitter::UNPCKLPS(X64Reg dest, const OpArg& arg)
+{
+ WriteSSEOp(0x00, 0x14, dest, arg);
+}
+void XEmitter::UNPCKHPS(X64Reg dest, const OpArg& arg)
+{
+ WriteSSEOp(0x00, 0x15, dest, arg);
+}
+void XEmitter::UNPCKLPD(X64Reg dest, const OpArg& arg)
+{
+ WriteSSEOp(0x66, 0x14, dest, arg);
+}
+void XEmitter::UNPCKHPD(X64Reg dest, const OpArg& arg)
+{
+ WriteSSEOp(0x66, 0x15, dest, arg);
+}
+
+// Pretty much every x86 CPU nowadays supports SSE3,
+// but the SSE2 fallbacks are easy.
+void XEmitter::MOVSLDUP(X64Reg regOp, const OpArg& arg)
+{
+ if (cpu_info.bSSE3)
+ {
+ WriteSSEOp(0xF3, 0x12, regOp, arg);
+ }
+ else
+ {
+ if (!arg.IsSimpleReg(regOp))
+ MOVAPD(regOp, arg);
+ UNPCKLPS(regOp, R(regOp));
+ }
+}
+void XEmitter::MOVSHDUP(X64Reg regOp, const OpArg& arg)
+{
+ if (cpu_info.bSSE3)
+ {
+ WriteSSEOp(0xF3, 0x16, regOp, arg);
+ }
+ else
+ {
+ if (!arg.IsSimpleReg(regOp))
+ MOVAPD(regOp, arg);
+ UNPCKHPS(regOp, R(regOp));
+ }
+}
+void XEmitter::MOVDDUP(X64Reg regOp, const OpArg& arg)
+{
+ if (cpu_info.bSSE3)
+ {
+ WriteSSEOp(0xF2, 0x12, regOp, arg);
+ }
+ else
+ {
+ if (!arg.IsSimpleReg(regOp))
+ MOVSD(regOp, arg);
+ UNPCKLPD(regOp, R(regOp));
+ }
+}
+
+// There are a few more left
+
+// Also some integer instructions are missing
+void XEmitter::PACKSSDW(X64Reg dest, const OpArg& arg)
+{
+ WriteSSEOp(0x66, 0x6B, dest, arg);
+}
+void XEmitter::PACKSSWB(X64Reg dest, const OpArg& arg)
+{
+ WriteSSEOp(0x66, 0x63, dest, arg);
+}
+void XEmitter::PACKUSWB(X64Reg dest, const OpArg& arg)
+{
+ WriteSSEOp(0x66, 0x67, dest, arg);
+}
+
+void XEmitter::PUNPCKLBW(X64Reg dest, const OpArg& arg)
+{
+ WriteSSEOp(0x66, 0x60, dest, arg);
+}
+void XEmitter::PUNPCKLWD(X64Reg dest, const OpArg& arg)
+{
+ WriteSSEOp(0x66, 0x61, dest, arg);
+}
+void XEmitter::PUNPCKLDQ(X64Reg dest, const OpArg& arg)
+{
+ WriteSSEOp(0x66, 0x62, dest, arg);
+}
+void XEmitter::PUNPCKLQDQ(X64Reg dest, const OpArg& arg)
+{
+ WriteSSEOp(0x66, 0x6C, dest, arg);
+}
+
+void XEmitter::PSRLW(X64Reg reg, int shift)
+{
+ WriteSSEOp(0x66, 0x71, (X64Reg)2, R(reg));
+ Write8(shift);
+}
+
+void XEmitter::PSRLD(X64Reg reg, int shift)
+{
+ WriteSSEOp(0x66, 0x72, (X64Reg)2, R(reg));
+ Write8(shift);
+}
+
+void XEmitter::PSRLQ(X64Reg reg, int shift)
+{
+ WriteSSEOp(0x66, 0x73, (X64Reg)2, R(reg));
+ Write8(shift);
+}
+
+void XEmitter::PSRLQ(X64Reg reg, const OpArg& arg)
+{
+ WriteSSEOp(0x66, 0xd3, reg, arg);
+}
+
+void XEmitter::PSRLDQ(X64Reg reg, int shift)
+{
+ WriteSSEOp(0x66, 0x73, (X64Reg)3, R(reg));
+ Write8(shift);
+}
+
+void XEmitter::PSLLW(X64Reg reg, int shift)
+{
+ WriteSSEOp(0x66, 0x71, (X64Reg)6, R(reg));
+ Write8(shift);
+}
+
+void XEmitter::PSLLD(X64Reg reg, int shift)
+{
+ WriteSSEOp(0x66, 0x72, (X64Reg)6, R(reg));
+ Write8(shift);
+}
+
+void XEmitter::PSLLQ(X64Reg reg, int shift)
+{
+ WriteSSEOp(0x66, 0x73, (X64Reg)6, R(reg));
+ Write8(shift);
+}
+
+void XEmitter::PSLLDQ(X64Reg reg, int shift)
+{
+ WriteSSEOp(0x66, 0x73, (X64Reg)7, R(reg));
+ Write8(shift);
+}
+
+// WARNING not REX compatible
+void XEmitter::PSRAW(X64Reg reg, int shift)
+{
+ if (reg > 7)
+ PanicAlert("The PSRAW-emitter does not support regs above 7");
+ Write8(0x66);
+ Write8(0x0f);
+ Write8(0x71);
+ Write8(0xE0 | reg);
+ Write8(shift);
+}
+
+// WARNING not REX compatible
+void XEmitter::PSRAD(X64Reg reg, int shift)
+{
+ if (reg > 7)
+ PanicAlert("The PSRAD-emitter does not support regs above 7");
+ Write8(0x66);
+ Write8(0x0f);
+ Write8(0x72);
+ Write8(0xE0 | reg);
+ Write8(shift);
+}
+
+void XEmitter::WriteSSSE3Op(u8 opPrefix, u16 op, X64Reg regOp, const OpArg& arg, int extrabytes)
+{
+ if (!cpu_info.bSSSE3)
+ PanicAlert("Trying to use SSSE3 on a system that doesn't support it. Bad programmer.");
+ WriteSSEOp(opPrefix, op, regOp, arg, extrabytes);
+}
+
+void XEmitter::WriteSSE41Op(u8 opPrefix, u16 op, X64Reg regOp, const OpArg& arg, int extrabytes)
+{
+ if (!cpu_info.bSSE4_1)
+ PanicAlert("Trying to use SSE4.1 on a system that doesn't support it. Bad programmer.");
+ WriteSSEOp(opPrefix, op, regOp, arg, extrabytes);
+}
+
+void XEmitter::PSHUFB(X64Reg dest, const OpArg& arg)
+{
+ WriteSSSE3Op(0x66, 0x3800, dest, arg);
+}
+void XEmitter::PTEST(X64Reg dest, const OpArg& arg)
+{
+ WriteSSE41Op(0x66, 0x3817, dest, arg);
+}
+void XEmitter::PACKUSDW(X64Reg dest, const OpArg& arg)
+{
+ WriteSSE41Op(0x66, 0x382b, dest, arg);
+}
+
+void XEmitter::PMOVSXBW(X64Reg dest, const OpArg& arg)
+{
+ WriteSSE41Op(0x66, 0x3820, dest, arg);
+}
+void XEmitter::PMOVSXBD(X64Reg dest, const OpArg& arg)
+{
+ WriteSSE41Op(0x66, 0x3821, dest, arg);
+}
+void XEmitter::PMOVSXBQ(X64Reg dest, const OpArg& arg)
+{
+ WriteSSE41Op(0x66, 0x3822, dest, arg);
+}
+void XEmitter::PMOVSXWD(X64Reg dest, const OpArg& arg)
+{
+ WriteSSE41Op(0x66, 0x3823, dest, arg);
+}
+void XEmitter::PMOVSXWQ(X64Reg dest, const OpArg& arg)
+{
+ WriteSSE41Op(0x66, 0x3824, dest, arg);
+}
+void XEmitter::PMOVSXDQ(X64Reg dest, const OpArg& arg)
+{
+ WriteSSE41Op(0x66, 0x3825, dest, arg);
+}
+void XEmitter::PMOVZXBW(X64Reg dest, const OpArg& arg)
+{
+ WriteSSE41Op(0x66, 0x3830, dest, arg);
+}
+void XEmitter::PMOVZXBD(X64Reg dest, const OpArg& arg)
+{
+ WriteSSE41Op(0x66, 0x3831, dest, arg);
+}
+void XEmitter::PMOVZXBQ(X64Reg dest, const OpArg& arg)
+{
+ WriteSSE41Op(0x66, 0x3832, dest, arg);
+}
+void XEmitter::PMOVZXWD(X64Reg dest, const OpArg& arg)
+{
+ WriteSSE41Op(0x66, 0x3833, dest, arg);
+}
+void XEmitter::PMOVZXWQ(X64Reg dest, const OpArg& arg)
+{
+ WriteSSE41Op(0x66, 0x3834, dest, arg);
+}
+void XEmitter::PMOVZXDQ(X64Reg dest, const OpArg& arg)
+{
+ WriteSSE41Op(0x66, 0x3835, dest, arg);
+}
+
+void XEmitter::PBLENDVB(X64Reg dest, const OpArg& arg)
+{
+ WriteSSE41Op(0x66, 0x3810, dest, arg);
+}
+void XEmitter::BLENDVPS(X64Reg dest, const OpArg& arg)
+{
+ WriteSSE41Op(0x66, 0x3814, dest, arg);
+}
+void XEmitter::BLENDVPD(X64Reg dest, const OpArg& arg)
+{
+ WriteSSE41Op(0x66, 0x3815, dest, arg);
+}
+void XEmitter::BLENDPS(X64Reg dest, const OpArg& arg, u8 blend)
+{
+ WriteSSE41Op(0x66, 0x3A0C, dest, arg, 1);
+ Write8(blend);
+}
+void XEmitter::BLENDPD(X64Reg dest, const OpArg& arg, u8 blend)
+{
+ WriteSSE41Op(0x66, 0x3A0D, dest, arg, 1);
+ Write8(blend);
+}
+
+void XEmitter::PAND(X64Reg dest, const OpArg& arg)
+{
+ WriteSSEOp(0x66, 0xDB, dest, arg);
+}
+void XEmitter::PANDN(X64Reg dest, const OpArg& arg)
+{
+ WriteSSEOp(0x66, 0xDF, dest, arg);
+}
+void XEmitter::PXOR(X64Reg dest, const OpArg& arg)
+{
+ WriteSSEOp(0x66, 0xEF, dest, arg);
+}
+void XEmitter::POR(X64Reg dest, const OpArg& arg)
+{
+ WriteSSEOp(0x66, 0xEB, dest, arg);
+}
+
+void XEmitter::PADDB(X64Reg dest, const OpArg& arg)
+{
+ WriteSSEOp(0x66, 0xFC, dest, arg);
+}
+void XEmitter::PADDW(X64Reg dest, const OpArg& arg)
+{
+ WriteSSEOp(0x66, 0xFD, dest, arg);
+}
+void XEmitter::PADDD(X64Reg dest, const OpArg& arg)
+{
+ WriteSSEOp(0x66, 0xFE, dest, arg);
+}
+void XEmitter::PADDQ(X64Reg dest, const OpArg& arg)
+{
+ WriteSSEOp(0x66, 0xD4, dest, arg);
+}
+
+void XEmitter::PADDSB(X64Reg dest, const OpArg& arg)
+{
+ WriteSSEOp(0x66, 0xEC, dest, arg);
+}
+void XEmitter::PADDSW(X64Reg dest, const OpArg& arg)
+{
+ WriteSSEOp(0x66, 0xED, dest, arg);
+}
+void XEmitter::PADDUSB(X64Reg dest, const OpArg& arg)
+{
+ WriteSSEOp(0x66, 0xDC, dest, arg);
+}
+void XEmitter::PADDUSW(X64Reg dest, const OpArg& arg)
+{
+ WriteSSEOp(0x66, 0xDD, dest, arg);
+}
+
+void XEmitter::PSUBB(X64Reg dest, const OpArg& arg)
+{
+ WriteSSEOp(0x66, 0xF8, dest, arg);
+}
+void XEmitter::PSUBW(X64Reg dest, const OpArg& arg)
+{
+ WriteSSEOp(0x66, 0xF9, dest, arg);
+}
+void XEmitter::PSUBD(X64Reg dest, const OpArg& arg)
+{
+ WriteSSEOp(0x66, 0xFA, dest, arg);
+}
+void XEmitter::PSUBQ(X64Reg dest, const OpArg& arg)
+{
+ WriteSSEOp(0x66, 0xFB, dest, arg);
+}
+
+void XEmitter::PSUBSB(X64Reg dest, const OpArg& arg)
+{
+ WriteSSEOp(0x66, 0xE8, dest, arg);
+}
+void XEmitter::PSUBSW(X64Reg dest, const OpArg& arg)
+{
+ WriteSSEOp(0x66, 0xE9, dest, arg);
+}
+void XEmitter::PSUBUSB(X64Reg dest, const OpArg& arg)
+{
+ WriteSSEOp(0x66, 0xD8, dest, arg);
+}
+void XEmitter::PSUBUSW(X64Reg dest, const OpArg& arg)
+{
+ WriteSSEOp(0x66, 0xD9, dest, arg);
+}
+
+void XEmitter::PAVGB(X64Reg dest, const OpArg& arg)
+{
+ WriteSSEOp(0x66, 0xE0, dest, arg);
+}
+void XEmitter::PAVGW(X64Reg dest, const OpArg& arg)
+{
+ WriteSSEOp(0x66, 0xE3, dest, arg);
+}
+
+void XEmitter::PCMPEQB(X64Reg dest, const OpArg& arg)
+{
+ WriteSSEOp(0x66, 0x74, dest, arg);
+}
+void XEmitter::PCMPEQW(X64Reg dest, const OpArg& arg)
+{
+ WriteSSEOp(0x66, 0x75, dest, arg);
+}
+void XEmitter::PCMPEQD(X64Reg dest, const OpArg& arg)
+{
+ WriteSSEOp(0x66, 0x76, dest, arg);
+}
+
+void XEmitter::PCMPGTB(X64Reg dest, const OpArg& arg)
+{
+ WriteSSEOp(0x66, 0x64, dest, arg);
+}
+void XEmitter::PCMPGTW(X64Reg dest, const OpArg& arg)
+{
+ WriteSSEOp(0x66, 0x65, dest, arg);
+}
+void XEmitter::PCMPGTD(X64Reg dest, const OpArg& arg)
+{
+ WriteSSEOp(0x66, 0x66, dest, arg);
+}
+
+void XEmitter::PEXTRW(X64Reg dest, const OpArg& arg, u8 subreg)
+{
+ WriteSSEOp(0x66, 0xC5, dest, arg);
+ Write8(subreg);
+}
+void XEmitter::PINSRW(X64Reg dest, const OpArg& arg, u8 subreg)
+{
+ WriteSSEOp(0x66, 0xC4, dest, arg);
+ Write8(subreg);
+}
+void XEmitter::PINSRD(X64Reg dest, const OpArg& arg, u8 subreg)
+{
+ WriteSSE41Op(0x66, 0x3A22, dest, arg);
+ Write8(subreg);
+}
+
+void XEmitter::PMADDWD(X64Reg dest, const OpArg& arg)
+{
+ WriteSSEOp(0x66, 0xF5, dest, arg);
+}
+void XEmitter::PSADBW(X64Reg dest, const OpArg& arg)
+{
+ WriteSSEOp(0x66, 0xF6, dest, arg);
+}
+
+void XEmitter::PMAXSW(X64Reg dest, const OpArg& arg)
+{
+ WriteSSEOp(0x66, 0xEE, dest, arg);
+}
+void XEmitter::PMAXUB(X64Reg dest, const OpArg& arg)
+{
+ WriteSSEOp(0x66, 0xDE, dest, arg);
+}
+void XEmitter::PMINSW(X64Reg dest, const OpArg& arg)
+{
+ WriteSSEOp(0x66, 0xEA, dest, arg);
+}
+void XEmitter::PMINUB(X64Reg dest, const OpArg& arg)
+{
+ WriteSSEOp(0x66, 0xDA, dest, arg);
+}
+
+void XEmitter::PMOVMSKB(X64Reg dest, const OpArg& arg)
+{
+ WriteSSEOp(0x66, 0xD7, dest, arg);
+}
+void XEmitter::PSHUFD(X64Reg regOp, const OpArg& arg, u8 shuffle)
+{
+ WriteSSEOp(0x66, 0x70, regOp, arg, 1);
+ Write8(shuffle);
+}
+void XEmitter::PSHUFLW(X64Reg regOp, const OpArg& arg, u8 shuffle)
+{
+ WriteSSEOp(0xF2, 0x70, regOp, arg, 1);
+ Write8(shuffle);
+}
+void XEmitter::PSHUFHW(X64Reg regOp, const OpArg& arg, u8 shuffle)
+{
+ WriteSSEOp(0xF3, 0x70, regOp, arg, 1);
+ Write8(shuffle);
+}
+
+// VEX
+void XEmitter::VADDSS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+ WriteAVXOp(0xF3, sseADD, regOp1, regOp2, arg);
+}
+void XEmitter::VSUBSS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+ WriteAVXOp(0xF3, sseSUB, regOp1, regOp2, arg);
+}
+void XEmitter::VMULSS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+ WriteAVXOp(0xF3, sseMUL, regOp1, regOp2, arg);
+}
+void XEmitter::VDIVSS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+ WriteAVXOp(0xF3, sseDIV, regOp1, regOp2, arg);
+}
+void XEmitter::VADDPS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+ WriteAVXOp(0x00, sseADD, regOp1, regOp2, arg);
+}
+void XEmitter::VSUBPS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+ WriteAVXOp(0x00, sseSUB, regOp1, regOp2, arg);
+}
+void XEmitter::VMULPS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+ WriteAVXOp(0x00, sseMUL, regOp1, regOp2, arg);
+}
+void XEmitter::VDIVPS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+ WriteAVXOp(0x00, sseDIV, regOp1, regOp2, arg);
+}
+void XEmitter::VADDSD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+ WriteAVXOp(0xF2, sseADD, regOp1, regOp2, arg);
+}
+void XEmitter::VSUBSD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+ WriteAVXOp(0xF2, sseSUB, regOp1, regOp2, arg);
+}
+void XEmitter::VMULSD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+ WriteAVXOp(0xF2, sseMUL, regOp1, regOp2, arg);
+}
+void XEmitter::VDIVSD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+ WriteAVXOp(0xF2, sseDIV, regOp1, regOp2, arg);
+}
+void XEmitter::VADDPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+ WriteAVXOp(0x66, sseADD, regOp1, regOp2, arg);
+}
+void XEmitter::VSUBPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+ WriteAVXOp(0x66, sseSUB, regOp1, regOp2, arg);
+}
+void XEmitter::VMULPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+ WriteAVXOp(0x66, sseMUL, regOp1, regOp2, arg);
+}
+void XEmitter::VDIVPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+ WriteAVXOp(0x66, sseDIV, regOp1, regOp2, arg);
+}
+void XEmitter::VSQRTSD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+ WriteAVXOp(0xF2, sseSQRT, regOp1, regOp2, arg);
+}
+void XEmitter::VCMPPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg, u8 compare)
+{
+ WriteAVXOp(0x66, sseCMP, regOp1, regOp2, arg, 0, 1);
+ Write8(compare);
+}
+void XEmitter::VSHUFPS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg, u8 shuffle)
+{
+ WriteAVXOp(0x00, sseSHUF, regOp1, regOp2, arg, 0, 1);
+ Write8(shuffle);
+}
+void XEmitter::VSHUFPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg, u8 shuffle)
+{
+ WriteAVXOp(0x66, sseSHUF, regOp1, regOp2, arg, 0, 1);
+ Write8(shuffle);
+}
+void XEmitter::VUNPCKLPS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+ WriteAVXOp(0x00, 0x14, regOp1, regOp2, arg);
+}
+void XEmitter::VUNPCKLPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+ WriteAVXOp(0x66, 0x14, regOp1, regOp2, arg);
+}
+void XEmitter::VUNPCKHPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+ WriteAVXOp(0x66, 0x15, regOp1, regOp2, arg);
+}
+void XEmitter::VBLENDVPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg, X64Reg regOp3)
+{
+ WriteAVXOp4(0x66, 0x3A4B, regOp1, regOp2, arg, regOp3);
+}
+void XEmitter::VBLENDPS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg, u8 blend)
+{
+ WriteAVXOp(0x66, 0x3A0C, regOp1, regOp2, arg, 0, 1);
+ Write8(blend);
+}
+void XEmitter::VBLENDPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg, u8 blend)
+{
+ WriteAVXOp(0x66, 0x3A0D, regOp1, regOp2, arg, 0, 1);
+ Write8(blend);
+}
+
+void XEmitter::VANDPS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+ WriteAVXOp(0x00, sseAND, regOp1, regOp2, arg);
+}
+void XEmitter::VANDPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+ WriteAVXOp(0x66, sseAND, regOp1, regOp2, arg);
+}
+void XEmitter::VANDNPS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+ WriteAVXOp(0x00, sseANDN, regOp1, regOp2, arg);
+}
+void XEmitter::VANDNPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+ WriteAVXOp(0x66, sseANDN, regOp1, regOp2, arg);
+}
+void XEmitter::VORPS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+ WriteAVXOp(0x00, sseOR, regOp1, regOp2, arg);
+}
+void XEmitter::VORPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+ WriteAVXOp(0x66, sseOR, regOp1, regOp2, arg);
+}
+void XEmitter::VXORPS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+ WriteAVXOp(0x00, sseXOR, regOp1, regOp2, arg);
+}
+void XEmitter::VXORPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+ WriteAVXOp(0x66, sseXOR, regOp1, regOp2, arg);
+}
+
+void XEmitter::VPAND(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+ WriteAVXOp(0x66, 0xDB, regOp1, regOp2, arg);
+}
+void XEmitter::VPANDN(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+ WriteAVXOp(0x66, 0xDF, regOp1, regOp2, arg);
+}
+void XEmitter::VPOR(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+ WriteAVXOp(0x66, 0xEB, regOp1, regOp2, arg);
+}
+void XEmitter::VPXOR(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+ WriteAVXOp(0x66, 0xEF, regOp1, regOp2, arg);
+}
+
+void XEmitter::VFMADD132PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+ WriteFMA3Op(0x98, regOp1, regOp2, arg);
+}
+void XEmitter::VFMADD213PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+ WriteFMA3Op(0xA8, regOp1, regOp2, arg);
+}
+void XEmitter::VFMADD231PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+ WriteFMA3Op(0xB8, regOp1, regOp2, arg);
+}
+void XEmitter::VFMADD132PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+ WriteFMA3Op(0x98, regOp1, regOp2, arg, 1);
+}
+void XEmitter::VFMADD213PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+ WriteFMA3Op(0xA8, regOp1, regOp2, arg, 1);
+}
+void XEmitter::VFMADD231PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+ WriteFMA3Op(0xB8, regOp1, regOp2, arg, 1);
+}
+void XEmitter::VFMADD132SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+ WriteFMA3Op(0x99, regOp1, regOp2, arg);
+}
+void XEmitter::VFMADD213SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+ WriteFMA3Op(0xA9, regOp1, regOp2, arg);
+}
+void XEmitter::VFMADD231SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+ WriteFMA3Op(0xB9, regOp1, regOp2, arg);
+}
+void XEmitter::VFMADD132SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+ WriteFMA3Op(0x99, regOp1, regOp2, arg, 1);
+}
+void XEmitter::VFMADD213SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+ WriteFMA3Op(0xA9, regOp1, regOp2, arg, 1);
+}
+void XEmitter::VFMADD231SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+ WriteFMA3Op(0xB9, regOp1, regOp2, arg, 1);
+}
+void XEmitter::VFMSUB132PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+ WriteFMA3Op(0x9A, regOp1, regOp2, arg);
+}
+void XEmitter::VFMSUB213PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+ WriteFMA3Op(0xAA, regOp1, regOp2, arg);
+}
+void XEmitter::VFMSUB231PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+ WriteFMA3Op(0xBA, regOp1, regOp2, arg);
+}
+void XEmitter::VFMSUB132PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+ WriteFMA3Op(0x9A, regOp1, regOp2, arg, 1);
+}
+void XEmitter::VFMSUB213PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+ WriteFMA3Op(0xAA, regOp1, regOp2, arg, 1);
+}
+void XEmitter::VFMSUB231PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+ WriteFMA3Op(0xBA, regOp1, regOp2, arg, 1);
+}
+void XEmitter::VFMSUB132SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+ WriteFMA3Op(0x9B, regOp1, regOp2, arg);
+}
+void XEmitter::VFMSUB213SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+ WriteFMA3Op(0xAB, regOp1, regOp2, arg);
+}
+void XEmitter::VFMSUB231SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+ WriteFMA3Op(0xBB, regOp1, regOp2, arg);
+}
+void XEmitter::VFMSUB132SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+ WriteFMA3Op(0x9B, regOp1, regOp2, arg, 1);
+}
+void XEmitter::VFMSUB213SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+ WriteFMA3Op(0xAB, regOp1, regOp2, arg, 1);
+}
+void XEmitter::VFMSUB231SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+ WriteFMA3Op(0xBB, regOp1, regOp2, arg, 1);
+}
+void XEmitter::VFNMADD132PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+ WriteFMA3Op(0x9C, regOp1, regOp2, arg);
+}
+void XEmitter::VFNMADD213PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+ WriteFMA3Op(0xAC, regOp1, regOp2, arg);
+}
+void XEmitter::VFNMADD231PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+ WriteFMA3Op(0xBC, regOp1, regOp2, arg);
+}
+void XEmitter::VFNMADD132PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+ WriteFMA3Op(0x9C, regOp1, regOp2, arg, 1);
+}
+void XEmitter::VFNMADD213PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+ WriteFMA3Op(0xAC, regOp1, regOp2, arg, 1);
+}
+void XEmitter::VFNMADD231PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+ WriteFMA3Op(0xBC, regOp1, regOp2, arg, 1);
+}
+void XEmitter::VFNMADD132SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+ WriteFMA3Op(0x9D, regOp1, regOp2, arg);
+}
+void XEmitter::VFNMADD213SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+ WriteFMA3Op(0xAD, regOp1, regOp2, arg);
+}
+void XEmitter::VFNMADD231SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+ WriteFMA3Op(0xBD, regOp1, regOp2, arg);
+}
+void XEmitter::VFNMADD132SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+ WriteFMA3Op(0x9D, regOp1, regOp2, arg, 1);
+}
+void XEmitter::VFNMADD213SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+ WriteFMA3Op(0xAD, regOp1, regOp2, arg, 1);
+}
+void XEmitter::VFNMADD231SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+ WriteFMA3Op(0xBD, regOp1, regOp2, arg, 1);
+}
+void XEmitter::VFNMSUB132PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+ WriteFMA3Op(0x9E, regOp1, regOp2, arg);
+}
+void XEmitter::VFNMSUB213PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+ WriteFMA3Op(0xAE, regOp1, regOp2, arg);
+}
+void XEmitter::VFNMSUB231PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+ WriteFMA3Op(0xBE, regOp1, regOp2, arg);
+}
+void XEmitter::VFNMSUB132PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+ WriteFMA3Op(0x9E, regOp1, regOp2, arg, 1);
+}
+void XEmitter::VFNMSUB213PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+ WriteFMA3Op(0xAE, regOp1, regOp2, arg, 1);
+}
+void XEmitter::VFNMSUB231PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+ WriteFMA3Op(0xBE, regOp1, regOp2, arg, 1);
+}
+void XEmitter::VFNMSUB132SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+ WriteFMA3Op(0x9F, regOp1, regOp2, arg);
+}
+void XEmitter::VFNMSUB213SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+ WriteFMA3Op(0xAF, regOp1, regOp2, arg);
+}
+void XEmitter::VFNMSUB231SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+ WriteFMA3Op(0xBF, regOp1, regOp2, arg);
+}
+void XEmitter::VFNMSUB132SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+ WriteFMA3Op(0x9F, regOp1, regOp2, arg, 1);
+}
+void XEmitter::VFNMSUB213SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+ WriteFMA3Op(0xAF, regOp1, regOp2, arg, 1);
+}
+void XEmitter::VFNMSUB231SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+ WriteFMA3Op(0xBF, regOp1, regOp2, arg, 1);
+}
+void XEmitter::VFMADDSUB132PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+ WriteFMA3Op(0x96, regOp1, regOp2, arg);
+}
+void XEmitter::VFMADDSUB213PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+ WriteFMA3Op(0xA6, regOp1, regOp2, arg);
+}
+void XEmitter::VFMADDSUB231PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+ WriteFMA3Op(0xB6, regOp1, regOp2, arg);
+}
+void XEmitter::VFMADDSUB132PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+ WriteFMA3Op(0x96, regOp1, regOp2, arg, 1);
+}
+void XEmitter::VFMADDSUB213PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+ WriteFMA3Op(0xA6, regOp1, regOp2, arg, 1);
+}
+void XEmitter::VFMADDSUB231PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+ WriteFMA3Op(0xB6, regOp1, regOp2, arg, 1);
+}
+void XEmitter::VFMSUBADD132PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+ WriteFMA3Op(0x97, regOp1, regOp2, arg);
+}
+void XEmitter::VFMSUBADD213PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+ WriteFMA3Op(0xA7, regOp1, regOp2, arg);
+}
+void XEmitter::VFMSUBADD231PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+ WriteFMA3Op(0xB7, regOp1, regOp2, arg);
+}
+void XEmitter::VFMSUBADD132PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+ WriteFMA3Op(0x97, regOp1, regOp2, arg, 1);
+}
+void XEmitter::VFMSUBADD213PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+ WriteFMA3Op(0xA7, regOp1, regOp2, arg, 1);
+}
+void XEmitter::VFMSUBADD231PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+ WriteFMA3Op(0xB7, regOp1, regOp2, arg, 1);
+}
+
+#define FMA4(name, op) \
+ void XEmitter::name(X64Reg dest, X64Reg regOp1, X64Reg regOp2, const OpArg& arg) \
+ { \
+ WriteFMA4Op(op, dest, regOp1, regOp2, arg, 1); \
+ } \
+ void XEmitter::name(X64Reg dest, X64Reg regOp1, const OpArg& arg, X64Reg regOp2) \
+ { \
+ WriteFMA4Op(op, dest, regOp1, regOp2, arg, 0); \
+ }
+
+FMA4(VFMADDSUBPS, 0x5C)
+FMA4(VFMADDSUBPD, 0x5D)
+FMA4(VFMSUBADDPS, 0x5E)
+FMA4(VFMSUBADDPD, 0x5F)
+FMA4(VFMADDPS, 0x68)
+FMA4(VFMADDPD, 0x69)
+FMA4(VFMADDSS, 0x6A)
+FMA4(VFMADDSD, 0x6B)
+FMA4(VFMSUBPS, 0x6C)
+FMA4(VFMSUBPD, 0x6D)
+FMA4(VFMSUBSS, 0x6E)
+FMA4(VFMSUBSD, 0x6F)
+FMA4(VFNMADDPS, 0x78)
+FMA4(VFNMADDPD, 0x79)
+FMA4(VFNMADDSS, 0x7A)
+FMA4(VFNMADDSD, 0x7B)
+FMA4(VFNMSUBPS, 0x7C)
+FMA4(VFNMSUBPD, 0x7D)
+FMA4(VFNMSUBSS, 0x7E)
+FMA4(VFNMSUBSD, 0x7F)
+#undef FMA4
+
+void XEmitter::SARX(int bits, X64Reg regOp1, const OpArg& arg, X64Reg regOp2)
+{
+ WriteBMI2Op(bits, 0xF3, 0x38F7, regOp1, regOp2, arg);
+}
+void XEmitter::SHLX(int bits, X64Reg regOp1, const OpArg& arg, X64Reg regOp2)
+{
+ WriteBMI2Op(bits, 0x66, 0x38F7, regOp1, regOp2, arg);
+}
+void XEmitter::SHRX(int bits, X64Reg regOp1, const OpArg& arg, X64Reg regOp2)
+{
+ WriteBMI2Op(bits, 0xF2, 0x38F7, regOp1, regOp2, arg);
+}
+void XEmitter::RORX(int bits, X64Reg regOp, const OpArg& arg, u8 rotate)
+{
+ WriteBMI2Op(bits, 0xF2, 0x3AF0, regOp, INVALID_REG, arg, 1);
+ Write8(rotate);
+}
+void XEmitter::PEXT(int bits, X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+ WriteBMI2Op(bits, 0xF3, 0x38F5, regOp1, regOp2, arg);
+}
+void XEmitter::PDEP(int bits, X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+ WriteBMI2Op(bits, 0xF2, 0x38F5, regOp1, regOp2, arg);
+}
+void XEmitter::MULX(int bits, X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+ WriteBMI2Op(bits, 0xF2, 0x38F6, regOp2, regOp1, arg);
+}
+void XEmitter::BZHI(int bits, X64Reg regOp1, const OpArg& arg, X64Reg regOp2)
+{
+ CheckFlags();
+ WriteBMI2Op(bits, 0x00, 0x38F5, regOp1, regOp2, arg);
+}
+void XEmitter::BLSR(int bits, X64Reg regOp, const OpArg& arg)
+{
+ WriteBMI1Op(bits, 0x00, 0x38F3, (X64Reg)0x1, regOp, arg);
+}
+void XEmitter::BLSMSK(int bits, X64Reg regOp, const OpArg& arg)
+{
+ WriteBMI1Op(bits, 0x00, 0x38F3, (X64Reg)0x2, regOp, arg);
+}
+void XEmitter::BLSI(int bits, X64Reg regOp, const OpArg& arg)
+{
+ WriteBMI1Op(bits, 0x00, 0x38F3, (X64Reg)0x3, regOp, arg);
+}
+void XEmitter::BEXTR(int bits, X64Reg regOp1, const OpArg& arg, X64Reg regOp2)
+{
+ WriteBMI1Op(bits, 0x00, 0x38F7, regOp1, regOp2, arg);
+}
+void XEmitter::ANDN(int bits, X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
+{
+ WriteBMI1Op(bits, 0x00, 0x38F2, regOp1, regOp2, arg);
+}
+
+// Prefixes
+
+void XEmitter::LOCK()
+{
+ Write8(0xF0);
+}
+void XEmitter::REP()
+{
+ Write8(0xF3);
+}
+void XEmitter::REPNE()
+{
+ Write8(0xF2);
+}
+void XEmitter::FSOverride()
+{
+ Write8(0x64);
+}
+void XEmitter::GSOverride()
+{
+ Write8(0x65);
+}
+
+void XEmitter::FWAIT()
+{
+ Write8(0x9B);
+}
+
+// TODO: make this more generic
+void XEmitter::WriteFloatLoadStore(int bits, FloatOp op, FloatOp op_80b, const OpArg& arg)
+{
+ int mf = 0;
+ ASSERT_MSG(DYNA_REC, !(bits == 80 && op_80b == FloatOp::Invalid),
+ "WriteFloatLoadStore: 80 bits not supported for this instruction");
+ switch (bits)
+ {
+ case 32:
+ mf = 0;
+ break;
+ case 64:
+ mf = 4;
+ break;
+ case 80:
+ mf = 2;
+ break;
+ default:
+ ASSERT_MSG(DYNA_REC, 0, "WriteFloatLoadStore: invalid bits (should be 32/64/80)");
+ }
+ Write8(0xd9 | mf);
+ // x87 instructions use the reg field of the ModR/M byte as opcode:
+ if (bits == 80)
+ op = op_80b;
+ arg.WriteRest(this, 0, static_cast<X64Reg>(op));
+}
+
+void XEmitter::FLD(int bits, const OpArg& src)
+{
+ WriteFloatLoadStore(bits, FloatOp::LD, FloatOp::LD80, src);
+}
+void XEmitter::FST(int bits, const OpArg& dest)
+{
+ WriteFloatLoadStore(bits, FloatOp::ST, FloatOp::Invalid, dest);
+}
+void XEmitter::FSTP(int bits, const OpArg& dest)
+{
+ WriteFloatLoadStore(bits, FloatOp::STP, FloatOp::STP80, dest);
+}
+void XEmitter::FNSTSW_AX()
+{
+ Write8(0xDF);
+ Write8(0xE0);
+}
+
+void XEmitter::RDTSC()
+{
+ Write8(0x0F);
+ Write8(0x31);
+}
+}
diff --git a/src/dolphin/x64Emitter.h b/src/dolphin/x64Emitter.h
new file mode 100644
index 0000000..869acb6
--- /dev/null
+++ b/src/dolphin/x64Emitter.h
@@ -0,0 +1,1169 @@
+// Copyright 2008 Dolphin Emulator Project
+// Licensed under GPLv2+
+// Refer to the license_dolphin.txt file included.
+
+// WARNING - THIS LIBRARY IS NOT THREAD SAFE!!!
+
+#pragma once
+
+#include <cstddef>
+#include <cstring>
+#include <functional>
+#include <tuple>
+#include <type_traits>
+
+#include "Compat.h"
+#include "BitSet.h"
+#include "../types.h"
+#include "x64ABI.h"
+
+namespace Gen
+{
+enum CCFlags
+{
+ CC_O = 0,
+ CC_NO = 1,
+ CC_B = 2,
+ CC_C = 2,
+ CC_NAE = 2,
+ CC_NB = 3,
+ CC_NC = 3,
+ CC_AE = 3,
+ CC_Z = 4,
+ CC_E = 4,
+ CC_NZ = 5,
+ CC_NE = 5,
+ CC_BE = 6,
+ CC_NA = 6,
+ CC_NBE = 7,
+ CC_A = 7,
+ CC_S = 8,
+ CC_NS = 9,
+ CC_P = 0xA,
+ CC_PE = 0xA,
+ CC_NP = 0xB,
+ CC_PO = 0xB,
+ CC_L = 0xC,
+ CC_NGE = 0xC,
+ CC_NL = 0xD,
+ CC_GE = 0xD,
+ CC_LE = 0xE,
+ CC_NG = 0xE,
+ CC_NLE = 0xF,
+ CC_G = 0xF
+};
+
+enum
+{
+ NUMGPRs = 16,
+ NUMXMMs = 16,
+};
+
+enum
+{
+ SCALE_NONE = 0,
+ SCALE_1 = 1,
+ SCALE_2 = 2,
+ SCALE_4 = 4,
+ SCALE_8 = 8,
+ SCALE_ATREG = 16,
+ // SCALE_NOBASE_1 is not supported and can be replaced with SCALE_ATREG
+ SCALE_NOBASE_2 = 34,
+ SCALE_NOBASE_4 = 36,
+ SCALE_NOBASE_8 = 40,
+ SCALE_RIP = 0xFF,
+ SCALE_IMM8 = 0xF0,
+ SCALE_IMM16 = 0xF1,
+ SCALE_IMM32 = 0xF2,
+ SCALE_IMM64 = 0xF3,
+};
+
+enum SSECompare
+{
+ CMP_EQ = 0,
+ CMP_LT = 1,
+ CMP_LE = 2,
+ CMP_UNORD = 3,
+ CMP_NEQ = 4,
+ CMP_NLT = 5,
+ CMP_NLE = 6,
+ CMP_ORD = 7,
+};
+
+class XEmitter;
+enum class FloatOp;
+enum class NormalOp;
+
+// Information about a generated MOV op
+struct MovInfo final
+{
+ u8* address;
+ bool nonAtomicSwapStore;
+ // valid iff nonAtomicSwapStore is true
+ X64Reg nonAtomicSwapStoreSrc;
+};
+
+// RIP addressing does not benefit from micro op fusion on Core arch
+struct OpArg
+{
+ // For accessing offset and operandReg.
+ // This also allows us to keep the op writing functions private.
+ friend class XEmitter;
+
+ // dummy op arg, used for storage
+ constexpr OpArg() = default;
+ constexpr OpArg(u64 offset_, int scale_, X64Reg rm_reg = RAX, X64Reg scaled_reg = RAX)
+ : scale{static_cast<u8>(scale_)}, offsetOrBaseReg{static_cast<u16>(rm_reg)},
+ indexReg{static_cast<u16>(scaled_reg)}, offset{offset_}
+ {
+ }
+ constexpr bool operator==(const OpArg& b) const
+ {
+ // TODO: Use std::tie here once Dolphin requires C++17. (We can't do it immediately,
+ // (because we still support some older versions of GCC where std::tie is not constexpr.)
+ return operandReg == b.operandReg && scale == b.scale && offsetOrBaseReg == b.offsetOrBaseReg &&
+ indexReg == b.indexReg && offset == b.offset;
+ }
+ constexpr bool operator!=(const OpArg& b) const { return !operator==(b); }
+ u64 Imm64() const
+ {
+ DEBUG_ASSERT(scale == SCALE_IMM64);
+ return (u64)offset;
+ }
+ u32 Imm32() const
+ {
+ DEBUG_ASSERT(scale == SCALE_IMM32);
+ return (u32)offset;
+ }
+ u16 Imm16() const
+ {
+ DEBUG_ASSERT(scale == SCALE_IMM16);
+ return (u16)offset;
+ }
+ u8 Imm8() const
+ {
+ DEBUG_ASSERT(scale == SCALE_IMM8);
+ return (u8)offset;
+ }
+
+ s64 SImm64() const
+ {
+ DEBUG_ASSERT(scale == SCALE_IMM64);
+ return (s64)offset;
+ }
+ s32 SImm32() const
+ {
+ DEBUG_ASSERT(scale == SCALE_IMM32);
+ return (s32)offset;
+ }
+ s16 SImm16() const
+ {
+ DEBUG_ASSERT(scale == SCALE_IMM16);
+ return (s16)offset;
+ }
+ s8 SImm8() const
+ {
+ DEBUG_ASSERT(scale == SCALE_IMM8);
+ return (s8)offset;
+ }
+
+ OpArg AsImm64() const
+ {
+ DEBUG_ASSERT(IsImm());
+ return OpArg((u64)offset, SCALE_IMM64);
+ }
+ OpArg AsImm32() const
+ {
+ DEBUG_ASSERT(IsImm());
+ return OpArg((u32)offset, SCALE_IMM32);
+ }
+ OpArg AsImm16() const
+ {
+ DEBUG_ASSERT(IsImm());
+ return OpArg((u16)offset, SCALE_IMM16);
+ }
+ OpArg AsImm8() const
+ {
+ DEBUG_ASSERT(IsImm());
+ return OpArg((u8)offset, SCALE_IMM8);
+ }
+
+ constexpr bool IsImm() const
+ {
+ return scale == SCALE_IMM8 || scale == SCALE_IMM16 || scale == SCALE_IMM32 ||
+ scale == SCALE_IMM64;
+ }
+ constexpr bool IsSimpleReg() const { return scale == SCALE_NONE; }
+ constexpr bool IsSimpleReg(X64Reg reg) const { return IsSimpleReg() && GetSimpleReg() == reg; }
+ constexpr bool IsZero() const { return IsImm() && offset == 0; }
+ constexpr int GetImmBits() const
+ {
+ switch (scale)
+ {
+ case SCALE_IMM8:
+ return 8;
+ case SCALE_IMM16:
+ return 16;
+ case SCALE_IMM32:
+ return 32;
+ case SCALE_IMM64:
+ return 64;
+ default:
+ return -1;
+ }
+ }
+
+ constexpr X64Reg GetSimpleReg() const
+ {
+ if (scale == SCALE_NONE)
+ return static_cast<X64Reg>(offsetOrBaseReg);
+
+ return INVALID_REG;
+ }
+
+ void AddMemOffset(int val)
+ {
+ DEBUG_ASSERT_MSG(DYNA_REC, scale == SCALE_RIP || (scale <= SCALE_ATREG && scale > SCALE_NONE),
+ "Tried to increment an OpArg which doesn't have an offset");
+ offset += val;
+ }
+
+private:
+ void WriteREX(XEmitter* emit, int opBits, int bits, int customOp = -1) const;
+ void WriteVEX(XEmitter* emit, X64Reg regOp1, X64Reg regOp2, int L, int pp, int mmmmm,
+ int W = 0) const;
+ void WriteRest(XEmitter* emit, int extraBytes = 0, X64Reg operandReg = INVALID_REG,
+ bool warn_64bit_offset = true) const;
+ void WriteSingleByteOp(XEmitter* emit, u8 op, X64Reg operandReg, int bits);
+ void WriteNormalOp(XEmitter* emit, bool toRM, NormalOp op, const OpArg& operand, int bits) const;
+
+ u8 scale = 0;
+ u16 offsetOrBaseReg = 0;
+ u16 indexReg = 0;
+ u64 offset = 0; // Also used to store immediates.
+ u16 operandReg = 0;
+};
+
+template <typename T>
+inline OpArg M(const T* ptr)
+{
+ return OpArg((u64)(const void*)ptr, (int)SCALE_RIP);
+}
+constexpr OpArg R(X64Reg value)
+{
+ return OpArg(0, SCALE_NONE, value);
+}
+constexpr OpArg MatR(X64Reg value)
+{
+ return OpArg(0, SCALE_ATREG, value);
+}
+
+constexpr OpArg MDisp(X64Reg value, int offset)
+{
+ return OpArg(static_cast<u32>(offset), SCALE_ATREG, value);
+}
+
+constexpr OpArg MComplex(X64Reg base, X64Reg scaled, int scale, int offset)
+{
+ return OpArg(offset, scale, base, scaled);
+}
+
+constexpr OpArg MScaled(X64Reg scaled, int scale, int offset)
+{
+ if (scale == SCALE_1)
+ return OpArg(offset, SCALE_ATREG, scaled);
+
+ return OpArg(offset, scale | 0x20, RAX, scaled);
+}
+
+constexpr OpArg MRegSum(X64Reg base, X64Reg offset)
+{
+ return MComplex(base, offset, 1, 0);
+}
+
+constexpr OpArg Imm8(u8 imm)
+{
+ return OpArg(imm, SCALE_IMM8);
+}
+constexpr OpArg Imm16(u16 imm)
+{
+ return OpArg(imm, SCALE_IMM16);
+} // rarely used
+constexpr OpArg Imm32(u32 imm)
+{
+ return OpArg(imm, SCALE_IMM32);
+}
+constexpr OpArg Imm64(u64 imm)
+{
+ return OpArg(imm, SCALE_IMM64);
+}
+inline OpArg ImmPtr(const void* imm)
+{
+ return Imm64(reinterpret_cast<u64>(imm));
+}
+
+inline u32 PtrOffset(const void* ptr, const void* base = nullptr)
+{
+ s64 distance = (s64)ptr - (s64)base;
+ if (distance >= 0x80000000LL || distance < -0x80000000LL)
+ {
+ ASSERT_MSG(DYNA_REC, 0, "pointer offset out of range");
+ return 0;
+ }
+
+ return (u32)distance;
+}
+
+// usage: int a[]; ARRAY_OFFSET(a,10)
+#define ARRAY_OFFSET(array, index) ((u32)((u64) & (array)[index] - (u64) & (array)[0]))
+// usage: struct {int e;} s; STRUCT_OFFSET(s,e)
+#define STRUCT_OFFSET(str, elem) ((u32)((u64) & (str).elem - (u64) & (str)))
+
+struct FixupBranch
+{
+ enum class Type
+ {
+ Branch8Bit,
+ Branch32Bit
+ };
+
+ u8* ptr;
+ Type type;
+};
+
+class XEmitter
+{
+ friend struct OpArg; // for Write8 etc
+private:
+ u8* code = nullptr;
+ bool flags_locked = false;
+
+ void CheckFlags();
+
+ void Rex(int w, int r, int x, int b);
+ void WriteModRM(int mod, int reg, int rm);
+ void WriteSIB(int scale, int index, int base);
+ void WriteSimple1Byte(int bits, u8 byte, X64Reg reg);
+ void WriteSimple2Byte(int bits, u8 byte1, u8 byte2, X64Reg reg);
+ void WriteMulDivType(int bits, OpArg src, int ext);
+ void WriteBitSearchType(int bits, X64Reg dest, OpArg src, u8 byte2, bool rep = false);
+ void WriteShift(int bits, OpArg dest, const OpArg& shift, int ext);
+ void WriteBitTest(int bits, const OpArg& dest, const OpArg& index, int ext);
+ void WriteMXCSR(OpArg arg, int ext);
+ void WriteSSEOp(u8 opPrefix, u16 op, X64Reg regOp, OpArg arg, int extrabytes = 0);
+ void WriteSSSE3Op(u8 opPrefix, u16 op, X64Reg regOp, const OpArg& arg, int extrabytes = 0);
+ void WriteSSE41Op(u8 opPrefix, u16 op, X64Reg regOp, const OpArg& arg, int extrabytes = 0);
+ void WriteVEXOp(u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg, int W = 0,
+ int extrabytes = 0);
+ void WriteVEXOp4(u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg,
+ X64Reg regOp3, int W = 0);
+ void WriteAVXOp(u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg, int W = 0,
+ int extrabytes = 0);
+ void WriteAVXOp4(u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg,
+ X64Reg regOp3, int W = 0);
+ void WriteFMA3Op(u8 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg, int W = 0);
+ void WriteFMA4Op(u8 op, X64Reg dest, X64Reg regOp1, X64Reg regOp2, const OpArg& arg, int W = 0);
+ void WriteBMIOp(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg,
+ int extrabytes = 0);
+ void WriteBMI1Op(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg,
+ int extrabytes = 0);
+ void WriteBMI2Op(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg,
+ int extrabytes = 0);
+ void WriteMOVBE(int bits, u8 op, X64Reg regOp, const OpArg& arg);
+ void WriteFloatLoadStore(int bits, FloatOp op, FloatOp op_80b, const OpArg& arg);
+ void WriteNormalOp(int bits, NormalOp op, const OpArg& a1, const OpArg& a2);
+
+ void ABI_CalculateFrameSize(BitSet32 mask, size_t rsp_alignment, size_t needed_frame_size,
+ size_t* shadowp, size_t* subtractionp, size_t* xmm_offsetp);
+
+protected:
+ void Write8(u8 value);
+ void Write16(u16 value);
+ void Write32(u32 value);
+ void Write64(u64 value);
+
+public:
+ XEmitter() = default;
+ explicit XEmitter(u8* code_ptr) : code{code_ptr} {}
+ virtual ~XEmitter() = default;
+ void SetCodePtr(u8* ptr);
+ void ReserveCodeSpace(int bytes);
+ u8* AlignCodeTo(size_t alignment);
+ u8* AlignCode4();
+ u8* AlignCode16();
+ u8* AlignCodePage();
+ const u8* GetCodePtr() const;
+ u8* GetWritableCodePtr();
+
+ void LockFlags() { flags_locked = true; }
+ void UnlockFlags() { flags_locked = false; }
+ // Looking for one of these? It's BANNED!! Some instructions are slow on modern CPU
+ // INC, DEC, LOOP, LOOPNE, LOOPE, ENTER, LEAVE, XCHG, XLAT, REP MOVSB/MOVSD, REP SCASD + other
+ // string instr.,
+ // INC and DEC are slow on Intel Core, but not on AMD. They create a
+ // false flag dependency because they only update a subset of the flags.
+ // XCHG is SLOW and should be avoided.
+
+ // Debug breakpoint
+ void INT3();
+
+ // Do nothing
+ void NOP(size_t count = 1);
+
+ // Save energy in wait-loops on P4 only. Probably not too useful.
+ void PAUSE();
+
+ // Flag control
+ void STC();
+ void CLC();
+ void CMC();
+
+ // These two can not be executed in 64-bit mode on early Intel 64-bit CPU:s, only on Core2 and
+ // AMD!
+ void LAHF(); // 3 cycle vector path
+ void SAHF(); // direct path fast
+
+ // Stack control
+ void PUSH(X64Reg reg);
+ void POP(X64Reg reg);
+ void PUSH(int bits, const OpArg& reg);
+ void POP(int bits, const OpArg& reg);
+ void PUSHF();
+ void POPF();
+
+ // Flow control
+ void RET();
+ void RET_FAST();
+ void UD2();
+ FixupBranch J(bool force5bytes = false);
+
+ void JMP(const u8* addr, bool force5Bytes = false);
+ void JMPptr(const OpArg& arg);
+ void JMPself(); // infinite loop!
+#ifdef CALL
+#undef CALL
+#endif
+ void CALL(const void* fnptr);
+ FixupBranch CALL();
+ void CALLptr(OpArg arg);
+
+ FixupBranch J_CC(CCFlags conditionCode, bool force5bytes = false);
+ void J_CC(CCFlags conditionCode, const u8* addr);
+
+ void SetJumpTarget(const FixupBranch& branch);
+
+ void SETcc(CCFlags flag, OpArg dest);
+ // Note: CMOV brings small if any benefit on current CPUs.
+ void CMOVcc(int bits, X64Reg dest, OpArg src, CCFlags flag);
+
+ // Fences
+ void LFENCE();
+ void MFENCE();
+ void SFENCE();
+
+ // Bit scan
+ void BSF(int bits, X64Reg dest, const OpArg& src); // Bottom bit to top bit
+ void BSR(int bits, X64Reg dest, const OpArg& src); // Top bit to bottom bit
+
+ // Cache control
+ enum PrefetchLevel
+ {
+ PF_NTA, // Non-temporal (data used once and only once)
+ PF_T0, // All cache levels
+ PF_T1, // Levels 2+ (aliased to T0 on AMD)
+ PF_T2, // Levels 3+ (aliased to T0 on AMD)
+ };
+ void PREFETCH(PrefetchLevel level, OpArg arg);
+ void MOVNTI(int bits, const OpArg& dest, X64Reg src);
+ void MOVNTDQ(const OpArg& arg, X64Reg regOp);
+ void MOVNTPS(const OpArg& arg, X64Reg regOp);
+ void MOVNTPD(const OpArg& arg, X64Reg regOp);
+
+ // Multiplication / division
+ void MUL(int bits, const OpArg& src); // UNSIGNED
+ void IMUL(int bits, const OpArg& src); // SIGNED
+ void IMUL(int bits, X64Reg regOp, const OpArg& src);
+ void IMUL(int bits, X64Reg regOp, const OpArg& src, const OpArg& imm);
+ void DIV(int bits, const OpArg& src);
+ void IDIV(int bits, const OpArg& src);
+
+ // Shift
+ void ROL(int bits, const OpArg& dest, const OpArg& shift);
+ void ROR_(int bits, const OpArg& dest, const OpArg& shift);
+ void RCL(int bits, const OpArg& dest, const OpArg& shift);
+ void RCR(int bits, const OpArg& dest, const OpArg& shift);
+ void SHL(int bits, const OpArg& dest, const OpArg& shift);
+ void SHR(int bits, const OpArg& dest, const OpArg& shift);
+ void SAR(int bits, const OpArg& dest, const OpArg& shift);
+
+ // Bit Test
+ void BT(int bits, const OpArg& dest, const OpArg& index);
+ void BTS(int bits, const OpArg& dest, const OpArg& index);
+ void BTR(int bits, const OpArg& dest, const OpArg& index);
+ void BTC(int bits, const OpArg& dest, const OpArg& index);
+
+ // Double-Precision Shift
+ void SHRD(int bits, const OpArg& dest, const OpArg& src, const OpArg& shift);
+ void SHLD(int bits, const OpArg& dest, const OpArg& src, const OpArg& shift);
+
+ // Extend EAX into EDX in various ways
+ void CWD(int bits = 16);
+ inline void CDQ() { CWD(32); }
+ inline void CQO() { CWD(64); }
+ void CBW(int bits = 8);
+ inline void CWDE() { CBW(16); }
+ inline void CDQE() { CBW(32); }
+ // Load effective address
+ void LEA(int bits, X64Reg dest, OpArg src);
+
+ // Integer arithmetic
+ void NEG(int bits, const OpArg& src);
+ void ADD(int bits, const OpArg& a1, const OpArg& a2);
+ void ADC(int bits, const OpArg& a1, const OpArg& a2);
+ void SUB(int bits, const OpArg& a1, const OpArg& a2);
+ void SBB(int bits, const OpArg& a1, const OpArg& a2);
+ void AND(int bits, const OpArg& a1, const OpArg& a2);
+ void CMP(int bits, const OpArg& a1, const OpArg& a2);
+
+ // Bit operations
+ void NOT(int bits, const OpArg& src);
+ void OR(int bits, const OpArg& a1, const OpArg& a2);
+ void XOR(int bits, const OpArg& a1, const OpArg& a2);
+ void MOV(int bits, const OpArg& a1, const OpArg& a2);
+ void TEST(int bits, const OpArg& a1, const OpArg& a2);
+
+ void CMP_or_TEST(int bits, const OpArg& a1, const OpArg& a2);
+ void MOV_sum(int bits, X64Reg dest, const OpArg& a1, const OpArg& a2);
+
+ // Are these useful at all? Consider removing.
+ void XCHG(int bits, const OpArg& a1, const OpArg& a2);
+ void XCHG_AHAL();
+
+ // Byte swapping (32 and 64-bit only).
+ void BSWAP(int bits, X64Reg reg);
+
+ // Sign/zero extension
+ void MOVSX(int dbits, int sbits, X64Reg dest,
+ OpArg src); // automatically uses MOVSXD if necessary
+ void MOVZX(int dbits, int sbits, X64Reg dest, OpArg src);
+
+ // Available only on Atom or >= Haswell so far. Test with cpu_info.bMOVBE.
+ void MOVBE(int bits, X64Reg dest, const OpArg& src);
+ void MOVBE(int bits, const OpArg& dest, X64Reg src);
+ void LoadAndSwap(int size, X64Reg dst, const OpArg& src, bool sign_extend = false,
+ MovInfo* info = nullptr);
+ void SwapAndStore(int size, const OpArg& dst, X64Reg src, MovInfo* info = nullptr);
+
+ // Available only on AMD >= Phenom or Intel >= Haswell
+ void LZCNT(int bits, X64Reg dest, const OpArg& src);
+ // Note: this one is actually part of BMI1
+ void TZCNT(int bits, X64Reg dest, const OpArg& src);
+
+ // WARNING - These two take 11-13 cycles and are VectorPath! (AMD64)
+ void STMXCSR(const OpArg& memloc);
+ void LDMXCSR(const OpArg& memloc);
+
+ // Prefixes
+ void LOCK();
+ void REP();
+ void REPNE();
+ void FSOverride();
+ void GSOverride();
+
+ // x87
+ enum x87StatusWordBits
+ {
+ x87_InvalidOperation = 0x1,
+ x87_DenormalizedOperand = 0x2,
+ x87_DivisionByZero = 0x4,
+ x87_Overflow = 0x8,
+ x87_Underflow = 0x10,
+ x87_Precision = 0x20,
+ x87_StackFault = 0x40,
+ x87_ErrorSummary = 0x80,
+ x87_C0 = 0x100,
+ x87_C1 = 0x200,
+ x87_C2 = 0x400,
+ x87_TopOfStack = 0x2000 | 0x1000 | 0x800,
+ x87_C3 = 0x4000,
+ x87_FPUBusy = 0x8000,
+ };
+
+ void FLD(int bits, const OpArg& src);
+ void FST(int bits, const OpArg& dest);
+ void FSTP(int bits, const OpArg& dest);
+ void FNSTSW_AX();
+ void FWAIT();
+
+ // SSE/SSE2: Floating point arithmetic
+ void ADDSS(X64Reg regOp, const OpArg& arg);
+ void ADDSD(X64Reg regOp, const OpArg& arg);
+ void SUBSS(X64Reg regOp, const OpArg& arg);
+ void SUBSD(X64Reg regOp, const OpArg& arg);
+ void MULSS(X64Reg regOp, const OpArg& arg);
+ void MULSD(X64Reg regOp, const OpArg& arg);
+ void DIVSS(X64Reg regOp, const OpArg& arg);
+ void DIVSD(X64Reg regOp, const OpArg& arg);
+ void MINSS(X64Reg regOp, const OpArg& arg);
+ void MINSD(X64Reg regOp, const OpArg& arg);
+ void MAXSS(X64Reg regOp, const OpArg& arg);
+ void MAXSD(X64Reg regOp, const OpArg& arg);
+ void SQRTSS(X64Reg regOp, const OpArg& arg);
+ void SQRTSD(X64Reg regOp, const OpArg& arg);
+ void RCPSS(X64Reg regOp, const OpArg& arg);
+ void RSQRTSS(X64Reg regOp, const OpArg& arg);
+
+ // SSE/SSE2: Floating point bitwise (yes)
+ void CMPSS(X64Reg regOp, const OpArg& arg, u8 compare);
+ void CMPSD(X64Reg regOp, const OpArg& arg, u8 compare);
+
+ // SSE/SSE2: Floating point packed arithmetic (x4 for float, x2 for double)
+ void ADDPS(X64Reg regOp, const OpArg& arg);
+ void ADDPD(X64Reg regOp, const OpArg& arg);
+ void SUBPS(X64Reg regOp, const OpArg& arg);
+ void SUBPD(X64Reg regOp, const OpArg& arg);
+ void CMPPS(X64Reg regOp, const OpArg& arg, u8 compare);
+ void CMPPD(X64Reg regOp, const OpArg& arg, u8 compare);
+ void MULPS(X64Reg regOp, const OpArg& arg);
+ void MULPD(X64Reg regOp, const OpArg& arg);
+ void DIVPS(X64Reg regOp, const OpArg& arg);
+ void DIVPD(X64Reg regOp, const OpArg& arg);
+ void MINPS(X64Reg regOp, const OpArg& arg);
+ void MINPD(X64Reg regOp, const OpArg& arg);
+ void MAXPS(X64Reg regOp, const OpArg& arg);
+ void MAXPD(X64Reg regOp, const OpArg& arg);
+ void SQRTPS(X64Reg regOp, const OpArg& arg);
+ void SQRTPD(X64Reg regOp, const OpArg& arg);
+ void RCPPS(X64Reg regOp, const OpArg& arg);
+ void RSQRTPS(X64Reg regOp, const OpArg& arg);
+
+ // SSE/SSE2: Floating point packed bitwise (x4 for float, x2 for double)
+ void ANDPS(X64Reg regOp, const OpArg& arg);
+ void ANDPD(X64Reg regOp, const OpArg& arg);
+ void ANDNPS(X64Reg regOp, const OpArg& arg);
+ void ANDNPD(X64Reg regOp, const OpArg& arg);
+ void ORPS(X64Reg regOp, const OpArg& arg);
+ void ORPD(X64Reg regOp, const OpArg& arg);
+ void XORPS(X64Reg regOp, const OpArg& arg);
+ void XORPD(X64Reg regOp, const OpArg& arg);
+
+ // SSE/SSE2: Shuffle components. These are tricky - see Intel documentation.
+ void SHUFPS(X64Reg regOp, const OpArg& arg, u8 shuffle);
+ void SHUFPD(X64Reg regOp, const OpArg& arg, u8 shuffle);
+
+ // SSE3
+ void MOVSLDUP(X64Reg regOp, const OpArg& arg);
+ void MOVSHDUP(X64Reg regOp, const OpArg& arg);
+ void MOVDDUP(X64Reg regOp, const OpArg& arg);
+
+ // SSE/SSE2: Useful alternative to shuffle in some cases.
+ void UNPCKLPS(X64Reg dest, const OpArg& src);
+ void UNPCKHPS(X64Reg dest, const OpArg& src);
+ void UNPCKLPD(X64Reg dest, const OpArg& src);
+ void UNPCKHPD(X64Reg dest, const OpArg& src);
+
+ // SSE/SSE2: Compares.
+ void COMISS(X64Reg regOp, const OpArg& arg);
+ void COMISD(X64Reg regOp, const OpArg& arg);
+ void UCOMISS(X64Reg regOp, const OpArg& arg);
+ void UCOMISD(X64Reg regOp, const OpArg& arg);
+
+ // SSE/SSE2: Moves. Use the right data type for your data, in most cases.
+ void MOVAPS(X64Reg regOp, const OpArg& arg);
+ void MOVAPD(X64Reg regOp, const OpArg& arg);
+ void MOVAPS(const OpArg& arg, X64Reg regOp);
+ void MOVAPD(const OpArg& arg, X64Reg regOp);
+
+ void MOVUPS(X64Reg regOp, const OpArg& arg);
+ void MOVUPD(X64Reg regOp, const OpArg& arg);
+ void MOVUPS(const OpArg& arg, X64Reg regOp);
+ void MOVUPD(const OpArg& arg, X64Reg regOp);
+
+ void MOVDQA(X64Reg regOp, const OpArg& arg);
+ void MOVDQA(const OpArg& arg, X64Reg regOp);
+ void MOVDQU(X64Reg regOp, const OpArg& arg);
+ void MOVDQU(const OpArg& arg, X64Reg regOp);
+
+ void MOVSS(X64Reg regOp, const OpArg& arg);
+ void MOVSD(X64Reg regOp, const OpArg& arg);
+ void MOVSS(const OpArg& arg, X64Reg regOp);
+ void MOVSD(const OpArg& arg, X64Reg regOp);
+
+ void MOVLPS(X64Reg regOp, const OpArg& arg);
+ void MOVLPD(X64Reg regOp, const OpArg& arg);
+ void MOVLPS(const OpArg& arg, X64Reg regOp);
+ void MOVLPD(const OpArg& arg, X64Reg regOp);
+
+ void MOVHPS(X64Reg regOp, const OpArg& arg);
+ void MOVHPD(X64Reg regOp, const OpArg& arg);
+ void MOVHPS(const OpArg& arg, X64Reg regOp);
+ void MOVHPD(const OpArg& arg, X64Reg regOp);
+
+ void MOVHLPS(X64Reg regOp1, X64Reg regOp2);
+ void MOVLHPS(X64Reg regOp1, X64Reg regOp2);
+
+ // Be careful when using these overloads for reg <--> xmm moves.
+ // The one you cast to OpArg with R(reg) is the x86 reg, the other
+ // one is the xmm reg.
+ // ie: "MOVD_xmm(eax, R(xmm1))" generates incorrect code (movd xmm0, rcx)
+ // use "MOVD_xmm(R(eax), xmm1)" instead.
+ void MOVD_xmm(X64Reg dest, const OpArg& arg);
+ void MOVQ_xmm(X64Reg dest, OpArg arg);
+ void MOVD_xmm(const OpArg& arg, X64Reg src);
+ void MOVQ_xmm(OpArg arg, X64Reg src);
+
+ // SSE/SSE2: Generates a mask from the high bits of the components of the packed register in
+ // question.
+ void MOVMSKPS(X64Reg dest, const OpArg& arg);
+ void MOVMSKPD(X64Reg dest, const OpArg& arg);
+
+ // SSE2: Selective byte store, mask in src register. EDI/RDI specifies store address. This is a
+ // weird one.
+ void MASKMOVDQU(X64Reg dest, X64Reg src);
+ void LDDQU(X64Reg dest, const OpArg& src);
+
+ // SSE/SSE2: Data type conversions.
+ void CVTPS2PD(X64Reg dest, const OpArg& src);
+ void CVTPD2PS(X64Reg dest, const OpArg& src);
+ void CVTSS2SD(X64Reg dest, const OpArg& src);
+ void CVTSI2SS(X64Reg dest, const OpArg& src);
+ void CVTSD2SS(X64Reg dest, const OpArg& src);
+ void CVTSI2SD(X64Reg dest, const OpArg& src);
+ void CVTDQ2PD(X64Reg regOp, const OpArg& arg);
+ void CVTPD2DQ(X64Reg regOp, const OpArg& arg);
+ void CVTDQ2PS(X64Reg regOp, const OpArg& arg);
+ void CVTPS2DQ(X64Reg regOp, const OpArg& arg);
+
+ void CVTTPS2DQ(X64Reg regOp, const OpArg& arg);
+ void CVTTPD2DQ(X64Reg regOp, const OpArg& arg);
+
+ // Destinations are X64 regs (rax, rbx, ...) for these instructions.
+ void CVTSS2SI(X64Reg xregdest, const OpArg& src);
+ void CVTSD2SI(X64Reg xregdest, const OpArg& src);
+ void CVTTSS2SI(X64Reg xregdest, const OpArg& arg);
+ void CVTTSD2SI(X64Reg xregdest, const OpArg& arg);
+
+ // SSE2: Packed integer instructions
+ void PACKSSDW(X64Reg dest, const OpArg& arg);
+ void PACKSSWB(X64Reg dest, const OpArg& arg);
+ void PACKUSDW(X64Reg dest, const OpArg& arg);
+ void PACKUSWB(X64Reg dest, const OpArg& arg);
+
+ void PUNPCKLBW(X64Reg dest, const OpArg& arg);
+ void PUNPCKLWD(X64Reg dest, const OpArg& arg);
+ void PUNPCKLDQ(X64Reg dest, const OpArg& arg);
+ void PUNPCKLQDQ(X64Reg dest, const OpArg& arg);
+
+ void PTEST(X64Reg dest, const OpArg& arg);
+ void PAND(X64Reg dest, const OpArg& arg);
+ void PANDN(X64Reg dest, const OpArg& arg);
+ void PXOR(X64Reg dest, const OpArg& arg);
+ void POR(X64Reg dest, const OpArg& arg);
+
+ void PADDB(X64Reg dest, const OpArg& arg);
+ void PADDW(X64Reg dest, const OpArg& arg);
+ void PADDD(X64Reg dest, const OpArg& arg);
+ void PADDQ(X64Reg dest, const OpArg& arg);
+
+ void PADDSB(X64Reg dest, const OpArg& arg);
+ void PADDSW(X64Reg dest, const OpArg& arg);
+ void PADDUSB(X64Reg dest, const OpArg& arg);
+ void PADDUSW(X64Reg dest, const OpArg& arg);
+
+ void PSUBB(X64Reg dest, const OpArg& arg);
+ void PSUBW(X64Reg dest, const OpArg& arg);
+ void PSUBD(X64Reg dest, const OpArg& arg);
+ void PSUBQ(X64Reg dest, const OpArg& arg);
+
+ void PSUBSB(X64Reg dest, const OpArg& arg);
+ void PSUBSW(X64Reg dest, const OpArg& arg);
+ void PSUBUSB(X64Reg dest, const OpArg& arg);
+ void PSUBUSW(X64Reg dest, const OpArg& arg);
+
+ void PAVGB(X64Reg dest, const OpArg& arg);
+ void PAVGW(X64Reg dest, const OpArg& arg);
+
+ void PCMPEQB(X64Reg dest, const OpArg& arg);
+ void PCMPEQW(X64Reg dest, const OpArg& arg);
+ void PCMPEQD(X64Reg dest, const OpArg& arg);
+
+ void PCMPGTB(X64Reg dest, const OpArg& arg);
+ void PCMPGTW(X64Reg dest, const OpArg& arg);
+ void PCMPGTD(X64Reg dest, const OpArg& arg);
+
+ void PEXTRW(X64Reg dest, const OpArg& arg, u8 subreg);
+ void PINSRW(X64Reg dest, const OpArg& arg, u8 subreg);
+ void PINSRD(X64Reg dest, const OpArg& arg, u8 subreg);
+
+ void PMADDWD(X64Reg dest, const OpArg& arg);
+ void PSADBW(X64Reg dest, const OpArg& arg);
+
+ void PMAXSW(X64Reg dest, const OpArg& arg);
+ void PMAXUB(X64Reg dest, const OpArg& arg);
+ void PMINSW(X64Reg dest, const OpArg& arg);
+ void PMINUB(X64Reg dest, const OpArg& arg);
+
+ void PMOVMSKB(X64Reg dest, const OpArg& arg);
+ void PSHUFD(X64Reg dest, const OpArg& arg, u8 shuffle);
+ void PSHUFB(X64Reg dest, const OpArg& arg);
+
+ void PSHUFLW(X64Reg dest, const OpArg& arg, u8 shuffle);
+ void PSHUFHW(X64Reg dest, const OpArg& arg, u8 shuffle);
+
+ void PSRLW(X64Reg reg, int shift);
+ void PSRLD(X64Reg reg, int shift);
+ void PSRLQ(X64Reg reg, int shift);
+ void PSRLQ(X64Reg reg, const OpArg& arg);
+ void PSRLDQ(X64Reg reg, int shift);
+
+ void PSLLW(X64Reg reg, int shift);
+ void PSLLD(X64Reg reg, int shift);
+ void PSLLQ(X64Reg reg, int shift);
+ void PSLLDQ(X64Reg reg, int shift);
+
+ void PSRAW(X64Reg reg, int shift);
+ void PSRAD(X64Reg reg, int shift);
+
+ // SSE4: data type conversions
+ void PMOVSXBW(X64Reg dest, const OpArg& arg);
+ void PMOVSXBD(X64Reg dest, const OpArg& arg);
+ void PMOVSXBQ(X64Reg dest, const OpArg& arg);
+ void PMOVSXWD(X64Reg dest, const OpArg& arg);
+ void PMOVSXWQ(X64Reg dest, const OpArg& arg);
+ void PMOVSXDQ(X64Reg dest, const OpArg& arg);
+ void PMOVZXBW(X64Reg dest, const OpArg& arg);
+ void PMOVZXBD(X64Reg dest, const OpArg& arg);
+ void PMOVZXBQ(X64Reg dest, const OpArg& arg);
+ void PMOVZXWD(X64Reg dest, const OpArg& arg);
+ void PMOVZXWQ(X64Reg dest, const OpArg& arg);
+ void PMOVZXDQ(X64Reg dest, const OpArg& arg);
+
+ // SSE4: blend instructions
+ void PBLENDVB(X64Reg dest, const OpArg& arg);
+ void BLENDVPS(X64Reg dest, const OpArg& arg);
+ void BLENDVPD(X64Reg dest, const OpArg& arg);
+ void BLENDPS(X64Reg dest, const OpArg& arg, u8 blend);
+ void BLENDPD(X64Reg dest, const OpArg& arg, u8 blend);
+
+ // AVX
+ void VADDSS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VSUBSS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VMULSS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VDIVSS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VADDPS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VSUBPS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VMULPS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VDIVPS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VADDSD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VSUBSD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VMULSD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VDIVSD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VADDPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VSUBPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VMULPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VDIVPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VSQRTSD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VCMPPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg, u8 compare);
+ void VSHUFPS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg, u8 shuffle);
+ void VSHUFPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg, u8 shuffle);
+ void VUNPCKLPS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VUNPCKLPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VUNPCKHPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VBLENDVPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg, X64Reg mask);
+ void VBLENDPS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg, u8 blend);
+ void VBLENDPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg, u8 blend);
+
+ void VANDPS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VANDPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VANDNPS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VANDNPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VORPS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VORPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VXORPS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VXORPD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+
+ void VPAND(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VPANDN(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VPOR(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VPXOR(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+
+ // FMA3
+ void VFMADD132PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VFMADD213PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VFMADD231PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VFMADD132PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VFMADD213PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VFMADD231PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VFMADD132SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VFMADD213SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VFMADD231SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VFMADD132SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VFMADD213SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VFMADD231SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VFMSUB132PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VFMSUB213PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VFMSUB231PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VFMSUB132PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VFMSUB213PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VFMSUB231PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VFMSUB132SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VFMSUB213SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VFMSUB231SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VFMSUB132SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VFMSUB213SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VFMSUB231SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VFNMADD132PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VFNMADD213PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VFNMADD231PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VFNMADD132PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VFNMADD213PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VFNMADD231PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VFNMADD132SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VFNMADD213SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VFNMADD231SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VFNMADD132SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VFNMADD213SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VFNMADD231SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VFNMSUB132PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VFNMSUB213PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VFNMSUB231PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VFNMSUB132PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VFNMSUB213PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VFNMSUB231PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VFNMSUB132SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VFNMSUB213SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VFNMSUB231SS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VFNMSUB132SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VFNMSUB213SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VFNMSUB231SD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VFMADDSUB132PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VFMADDSUB213PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VFMADDSUB231PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VFMADDSUB132PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VFMADDSUB213PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VFMADDSUB231PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VFMSUBADD132PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VFMSUBADD213PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VFMSUBADD231PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VFMSUBADD132PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VFMSUBADD213PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void VFMSUBADD231PD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+
+#define FMA4(name) \
+ void name(X64Reg dest, X64Reg regOp1, X64Reg regOp2, const OpArg& arg); \
+ void name(X64Reg dest, X64Reg regOp1, const OpArg& arg, X64Reg regOp2);
+
+ FMA4(VFMADDSUBPS)
+ FMA4(VFMADDSUBPD)
+ FMA4(VFMSUBADDPS)
+ FMA4(VFMSUBADDPD)
+ FMA4(VFMADDPS)
+ FMA4(VFMADDPD)
+ FMA4(VFMADDSS)
+ FMA4(VFMADDSD)
+ FMA4(VFMSUBPS)
+ FMA4(VFMSUBPD)
+ FMA4(VFMSUBSS)
+ FMA4(VFMSUBSD)
+ FMA4(VFNMADDPS)
+ FMA4(VFNMADDPD)
+ FMA4(VFNMADDSS)
+ FMA4(VFNMADDSD)
+ FMA4(VFNMSUBPS)
+ FMA4(VFNMSUBPD)
+ FMA4(VFNMSUBSS)
+ FMA4(VFNMSUBSD)
+#undef FMA4
+
+ // VEX GPR instructions
+ void SARX(int bits, X64Reg regOp1, const OpArg& arg, X64Reg regOp2);
+ void SHLX(int bits, X64Reg regOp1, const OpArg& arg, X64Reg regOp2);
+ void SHRX(int bits, X64Reg regOp1, const OpArg& arg, X64Reg regOp2);
+ void RORX(int bits, X64Reg regOp, const OpArg& arg, u8 rotate);
+ void PEXT(int bits, X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void PDEP(int bits, X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void MULX(int bits, X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+ void BZHI(int bits, X64Reg regOp1, const OpArg& arg, X64Reg regOp2);
+ void BLSR(int bits, X64Reg regOp, const OpArg& arg);
+ void BLSMSK(int bits, X64Reg regOp, const OpArg& arg);
+ void BLSI(int bits, X64Reg regOp, const OpArg& arg);
+ void BEXTR(int bits, X64Reg regOp1, const OpArg& arg, X64Reg regOp2);
+ void ANDN(int bits, X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
+
+ void RDTSC();
+
+ // Utility functions
+ // The difference between this and CALL is that this aligns the stack
+ // where appropriate.
+ template <typename FunctionPointer>
+ void ABI_CallFunction(FunctionPointer func)
+ {
+ static_assert(std::is_pointer<FunctionPointer>() &&
+ std::is_function<std::remove_pointer_t<FunctionPointer>>(),
+ "Supplied type must be a function pointer.");
+
+ const void* ptr = reinterpret_cast<const void*>(func);
+ const u64 address = reinterpret_cast<u64>(ptr);
+ const u64 distance = address - (reinterpret_cast<u64>(code) + 5);
+
+ if (distance >= 0x0000000080000000ULL && distance < 0xFFFFFFFF80000000ULL)
+ {
+ // Far call
+ MOV(64, R(RAX), Imm64(address));
+ CALLptr(R(RAX));
+ }
+ else
+ {
+ CALL(ptr);
+ }
+ }
+
+ template <typename FunctionPointer>
+ void ABI_CallFunctionC16(FunctionPointer func, u16 param1)
+ {
+ MOV(32, R(ABI_PARAM1), Imm32(param1));
+ ABI_CallFunction(func);
+ }
+
+ template <typename FunctionPointer>
+ void ABI_CallFunctionCC16(FunctionPointer func, u32 param1, u16 param2)
+ {
+ MOV(32, R(ABI_PARAM1), Imm32(param1));
+ MOV(32, R(ABI_PARAM2), Imm32(param2));
+ ABI_CallFunction(func);
+ }
+
+ template <typename FunctionPointer>
+ void ABI_CallFunctionC(FunctionPointer func, u32 param1)
+ {
+ MOV(32, R(ABI_PARAM1), Imm32(param1));
+ ABI_CallFunction(func);
+ }
+
+ template <typename FunctionPointer>
+ void ABI_CallFunctionCC(FunctionPointer func, u32 param1, u32 param2)
+ {
+ MOV(32, R(ABI_PARAM1), Imm32(param1));
+ MOV(32, R(ABI_PARAM2), Imm32(param2));
+ ABI_CallFunction(func);
+ }
+
+ template <typename FunctionPointer>
+ void ABI_CallFunctionCP(FunctionPointer func, u32 param1, const void* param2)
+ {
+ MOV(32, R(ABI_PARAM1), Imm32(param1));
+ MOV(64, R(ABI_PARAM2), Imm64(reinterpret_cast<u64>(param2)));
+ ABI_CallFunction(func);
+ }
+
+ template <typename FunctionPointer>
+ void ABI_CallFunctionCCC(FunctionPointer func, u32 param1, u32 param2, u32 param3)
+ {
+ MOV(32, R(ABI_PARAM1), Imm32(param1));
+ MOV(32, R(ABI_PARAM2), Imm32(param2));
+ MOV(32, R(ABI_PARAM3), Imm32(param3));
+ ABI_CallFunction(func);
+ }
+
+ template <typename FunctionPointer>
+ void ABI_CallFunctionCCP(FunctionPointer func, u32 param1, u32 param2, const void* param3)
+ {
+ MOV(32, R(ABI_PARAM1), Imm32(param1));
+ MOV(32, R(ABI_PARAM2), Imm32(param2));
+ MOV(64, R(ABI_PARAM3), Imm64(reinterpret_cast<u64>(param3)));
+ ABI_CallFunction(func);
+ }
+
+ template <typename FunctionPointer>
+ void ABI_CallFunctionCCCP(FunctionPointer func, u32 param1, u32 param2, u32 param3,
+ const void* param4)
+ {
+ MOV(32, R(ABI_PARAM1), Imm32(param1));
+ MOV(32, R(ABI_PARAM2), Imm32(param2));
+ MOV(32, R(ABI_PARAM3), Imm32(param3));
+ MOV(64, R(ABI_PARAM4), Imm64(reinterpret_cast<u64>(param4)));
+ ABI_CallFunction(func);
+ }
+
+ template <typename FunctionPointer>
+ void ABI_CallFunctionPC(FunctionPointer func, const void* param1, u32 param2)
+ {
+ MOV(64, R(ABI_PARAM1), Imm64(reinterpret_cast<u64>(param1)));
+ MOV(32, R(ABI_PARAM2), Imm32(param2));
+ ABI_CallFunction(func);
+ }
+
+ template <typename FunctionPointer>
+ void ABI_CallFunctionPPC(FunctionPointer func, const void* param1, const void* param2, u32 param3)
+ {
+ MOV(64, R(ABI_PARAM1), Imm64(reinterpret_cast<u64>(param1)));
+ MOV(64, R(ABI_PARAM2), Imm64(reinterpret_cast<u64>(param2)));
+ MOV(32, R(ABI_PARAM3), Imm32(param3));
+ ABI_CallFunction(func);
+ }
+
+ // Pass a register as a parameter.
+ template <typename FunctionPointer>
+ void ABI_CallFunctionR(FunctionPointer func, X64Reg reg1)
+ {
+ if (reg1 != ABI_PARAM1)
+ MOV(32, R(ABI_PARAM1), R(reg1));
+ ABI_CallFunction(func);
+ }
+
+ // Pass two registers as parameters.
+ template <typename FunctionPointer>
+ void ABI_CallFunctionRR(FunctionPointer func, X64Reg reg1, X64Reg reg2)
+ {
+ MOVTwo(64, ABI_PARAM1, reg1, 0, ABI_PARAM2, reg2);
+ ABI_CallFunction(func);
+ }
+
+ template <typename FunctionPointer>
+ void ABI_CallFunctionAC(int bits, FunctionPointer func, const Gen::OpArg& arg1, u32 param2)
+ {
+ if (!arg1.IsSimpleReg(ABI_PARAM1))
+ MOV(bits, R(ABI_PARAM1), arg1);
+ MOV(32, R(ABI_PARAM2), Imm32(param2));
+ ABI_CallFunction(func);
+ }
+
+ template <typename FunctionPointer>
+ void ABI_CallFunctionA(int bits, FunctionPointer func, const Gen::OpArg& arg1)
+ {
+ if (!arg1.IsSimpleReg(ABI_PARAM1))
+ MOV(bits, R(ABI_PARAM1), arg1);
+ ABI_CallFunction(func);
+ }
+
+ // Helper method for ABI functions related to calling functions. May be used by itself as well.
+ void MOVTwo(int bits, X64Reg dst1, X64Reg src1, s32 offset, X64Reg dst2, X64Reg src2);
+
+ // Saves/restores the registers and adjusts the stack to be aligned as
+ // required by the ABI, where the previous alignment was as specified.
+ // Push returns the size of the shadow space, i.e. the offset of the frame.
+ size_t ABI_PushRegistersAndAdjustStack(BitSet32 mask, size_t rsp_alignment,
+ size_t needed_frame_size = 0);
+ void ABI_PopRegistersAndAdjustStack(BitSet32 mask, size_t rsp_alignment,
+ size_t needed_frame_size = 0);
+
+ // Utility to generate a call to a std::function object.
+ //
+ // Unfortunately, calling operator() directly is undefined behavior in C++
+ // (this method might be a thunk in the case of multi-inheritance) so we
+ // have to go through a trampoline function.
+ template <typename T, typename... Args>
+ static T CallLambdaTrampoline(const std::function<T(Args...)>* f, Args... args)
+ {
+ return (*f)(args...);
+ }
+
+ template <typename T, typename... Args>
+ void ABI_CallLambdaC(const std::function<T(Args...)>* f, u32 p1)
+ {
+ auto trampoline = &XEmitter::CallLambdaTrampoline<T, Args...>;
+ ABI_CallFunctionPC(trampoline, reinterpret_cast<const void*>(f), p1);
+ }
+}; // class XEmitter
+
+} // namespace
diff --git a/src/dolphin/x64Reg.h b/src/dolphin/x64Reg.h
new file mode 100644
index 0000000..a92e024
--- /dev/null
+++ b/src/dolphin/x64Reg.h
@@ -0,0 +1,96 @@
+// Copyright 2016 Dolphin Emulator Project
+// Licensed under GPLv2+
+// Refer to the license_dolphin.txt file included.
+
+#pragma once
+
+namespace Gen
+{
+enum X64Reg
+{
+ EAX = 0,
+ EBX = 3,
+ ECX = 1,
+ EDX = 2,
+ ESI = 6,
+ EDI = 7,
+ EBP = 5,
+ ESP = 4,
+
+ RAX = 0,
+ RBX = 3,
+ RCX = 1,
+ RDX = 2,
+ RSI = 6,
+ RDI = 7,
+ RBP = 5,
+ RSP = 4,
+ R8 = 8,
+ R9 = 9,
+ R10 = 10,
+ R11 = 11,
+ R12 = 12,
+ R13 = 13,
+ R14 = 14,
+ R15 = 15,
+
+ AL = 0,
+ BL = 3,
+ CL = 1,
+ DL = 2,
+ SIL = 6,
+ DIL = 7,
+ BPL = 5,
+ SPL = 4,
+ AH = 0x104,
+ BH = 0x107,
+ CH = 0x105,
+ DH = 0x106,
+
+ AX = 0,
+ BX = 3,
+ CX = 1,
+ DX = 2,
+ SI = 6,
+ DI = 7,
+ BP = 5,
+ SP = 4,
+
+ XMM0 = 0,
+ XMM1,
+ XMM2,
+ XMM3,
+ XMM4,
+ XMM5,
+ XMM6,
+ XMM7,
+ XMM8,
+ XMM9,
+ XMM10,
+ XMM11,
+ XMM12,
+ XMM13,
+ XMM14,
+ XMM15,
+
+ YMM0 = 0,
+ YMM1,
+ YMM2,
+ YMM3,
+ YMM4,
+ YMM5,
+ YMM6,
+ YMM7,
+ YMM8,
+ YMM9,
+ YMM10,
+ YMM11,
+ YMM12,
+ YMM13,
+ YMM14,
+ YMM15,
+
+ INVALID_REG = 0xFFFFFFFF
+};
+
+} // namespace Gen