#include "ARMJIT_Compiler.h"

#include "../ARMInterpreter.h"
#include "../Config.h"

#include <assert.h>

#include "../dolphin/CommonFuncs.h"

#ifdef _WIN32
#include <windows.h>
#else
#include <sys/mman.h>
#include <unistd.h>
#endif

using namespace Gen;

extern "C" void ARM_Ret();

namespace ARMJIT
{
template <>
const X64Reg RegisterCache<Compiler, X64Reg>::NativeRegAllocOrder[] =
{
#ifdef _WIN32
    RBX, RSI, RDI, R12, R13, R14, // callee saved
    R10, R11, // caller saved
#else
    RBX, R12, R13, R14, // callee saved, this is sad
    R9, R10, R11, // caller saved
#endif
};
template <>
const int RegisterCache<Compiler, X64Reg>::NativeRegsAvailable =
#ifdef _WIN32
    8
#else
    7
#endif
;

#ifdef _WIN32
const BitSet32 CallerSavedPushRegs({R10, R11});
#else
const BitSet32 CallerSavedPushRegs({R9, R10, R11});
#endif

void Compiler::PushRegs(bool saveHiRegs)
{
    BitSet32 loadedRegs(RegCache.LoadedRegs);

    if (saveHiRegs)
    {
        BitSet32 hiRegsLoaded(RegCache.LoadedRegs & 0x7F00);
        for (int reg : hiRegsLoaded)
        {
            if (Thumb || CurInstr.Cond() == 0xE)
                RegCache.UnloadRegister(reg);
            else
                SaveReg(reg, RegCache.Mapping[reg]);
            // prevent saving the register twice
            loadedRegs[reg] = false;
        }
    }

    for (int reg : loadedRegs)
        if (BitSet32(1 << RegCache.Mapping[reg]) & ABI_ALL_CALLER_SAVED)
            SaveReg(reg, RegCache.Mapping[reg]);
}

void Compiler::PopRegs(bool saveHiRegs)
{
    BitSet32 loadedRegs(RegCache.LoadedRegs);
    for (int reg : loadedRegs)
    {
        if ((saveHiRegs && reg >= 8 && reg < 15)
            || BitSet32(1 << RegCache.Mapping[reg]) & ABI_ALL_CALLER_SAVED)
        {
            LoadReg(reg, RegCache.Mapping[reg]);
        }
    }
}

void Compiler::A_Comp_MRS()
{
    Comp_AddCycles_C();

    OpArg rd = MapReg(CurInstr.A_Reg(12));

    if (CurInstr.Instr & (1 << 22))
    {
        MOV(32, R(RSCRATCH), R(RCPSR));
        AND(32, R(RSCRATCH), Imm8(0x1F));
        XOR(32, R(RSCRATCH3), R(RSCRATCH3));
        MOV(32, R(RSCRATCH2), Imm32(15 - 8));
        CALL(ReadBanked);
        MOV(32, rd, R(RSCRATCH3));
    }
    else
    {
        MOV(32, rd, R(RCPSR));
    }
}

void UpdateModeTrampoline(ARM* arm, u32 oldmode, u32 newmode)
{
    arm->UpdateMode(oldmode, newmode);
}

void Compiler::A_Comp_MSR()
{
    Comp_AddCycles_C();

    OpArg val = CurInstr.Instr & (1 << 25)
        ? Imm32(::ROR((CurInstr.Instr & 0xFF), ((CurInstr.Instr >> 7) & 0x1E)))
        : MapReg(CurInstr.A_Reg(0));

    u32 mask = 0;
    if (CurInstr.Instr & (1<<16)) mask |= 0x000000FF;
    if (CurInstr.Instr & (1<<17)) mask |= 0x0000FF00;
    if (CurInstr.Instr & (1<<18)) mask |= 0x00FF0000;
    if (CurInstr.Instr & (1<<19)) mask |= 0xFF000000;

    if (CurInstr.Instr & (1 << 22))
    {
        MOV(32, R(RSCRATCH), R(RCPSR));
        AND(32, R(RSCRATCH), Imm8(0x1F));
        XOR(32, R(RSCRATCH3), R(RSCRATCH3));
        MOV(32, R(RSCRATCH2), Imm32(15 - 8));
        CALL(ReadBanked);

        MOV(32, R(RSCRATCH2), Imm32(mask));
        MOV(32, R(RSCRATCH4), R(RSCRATCH2));
        AND(32, R(RSCRATCH4), Imm32(0xFFFFFF00));
        MOV(32, R(RSCRATCH), R(RCPSR));
        AND(32, R(RSCRATCH), Imm8(0x1F));
        CMP(32, R(RSCRATCH), Imm8(0x10));
        CMOVcc(32, RSCRATCH2, R(RSCRATCH4), CC_E);

        MOV(32, R(RSCRATCH4), R(RSCRATCH2));
        NOT(32, R(RSCRATCH4));
        AND(32, R(RSCRATCH3), R(RSCRATCH4));

        AND(32, R(RSCRATCH2), val);
        OR(32, R(RSCRATCH3), R(RSCRATCH2));

        MOV(32, R(RSCRATCH2), Imm32(15 - 8));
        CALL(WriteBanked);
    }
    else
    {
        mask &= 0xFFFFFFDF;
        CPSRDirty = true;

        if ((mask & 0xFF) == 0)
        {
            AND(32, R(RCPSR), Imm32(~mask));
            if (!val.IsImm())
            {
                MOV(32, R(RSCRATCH), val);
                AND(32, R(RSCRATCH), Imm32(mask));
                OR(32, R(RCPSR), R(RSCRATCH));
            }
            else
            {
                OR(32, R(RCPSR), Imm32(val.Imm32() & mask));
            }
        }
        else
        {
            MOV(32, R(RSCRATCH2), Imm32(mask));
            MOV(32, R(RSCRATCH3), R(RSCRATCH2));
            AND(32, R(RSCRATCH3), Imm32(0xFFFFFF00));
            MOV(32, R(RSCRATCH), R(RCPSR));
            AND(32, R(RSCRATCH), Imm8(0x1F));
            CMP(32, R(RSCRATCH), Imm8(0x10));
            CMOVcc(32, RSCRATCH2, R(RSCRATCH3), CC_E);

            MOV(32, R(RSCRATCH3), R(RCPSR));

            // I need you ANDN
            MOV(32, R(RSCRATCH), R(RSCRATCH2));
            NOT(32, R(RSCRATCH));
            AND(32, R(RCPSR), R(RSCRATCH));

            AND(32, R(RSCRATCH2), val);
            OR(32, R(RCPSR), R(RSCRATCH2));

            PushRegs(true);

            MOV(32, R(ABI_PARAM3), R(RCPSR));
            MOV(32, R(ABI_PARAM2), R(RSCRATCH3));
            MOV(64, R(ABI_PARAM1), R(RCPU));
            CALL((void*)&UpdateModeTrampoline);

            PopRegs(true);
        }
    }
}

/*
    We'll repurpose this .bss memory

 */
u8 CodeMemory[1024 * 1024 * 32];

Compiler::Compiler()
{
    {
    #ifdef _WIN32
        SYSTEM_INFO sysInfo;
        GetSystemInfo(&sysInfo);

        u64 pageSize = (u64)sysInfo.dwPageSize;
    #else
        u64 pageSize = sysconf(_SC_PAGE_SIZE);
    #endif

        u8* pageAligned = (u8*)(((u64)CodeMemory & ~(pageSize - 1)) + pageSize);
        u64 alignedSize = (((u64)CodeMemory + sizeof(CodeMemory)) & ~(pageSize - 1)) - (u64)pageAligned;

    #ifdef _WIN32
        DWORD dummy;
        VirtualProtect(pageAligned, alignedSize, PAGE_EXECUTE_READWRITE, &dummy);
    #elif defined(__APPLE__)
        pageAligned = (u8*)mmap(NULL, 1024*1024*32, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_PRIVATE | MAP_ANONYMOUS ,-1, 0);
    #else
        mprotect(pageAligned, alignedSize, PROT_EXEC | PROT_READ | PROT_WRITE);
    #endif

        ResetStart = pageAligned;
        CodeMemSize = alignedSize;
    }

    Reset();

    {
        // RSCRATCH mode
        // RSCRATCH2 reg number
        // RSCRATCH3 value in current mode
        // ret - RSCRATCH3
        ReadBanked = (void*)GetWritableCodePtr();
        CMP(32, R(RSCRATCH), Imm8(0x11));
        FixupBranch fiq = J_CC(CC_E);
        SUB(32, R(RSCRATCH2), Imm8(13 - 8));
        FixupBranch notEverything = J_CC(CC_L);
        CMP(32, R(RSCRATCH), Imm8(0x12));
        FixupBranch irq = J_CC(CC_E);
        CMP(32, R(RSCRATCH), Imm8(0x13));
        FixupBranch svc = J_CC(CC_E);
        CMP(32, R(RSCRATCH), Imm8(0x17));
        FixupBranch abt = J_CC(CC_E);
        CMP(32, R(RSCRATCH), Imm8(0x1B));
        FixupBranch und = J_CC(CC_E);
        SetJumpTarget(notEverything);
        RET();

        SetJumpTarget(fiq);
        MOV(32, R(RSCRATCH3), MComplex(RCPU, RSCRATCH2, SCALE_4, offsetof(ARM, R_FIQ)));
        RET();
        SetJumpTarget(irq);
        MOV(32, R(RSCRATCH3), MComplex(RCPU, RSCRATCH2, SCALE_4, offsetof(ARM, R_IRQ)));
        RET();
        SetJumpTarget(svc);
        MOV(32, R(RSCRATCH3), MComplex(RCPU, RSCRATCH2, SCALE_4, offsetof(ARM, R_SVC)));
        RET();
        SetJumpTarget(abt);
        MOV(32, R(RSCRATCH3), MComplex(RCPU, RSCRATCH2, SCALE_4, offsetof(ARM, R_ABT)));
        RET();
        SetJumpTarget(und);
        MOV(32, R(RSCRATCH3), MComplex(RCPU, RSCRATCH2, SCALE_4, offsetof(ARM, R_UND)));
        RET();
    }
    {
        // RSCRATCH  mode
        // RSCRATCH2 reg n
        // RSCRATCH3 value
        // carry flag set if the register isn't banked
        WriteBanked = (void*)GetWritableCodePtr();
        CMP(32, R(RSCRATCH), Imm8(0x11));
        FixupBranch fiq = J_CC(CC_E);
        SUB(32, R(RSCRATCH2), Imm8(13 - 8));
        FixupBranch notEverything = J_CC(CC_L);
        CMP(32, R(RSCRATCH), Imm8(0x12));
        FixupBranch irq = J_CC(CC_E);
        CMP(32, R(RSCRATCH), Imm8(0x13));
        FixupBranch svc = J_CC(CC_E);
        CMP(32, R(RSCRATCH), Imm8(0x17));
        FixupBranch abt = J_CC(CC_E);
        CMP(32, R(RSCRATCH), Imm8(0x1B));
        FixupBranch und = J_CC(CC_E);
        SetJumpTarget(notEverything);
        STC();
        RET();

        SetJumpTarget(fiq);
        MOV(32, MComplex(RCPU, RSCRATCH2, SCALE_4, offsetof(ARM, R_FIQ)), R(RSCRATCH3));
        CLC();
        RET();
        SetJumpTarget(irq);
        MOV(32, MComplex(RCPU, RSCRATCH2, SCALE_4, offsetof(ARM, R_IRQ)), R(RSCRATCH3));
        CLC();
        RET();
        SetJumpTarget(svc);
        MOV(32, MComplex(RCPU, RSCRATCH2, SCALE_4, offsetof(ARM, R_SVC)), R(RSCRATCH3));
        CLC();
        RET();
        SetJumpTarget(abt);
        MOV(32, MComplex(RCPU, RSCRATCH2, SCALE_4, offsetof(ARM, R_ABT)), R(RSCRATCH3));
        CLC();
        RET();
        SetJumpTarget(und);
        MOV(32, MComplex(RCPU, RSCRATCH2, SCALE_4, offsetof(ARM, R_UND)), R(RSCRATCH3));
        CLC();
        RET();
    }

    for (int consoleType = 0; consoleType < 2; consoleType++)
    {
        for (int num = 0; num < 2; num++)
        {
            for (int size = 0; size < 3; size++)
            {
                for (int reg = 0; reg < 16; reg++)
                {
                    if (reg == RSCRATCH || reg == ABI_PARAM1 || reg == ABI_PARAM2)
                    {
                        PatchedStoreFuncs[consoleType][num][size][reg] = NULL;
                        PatchedLoadFuncs[consoleType][num][size][0][reg] = NULL;
                        PatchedLoadFuncs[consoleType][num][size][1][reg] = NULL;
                        continue;
                    }

                    X64Reg rdMapped = (X64Reg)reg;
                    PatchedStoreFuncs[consoleType][num][size][reg] = GetWritableCodePtr();
                    if (RSCRATCH3 != ABI_PARAM1)
                        MOV(32, R(ABI_PARAM1), R(RSCRATCH3));
                    if (num == 0)
                    {
                        MOV(64, R(ABI_PARAM2), R(RCPU));
                        if (rdMapped != ABI_PARAM3)
                            MOV(32, R(ABI_PARAM3), R(rdMapped));
                    }
                    else
                    {
                        MOV(32, R(ABI_PARAM2), R(rdMapped));
                    }
                    ABI_PushRegistersAndAdjustStack(CallerSavedPushRegs, 8);
                    if (consoleType == 0)
                    {
                        switch ((8 << size) | num)
                        {
                        case 32: ABI_CallFunction(SlowWrite9<u32, 0>); break;
                        case 33: ABI_CallFunction(SlowWrite7<u32, 0>); break;
                        case 16: ABI_CallFunction(SlowWrite9<u16, 0>); break;
                        case 17: ABI_CallFunction(SlowWrite7<u16, 0>); break;
                        case 8: ABI_CallFunction(SlowWrite9<u8, 0>); break;
                        case 9: ABI_CallFunction(SlowWrite7<u8, 0>); break;
                        }
                    }
                    else
                    {
                        switch ((8 << size) | num)
                        {
                        case 32: ABI_CallFunction(SlowWrite9<u32, 1>); break;
                        case 33: ABI_CallFunction(SlowWrite7<u32, 1>); break;
                        case 16: ABI_CallFunction(SlowWrite9<u16, 1>); break;
                        case 17: ABI_CallFunction(SlowWrite7<u16, 1>); break;
                        case 8: ABI_CallFunction(SlowWrite9<u8, 1>); break;
                        case 9: ABI_CallFunction(SlowWrite7<u8, 1>); break;
                        }
                    }
                    ABI_PopRegistersAndAdjustStack(CallerSavedPushRegs, 8);
                    RET();

                    for (int signextend = 0; signextend < 2; signextend++)
                    {
                        PatchedLoadFuncs[consoleType][num][size][signextend][reg] = GetWritableCodePtr();
                        if (RSCRATCH3 != ABI_PARAM1)
                            MOV(32, R(ABI_PARAM1), R(RSCRATCH3));
                        if (num == 0)
                            MOV(64, R(ABI_PARAM2), R(RCPU));
                        ABI_PushRegistersAndAdjustStack(CallerSavedPushRegs, 8);
                        if (consoleType == 0)
                        {
                            switch ((8 << size) | num)
                            {
                            case 32: ABI_CallFunction(SlowRead9<u32, 0>); break;
                            case 33: ABI_CallFunction(SlowRead7<u32, 0>); break;
                            case 16: ABI_CallFunction(SlowRead9<u16, 0>); break;
                            case 17: ABI_CallFunction(SlowRead7<u16, 0>); break;
                            case 8: ABI_CallFunction(SlowRead9<u8, 0>); break;
                            case 9: ABI_CallFunction(SlowRead7<u8, 0>); break;
                            }
                        }
                        else
                        {
                            switch ((8 << size) | num)
                            {
                            case 32: ABI_CallFunction(SlowRead9<u32, 1>); break;
                            case 33: ABI_CallFunction(SlowRead7<u32, 1>); break;
                            case 16: ABI_CallFunction(SlowRead9<u16, 1>); break;
                            case 17: ABI_CallFunction(SlowRead7<u16, 1>); break;
                            case 8: ABI_CallFunction(SlowRead9<u8, 1>); break;
                            case 9: ABI_CallFunction(SlowRead7<u8, 1>); break;
                            }
                        }
                        ABI_PopRegistersAndAdjustStack(CallerSavedPushRegs, 8);
                        if (signextend)
                            MOVSX(32, 8 << size, rdMapped, R(RSCRATCH));
                        else
                            MOVZX(32, 8 << size, rdMapped, R(RSCRATCH));
                        RET();
                    }
                }
            }
        }
    }

    // move the region forward to prevent overwriting the generated functions
    CodeMemSize -= GetWritableCodePtr() - ResetStart;
    ResetStart = GetWritableCodePtr();

    NearStart = ResetStart;
    FarStart = ResetStart + 1024*1024*24;

    NearSize = FarStart - ResetStart;
    FarSize = (ResetStart + CodeMemSize) - FarStart;
}

void Compiler::LoadCPSR()
{
    assert(!CPSRDirty);

    MOV(32, R(RCPSR), MDisp(RCPU, offsetof(ARM, CPSR)));
}

void Compiler::SaveCPSR(bool flagClean)
{
    if (CPSRDirty)
    {
        MOV(32, MDisp(RCPU, offsetof(ARM, CPSR)), R(RCPSR));
        if (flagClean)
            CPSRDirty = false;
    }
}

void Compiler::LoadReg(int reg, X64Reg nativeReg)
{
    if (reg != 15)
        MOV(32, R(nativeReg), MDisp(RCPU, offsetof(ARM, R[reg])));
    else
        MOV(32, R(nativeReg), Imm32(R15));
}

void Compiler::SaveReg(int reg, X64Reg nativeReg)
{
    MOV(32, MDisp(RCPU, offsetof(ARM, R[reg])), R(nativeReg));
}

// invalidates RSCRATCH and RSCRATCH3
Gen::FixupBranch Compiler::CheckCondition(u32 cond)
{
    // hack, ldm/stm can get really big TODO: make this better
    bool ldmStm = !Thumb &&
        (CurInstr.Info.Kind == ARMInstrInfo::ak_LDM || CurInstr.Info.Kind == ARMInstrInfo::ak_STM);
    if (cond >= 0x8)
    {
        static_assert(RSCRATCH3 == ECX, "RSCRATCH has to be equal to ECX!");
        MOV(32, R(RSCRATCH3), R(RCPSR));
        SHR(32, R(RSCRATCH3), Imm8(28));
        MOV(32, R(RSCRATCH), Imm32(1));
        SHL(32, R(RSCRATCH), R(RSCRATCH3));
        TEST(32, R(RSCRATCH), Imm32(ARM::ConditionTable[cond]));

        return J_CC(CC_Z, ldmStm);
    }
    else
    {
        // could have used a LUT, but then where would be the fun?
        TEST(32, R(RCPSR), Imm32(1 << (28 + ((~(cond >> 1) & 1) << 1 | (cond >> 2 & 1) ^ (cond >> 1 & 1)))));

        return J_CC(cond & 1 ? CC_NZ : CC_Z, ldmStm);
    }
}

#define F(x) &Compiler::x
const Compiler::CompileFunc A_Comp[ARMInstrInfo::ak_Count] =
{
    // AND
    F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith),
    F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith),
    // EOR
    F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith),
    F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith),
    // SUB
    F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith),
    F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith),
    // RSB
    F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith),
    F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith),
    // ADD
    F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith),
    F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith),
    // ADC
    F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith),
    F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith),
    // SBC
    F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith),
    F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith),
    // RSC
    F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith),
    F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith),
    // ORR
    F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith),
    F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith),
    // MOV
    F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp),
    F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp),
    // BIC
    F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith),
    F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith), F(A_Comp_Arith),
    // MVN
    F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp),
    F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp), F(A_Comp_MovOp),
    // TST
    F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp),
    // TEQ
    F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp),
    // CMP
    F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp),
    // CMN
    F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp), F(A_Comp_CmpOp),
    // Mul
    F(A_Comp_MUL_MLA), F(A_Comp_MUL_MLA), F(A_Comp_Mul_Long), F(A_Comp_Mul_Long), F(A_Comp_Mul_Long), F(A_Comp_Mul_Long), NULL, NULL, NULL, NULL, NULL,
    // ARMv5 stuff
    F(A_Comp_CLZ), NULL, NULL, NULL, NULL,
    // STR
    F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB),
    // STRB
    F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB),
    // LDR
    F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB),
    // LDRB
    F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB), F(A_Comp_MemWB),
    // STRH
    F(A_Comp_MemHalf), F(A_Comp_MemHalf), F(A_Comp_MemHalf), F(A_Comp_MemHalf),
    // LDRD, STRD never used by anything so they stay interpreted (by anything I mean the 5 games I checked)
    NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
    // LDRH
    F(A_Comp_MemHalf), F(A_Comp_MemHalf), F(A_Comp_MemHalf), F(A_Comp_MemHalf),
    // LDRSB
    F(A_Comp_MemHalf), F(A_Comp_MemHalf), F(A_Comp_MemHalf), F(A_Comp_MemHalf),
    // LDRSH
    F(A_Comp_MemHalf), F(A_Comp_MemHalf), F(A_Comp_MemHalf), F(A_Comp_MemHalf),
    // swap
    NULL, NULL,
    // LDM/STM
    F(A_Comp_LDM_STM), F(A_Comp_LDM_STM),
    // Branch
    F(A_Comp_BranchImm), F(A_Comp_BranchImm), F(A_Comp_BranchImm), F(A_Comp_BranchXchangeReg), F(A_Comp_BranchXchangeReg),
    // system stuff
    NULL, F(A_Comp_MSR), F(A_Comp_MSR), F(A_Comp_MRS), NULL, NULL, NULL,
    F(Nop)
};

const Compiler::CompileFunc T_Comp[ARMInstrInfo::tk_Count] = {
    // Shift imm
    F(T_Comp_ShiftImm), F(T_Comp_ShiftImm), F(T_Comp_ShiftImm),
    // Three operand ADD/SUB
    F(T_Comp_AddSub_), F(T_Comp_AddSub_), F(T_Comp_AddSub_), F(T_Comp_AddSub_),
    // 8 bit imm
    F(T_Comp_ALU_Imm8), F(T_Comp_ALU_Imm8), F(T_Comp_ALU_Imm8), F(T_Comp_ALU_Imm8),
    // general ALU
    F(T_Comp_ALU), F(T_Comp_ALU), F(T_Comp_ALU), F(T_Comp_ALU),
    F(T_Comp_ALU), F(T_Comp_ALU), F(T_Comp_ALU), F(T_Comp_ALU),
    F(T_Comp_ALU), F(T_Comp_ALU), F(T_Comp_ALU), F(T_Comp_ALU),
    F(T_Comp_ALU), F(T_Comp_MUL), F(T_Comp_ALU), F(T_Comp_ALU),
    // hi reg
    F(T_Comp_ALU_HiReg), F(T_Comp_ALU_HiReg), F(T_Comp_ALU_HiReg),
    // pc/sp relative
    F(T_Comp_RelAddr), F(T_Comp_RelAddr), F(T_Comp_AddSP),
    // LDR pcrel
    F(T_Comp_LoadPCRel),
    // LDR/STR reg offset
    F(T_Comp_MemReg), F(T_Comp_MemReg), F(T_Comp_MemReg), F(T_Comp_MemReg),
    // LDR/STR sign extended, half
    F(T_Comp_MemRegHalf), F(T_Comp_MemRegHalf), F(T_Comp_MemRegHalf), F(T_Comp_MemRegHalf),
    // LDR/STR imm offset
    F(T_Comp_MemImm), F(T_Comp_MemImm), F(T_Comp_MemImm), F(T_Comp_MemImm),
    // LDR/STR half imm offset
    F(T_Comp_MemImmHalf), F(T_Comp_MemImmHalf),
    // LDR/STR sp rel
    F(T_Comp_MemSPRel), F(T_Comp_MemSPRel),
    // PUSH/POP
    F(T_Comp_PUSH_POP), F(T_Comp_PUSH_POP), 
    // LDMIA, STMIA
    F(T_Comp_LDMIA_STMIA), F(T_Comp_LDMIA_STMIA), 
    // Branch
    F(T_Comp_BCOND), F(T_Comp_BranchXchangeReg), F(T_Comp_BranchXchangeReg), F(T_Comp_B), F(T_Comp_BL_LONG_1), F(T_Comp_BL_LONG_2), 
    // Unk, SVC
    NULL, NULL,
    F(T_Comp_BL_Merged)
};
#undef F

bool Compiler::CanCompile(bool thumb, u16 kind)
{
    return (thumb ? T_Comp[kind] : A_Comp[kind]) != NULL;
}

void Compiler::Reset()
{
    memset(ResetStart, 0xcc, CodeMemSize);
    SetCodePtr(ResetStart);

    NearCode = NearStart;
    FarCode = FarStart;

    LoadStorePatches.clear();
}

bool Compiler::IsJITFault(u8* addr)
{
    return (u64)addr >= (u64)ResetStart && (u64)addr < (u64)ResetStart + CodeMemSize;
}

void Compiler::Comp_SpecialBranchBehaviour(bool taken)
{
    if (taken && CurInstr.BranchFlags & branch_IdleBranch)
        OR(8, MDisp(RCPU, offsetof(ARM, IdleLoop)), Imm8(0x1));

    if ((CurInstr.BranchFlags & branch_FollowCondNotTaken && taken)
        || (CurInstr.BranchFlags & branch_FollowCondTaken && !taken))
    {
        RegCache.PrepareExit();

        ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm32(ConstantCycles));
        JMP((u8*)&ARM_Ret, true);
    }
}

JitBlockEntry Compiler::CompileBlock(ARM* cpu, bool thumb, FetchedInstr instrs[], int instrsCount)
{
    if (NearSize - (GetCodePtr() - NearStart) < 1024 * 32) // guess...
    {
        printf("near reset\n");
        ResetBlockCache();
    }
    if (FarSize - (FarCode - FarStart) < 1024 * 32) // guess...
    {
        printf("far reset\n");
        ResetBlockCache();
    }

    ConstantCycles = 0;
    Thumb = thumb;
    Num = cpu->Num;
    CodeRegion = instrs[0].Addr >> 24;
    CurCPU = cpu;
    // CPSR might have been modified in a previous block
    CPSRDirty = false;

    JitBlockEntry res = (JitBlockEntry)GetWritableCodePtr();

    RegCache = RegisterCache<Compiler, X64Reg>(this, instrs, instrsCount);

    for (int i = 0; i < instrsCount; i++)
    {
        CurInstr = instrs[i];
        R15 = CurInstr.Addr + (Thumb ? 4 : 8);
        CodeRegion = R15 >> 24;

        Exit = i == instrsCount - 1 || (CurInstr.BranchFlags & branch_FollowCondNotTaken);

        CompileFunc comp = Thumb
            ? T_Comp[CurInstr.Info.Kind]
            : A_Comp[CurInstr.Info.Kind];

        bool isConditional = Thumb ? CurInstr.Info.Kind == ARMInstrInfo::tk_BCOND : CurInstr.Cond() < 0xE;
        if (comp == NULL || (CurInstr.BranchFlags & branch_FollowCondTaken) || (i == instrsCount - 1 && (!CurInstr.Info.Branches() || isConditional)))
        {
            MOV(32, MDisp(RCPU, offsetof(ARM, R[15])), Imm32(R15));
            if (comp == NULL)
            {
                MOV(32, MDisp(RCPU, offsetof(ARM, CodeCycles)), Imm32(CurInstr.CodeCycles));
                MOV(32, MDisp(RCPU, offsetof(ARM, CurInstr)), Imm32(CurInstr.Instr));

                SaveCPSR();
            }
        }

        if (comp != NULL)
            RegCache.Prepare(Thumb, i);
        else
            RegCache.Flush();

        if (Thumb)
        {
            if (comp == NULL)
            {
                MOV(64, R(ABI_PARAM1), R(RCPU));

                ABI_CallFunction(InterpretTHUMB[CurInstr.Info.Kind]);
            }
            else
            {
                (this->*comp)();
            }
        }
        else
        {
            u32 cond = CurInstr.Cond();
            if (CurInstr.Info.Kind == ARMInstrInfo::ak_BLX_IMM)
            {
                if (comp)
                    (this->*comp)();
                else
                {
                    MOV(64, R(ABI_PARAM1), R(RCPU));
                    ABI_CallFunction(ARMInterpreter::A_BLX_IMM);
                }
            }
            else if (cond == 0xF)
            {
                Comp_AddCycles_C();
            }
            else
            {
                IrregularCycles = comp == NULL;

                FixupBranch skipExecute;
                if (cond < 0xE)
                    skipExecute = CheckCondition(cond);

                if (comp == NULL)
                {
                    MOV(64, R(ABI_PARAM1), R(RCPU));

                    ABI_CallFunction(InterpretARM[CurInstr.Info.Kind]);
                }
                else
                {
                    (this->*comp)();
                }

                Comp_SpecialBranchBehaviour(true);

                if (CurInstr.Cond() < 0xE)
                {
                    if (IrregularCycles || (CurInstr.BranchFlags & branch_FollowCondTaken))
                    {
                        FixupBranch skipFailed = J();
                        SetJumpTarget(skipExecute);

                        Comp_AddCycles_C(true);

                        Comp_SpecialBranchBehaviour(false);

                        SetJumpTarget(skipFailed);
                    }
                    else
                    {
                        SetJumpTarget(skipExecute);
                    }
                }
                
            }
        }

        if (comp == NULL)
            LoadCPSR();
    }

    RegCache.Flush();

    ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm32(ConstantCycles));
    JMP((u8*)ARM_Ret, true);

    /*FILE* codeout = fopen("codeout", "a");
    fprintf(codeout, "beginning block argargarg__ %x!!!", instrs[0].Addr);
    fwrite((u8*)res, GetWritableCodePtr() - (u8*)res, 1, codeout);

    fclose(codeout);*/

    return res;
}

void Compiler::Comp_AddCycles_C(bool forceNonConstant)
{
    s32 cycles = Num ?
        NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 1 : 3]
        : ((R15 & 0x2) ? 0 : CurInstr.CodeCycles);

    if ((!Thumb && CurInstr.Cond() < 0xE) || forceNonConstant)
        ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm8(cycles));
    else
        ConstantCycles += cycles;
}

void Compiler::Comp_AddCycles_CI(u32 i)
{
    s32 cycles = (Num ?
        NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2]
        : ((R15 & 0x2) ? 0 : CurInstr.CodeCycles)) + i;

    if (!Thumb && CurInstr.Cond() < 0xE)
        ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm8(cycles));
    else
        ConstantCycles += cycles;
}

void Compiler::Comp_AddCycles_CI(Gen::X64Reg i, int add)
{
    s32 cycles = Num ?
        NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2]
        : ((R15 & 0x2) ? 0 : CurInstr.CodeCycles);
    
    if (!Thumb && CurInstr.Cond() < 0xE)
    {
        LEA(32, RSCRATCH, MDisp(i, add + cycles));
        ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(RSCRATCH));
    }
    else
    {
        ConstantCycles += cycles;
        ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(i));
    }
}

void Compiler::Comp_AddCycles_CDI()
{
    if (Num == 0)
        Comp_AddCycles_CD();
    else
    {
        IrregularCycles = true;

        s32 cycles;

        s32 numC = NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2];
        s32 numD = CurInstr.DataCycles;

        if ((CurInstr.DataRegion >> 24) == 0x02) // mainRAM
        {
            if (CodeRegion == 0x02)
                cycles = numC + numD;
            else
            {
                numC++;
                cycles = std::max(numC + numD - 3, std::max(numC, numD));
            }
        }
        else if (CodeRegion == 0x02)
        {
            numD++;
            cycles = std::max(numC + numD - 3, std::max(numC, numD));
        }
        else
        {
            cycles = numC + numD + 1;
        }
        
        if (!Thumb && CurInstr.Cond() < 0xE)
            ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm8(cycles));
        else
            ConstantCycles += cycles;
    }
}

void Compiler::Comp_AddCycles_CD()
{
    u32 cycles = 0;
    if (Num == 0)
    {
        s32 numC = (R15 & 0x2) ? 0 : CurInstr.CodeCycles;
        s32 numD = CurInstr.DataCycles;

        //if (DataRegion != CodeRegion)
            cycles = std::max(numC + numD - 6, std::max(numC, numD));

        IrregularCycles = cycles != numC;
    }
    else
    {
        s32 numC = NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2];
        s32 numD = CurInstr.DataCycles;

        if ((CurInstr.DataRegion >> 4) == 0x02)
        {
            if (CodeRegion == 0x02)
                cycles += numC + numD;
            else
                cycles += std::max(numC + numD - 3, std::max(numC, numD));
        }
        else if (CodeRegion == 0x02)
        {
            cycles += std::max(numC + numD - 3, std::max(numC, numD));
        }
        else
        {
            cycles += numC + numD;
        }

        IrregularCycles = true;
    }

    if (IrregularCycles && !Thumb && CurInstr.Cond() < 0xE)
        ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm8(cycles));
    else
        ConstantCycles += cycles;
}
}