diff options
Diffstat (limited to 'src/common/x64')
| -rw-r--r-- | src/common/x64/abi.cpp | 680 | ||||
| -rw-r--r-- | src/common/x64/abi.h | 78 | ||||
| -rw-r--r-- | src/common/x64/emitter.cpp | 1989 | ||||
| -rw-r--r-- | src/common/x64/emitter.h | 1067 | 
4 files changed, 3814 insertions, 0 deletions
| diff --git a/src/common/x64/abi.cpp b/src/common/x64/abi.cpp new file mode 100644 index 000000000..598e7f335 --- /dev/null +++ b/src/common/x64/abi.cpp @@ -0,0 +1,680 @@ +// Copyright (C) 2003 Dolphin Project. + +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, version 2.0 or later versions. + +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the +// GNU General Public License 2.0 for more details. + +// A copy of the GPL 2.0 should have been included with the program. +// If not, see http://www.gnu.org/licenses/ + +// Official SVN repository and contact information can be found at +// http://code.google.com/p/dolphin-emu/ + +#include "abi.h" +#include "emitter.h" + +using namespace Gen; + +// Shared code between Win64 and Unix64 + +// Sets up a __cdecl function. +void XEmitter::ABI_EmitPrologue(int maxCallParams) +{ +#ifdef _M_IX86 +    // Don't really need to do anything +#elif defined(ARCHITECTURE_X64) +#if _WIN32 +    int stacksize = ((maxCallParams + 1) & ~1) * 8 + 8; +    // Set up a stack frame so that we can call functions +    // TODO: use maxCallParams +    SUB(64, R(RSP), Imm8(stacksize)); +#endif +#else +#error Arch not supported +#endif +} + +void XEmitter::ABI_EmitEpilogue(int maxCallParams) +{ +#ifdef _M_IX86 +    RET(); +#elif defined(ARCHITECTURE_X64) +#ifdef _WIN32 +    int stacksize = ((maxCallParams+1)&~1)*8 + 8; +    ADD(64, R(RSP), Imm8(stacksize)); +#endif +    RET(); +#else +#error Arch not supported + + +#endif +} + +#ifdef _M_IX86 // All32 + +// Shared code between Win32 and Unix32 +void XEmitter::ABI_CallFunction(const void *func) { +    ABI_AlignStack(0); +    CALL(func); +    ABI_RestoreStack(0); +} + +void XEmitter::ABI_CallFunctionC16(const void *func, u16 param1) { +    ABI_AlignStack(1 * 2); +    PUSH(16, Imm16(param1)); +    CALL(func); +    ABI_RestoreStack(1 * 2); +} + +void XEmitter::ABI_CallFunctionCC16(const void *func, u32 param1, u16 param2) { +    ABI_AlignStack(1 * 2 + 1 * 4); +    PUSH(16, Imm16(param2)); +    PUSH(32, Imm32(param1)); +    CALL(func); +    ABI_RestoreStack(1 * 2 + 1 * 4); +} + +void XEmitter::ABI_CallFunctionC(const void *func, u32 param1) { +    ABI_AlignStack(1 * 4); +    PUSH(32, Imm32(param1)); +    CALL(func); +    ABI_RestoreStack(1 * 4); +} + +void XEmitter::ABI_CallFunctionCC(const void *func, u32 param1, u32 param2) { +    ABI_AlignStack(2 * 4); +    PUSH(32, Imm32(param2)); +    PUSH(32, Imm32(param1)); +    CALL(func); +    ABI_RestoreStack(2 * 4); +} + +void XEmitter::ABI_CallFunctionCCC(const void *func, u32 param1, u32 param2, u32 param3) { +    ABI_AlignStack(3 * 4); +    PUSH(32, Imm32(param3)); +    PUSH(32, Imm32(param2)); +    PUSH(32, Imm32(param1)); +    CALL(func); +    ABI_RestoreStack(3 * 4); +} + +void XEmitter::ABI_CallFunctionCCP(const void *func, u32 param1, u32 param2, void *param3) { +    ABI_AlignStack(3 * 4); +    PUSH(32, ImmPtr(param3)); +    PUSH(32, Imm32(param2)); +    PUSH(32, Imm32(param1)); +    CALL(func); +    ABI_RestoreStack(3 * 4); +} + +void XEmitter::ABI_CallFunctionCCCP(const void *func, u32 param1, u32 param2,u32 param3, void *param4) { +    ABI_AlignStack(4 * 4); +    PUSH(32, ImmPtr(param4)); +    PUSH(32, Imm32(param3)); +    PUSH(32, Imm32(param2)); +    PUSH(32, Imm32(param1)); +    CALL(func); +    ABI_RestoreStack(4 * 4); +} + +void XEmitter::ABI_CallFunctionP(const void *func, void *param1) { +    ABI_AlignStack(1 * 4); +    PUSH(32, ImmPtr(param1)); +    CALL(func); +    ABI_RestoreStack(1 * 4); +} + +void XEmitter::ABI_CallFunctionPA(const void *func, void *param1, const Gen::OpArg &arg2) { +    ABI_AlignStack(2 * 4); +    PUSH(32, arg2); +    PUSH(32, ImmPtr(param1)); +    CALL(func); +    ABI_RestoreStack(2 * 4); +} + +void XEmitter::ABI_CallFunctionPAA(const void *func, void *param1, const Gen::OpArg &arg2, const Gen::OpArg &arg3) { +    ABI_AlignStack(3 * 4); +    PUSH(32, arg3); +    PUSH(32, arg2); +    PUSH(32, ImmPtr(param1)); +    CALL(func); +    ABI_RestoreStack(3 * 4); +} + +void XEmitter::ABI_CallFunctionPPC(const void *func, void *param1, void *param2, u32 param3) { +    ABI_AlignStack(3 * 4); +    PUSH(32, Imm32(param3)); +    PUSH(32, ImmPtr(param2)); +    PUSH(32, ImmPtr(param1)); +    CALL(func); +    ABI_RestoreStack(3 * 4); +} + +// Pass a register as a parameter. +void XEmitter::ABI_CallFunctionR(const void *func, X64Reg reg1) { +    ABI_AlignStack(1 * 4); +    PUSH(32, R(reg1)); +    CALL(func); +    ABI_RestoreStack(1 * 4); +} + +// Pass two registers as parameters. +void XEmitter::ABI_CallFunctionRR(const void *func, Gen::X64Reg reg1, Gen::X64Reg reg2) +{ +    ABI_AlignStack(2 * 4); +    PUSH(32, R(reg2)); +    PUSH(32, R(reg1)); +    CALL(func); +    ABI_RestoreStack(2 * 4); +} + +void XEmitter::ABI_CallFunctionAC(const void *func, const Gen::OpArg &arg1, u32 param2) +{ +    ABI_AlignStack(2 * 4); +    PUSH(32, Imm32(param2)); +    PUSH(32, arg1); +    CALL(func); +    ABI_RestoreStack(2 * 4); +} + +void XEmitter::ABI_CallFunctionACC(const void *func, const Gen::OpArg &arg1, u32 param2, u32 param3) +{ +    ABI_AlignStack(3 * 4); +    PUSH(32, Imm32(param3)); +    PUSH(32, Imm32(param2)); +    PUSH(32, arg1); +    CALL(func); +    ABI_RestoreStack(3 * 4); +} + +void XEmitter::ABI_CallFunctionA(const void *func, const Gen::OpArg &arg1) +{ +    ABI_AlignStack(1 * 4); +    PUSH(32, arg1); +    CALL(func); +    ABI_RestoreStack(1 * 4); +} + +void XEmitter::ABI_CallFunctionAA(const void *func, const Gen::OpArg &arg1, const Gen::OpArg &arg2) +{ +    ABI_AlignStack(2 * 4); +    PUSH(32, arg2); +    PUSH(32, arg1); +    CALL(func); +    ABI_RestoreStack(2 * 4); +} + +void XEmitter::ABI_PushAllCalleeSavedRegsAndAdjustStack() { +    // Note: 4 * 4 = 16 bytes, so alignment is preserved. +    PUSH(EBP); +    PUSH(EBX); +    PUSH(ESI); +    PUSH(EDI); +} + +void XEmitter::ABI_PopAllCalleeSavedRegsAndAdjustStack() { +    POP(EDI); +    POP(ESI); +    POP(EBX); +    POP(EBP); +} + +unsigned int XEmitter::ABI_GetAlignedFrameSize(unsigned int frameSize) { +    frameSize += 4; // reserve space for return address +    unsigned int alignedSize = +#ifdef __GNUC__ +        (frameSize + 15) & -16; +#else +        (frameSize + 3) & -4; +#endif +    return alignedSize; +} + + +void XEmitter::ABI_AlignStack(unsigned int frameSize) { +// Mac OS X requires the stack to be 16-byte aligned before every call. +// Linux requires the stack to be 16-byte aligned before calls that put SSE +// vectors on the stack, but since we do not keep track of which calls do that, +// it is effectively every call as well. +// Windows binaries compiled with MSVC do not have such a restriction*, but I +// expect that GCC on Windows acts the same as GCC on Linux in this respect. +// It would be nice if someone could verify this. +// *However, the MSVC optimizing compiler assumes a 4-byte-aligned stack at times. +    unsigned int fillSize = +        ABI_GetAlignedFrameSize(frameSize) - (frameSize + 4); +    if (fillSize != 0) { +        SUB(32, R(ESP), Imm8(fillSize)); +    } +} + +void XEmitter::ABI_RestoreStack(unsigned int frameSize) { +    unsigned int alignedSize = ABI_GetAlignedFrameSize(frameSize); +    alignedSize -= 4; // return address is POPped at end of call +    if (alignedSize != 0) { +        ADD(32, R(ESP), Imm8(alignedSize)); +    } +} + +#else //64bit + +// Common functions +void XEmitter::ABI_CallFunction(const void *func) { +    u64 distance = u64(func) - (u64(code) + 5); +    if (distance >= 0x0000000080000000ULL +     && distance <  0xFFFFFFFF80000000ULL) { +        // Far call +        MOV(64, R(RAX), ImmPtr(func)); +        CALLptr(R(RAX)); +    } else { +        CALL(func); +    } +} + +void XEmitter::ABI_CallFunctionC16(const void *func, u16 param1) { +    MOV(32, R(ABI_PARAM1), Imm32((u32)param1)); +    u64 distance = u64(func) - (u64(code) + 5); +    if (distance >= 0x0000000080000000ULL +     && distance <  0xFFFFFFFF80000000ULL) { +        // Far call +        MOV(64, R(RAX), ImmPtr(func)); +        CALLptr(R(RAX)); +    } else { +        CALL(func); +    } +} + +void XEmitter::ABI_CallFunctionCC16(const void *func, u32 param1, u16 param2) { +    MOV(32, R(ABI_PARAM1), Imm32(param1)); +    MOV(32, R(ABI_PARAM2), Imm32((u32)param2)); +    u64 distance = u64(func) - (u64(code) + 5); +    if (distance >= 0x0000000080000000ULL +        && distance <  0xFFFFFFFF80000000ULL) { +            // Far call +            MOV(64, R(RAX), ImmPtr(func)); +            CALLptr(R(RAX)); +    } else { +        CALL(func); +    } +} + +void XEmitter::ABI_CallFunctionC(const void *func, u32 param1) { +    MOV(32, R(ABI_PARAM1), Imm32(param1)); +    u64 distance = u64(func) - (u64(code) + 5); +    if (distance >= 0x0000000080000000ULL +     && distance <  0xFFFFFFFF80000000ULL) { +        // Far call +        MOV(64, R(RAX), ImmPtr(func)); +        CALLptr(R(RAX)); +    } else { +        CALL(func); +    } +} + +void XEmitter::ABI_CallFunctionCC(const void *func, u32 param1, u32 param2) { +    MOV(32, R(ABI_PARAM1), Imm32(param1)); +    MOV(32, R(ABI_PARAM2), Imm32(param2)); +    u64 distance = u64(func) - (u64(code) + 5); +    if (distance >= 0x0000000080000000ULL +     && distance <  0xFFFFFFFF80000000ULL) { +        // Far call +        MOV(64, R(RAX), ImmPtr(func)); +        CALLptr(R(RAX)); +    } else { +        CALL(func); +    } +} + +void XEmitter::ABI_CallFunctionCCC(const void *func, u32 param1, u32 param2, u32 param3) { +    MOV(32, R(ABI_PARAM1), Imm32(param1)); +    MOV(32, R(ABI_PARAM2), Imm32(param2)); +    MOV(32, R(ABI_PARAM3), Imm32(param3)); +    u64 distance = u64(func) - (u64(code) + 5); +    if (distance >= 0x0000000080000000ULL +     && distance <  0xFFFFFFFF80000000ULL) { +        // Far call +        MOV(64, R(RAX), ImmPtr(func)); +        CALLptr(R(RAX)); +    } else { +        CALL(func); +    } +} + +void XEmitter::ABI_CallFunctionCCP(const void *func, u32 param1, u32 param2, void *param3) { +    MOV(32, R(ABI_PARAM1), Imm32(param1)); +    MOV(32, R(ABI_PARAM2), Imm32(param2)); +    MOV(64, R(ABI_PARAM3), ImmPtr(param3)); +    u64 distance = u64(func) - (u64(code) + 5); +    if (distance >= 0x0000000080000000ULL +     && distance <  0xFFFFFFFF80000000ULL) { +        // Far call +        MOV(64, R(RAX), ImmPtr(func)); +        CALLptr(R(RAX)); +    } else { +        CALL(func); +    } +} + +void XEmitter::ABI_CallFunctionCCCP(const void *func, u32 param1, u32 param2, u32 param3, void *param4) { +    MOV(32, R(ABI_PARAM1), Imm32(param1)); +    MOV(32, R(ABI_PARAM2), Imm32(param2)); +    MOV(32, R(ABI_PARAM3), Imm32(param3)); +    MOV(64, R(ABI_PARAM4), ImmPtr(param4)); +    u64 distance = u64(func) - (u64(code) + 5); +    if (distance >= 0x0000000080000000ULL +     && distance <  0xFFFFFFFF80000000ULL) { +        // Far call +        MOV(64, R(RAX), ImmPtr(func)); +        CALLptr(R(RAX)); +    } else { +        CALL(func); +    } +} + +void XEmitter::ABI_CallFunctionP(const void *func, void *param1) { +    MOV(64, R(ABI_PARAM1), ImmPtr(param1)); +    u64 distance = u64(func) - (u64(code) + 5); +    if (distance >= 0x0000000080000000ULL +     && distance <  0xFFFFFFFF80000000ULL) { +        // Far call +        MOV(64, R(RAX), ImmPtr(func)); +        CALLptr(R(RAX)); +    } else { +        CALL(func); +    } +} + +void XEmitter::ABI_CallFunctionPA(const void *func, void *param1, const Gen::OpArg &arg2) { +    MOV(64, R(ABI_PARAM1), ImmPtr(param1)); +    if (!arg2.IsSimpleReg(ABI_PARAM2)) +        MOV(32, R(ABI_PARAM2), arg2); +    u64 distance = u64(func) - (u64(code) + 5); +    if (distance >= 0x0000000080000000ULL +     && distance <  0xFFFFFFFF80000000ULL) { +        // Far call +        MOV(64, R(RAX), ImmPtr(func)); +        CALLptr(R(RAX)); +    } else { +        CALL(func); +    } +} + +void XEmitter::ABI_CallFunctionPAA(const void *func, void *param1, const Gen::OpArg &arg2, const Gen::OpArg &arg3) { +    MOV(64, R(ABI_PARAM1), ImmPtr(param1)); +    if (!arg2.IsSimpleReg(ABI_PARAM2)) +        MOV(32, R(ABI_PARAM2), arg2); +    if (!arg3.IsSimpleReg(ABI_PARAM3)) +        MOV(32, R(ABI_PARAM3), arg3); +    u64 distance = u64(func) - (u64(code) + 5); +    if (distance >= 0x0000000080000000ULL +     && distance <  0xFFFFFFFF80000000ULL) { +        // Far call +        MOV(64, R(RAX), ImmPtr(func)); +        CALLptr(R(RAX)); +    } else { +        CALL(func); +    } +} + +void XEmitter::ABI_CallFunctionPPC(const void *func, void *param1, void *param2, u32 param3) { +    MOV(64, R(ABI_PARAM1), ImmPtr(param1)); +    MOV(64, R(ABI_PARAM2), ImmPtr(param2)); +    MOV(32, R(ABI_PARAM3), Imm32(param3)); +    u64 distance = u64(func) - (u64(code) + 5); +    if (distance >= 0x0000000080000000ULL +     && distance <  0xFFFFFFFF80000000ULL) { +        // Far call +        MOV(64, R(RAX), ImmPtr(func)); +        CALLptr(R(RAX)); +    } else { +        CALL(func); +    } +} + +// Pass a register as a parameter. +void XEmitter::ABI_CallFunctionR(const void *func, X64Reg reg1) { +    if (reg1 != ABI_PARAM1) +        MOV(32, R(ABI_PARAM1), R(reg1)); +    u64 distance = u64(func) - (u64(code) + 5); +    if (distance >= 0x0000000080000000ULL +     && distance <  0xFFFFFFFF80000000ULL) { +        // Far call +        MOV(64, R(RAX), ImmPtr(func)); +        CALLptr(R(RAX)); +    } else { +        CALL(func); +    } +} + +// Pass two registers as parameters. +void XEmitter::ABI_CallFunctionRR(const void *func, X64Reg reg1, X64Reg reg2) { +    if (reg2 != ABI_PARAM1) { +        if (reg1 != ABI_PARAM1) +            MOV(64, R(ABI_PARAM1), R(reg1)); +        if (reg2 != ABI_PARAM2) +            MOV(64, R(ABI_PARAM2), R(reg2)); +    } else { +        if (reg2 != ABI_PARAM2) +            MOV(64, R(ABI_PARAM2), R(reg2)); +        if (reg1 != ABI_PARAM1) +            MOV(64, R(ABI_PARAM1), R(reg1)); +    } +    u64 distance = u64(func) - (u64(code) + 5); +    if (distance >= 0x0000000080000000ULL +     && distance <  0xFFFFFFFF80000000ULL) { +        // Far call +        MOV(64, R(RAX), ImmPtr(func)); +        CALLptr(R(RAX)); +    } else { +        CALL(func); +    } +} + +void XEmitter::ABI_CallFunctionAC(const void *func, const Gen::OpArg &arg1, u32 param2) +{ +    if (!arg1.IsSimpleReg(ABI_PARAM1)) +        MOV(32, R(ABI_PARAM1), arg1); +    MOV(32, R(ABI_PARAM2), Imm32(param2)); +    u64 distance = u64(func) - (u64(code) + 5); +    if (distance >= 0x0000000080000000ULL +     && distance <  0xFFFFFFFF80000000ULL) { +        // Far call +        MOV(64, R(RAX), ImmPtr(func)); +        CALLptr(R(RAX)); +    } else { +        CALL(func); +    } +} + +void XEmitter::ABI_CallFunctionACC(const void *func, const Gen::OpArg &arg1, u32 param2, u32 param3) +{ +    if (!arg1.IsSimpleReg(ABI_PARAM1)) +        MOV(32, R(ABI_PARAM1), arg1); +    MOV(32, R(ABI_PARAM2), Imm32(param2)); +    MOV(64, R(ABI_PARAM3), Imm64(param3)); +    u64 distance = u64(func) - (u64(code) + 5); +    if (distance >= 0x0000000080000000ULL +     && distance <  0xFFFFFFFF80000000ULL) { +        // Far call +        MOV(64, R(RAX), ImmPtr(func)); +        CALLptr(R(RAX)); +    } else { +        CALL(func); +    } +} + +void XEmitter::ABI_CallFunctionA(const void *func, const Gen::OpArg &arg1) +{ +    if (!arg1.IsSimpleReg(ABI_PARAM1)) +        MOV(32, R(ABI_PARAM1), arg1); +    u64 distance = u64(func) - (u64(code) + 5); +    if (distance >= 0x0000000080000000ULL +     && distance <  0xFFFFFFFF80000000ULL) { +        // Far call +        MOV(64, R(RAX), ImmPtr(func)); +        CALLptr(R(RAX)); +    } else { +        CALL(func); +    } +} + +void XEmitter::ABI_CallFunctionAA(const void *func, const Gen::OpArg &arg1, const Gen::OpArg &arg2) +{ +    if (!arg1.IsSimpleReg(ABI_PARAM1)) +        MOV(32, R(ABI_PARAM1), arg1); +    if (!arg2.IsSimpleReg(ABI_PARAM2)) +        MOV(32, R(ABI_PARAM2), arg2); +    u64 distance = u64(func) - (u64(code) + 5); +    if (distance >= 0x0000000080000000ULL +     && distance <  0xFFFFFFFF80000000ULL) { +        // Far call +        MOV(64, R(RAX), ImmPtr(func)); +        CALLptr(R(RAX)); +    } else { +        CALL(func); +    } +} + +unsigned int XEmitter::ABI_GetAlignedFrameSize(unsigned int frameSize) { +    return frameSize; +} + +#ifdef _WIN32 + +// The Windows x64 ABI requires XMM6 - XMM15 to be callee saved.  10 regs. +// But, not saving XMM4 and XMM5 breaks things in VS 2010, even though they are volatile regs. +// Let's just save all 16. +const int XMM_STACK_SPACE = 16 * 16; + +// Win64 Specific Code +void XEmitter::ABI_PushAllCalleeSavedRegsAndAdjustStack() { +    //we only want to do this once +    PUSH(RBX); +    PUSH(RSI); +    PUSH(RDI); +    PUSH(RBP); +    PUSH(R12); +    PUSH(R13); +    PUSH(R14); +    PUSH(R15); +    ABI_AlignStack(0); + +    // Do this after aligning, because before it's offset by 8. +    SUB(64, R(RSP), Imm32(XMM_STACK_SPACE)); +    for (int i = 0; i < 16; ++i) +        MOVAPS(MDisp(RSP, i * 16), (X64Reg)(XMM0 + i)); +} + +void XEmitter::ABI_PopAllCalleeSavedRegsAndAdjustStack() { +    for (int i = 0; i < 16; ++i) +        MOVAPS((X64Reg)(XMM0 + i), MDisp(RSP, i * 16)); +    ADD(64, R(RSP), Imm32(XMM_STACK_SPACE)); + +    ABI_RestoreStack(0); +    POP(R15); +    POP(R14); +    POP(R13); +    POP(R12); +    POP(RBP); +    POP(RDI); +    POP(RSI); +    POP(RBX); +} + +// Win64 Specific Code +void XEmitter::ABI_PushAllCallerSavedRegsAndAdjustStack() { +    PUSH(RCX); +    PUSH(RDX); +    PUSH(RSI); +    PUSH(RDI); +    PUSH(R8); +    PUSH(R9); +    PUSH(R10); +    PUSH(R11); +    // TODO: Callers preserve XMM4-5 (XMM0-3 are args.) +    ABI_AlignStack(0); +} + +void XEmitter::ABI_PopAllCallerSavedRegsAndAdjustStack() { +    ABI_RestoreStack(0); +    POP(R11); +    POP(R10); +    POP(R9); +    POP(R8); +    POP(RDI); +    POP(RSI); +    POP(RDX); +    POP(RCX); +} + +void XEmitter::ABI_AlignStack(unsigned int /*frameSize*/) { +    SUB(64, R(RSP), Imm8(0x28)); +} + +void XEmitter::ABI_RestoreStack(unsigned int /*frameSize*/) { +    ADD(64, R(RSP), Imm8(0x28)); +} + +#else +// Unix64 Specific Code +void XEmitter::ABI_PushAllCalleeSavedRegsAndAdjustStack() { +    PUSH(RBX); +    PUSH(RBP); +    PUSH(R12); +    PUSH(R13); +    PUSH(R14); +    PUSH(R15); +    PUSH(R15); //just to align stack. duped push/pop doesn't hurt. +    // TODO: XMM? +} + +void XEmitter::ABI_PopAllCalleeSavedRegsAndAdjustStack() { +    POP(R15); +    POP(R15); +    POP(R14); +    POP(R13); +    POP(R12); +    POP(RBP); +    POP(RBX); +} + +void XEmitter::ABI_PushAllCallerSavedRegsAndAdjustStack() { +    PUSH(RCX); +    PUSH(RDX); +    PUSH(RSI); +    PUSH(RDI); +    PUSH(R8); +    PUSH(R9); +    PUSH(R10); +    PUSH(R11); +    PUSH(R11); +} + +void XEmitter::ABI_PopAllCallerSavedRegsAndAdjustStack() { +    POP(R11); +    POP(R11); +    POP(R10); +    POP(R9); +    POP(R8); +    POP(RDI); +    POP(RSI); +    POP(RDX); +    POP(RCX); +} + +void XEmitter::ABI_AlignStack(unsigned int /*frameSize*/) { +    SUB(64, R(RSP), Imm8(0x08)); +} + +void XEmitter::ABI_RestoreStack(unsigned int /*frameSize*/) { +    ADD(64, R(RSP), Imm8(0x08)); +} + +#endif // WIN32 + +#endif // 32bit diff --git a/src/common/x64/abi.h b/src/common/x64/abi.h new file mode 100644 index 000000000..0ee189d45 --- /dev/null +++ b/src/common/x64/abi.h @@ -0,0 +1,78 @@ +// Copyright (C) 2003 Dolphin Project. + +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, version 2.0 or later versions. + +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the +// GNU General Public License 2.0 for more details. + +// A copy of the GPL 2.0 should have been included with the program. +// If not, see http://www.gnu.org/licenses/ + +// Official SVN repository and contact information can be found at +// http://code.google.com/p/dolphin-emu/ + +#pragma once + +#include "common/common_types.h" + +// x86/x64 ABI:s, and helpers to help follow them when JIT-ing code. +// All convensions return values in EAX (+ possibly EDX). + +// Linux 32-bit, Windows 32-bit (cdecl, System V): +// * Caller pushes left to right +// * Caller fixes stack after call +// * function subtract from stack for local storage only. +// Scratch:      EAX ECX EDX +// Callee-save:  EBX ESI EDI EBP +// Parameters:   - + +// Windows 64-bit +// * 4-reg "fastcall" variant, very new-skool stack handling +// * Callee moves stack pointer, to make room for shadow regs for the biggest function _it itself calls_ +// * Parameters passed in RCX, RDX, ... further parameters are MOVed into the allocated stack space. +// Scratch:      RAX RCX RDX R8 R9 R10 R11 +// Callee-save:  RBX RSI RDI RBP R12 R13 R14 R15 +// Parameters:   RCX RDX R8 R9, further MOV-ed + +// Linux 64-bit +// * 6-reg "fastcall" variant, old skool stack handling (parameters are pushed) +// Scratch:      RAX RCX RDX RSI RDI R8 R9 R10 R11 +// Callee-save:  RBX RBP R12 R13 R14 R15 +// Parameters:   RDI RSI RDX RCX R8 R9 + +#ifdef _M_IX86 // 32 bit calling convention, shared by all + +// 32-bit don't pass parameters in regs, but these are convenient to have anyway when we have to +// choose regs to put stuff in. +#define ABI_PARAM1 RCX +#define ABI_PARAM2 RDX + +// There are no ABI_PARAM* here, since args are pushed. +// 32-bit bog standard cdecl, shared between linux and windows +// MacOSX 32-bit is same as System V with a few exceptions that we probably don't care much about. + +#elif ARCHITECTURE_X64 // 64 bit calling convention + +#ifdef _WIN32 // 64-bit Windows - the really exotic calling convention + +#define ABI_PARAM1 RCX +#define ABI_PARAM2 RDX +#define ABI_PARAM3 R8 +#define ABI_PARAM4 R9 + +#else  //64-bit Unix (hopefully MacOSX too) + +#define ABI_PARAM1 RDI +#define ABI_PARAM2 RSI +#define ABI_PARAM3 RDX +#define ABI_PARAM4 RCX +#define ABI_PARAM5 R8 +#define ABI_PARAM6 R9 + +#endif // WIN32 + +#endif // X86 diff --git a/src/common/x64/emitter.cpp b/src/common/x64/emitter.cpp new file mode 100644 index 000000000..4e1c43d6c --- /dev/null +++ b/src/common/x64/emitter.cpp @@ -0,0 +1,1989 @@ +// Copyright (C) 2003 Dolphin Project. + +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, version 2.0 or later versions. + +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the +// GNU General Public License 2.0 for more details. + +// A copy of the GPL 2.0 should have been included with the program. +// If not, see http://www.gnu.org/licenses/ + +// Official SVN repository and contact information can be found at +// http://code.google.com/p/dolphin-emu/ + +#include <cstring> + +#include "common/assert.h" +#include "common/cpu_detect.h" +#include "common/logging/log.h" +#include "common/memory_util.h" + +#include "abi.h" +#include "emitter.h" + +#define PRIx64 "llx" + +// Minimize the diff against Dolphin +#define DYNA_REC JIT + +namespace Gen +{ + +struct NormalOpDef +{ +    u8 toRm8, toRm32, fromRm8, fromRm32, imm8, imm32, simm8, eaximm8, eaximm32, ext; +}; + +// 0xCC is code for invalid combination of immediates +static const NormalOpDef normalops[11] = +{ +    {0x00, 0x01, 0x02, 0x03, 0x80, 0x81, 0x83, 0x04, 0x05, 0}, //ADD +    {0x10, 0x11, 0x12, 0x13, 0x80, 0x81, 0x83, 0x14, 0x15, 2}, //ADC + +    {0x28, 0x29, 0x2A, 0x2B, 0x80, 0x81, 0x83, 0x2C, 0x2D, 5}, //SUB +    {0x18, 0x19, 0x1A, 0x1B, 0x80, 0x81, 0x83, 0x1C, 0x1D, 3}, //SBB + +    {0x20, 0x21, 0x22, 0x23, 0x80, 0x81, 0x83, 0x24, 0x25, 4}, //AND +    {0x08, 0x09, 0x0A, 0x0B, 0x80, 0x81, 0x83, 0x0C, 0x0D, 1}, //OR + +    {0x30, 0x31, 0x32, 0x33, 0x80, 0x81, 0x83, 0x34, 0x35, 6}, //XOR +    {0x88, 0x89, 0x8A, 0x8B, 0xC6, 0xC7, 0xCC, 0xCC, 0xCC, 0}, //MOV + +    {0x84, 0x85, 0x84, 0x85, 0xF6, 0xF7, 0xCC, 0xA8, 0xA9, 0}, //TEST (to == from) +    {0x38, 0x39, 0x3A, 0x3B, 0x80, 0x81, 0x83, 0x3C, 0x3D, 7}, //CMP + +    {0x86, 0x87, 0x86, 0x87, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 7}, //XCHG +}; + +enum NormalSSEOps +{ +    sseCMP         = 0xC2, +    sseADD         = 0x58, //ADD +    sseSUB         = 0x5C, //SUB +    sseAND         = 0x54, //AND +    sseANDN        = 0x55, //ANDN +    sseOR          = 0x56, +    sseXOR         = 0x57, +    sseMUL         = 0x59, //MUL +    sseDIV         = 0x5E, //DIV +    sseMIN         = 0x5D, //MIN +    sseMAX         = 0x5F, //MAX +    sseCOMIS       = 0x2F, //COMIS +    sseUCOMIS      = 0x2E, //UCOMIS +    sseSQRT        = 0x51, //SQRT +    sseRSQRT       = 0x52, //RSQRT (NO DOUBLE PRECISION!!!) +    sseRCP         = 0x53, //RCP +    sseMOVAPfromRM = 0x28, //MOVAP from RM +    sseMOVAPtoRM   = 0x29, //MOVAP to RM +    sseMOVUPfromRM = 0x10, //MOVUP from RM +    sseMOVUPtoRM   = 0x11, //MOVUP to RM +    sseMOVLPfromRM= 0x12, +    sseMOVLPtoRM  = 0x13, +    sseMOVHPfromRM= 0x16, +    sseMOVHPtoRM  = 0x17, +    sseMOVHLPS     = 0x12, +    sseMOVLHPS     = 0x16, +    sseMOVDQfromRM = 0x6F, +    sseMOVDQtoRM   = 0x7F, +    sseMASKMOVDQU  = 0xF7, +    sseLDDQU       = 0xF0, +    sseSHUF        = 0xC6, +    sseMOVNTDQ     = 0xE7, +    sseMOVNTP      = 0x2B, +    sseHADD        = 0x7C, +}; + + +void XEmitter::SetCodePtr(u8 *ptr) +{ +    code = ptr; +} + +const u8 *XEmitter::GetCodePtr() const +{ +    return code; +} + +u8 *XEmitter::GetWritableCodePtr() +{ +    return code; +} + +void XEmitter::ReserveCodeSpace(int bytes) +{ +    for (int i = 0; i < bytes; i++) +        *code++ = 0xCC; +} + +const u8 *XEmitter::AlignCode4() +{ +    int c = int((u64)code & 3); +    if (c) +        ReserveCodeSpace(4-c); +    return code; +} + +const u8 *XEmitter::AlignCode16() +{ +    int c = int((u64)code & 15); +    if (c) +        ReserveCodeSpace(16-c); +    return code; +} + +const u8 *XEmitter::AlignCodePage() +{ +    int c = int((u64)code & 4095); +    if (c) +        ReserveCodeSpace(4096-c); +    return code; +} + +// This operation modifies flags; check to see the flags are locked. +// If the flags are locked, we should immediately and loudly fail before +// causing a subtle JIT bug. +void XEmitter::CheckFlags() +{ +    ASSERT_MSG(!flags_locked, "Attempt to modify flags while flags locked!"); +} + +void XEmitter::WriteModRM(int mod, int reg, int rm) +{ +    Write8((u8)((mod << 6) | ((reg & 7) << 3) | (rm & 7))); +} + +void XEmitter::WriteSIB(int scale, int index, int base) +{ +    Write8((u8)((scale << 6) | ((index & 7) << 3) | (base & 7))); +} + +void OpArg::WriteRex(XEmitter *emit, int opBits, int bits, int customOp) const +{ +    if (customOp == -1)       customOp = operandReg; +#ifdef ARCHITECTURE_X64 +    u8 op = 0x40; +    // REX.W (whether operation is a 64-bit operation) +    if (opBits == 64)         op |= 8; +    // REX.R (whether ModR/M reg field refers to R8-R15. +    if (customOp & 8)         op |= 4; +    // REX.X (whether ModR/M SIB index field refers to R8-R15) +    if (indexReg & 8)         op |= 2; +    // REX.B (whether ModR/M rm or SIB base or opcode reg field refers to R8-R15) +    if (offsetOrBaseReg & 8)  op |= 1; +    // Write REX if wr have REX bits to write, or if the operation accesses +    // SIL, DIL, BPL, or SPL. +    if (op != 0x40 || +        (scale == SCALE_NONE && bits == 8 && (offsetOrBaseReg & 0x10c) == 4) || +        (opBits == 8 && (customOp & 0x10c) == 4)) +    { +        emit->Write8(op); +        // Check the operation doesn't access AH, BH, CH, or DH. +        DEBUG_ASSERT((offsetOrBaseReg & 0x100) == 0); +        DEBUG_ASSERT((customOp & 0x100) == 0); +    } +#else +    DEBUG_ASSERT(opBits != 64); +    DEBUG_ASSERT((customOp & 8) == 0 || customOp == -1); +    DEBUG_ASSERT((indexReg & 8) == 0); +    DEBUG_ASSERT((offsetOrBaseReg & 8) == 0); +    DEBUG_ASSERT(opBits != 8 || (customOp & 0x10c) != 4 || customOp == -1); +    DEBUG_ASSERT(scale == SCALE_ATREG || bits != 8 || (offsetOrBaseReg & 0x10c) != 4); +#endif +} + +void OpArg::WriteVex(XEmitter* emit, X64Reg regOp1, X64Reg regOp2, int L, int pp, int mmmmm, int W) const +{ +    int R = !(regOp1 & 8); +    int X = !(indexReg & 8); +    int B = !(offsetOrBaseReg & 8); + +    int vvvv = (regOp2 == X64Reg::INVALID_REG) ? 0xf : (regOp2 ^ 0xf); + +    // do we need any VEX fields that only appear in the three-byte form? +    if (X == 1 && B == 1 && W == 0 && mmmmm == 1) +    { +        u8 RvvvvLpp = (R << 7) | (vvvv << 3) | (L << 1) | pp; +        emit->Write8(0xC5); +        emit->Write8(RvvvvLpp); +    } +    else +    { +        u8 RXBmmmmm = (R << 7) | (X << 6) | (B << 5) | mmmmm; +        u8 WvvvvLpp = (W << 7) | (vvvv << 3) | (L << 1) | pp; +        emit->Write8(0xC4); +        emit->Write8(RXBmmmmm); +        emit->Write8(WvvvvLpp); +    } +} + +void OpArg::WriteRest(XEmitter *emit, int extraBytes, X64Reg _operandReg, +    bool warn_64bit_offset) const +{ +    if (_operandReg == INVALID_REG) +        _operandReg = (X64Reg)this->operandReg; +    int mod = 0; +    int ireg = indexReg; +    bool SIB = false; +    int _offsetOrBaseReg = this->offsetOrBaseReg; + +    if (scale == SCALE_RIP) //Also, on 32-bit, just an immediate address +    { +        // Oh, RIP addressing. +        _offsetOrBaseReg = 5; +        emit->WriteModRM(0, _operandReg, _offsetOrBaseReg); +        //TODO : add some checks +#ifdef ARCHITECTURE_X64 +        u64 ripAddr = (u64)emit->GetCodePtr() + 4 + extraBytes; +        s64 distance = (s64)offset - (s64)ripAddr; +        ASSERT_MSG( +                     (distance < 0x80000000LL && +                      distance >=  -0x80000000LL) || +                     !warn_64bit_offset, +                     "WriteRest: op out of range (0x%" PRIx64 " uses 0x%" PRIx64 ")", +                     ripAddr, offset); +        s32 offs = (s32)distance; +        emit->Write32((u32)offs); +#else +        emit->Write32((u32)offset); +#endif +        return; +    } + +    if (scale == 0) +    { +        // Oh, no memory, Just a reg. +        mod = 3; //11 +    } +    else if (scale >= 1) +    { +        //Ah good, no scaling. +        if (scale == SCALE_ATREG && !((_offsetOrBaseReg & 7) == 4 || (_offsetOrBaseReg & 7) == 5)) +        { +            //Okay, we're good. No SIB necessary. +            int ioff = (int)offset; +            if (ioff == 0) +            { +                mod = 0; +            } +            else if (ioff<-128 || ioff>127) +            { +                mod = 2; //32-bit displacement +            } +            else +            { +                mod = 1; //8-bit displacement +            } +        } +        else if (scale >= SCALE_NOBASE_2 && scale <= SCALE_NOBASE_8) +        { +            SIB = true; +            mod = 0; +            _offsetOrBaseReg = 5; +        } +        else //if (scale != SCALE_ATREG) +        { +            if ((_offsetOrBaseReg & 7) == 4) //this would occupy the SIB encoding :( +            { +                //So we have to fake it with SIB encoding :( +                SIB = true; +            } + +            if (scale >= SCALE_1 && scale < SCALE_ATREG) +            { +                SIB = true; +            } + +            if (scale == SCALE_ATREG && ((_offsetOrBaseReg & 7) == 4)) +            { +                SIB = true; +                ireg = _offsetOrBaseReg; +            } + +            //Okay, we're fine. Just disp encoding. +            //We need displacement. Which size? +            int ioff = (int)(s64)offset; +            if (ioff < -128 || ioff > 127) +            { +                mod = 2; //32-bit displacement +            } +            else +            { +                mod = 1; //8-bit displacement +            } +        } +    } + +    // Okay. Time to do the actual writing +    // ModRM byte: +    int oreg = _offsetOrBaseReg; +    if (SIB) +        oreg = 4; + +    // TODO(ector): WTF is this if about? I don't remember writing it :-) +    //if (RIP) +    //    oreg = 5; + +    emit->WriteModRM(mod, _operandReg&7, oreg&7); + +    if (SIB) +    { +        //SIB byte +        int ss; +        switch (scale) +        { +        case SCALE_NONE: _offsetOrBaseReg = 4; ss = 0; break; //RSP +        case SCALE_1: ss = 0; break; +        case SCALE_2: ss = 1; break; +        case SCALE_4: ss = 2; break; +        case SCALE_8: ss = 3; break; +        case SCALE_NOBASE_2: ss = 1; break; +        case SCALE_NOBASE_4: ss = 2; break; +        case SCALE_NOBASE_8: ss = 3; break; +        case SCALE_ATREG: ss = 0; break; +        default: ASSERT_MSG(0, "Invalid scale for SIB byte"); ss = 0; break; +        } +        emit->Write8((u8)((ss << 6) | ((ireg&7)<<3) | (_offsetOrBaseReg&7))); +    } + +    if (mod == 1) //8-bit disp +    { +        emit->Write8((u8)(s8)(s32)offset); +    } +    else if (mod == 2 || (scale >= SCALE_NOBASE_2 && scale <= SCALE_NOBASE_8)) //32-bit disp +    { +        emit->Write32((u32)offset); +    } +} + +// W = operand extended width (1 if 64-bit) +// R = register# upper bit +// X = scale amnt upper bit +// B = base register# upper bit +void XEmitter::Rex(int w, int r, int x, int b) +{ +    w = w ? 1 : 0; +    r = r ? 1 : 0; +    x = x ? 1 : 0; +    b = b ? 1 : 0; +    u8 rx = (u8)(0x40 | (w << 3) | (r << 2) | (x << 1) | (b)); +    if (rx != 0x40) +        Write8(rx); +} + +void XEmitter::JMP(const u8 *addr, bool force5Bytes) +{ +    u64 fn = (u64)addr; +    if (!force5Bytes) +    { +        s64 distance = (s64)(fn - ((u64)code + 2)); +        ASSERT_MSG(distance >= -0x80 && distance < 0x80, +                 "Jump target too far away, needs force5Bytes = true"); +        //8 bits will do +        Write8(0xEB); +        Write8((u8)(s8)distance); +    } +    else +    { +        s64 distance = (s64)(fn - ((u64)code + 5)); + +        ASSERT_MSG( +                     distance >= -0x80000000LL && distance < 0x80000000LL, +                     "Jump target too far away, needs indirect register"); +        Write8(0xE9); +        Write32((u32)(s32)distance); +    } +} + +void XEmitter::JMPptr(const OpArg &arg2) +{ +    OpArg arg = arg2; +    if (arg.IsImm()) ASSERT_MSG(0, "JMPptr - Imm argument"); +    arg.operandReg = 4; +    arg.WriteRex(this, 0, 0); +    Write8(0xFF); +    arg.WriteRest(this); +} + +//Can be used to trap other processors, before overwriting their code +// not used in dolphin +void XEmitter::JMPself() +{ +    Write8(0xEB); +    Write8(0xFE); +} + +void XEmitter::CALLptr(OpArg arg) +{ +    if (arg.IsImm()) ASSERT_MSG(0, "CALLptr - Imm argument"); +    arg.operandReg = 2; +    arg.WriteRex(this, 0, 0); +    Write8(0xFF); +    arg.WriteRest(this); +} + +void XEmitter::CALL(const void *fnptr) +{ +    u64 distance = u64(fnptr) - (u64(code) + 5); +    ASSERT_MSG( +                 distance < 0x0000000080000000ULL || +                 distance >=  0xFFFFFFFF80000000ULL, +                 "CALL out of range (%p calls %p)", code, fnptr); +    Write8(0xE8); +    Write32(u32(distance)); +} + +FixupBranch XEmitter::J(bool force5bytes) +{ +    FixupBranch branch; +    branch.type = force5bytes ? 1 : 0; +    branch.ptr = code + (force5bytes ? 5 : 2); +    if (!force5bytes) +    { +        //8 bits will do +        Write8(0xEB); +        Write8(0); +    } +    else +    { +        Write8(0xE9); +        Write32(0); +    } +    return branch; +} + +FixupBranch XEmitter::J_CC(CCFlags conditionCode, bool force5bytes) +{ +    FixupBranch branch; +    branch.type = force5bytes ? 1 : 0; +    branch.ptr = code + (force5bytes ? 6 : 2); +    if (!force5bytes) +    { +        //8 bits will do +        Write8(0x70 + conditionCode); +        Write8(0); +    } +    else +    { +        Write8(0x0F); +        Write8(0x80 + conditionCode); +        Write32(0); +    } +    return branch; +} + +void XEmitter::J_CC(CCFlags conditionCode, const u8* addr, bool force5bytes) +{ +    u64 fn = (u64)addr; +    s64 distance = (s64)(fn - ((u64)code + 2)); +    if (distance < -0x80 || distance >= 0x80 || force5bytes) +    { +        distance = (s64)(fn - ((u64)code + 6)); +        ASSERT_MSG( +                     distance >= -0x80000000LL && distance < 0x80000000LL, +                     "Jump target too far away, needs indirect register"); +        Write8(0x0F); +        Write8(0x80 + conditionCode); +        Write32((u32)(s32)distance); +    } +    else +    { +        Write8(0x70 + conditionCode); +        Write8((u8)(s8)distance); +    } +} + +void XEmitter::SetJumpTarget(const FixupBranch &branch) +{ +    if (branch.type == 0) +    { +        s64 distance = (s64)(code - branch.ptr); +        ASSERT_MSG(distance >= -0x80 && distance < 0x80, "Jump target too far away, needs force5Bytes = true"); +        branch.ptr[-1] = (u8)(s8)distance; +    } +    else if (branch.type == 1) +    { +        s64 distance = (s64)(code - branch.ptr); +        ASSERT_MSG(distance >= -0x80000000LL && distance < 0x80000000LL, "Jump target too far away, needs indirect register"); +        ((s32*)branch.ptr)[-1] = (s32)distance; +    } +} + +// INC/DEC considered harmful on newer CPUs due to partial flag set. +// Use ADD, SUB instead. + +/* +void XEmitter::INC(int bits, OpArg arg) +{ +    if (arg.IsImm()) ASSERT_MSG(0, "INC - Imm argument"); +    arg.operandReg = 0; +    if (bits == 16) {Write8(0x66);} +    arg.WriteRex(this, bits, bits); +    Write8(bits == 8 ? 0xFE : 0xFF); +    arg.WriteRest(this); +} +void XEmitter::DEC(int bits, OpArg arg) +{ +    if (arg.IsImm()) ASSERT_MSG(0, "DEC - Imm argument"); +    arg.operandReg = 1; +    if (bits == 16) {Write8(0x66);} +    arg.WriteRex(this, bits, bits); +    Write8(bits == 8 ? 0xFE : 0xFF); +    arg.WriteRest(this); +} +*/ + +//Single byte opcodes +//There is no PUSHAD/POPAD in 64-bit mode. +void XEmitter::INT3() {Write8(0xCC);} +void XEmitter::RET()  {Write8(0xC3);} +void XEmitter::RET_FAST()  {Write8(0xF3); Write8(0xC3);} //two-byte return (rep ret) - recommended by AMD optimization manual for the case of jumping to a ret + +// The first sign of decadence: optimized NOPs. +void XEmitter::NOP(size_t size) +{ +    DEBUG_ASSERT((int)size > 0); +    while (true) +    { +        switch (size) +        { +        case 0: +            return; +        case 1: +            Write8(0x90); +            return; +        case 2: +            Write8(0x66); Write8(0x90); +            return; +        case 3: +            Write8(0x0F); Write8(0x1F); Write8(0x00); +            return; +        case 4: +            Write8(0x0F); Write8(0x1F); Write8(0x40); Write8(0x00); +            return; +        case 5: +            Write8(0x0F); Write8(0x1F); Write8(0x44); Write8(0x00); +            Write8(0x00); +            return; +        case 6: +            Write8(0x66); Write8(0x0F); Write8(0x1F); Write8(0x44); +            Write8(0x00); Write8(0x00); +            return; +        case 7: +            Write8(0x0F); Write8(0x1F); Write8(0x80); Write8(0x00); +            Write8(0x00); Write8(0x00); Write8(0x00); +            return; +        case 8: +            Write8(0x0F); Write8(0x1F); Write8(0x84); Write8(0x00); +            Write8(0x00); Write8(0x00); Write8(0x00); Write8(0x00); +            return; +        case 9: +            Write8(0x66); Write8(0x0F); Write8(0x1F); Write8(0x84); +            Write8(0x00); Write8(0x00); Write8(0x00); Write8(0x00); +            Write8(0x00); +            return; +        case 10: +            Write8(0x66); Write8(0x66); Write8(0x0F); Write8(0x1F); +            Write8(0x84); Write8(0x00); Write8(0x00); Write8(0x00); +            Write8(0x00); Write8(0x00); +            return; +        default: +            // Even though x86 instructions are allowed to be up to 15 bytes long, +            // AMD advises against using NOPs longer than 11 bytes because they +            // carry a performance penalty on CPUs older than AMD family 16h. +            Write8(0x66); Write8(0x66); Write8(0x66); Write8(0x0F); +            Write8(0x1F); Write8(0x84); Write8(0x00); Write8(0x00); +            Write8(0x00); Write8(0x00); Write8(0x00); +            size -= 11; +            continue; +        } +    } +} + +void XEmitter::PAUSE() {Write8(0xF3); NOP();} //use in tight spinloops for energy saving on some cpu +void XEmitter::CLC()  {CheckFlags(); Write8(0xF8);} //clear carry +void XEmitter::CMC()  {CheckFlags(); Write8(0xF5);} //flip carry +void XEmitter::STC()  {CheckFlags(); Write8(0xF9);} //set carry + +//TODO: xchg ah, al ??? +void XEmitter::XCHG_AHAL() +{ +    Write8(0x86); +    Write8(0xe0); +    // alt. 86 c4 +} + +//These two can not be executed on early Intel 64-bit CPU:s, only on AMD! +void XEmitter::LAHF() {Write8(0x9F);} +void XEmitter::SAHF() {CheckFlags(); Write8(0x9E);} + +void XEmitter::PUSHF() {Write8(0x9C);} +void XEmitter::POPF()  {CheckFlags(); Write8(0x9D);} + +void XEmitter::LFENCE() {Write8(0x0F); Write8(0xAE); Write8(0xE8);} +void XEmitter::MFENCE() {Write8(0x0F); Write8(0xAE); Write8(0xF0);} +void XEmitter::SFENCE() {Write8(0x0F); Write8(0xAE); Write8(0xF8);} + +void XEmitter::WriteSimple1Byte(int bits, u8 byte, X64Reg reg) +{ +    if (bits == 16) +        Write8(0x66); +    Rex(bits == 64, 0, 0, (int)reg >> 3); +    Write8(byte + ((int)reg & 7)); +} + +void XEmitter::WriteSimple2Byte(int bits, u8 byte1, u8 byte2, X64Reg reg) +{ +    if (bits == 16) +        Write8(0x66); +    Rex(bits==64, 0, 0, (int)reg >> 3); +    Write8(byte1); +    Write8(byte2 + ((int)reg & 7)); +} + +void XEmitter::CWD(int bits) +{ +    if (bits == 16) +        Write8(0x66); +    Rex(bits == 64, 0, 0, 0); +    Write8(0x99); +} + +void XEmitter::CBW(int bits) +{ +    if (bits == 8) +        Write8(0x66); +    Rex(bits == 32, 0, 0, 0); +    Write8(0x98); +} + +//Simple opcodes + + +//push/pop do not need wide to be 64-bit +void XEmitter::PUSH(X64Reg reg) {WriteSimple1Byte(32, 0x50, reg);} +void XEmitter::POP(X64Reg reg)  {WriteSimple1Byte(32, 0x58, reg);} + +void XEmitter::PUSH(int bits, const OpArg ®) +{ +    if (reg.IsSimpleReg()) +        PUSH(reg.GetSimpleReg()); +    else if (reg.IsImm()) +    { +        switch (reg.GetImmBits()) +        { +        case 8: +            Write8(0x6A); +            Write8((u8)(s8)reg.offset); +            break; +        case 16: +            Write8(0x66); +            Write8(0x68); +            Write16((u16)(s16)(s32)reg.offset); +            break; +        case 32: +            Write8(0x68); +            Write32((u32)reg.offset); +            break; +        default: +            ASSERT_MSG(0, "PUSH - Bad imm bits"); +            break; +        } +    } +    else +    { +        if (bits == 16) +            Write8(0x66); +        reg.WriteRex(this, bits, bits); +        Write8(0xFF); +        reg.WriteRest(this, 0, (X64Reg)6); +    } +} + +void XEmitter::POP(int /*bits*/, const OpArg ®) +{ +    if (reg.IsSimpleReg()) +        POP(reg.GetSimpleReg()); +    else +        ASSERT_MSG(0, "POP - Unsupported encoding"); +} + +void XEmitter::BSWAP(int bits, X64Reg reg) +{ +    if (bits >= 32) +    { +        WriteSimple2Byte(bits, 0x0F, 0xC8, reg); +    } +    else if (bits == 16) +    { +        ROL(16, R(reg), Imm8(8)); +    } +    else if (bits == 8) +    { +        // Do nothing - can't bswap a single byte... +    } +    else +    { +        ASSERT_MSG(0, "BSWAP - Wrong number of bits"); +    } +} + +// Undefined opcode - reserved +// If we ever need a way to always cause a non-breakpoint hard exception... +void XEmitter::UD2() +{ +    Write8(0x0F); +    Write8(0x0B); +} + +void XEmitter::PREFETCH(PrefetchLevel level, OpArg arg) +{ +    ASSERT_MSG(!arg.IsImm(), "PREFETCH - Imm argument"); +    arg.operandReg = (u8)level; +    arg.WriteRex(this, 0, 0); +    Write8(0x0F); +    Write8(0x18); +    arg.WriteRest(this); +} + +void XEmitter::SETcc(CCFlags flag, OpArg dest) +{ +    ASSERT_MSG(!dest.IsImm(), "SETcc - Imm argument"); +    dest.operandReg = 0; +    dest.WriteRex(this, 0, 8); +    Write8(0x0F); +    Write8(0x90 + (u8)flag); +    dest.WriteRest(this); +} + +void XEmitter::CMOVcc(int bits, X64Reg dest, OpArg src, CCFlags flag) +{ +    ASSERT_MSG(!src.IsImm(), "CMOVcc - Imm argument"); +    ASSERT_MSG(bits != 8, "CMOVcc - 8 bits unsupported"); +    if (bits == 16) +        Write8(0x66); +    src.operandReg = dest; +    src.WriteRex(this, bits, bits); +    Write8(0x0F); +    Write8(0x40 + (u8)flag); +    src.WriteRest(this); +} + +void XEmitter::WriteMulDivType(int bits, OpArg src, int ext) +{ +    ASSERT_MSG(!src.IsImm(), "WriteMulDivType - Imm argument"); +    CheckFlags(); +    src.operandReg = ext; +    if (bits == 16) +        Write8(0x66); +    src.WriteRex(this, bits, bits, 0); +    if (bits == 8) +    { +        Write8(0xF6); +    } +    else +    { +        Write8(0xF7); +    } +    src.WriteRest(this); +} + +void XEmitter::MUL(int bits, OpArg src)  {WriteMulDivType(bits, src, 4);} +void XEmitter::DIV(int bits, OpArg src)  {WriteMulDivType(bits, src, 6);} +void XEmitter::IMUL(int bits, OpArg src) {WriteMulDivType(bits, src, 5);} +void XEmitter::IDIV(int bits, OpArg src) {WriteMulDivType(bits, src, 7);} +void XEmitter::NEG(int bits, OpArg src)  {WriteMulDivType(bits, src, 3);} +void XEmitter::NOT(int bits, OpArg src)  {WriteMulDivType(bits, src, 2);} + +void XEmitter::WriteBitSearchType(int bits, X64Reg dest, OpArg src, u8 byte2, bool rep) +{ +    ASSERT_MSG(!src.IsImm(), "WriteBitSearchType - Imm argument"); +    CheckFlags(); +    src.operandReg = (u8)dest; +    if (bits == 16) +        Write8(0x66); +    if (rep) +        Write8(0xF3); +    src.WriteRex(this, bits, bits); +    Write8(0x0F); +    Write8(byte2); +    src.WriteRest(this); +} + +void XEmitter::MOVNTI(int bits, OpArg dest, X64Reg src) +{ +    if (bits <= 16) +        ASSERT_MSG(0, "MOVNTI - bits<=16"); +    WriteBitSearchType(bits, src, dest, 0xC3); +} + +void XEmitter::BSF(int bits, X64Reg dest, OpArg src) {WriteBitSearchType(bits,dest,src,0xBC);} //bottom bit to top bit +void XEmitter::BSR(int bits, X64Reg dest, OpArg src) {WriteBitSearchType(bits,dest,src,0xBD);} //top bit to bottom bit + +void XEmitter::TZCNT(int bits, X64Reg dest, OpArg src) +{ +    CheckFlags(); +    if (!Common::cpu_info.bBMI1) +        ASSERT_MSG(0, "Trying to use BMI1 on a system that doesn't support it. Bad programmer."); +    WriteBitSearchType(bits, dest, src, 0xBC, true); +} +void XEmitter::LZCNT(int bits, X64Reg dest, OpArg src) +{ +    CheckFlags(); +    if (!Common::cpu_info.bLZCNT) +        ASSERT_MSG(0, "Trying to use LZCNT on a system that doesn't support it. Bad programmer."); +    WriteBitSearchType(bits, dest, src, 0xBD, true); +} + +void XEmitter::MOVSX(int dbits, int sbits, X64Reg dest, OpArg src) +{ +    ASSERT_MSG(!src.IsImm(), "MOVSX - Imm argument"); +    if (dbits == sbits) +    { +        MOV(dbits, R(dest), src); +        return; +    } +    src.operandReg = (u8)dest; +    if (dbits == 16) +        Write8(0x66); +    src.WriteRex(this, dbits, sbits); +    if (sbits == 8) +    { +        Write8(0x0F); +        Write8(0xBE); +    } +    else if (sbits == 16) +    { +        Write8(0x0F); +        Write8(0xBF); +    } +    else if (sbits == 32 && dbits == 64) +    { +        Write8(0x63); +    } +    else +    { +        Crash(); +    } +    src.WriteRest(this); +} + +void XEmitter::MOVZX(int dbits, int sbits, X64Reg dest, OpArg src) +{ +    ASSERT_MSG(!src.IsImm(), "MOVZX - Imm argument"); +    if (dbits == sbits) +    { +        MOV(dbits, R(dest), src); +        return; +    } +    src.operandReg = (u8)dest; +    if (dbits == 16) +        Write8(0x66); +    //the 32bit result is automatically zero extended to 64bit +    src.WriteRex(this, dbits == 64 ? 32 : dbits, sbits); +    if (sbits == 8) +    { +        Write8(0x0F); +        Write8(0xB6); +    } +    else if (sbits == 16) +    { +        Write8(0x0F); +        Write8(0xB7); +    } +    else if (sbits == 32 && dbits == 64) +    { +        Write8(0x8B); +    } +    else +    { +        ASSERT_MSG(0, "MOVZX - Invalid size"); +    } +    src.WriteRest(this); +} + +void XEmitter::MOVBE(int bits, const OpArg& dest, const OpArg& src) +{ +    ASSERT_MSG(Common::cpu_info.bMOVBE, "Generating MOVBE on a system that does not support it."); +    if (bits == 8) +    { +        MOV(bits, dest, src); +        return; +    } + +    if (bits == 16) +        Write8(0x66); + +    if (dest.IsSimpleReg()) +    { +        ASSERT_MSG(!src.IsSimpleReg() && !src.IsImm(), "MOVBE: Loading from !mem"); +        src.WriteRex(this, bits, bits, dest.GetSimpleReg()); +        Write8(0x0F); Write8(0x38); Write8(0xF0); +        src.WriteRest(this, 0, dest.GetSimpleReg()); +    } +    else if (src.IsSimpleReg()) +    { +        ASSERT_MSG(!dest.IsSimpleReg() && !dest.IsImm(), "MOVBE: Storing to !mem"); +        dest.WriteRex(this, bits, bits, src.GetSimpleReg()); +        Write8(0x0F); Write8(0x38); Write8(0xF1); +        dest.WriteRest(this, 0, src.GetSimpleReg()); +    } +    else +    { +        ASSERT_MSG(0, "MOVBE: Not loading or storing to mem"); +    } +} + + +void XEmitter::LEA(int bits, X64Reg dest, OpArg src) +{ +    ASSERT_MSG(!src.IsImm(), "LEA - Imm argument"); +    src.operandReg = (u8)dest; +    if (bits == 16) +        Write8(0x66); //TODO: performance warning +    src.WriteRex(this, bits, bits); +    Write8(0x8D); +    src.WriteRest(this, 0, INVALID_REG, bits == 64); +} + +//shift can be either imm8 or cl +void XEmitter::WriteShift(int bits, OpArg dest, OpArg &shift, int ext) +{ +    CheckFlags(); +    bool writeImm = false; +    if (dest.IsImm()) +    { +        ASSERT_MSG(0, "WriteShift - can't shift imms"); +    } +    if ((shift.IsSimpleReg() && shift.GetSimpleReg() != ECX) || (shift.IsImm() && shift.GetImmBits() != 8)) +    { +        ASSERT_MSG(0, "WriteShift - illegal argument"); +    } +    dest.operandReg = ext; +    if (bits == 16) +        Write8(0x66); +    dest.WriteRex(this, bits, bits, 0); +    if (shift.GetImmBits() == 8) +    { +        //ok an imm +        u8 imm = (u8)shift.offset; +        if (imm == 1) +        { +            Write8(bits == 8 ? 0xD0 : 0xD1); +        } +        else +        { +            writeImm = true; +            Write8(bits == 8 ? 0xC0 : 0xC1); +        } +    } +    else +    { +        Write8(bits == 8 ? 0xD2 : 0xD3); +    } +    dest.WriteRest(this, writeImm ? 1 : 0); +    if (writeImm) +        Write8((u8)shift.offset); +} + +// large rotates and shift are slower on intel than amd +// intel likes to rotate by 1, and the op is smaller too +void XEmitter::ROL(int bits, OpArg dest, OpArg shift) {WriteShift(bits, dest, shift, 0);} +void XEmitter::ROR(int bits, OpArg dest, OpArg shift) {WriteShift(bits, dest, shift, 1);} +void XEmitter::RCL(int bits, OpArg dest, OpArg shift) {WriteShift(bits, dest, shift, 2);} +void XEmitter::RCR(int bits, OpArg dest, OpArg shift) {WriteShift(bits, dest, shift, 3);} +void XEmitter::SHL(int bits, OpArg dest, OpArg shift) {WriteShift(bits, dest, shift, 4);} +void XEmitter::SHR(int bits, OpArg dest, OpArg shift) {WriteShift(bits, dest, shift, 5);} +void XEmitter::SAR(int bits, OpArg dest, OpArg shift) {WriteShift(bits, dest, shift, 7);} + +// index can be either imm8 or register, don't use memory destination because it's slow +void XEmitter::WriteBitTest(int bits, OpArg &dest, OpArg &index, int ext) +{ +    CheckFlags(); +    if (dest.IsImm()) +    { +        ASSERT_MSG(0, "WriteBitTest - can't test imms"); +    } +    if ((index.IsImm() && index.GetImmBits() != 8)) +    { +        ASSERT_MSG(0, "WriteBitTest - illegal argument"); +    } +    if (bits == 16) +        Write8(0x66); +    if (index.IsImm()) +    { +        dest.WriteRex(this, bits, bits); +        Write8(0x0F); Write8(0xBA); +        dest.WriteRest(this, 1, (X64Reg)ext); +        Write8((u8)index.offset); +    } +    else +    { +        X64Reg operand = index.GetSimpleReg(); +        dest.WriteRex(this, bits, bits, operand); +        Write8(0x0F); Write8(0x83 + 8*ext); +        dest.WriteRest(this, 1, operand); +    } +} + +void XEmitter::BT(int bits, OpArg dest, OpArg index)  {WriteBitTest(bits, dest, index, 4);} +void XEmitter::BTS(int bits, OpArg dest, OpArg index) {WriteBitTest(bits, dest, index, 5);} +void XEmitter::BTR(int bits, OpArg dest, OpArg index) {WriteBitTest(bits, dest, index, 6);} +void XEmitter::BTC(int bits, OpArg dest, OpArg index) {WriteBitTest(bits, dest, index, 7);} + +//shift can be either imm8 or cl +void XEmitter::SHRD(int bits, OpArg dest, OpArg src, OpArg shift) +{ +    CheckFlags(); +    if (dest.IsImm()) +    { +        ASSERT_MSG(0, "SHRD - can't use imms as destination"); +    } +    if (!src.IsSimpleReg()) +    { +        ASSERT_MSG(0, "SHRD - must use simple register as source"); +    } +    if ((shift.IsSimpleReg() && shift.GetSimpleReg() != ECX) || (shift.IsImm() && shift.GetImmBits() != 8)) +    { +        ASSERT_MSG(0, "SHRD - illegal shift"); +    } +    if (bits == 16) +        Write8(0x66); +    X64Reg operand = src.GetSimpleReg(); +    dest.WriteRex(this, bits, bits, operand); +    if (shift.GetImmBits() == 8) +    { +        Write8(0x0F); Write8(0xAC); +        dest.WriteRest(this, 1, operand); +        Write8((u8)shift.offset); +    } +    else +    { +        Write8(0x0F); Write8(0xAD); +        dest.WriteRest(this, 0, operand); +    } +} + +void XEmitter::SHLD(int bits, OpArg dest, OpArg src, OpArg shift) +{ +    CheckFlags(); +    if (dest.IsImm()) +    { +        ASSERT_MSG(0, "SHLD - can't use imms as destination"); +    } +    if (!src.IsSimpleReg()) +    { +        ASSERT_MSG(0, "SHLD - must use simple register as source"); +    } +    if ((shift.IsSimpleReg() && shift.GetSimpleReg() != ECX) || (shift.IsImm() && shift.GetImmBits() != 8)) +    { +        ASSERT_MSG(0, "SHLD - illegal shift"); +    } +    if (bits == 16) +        Write8(0x66); +    X64Reg operand = src.GetSimpleReg(); +    dest.WriteRex(this, bits, bits, operand); +    if (shift.GetImmBits() == 8) +    { +        Write8(0x0F); Write8(0xA4); +        dest.WriteRest(this, 1, operand); +        Write8((u8)shift.offset); +    } +    else +    { +        Write8(0x0F); Write8(0xA5); +        dest.WriteRest(this, 0, operand); +    } +} + +void OpArg::WriteSingleByteOp(XEmitter *emit, u8 op, X64Reg _operandReg, int bits) +{ +    if (bits == 16) +        emit->Write8(0x66); + +    this->operandReg = (u8)_operandReg; +    WriteRex(emit, bits, bits); +    emit->Write8(op); +    WriteRest(emit); +} + +//operand can either be immediate or register +void OpArg::WriteNormalOp(XEmitter *emit, bool toRM, NormalOp op, const OpArg &operand, int bits) const +{ +    X64Reg _operandReg; +    if (IsImm()) +    { +        ASSERT_MSG(0, "WriteNormalOp - Imm argument, wrong order"); +    } + +    if (bits == 16) +        emit->Write8(0x66); + +    int immToWrite = 0; + +    if (operand.IsImm()) +    { +        WriteRex(emit, bits, bits); + +        if (!toRM) +        { +            ASSERT_MSG(0, "WriteNormalOp - Writing to Imm (!toRM)"); +        } + +        if (operand.scale == SCALE_IMM8 && bits == 8) +        { +            // op al, imm8 +            if (!scale && offsetOrBaseReg == AL && normalops[op].eaximm8 != 0xCC) +            { +                emit->Write8(normalops[op].eaximm8); +                emit->Write8((u8)operand.offset); +                return; +            } +            // mov reg, imm8 +            if (!scale && op == nrmMOV) +            { +                emit->Write8(0xB0 + (offsetOrBaseReg & 7)); +                emit->Write8((u8)operand.offset); +                return; +            } +            // op r/m8, imm8 +            emit->Write8(normalops[op].imm8); +            immToWrite = 8; +        } +        else if ((operand.scale == SCALE_IMM16 && bits == 16) || +                 (operand.scale == SCALE_IMM32 && bits == 32) || +                 (operand.scale == SCALE_IMM32 && bits == 64)) +        { +            // Try to save immediate size if we can, but first check to see +            // if the instruction supports simm8. +            // op r/m, imm8 +            if (normalops[op].simm8 != 0xCC && +                ((operand.scale == SCALE_IMM16 && (s16)operand.offset == (s8)operand.offset) || +                 (operand.scale == SCALE_IMM32 && (s32)operand.offset == (s8)operand.offset))) +            { +                emit->Write8(normalops[op].simm8); +                immToWrite = 8; +            } +            else +            { +                // mov reg, imm +                if (!scale && op == nrmMOV && bits != 64) +                { +                    emit->Write8(0xB8 + (offsetOrBaseReg & 7)); +                    if (bits == 16) +                        emit->Write16((u16)operand.offset); +                    else +                        emit->Write32((u32)operand.offset); +                    return; +                } +                // op eax, imm +                if (!scale && offsetOrBaseReg == EAX && normalops[op].eaximm32 != 0xCC) +                { +                    emit->Write8(normalops[op].eaximm32); +                    if (bits == 16) +                        emit->Write16((u16)operand.offset); +                    else +                        emit->Write32((u32)operand.offset); +                    return; +                } +                // op r/m, imm +                emit->Write8(normalops[op].imm32); +                immToWrite = bits == 16 ? 16 : 32; +            } +        } +        else if ((operand.scale == SCALE_IMM8 && bits == 16) || +                 (operand.scale == SCALE_IMM8 && bits == 32) || +                 (operand.scale == SCALE_IMM8 && bits == 64)) +        { +            // op r/m, imm8 +            emit->Write8(normalops[op].simm8); +            immToWrite = 8; +        } +        else if (operand.scale == SCALE_IMM64 && bits == 64) +        { +            if (scale) +            { +                ASSERT_MSG(0, "WriteNormalOp - MOV with 64-bit imm requres register destination"); +            } +            // mov reg64, imm64 +            else if (op == nrmMOV) +            { +                emit->Write8(0xB8 + (offsetOrBaseReg & 7)); +                emit->Write64((u64)operand.offset); +                return; +            } +            ASSERT_MSG(0, "WriteNormalOp - Only MOV can take 64-bit imm"); +        } +        else +        { +            ASSERT_MSG(0, "WriteNormalOp - Unhandled case"); +        } +        _operandReg = (X64Reg)normalops[op].ext; //pass extension in REG of ModRM +    } +    else +    { +        _operandReg = (X64Reg)operand.offsetOrBaseReg; +        WriteRex(emit, bits, bits, _operandReg); +        // op r/m, reg +        if (toRM) +        { +            emit->Write8(bits == 8 ? normalops[op].toRm8 : normalops[op].toRm32); +        } +        // op reg, r/m +        else +        { +            emit->Write8(bits == 8 ? normalops[op].fromRm8 : normalops[op].fromRm32); +        } +    } +    WriteRest(emit, immToWrite >> 3, _operandReg); +    switch (immToWrite) +    { +    case 0: +        break; +    case 8: +        emit->Write8((u8)operand.offset); +        break; +    case 16: +        emit->Write16((u16)operand.offset); +        break; +    case 32: +        emit->Write32((u32)operand.offset); +        break; +    default: +        ASSERT_MSG(0, "WriteNormalOp - Unhandled case"); +    } +} + +void XEmitter::WriteNormalOp(XEmitter *emit, int bits, NormalOp op, const OpArg &a1, const OpArg &a2) +{ +    if (a1.IsImm()) +    { +        //Booh! Can't write to an imm +        ASSERT_MSG(0, "WriteNormalOp - a1 cannot be imm"); +        return; +    } +    if (a2.IsImm()) +    { +        a1.WriteNormalOp(emit, true, op, a2, bits); +    } +    else +    { +        if (a1.IsSimpleReg()) +        { +            a2.WriteNormalOp(emit, false, op, a1, bits); +        } +        else +        { +            ASSERT_MSG(a2.IsSimpleReg() || a2.IsImm(), "WriteNormalOp - a1 and a2 cannot both be memory"); +            a1.WriteNormalOp(emit, true, op, a2, bits); +        } +    } +} + +void XEmitter::ADD (int bits, const OpArg &a1, const OpArg &a2) {CheckFlags(); WriteNormalOp(this, bits, nrmADD, a1, a2);} +void XEmitter::ADC (int bits, const OpArg &a1, const OpArg &a2) {CheckFlags(); WriteNormalOp(this, bits, nrmADC, a1, a2);} +void XEmitter::SUB (int bits, const OpArg &a1, const OpArg &a2) {CheckFlags(); WriteNormalOp(this, bits, nrmSUB, a1, a2);} +void XEmitter::SBB (int bits, const OpArg &a1, const OpArg &a2) {CheckFlags(); WriteNormalOp(this, bits, nrmSBB, a1, a2);} +void XEmitter::AND (int bits, const OpArg &a1, const OpArg &a2) {CheckFlags(); WriteNormalOp(this, bits, nrmAND, a1, a2);} +void XEmitter::OR  (int bits, const OpArg &a1, const OpArg &a2) {CheckFlags(); WriteNormalOp(this, bits, nrmOR , a1, a2);} +void XEmitter::XOR (int bits, const OpArg &a1, const OpArg &a2) {CheckFlags(); WriteNormalOp(this, bits, nrmXOR, a1, a2);} +void XEmitter::MOV (int bits, const OpArg &a1, const OpArg &a2) +{ +    if (a1.IsSimpleReg() && a2.IsSimpleReg() && a1.GetSimpleReg() == a2.GetSimpleReg()) +        LOG_ERROR(Common, "Redundant MOV @ %p - bug in JIT?", code); +    WriteNormalOp(this, bits, nrmMOV, a1, a2); +} +void XEmitter::TEST(int bits, const OpArg &a1, const OpArg &a2) {CheckFlags(); WriteNormalOp(this, bits, nrmTEST, a1, a2);} +void XEmitter::CMP (int bits, const OpArg &a1, const OpArg &a2) {CheckFlags(); WriteNormalOp(this, bits, nrmCMP, a1, a2);} +void XEmitter::XCHG(int bits, const OpArg &a1, const OpArg &a2) {WriteNormalOp(this, bits, nrmXCHG, a1, a2);} + +void XEmitter::IMUL(int bits, X64Reg regOp, OpArg a1, OpArg a2) +{ +    CheckFlags(); +    if (bits == 8) +    { +        ASSERT_MSG(0, "IMUL - illegal bit size!"); +        return; +    } + +    if (a1.IsImm()) +    { +        ASSERT_MSG(0, "IMUL - second arg cannot be imm!"); +        return; +    } + +    if (!a2.IsImm()) +    { +        ASSERT_MSG(0, "IMUL - third arg must be imm!"); +        return; +    } + +    if (bits == 16) +        Write8(0x66); +    a1.WriteRex(this, bits, bits, regOp); + +    if (a2.GetImmBits() == 8 || +        (a2.GetImmBits() == 16 && (s8)a2.offset == (s16)a2.offset) || +        (a2.GetImmBits() == 32 && (s8)a2.offset == (s32)a2.offset)) +    { +        Write8(0x6B); +        a1.WriteRest(this, 1, regOp); +        Write8((u8)a2.offset); +    } +    else +    { +        Write8(0x69); +        if (a2.GetImmBits() == 16 && bits == 16) +        { +            a1.WriteRest(this, 2, regOp); +            Write16((u16)a2.offset); +        } +        else if (a2.GetImmBits() == 32 && (bits == 32 || bits == 64)) +        { +            a1.WriteRest(this, 4, regOp); +            Write32((u32)a2.offset); +        } +        else +        { +            ASSERT_MSG(0, "IMUL - unhandled case!"); +        } +    } +} + +void XEmitter::IMUL(int bits, X64Reg regOp, OpArg a) +{ +    CheckFlags(); +    if (bits == 8) +    { +        ASSERT_MSG(0, "IMUL - illegal bit size!"); +        return; +    } + +    if (a.IsImm()) +    { +        IMUL(bits, regOp, R(regOp), a) ; +        return; +    } + +    if (bits == 16) +        Write8(0x66); +    a.WriteRex(this, bits, bits, regOp); +    Write8(0x0F); +    Write8(0xAF); +    a.WriteRest(this, 0, regOp); +} + + +void XEmitter::WriteSSEOp(u8 opPrefix, u16 op, X64Reg regOp, OpArg arg, int extrabytes) +{ +    if (opPrefix) +        Write8(opPrefix); +    arg.operandReg = regOp; +    arg.WriteRex(this, 0, 0); +    Write8(0x0F); +    if (op > 0xFF) +        Write8((op >> 8) & 0xFF); +    Write8(op & 0xFF); +    arg.WriteRest(this, extrabytes); +} + +void XEmitter::WriteAVXOp(u8 opPrefix, u16 op, X64Reg regOp, OpArg arg, int extrabytes) +{ +    WriteAVXOp(opPrefix, op, regOp, INVALID_REG, arg, extrabytes); +} + +static int GetVEXmmmmm(u16 op) +{ +    // Currently, only 0x38 and 0x3A are used as secondary escape byte. +    if ((op >> 8) == 0x3A) +        return 3; +    else if ((op >> 8) == 0x38) +        return 2; +    else +        return 1; +} + +static int GetVEXpp(u8 opPrefix) +{ +    if (opPrefix == 0x66) +        return 1; +    else if (opPrefix == 0xF3) +        return 2; +    else if (opPrefix == 0xF2) +        return 3; +    else +        return 0; +} + +void XEmitter::WriteAVXOp(u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, OpArg arg, int extrabytes) +{ +    if (!Common::cpu_info.bAVX) +        ASSERT_MSG(0, "Trying to use AVX on a system that doesn't support it. Bad programmer."); +    int mmmmm = GetVEXmmmmm(op); +    int pp = GetVEXpp(opPrefix); +    // FIXME: we currently don't support 256-bit instructions, and "size" is not the vector size here +    arg.WriteVex(this, regOp1, regOp2, 0, pp, mmmmm); +    Write8(op & 0xFF); +    arg.WriteRest(this, extrabytes, regOp1); +} + +// Like the above, but more general; covers GPR-based VEX operations, like BMI1/2 +void XEmitter::WriteVEXOp(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, OpArg arg, int extrabytes) +{ +    if (size != 32 && size != 64) +        ASSERT_MSG(0, "VEX GPR instructions only support 32-bit and 64-bit modes!"); +    int mmmmm = GetVEXmmmmm(op); +    int pp = GetVEXpp(opPrefix); +    arg.WriteVex(this, regOp1, regOp2, 0, pp, mmmmm, size == 64); +    Write8(op & 0xFF); +    arg.WriteRest(this, extrabytes, regOp1); +} + +void XEmitter::WriteBMI1Op(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, OpArg arg, int extrabytes) +{ +    CheckFlags(); +    if (!Common::cpu_info.bBMI1) +        ASSERT_MSG(0, "Trying to use BMI1 on a system that doesn't support it. Bad programmer."); +    WriteVEXOp(size, opPrefix, op, regOp1, regOp2, arg, extrabytes); +} + +void XEmitter::WriteBMI2Op(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, OpArg arg, int extrabytes) +{ +    CheckFlags(); +    if (!Common::cpu_info.bBMI2) +        ASSERT_MSG(0, "Trying to use BMI2 on a system that doesn't support it. Bad programmer."); +    WriteVEXOp(size, opPrefix, op, regOp1, regOp2, arg, extrabytes); +} + +void XEmitter::MOVD_xmm(X64Reg dest, const OpArg &arg) {WriteSSEOp(0x66, 0x6E, dest, arg, 0);} +void XEmitter::MOVD_xmm(const OpArg &arg, X64Reg src) {WriteSSEOp(0x66, 0x7E, src, arg, 0);} + +void XEmitter::MOVQ_xmm(X64Reg dest, OpArg arg) +{ +#ifdef ARCHITECTURE_X64 +        // Alternate encoding +        // This does not display correctly in MSVC's debugger, it thinks it's a MOVD +        arg.operandReg = dest; +        Write8(0x66); +        arg.WriteRex(this, 64, 0); +        Write8(0x0f); +        Write8(0x6E); +        arg.WriteRest(this, 0); +#else +        arg.operandReg = dest; +        Write8(0xF3); +        Write8(0x0f); +        Write8(0x7E); +        arg.WriteRest(this, 0); +#endif +} + +void XEmitter::MOVQ_xmm(OpArg arg, X64Reg src) +{ +    if (src > 7 || arg.IsSimpleReg()) +    { +        // Alternate encoding +        // This does not display correctly in MSVC's debugger, it thinks it's a MOVD +        arg.operandReg = src; +        Write8(0x66); +        arg.WriteRex(this, 64, 0); +        Write8(0x0f); +        Write8(0x7E); +        arg.WriteRest(this, 0); +    } +    else +    { +        arg.operandReg = src; +        arg.WriteRex(this, 0, 0); +        Write8(0x66); +        Write8(0x0f); +        Write8(0xD6); +        arg.WriteRest(this, 0); +    } +} + +void XEmitter::WriteMXCSR(OpArg arg, int ext) +{ +    if (arg.IsImm() || arg.IsSimpleReg()) +        ASSERT_MSG(0, "MXCSR - invalid operand"); + +    arg.operandReg = ext; +    arg.WriteRex(this, 0, 0); +    Write8(0x0F); +    Write8(0xAE); +    arg.WriteRest(this); +} + +void XEmitter::STMXCSR(OpArg memloc) {WriteMXCSR(memloc, 3);} +void XEmitter::LDMXCSR(OpArg memloc) {WriteMXCSR(memloc, 2);} + +void XEmitter::MOVNTDQ(OpArg arg, X64Reg regOp) {WriteSSEOp(0x66, sseMOVNTDQ, regOp, arg);} +void XEmitter::MOVNTPS(OpArg arg, X64Reg regOp) {WriteSSEOp(0x00, sseMOVNTP, regOp, arg);} +void XEmitter::MOVNTPD(OpArg arg, X64Reg regOp) {WriteSSEOp(0x66, sseMOVNTP, regOp, arg);} + +void XEmitter::ADDSS(X64Reg regOp, OpArg arg)   {WriteSSEOp(0xF3, sseADD, regOp, arg);} +void XEmitter::ADDSD(X64Reg regOp, OpArg arg)   {WriteSSEOp(0xF2, sseADD, regOp, arg);} +void XEmitter::SUBSS(X64Reg regOp, OpArg arg)   {WriteSSEOp(0xF3, sseSUB, regOp, arg);} +void XEmitter::SUBSD(X64Reg regOp, OpArg arg)   {WriteSSEOp(0xF2, sseSUB, regOp, arg);} +void XEmitter::CMPSS(X64Reg regOp, OpArg arg, u8 compare)   {WriteSSEOp(0xF3, sseCMP, regOp, arg, 1); Write8(compare);} +void XEmitter::CMPSD(X64Reg regOp, OpArg arg, u8 compare)   {WriteSSEOp(0xF2, sseCMP, regOp, arg, 1); Write8(compare);} +void XEmitter::MULSS(X64Reg regOp, OpArg arg)   {WriteSSEOp(0xF3, sseMUL, regOp, arg);} +void XEmitter::MULSD(X64Reg regOp, OpArg arg)   {WriteSSEOp(0xF2, sseMUL, regOp, arg);} +void XEmitter::DIVSS(X64Reg regOp, OpArg arg)   {WriteSSEOp(0xF3, sseDIV, regOp, arg);} +void XEmitter::DIVSD(X64Reg regOp, OpArg arg)   {WriteSSEOp(0xF2, sseDIV, regOp, arg);} +void XEmitter::MINSS(X64Reg regOp, OpArg arg)   {WriteSSEOp(0xF3, sseMIN, regOp, arg);} +void XEmitter::MINSD(X64Reg regOp, OpArg arg)   {WriteSSEOp(0xF2, sseMIN, regOp, arg);} +void XEmitter::MAXSS(X64Reg regOp, OpArg arg)   {WriteSSEOp(0xF3, sseMAX, regOp, arg);} +void XEmitter::MAXSD(X64Reg regOp, OpArg arg)   {WriteSSEOp(0xF2, sseMAX, regOp, arg);} +void XEmitter::SQRTSS(X64Reg regOp, OpArg arg)  {WriteSSEOp(0xF3, sseSQRT, regOp, arg);} +void XEmitter::SQRTSD(X64Reg regOp, OpArg arg)  {WriteSSEOp(0xF2, sseSQRT, regOp, arg);} +void XEmitter::RSQRTSS(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF3, sseRSQRT, regOp, arg);} + +void XEmitter::ADDPS(X64Reg regOp, OpArg arg)   {WriteSSEOp(0x00, sseADD, regOp, arg);} +void XEmitter::ADDPD(X64Reg regOp, OpArg arg)   {WriteSSEOp(0x66, sseADD, regOp, arg);} +void XEmitter::SUBPS(X64Reg regOp, OpArg arg)   {WriteSSEOp(0x00, sseSUB, regOp, arg);} +void XEmitter::SUBPD(X64Reg regOp, OpArg arg)   {WriteSSEOp(0x66, sseSUB, regOp, arg);} +void XEmitter::CMPPS(X64Reg regOp, OpArg arg, u8 compare)   {WriteSSEOp(0x00, sseCMP, regOp, arg, 1); Write8(compare);} +void XEmitter::CMPPD(X64Reg regOp, OpArg arg, u8 compare)   {WriteSSEOp(0x66, sseCMP, regOp, arg, 1); Write8(compare);} +void XEmitter::ANDPS(X64Reg regOp, OpArg arg)   {WriteSSEOp(0x00, sseAND, regOp, arg);} +void XEmitter::ANDPD(X64Reg regOp, OpArg arg)   {WriteSSEOp(0x66, sseAND, regOp, arg);} +void XEmitter::ANDNPS(X64Reg regOp, OpArg arg)  {WriteSSEOp(0x00, sseANDN, regOp, arg);} +void XEmitter::ANDNPD(X64Reg regOp, OpArg arg)  {WriteSSEOp(0x66, sseANDN, regOp, arg);} +void XEmitter::ORPS(X64Reg regOp, OpArg arg)    {WriteSSEOp(0x00, sseOR, regOp, arg);} +void XEmitter::ORPD(X64Reg regOp, OpArg arg)    {WriteSSEOp(0x66, sseOR, regOp, arg);} +void XEmitter::XORPS(X64Reg regOp, OpArg arg)   {WriteSSEOp(0x00, sseXOR, regOp, arg);} +void XEmitter::XORPD(X64Reg regOp, OpArg arg)   {WriteSSEOp(0x66, sseXOR, regOp, arg);} +void XEmitter::MULPS(X64Reg regOp, OpArg arg)   {WriteSSEOp(0x00, sseMUL, regOp, arg);} +void XEmitter::MULPD(X64Reg regOp, OpArg arg)   {WriteSSEOp(0x66, sseMUL, regOp, arg);} +void XEmitter::DIVPS(X64Reg regOp, OpArg arg)   {WriteSSEOp(0x00, sseDIV, regOp, arg);} +void XEmitter::DIVPD(X64Reg regOp, OpArg arg)   {WriteSSEOp(0x66, sseDIV, regOp, arg);} +void XEmitter::MINPS(X64Reg regOp, OpArg arg)   {WriteSSEOp(0x00, sseMIN, regOp, arg);} +void XEmitter::MINPD(X64Reg regOp, OpArg arg)   {WriteSSEOp(0x66, sseMIN, regOp, arg);} +void XEmitter::MAXPS(X64Reg regOp, OpArg arg)   {WriteSSEOp(0x00, sseMAX, regOp, arg);} +void XEmitter::MAXPD(X64Reg regOp, OpArg arg)   {WriteSSEOp(0x66, sseMAX, regOp, arg);} +void XEmitter::SQRTPS(X64Reg regOp, OpArg arg)  {WriteSSEOp(0x00, sseSQRT, regOp, arg);} +void XEmitter::SQRTPD(X64Reg regOp, OpArg arg)  {WriteSSEOp(0x66, sseSQRT, regOp, arg);} +void XEmitter::RCPPS(X64Reg regOp, OpArg arg) { WriteSSEOp(0x00, sseRCP, regOp, arg); } +void XEmitter::RSQRTPS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, sseRSQRT, regOp, arg);} +void XEmitter::SHUFPS(X64Reg regOp, OpArg arg, u8 shuffle) {WriteSSEOp(0x00, sseSHUF, regOp, arg,1); Write8(shuffle);} +void XEmitter::SHUFPD(X64Reg regOp, OpArg arg, u8 shuffle) {WriteSSEOp(0x66, sseSHUF, regOp, arg,1); Write8(shuffle);} + +void XEmitter::HADDPS(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF2, sseHADD, regOp, arg);} + +void XEmitter::COMISS(X64Reg regOp, OpArg arg)  {WriteSSEOp(0x00, sseCOMIS, regOp, arg);} //weird that these should be packed +void XEmitter::COMISD(X64Reg regOp, OpArg arg)  {WriteSSEOp(0x66, sseCOMIS, regOp, arg);} //ordered +void XEmitter::UCOMISS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, sseUCOMIS, regOp, arg);} //unordered +void XEmitter::UCOMISD(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, sseUCOMIS, regOp, arg);} + +void XEmitter::MOVAPS(X64Reg regOp, OpArg arg)  {WriteSSEOp(0x00, sseMOVAPfromRM, regOp, arg);} +void XEmitter::MOVAPD(X64Reg regOp, OpArg arg)  {WriteSSEOp(0x66, sseMOVAPfromRM, regOp, arg);} +void XEmitter::MOVAPS(OpArg arg, X64Reg regOp)  {WriteSSEOp(0x00, sseMOVAPtoRM, regOp, arg);} +void XEmitter::MOVAPD(OpArg arg, X64Reg regOp)  {WriteSSEOp(0x66, sseMOVAPtoRM, regOp, arg);} + +void XEmitter::MOVUPS(X64Reg regOp, OpArg arg)  {WriteSSEOp(0x00, sseMOVUPfromRM, regOp, arg);} +void XEmitter::MOVUPD(X64Reg regOp, OpArg arg)  {WriteSSEOp(0x66, sseMOVUPfromRM, regOp, arg);} +void XEmitter::MOVUPS(OpArg arg, X64Reg regOp)  {WriteSSEOp(0x00, sseMOVUPtoRM, regOp, arg);} +void XEmitter::MOVUPD(OpArg arg, X64Reg regOp)  {WriteSSEOp(0x66, sseMOVUPtoRM, regOp, arg);} + +void XEmitter::MOVDQA(X64Reg regOp, OpArg arg)  {WriteSSEOp(0x66, sseMOVDQfromRM, regOp, arg);} +void XEmitter::MOVDQA(OpArg arg, X64Reg regOp)  {WriteSSEOp(0x66, sseMOVDQtoRM, regOp, arg);} +void XEmitter::MOVDQU(X64Reg regOp, OpArg arg)  {WriteSSEOp(0xF3, sseMOVDQfromRM, regOp, arg);} +void XEmitter::MOVDQU(OpArg arg, X64Reg regOp)  {WriteSSEOp(0xF3, sseMOVDQtoRM, regOp, arg);} + +void XEmitter::MOVSS(X64Reg regOp, OpArg arg)   {WriteSSEOp(0xF3, sseMOVUPfromRM, regOp, arg);} +void XEmitter::MOVSD(X64Reg regOp, OpArg arg)   {WriteSSEOp(0xF2, sseMOVUPfromRM, regOp, arg);} +void XEmitter::MOVSS(OpArg arg, X64Reg regOp)   {WriteSSEOp(0xF3, sseMOVUPtoRM, regOp, arg);} +void XEmitter::MOVSD(OpArg arg, X64Reg regOp)   {WriteSSEOp(0xF2, sseMOVUPtoRM, regOp, arg);} + +void XEmitter::MOVLPS(X64Reg regOp, OpArg arg)  { WriteSSEOp(0x00, sseMOVLPfromRM, regOp, arg); } +void XEmitter::MOVLPD(X64Reg regOp, OpArg arg)  { WriteSSEOp(0x66, sseMOVLPfromRM, regOp, arg); } +void XEmitter::MOVLPS(OpArg arg, X64Reg regOp)  { WriteSSEOp(0x00, sseMOVLPtoRM, regOp, arg); } +void XEmitter::MOVLPD(OpArg arg, X64Reg regOp)  { WriteSSEOp(0x66, sseMOVLPtoRM, regOp, arg); } + +void XEmitter::MOVHPS(X64Reg regOp, OpArg arg)  { WriteSSEOp(0x00, sseMOVHPfromRM, regOp, arg); } +void XEmitter::MOVHPD(X64Reg regOp, OpArg arg)  { WriteSSEOp(0x66, sseMOVHPfromRM, regOp, arg); } +void XEmitter::MOVHPS(OpArg arg, X64Reg regOp)  { WriteSSEOp(0x00, sseMOVHPtoRM, regOp, arg); } +void XEmitter::MOVHPD(OpArg arg, X64Reg regOp)  { WriteSSEOp(0x66, sseMOVHPtoRM, regOp, arg); } + +void XEmitter::MOVHLPS(X64Reg regOp1, X64Reg regOp2) {WriteSSEOp(0x00, sseMOVHLPS, regOp1, R(regOp2));} +void XEmitter::MOVLHPS(X64Reg regOp1, X64Reg regOp2) {WriteSSEOp(0x00, sseMOVLHPS, regOp1, R(regOp2));} + +void XEmitter::CVTPS2PD(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, 0x5A, regOp, arg);} +void XEmitter::CVTPD2PS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, 0x5A, regOp, arg);} + +void XEmitter::CVTSD2SS(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF2, 0x5A, regOp, arg);} +void XEmitter::CVTSS2SD(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF3, 0x5A, regOp, arg);} +void XEmitter::CVTSD2SI(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF2, 0x2D, regOp, arg);} +void XEmitter::CVTSS2SI(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF3, 0x2D, regOp, arg);} +void XEmitter::CVTSI2SD(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF2, 0x2A, regOp, arg);} +void XEmitter::CVTSI2SS(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF3, 0x2A, regOp, arg);} + +void XEmitter::CVTDQ2PD(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF3, 0xE6, regOp, arg);} +void XEmitter::CVTDQ2PS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, 0x5B, regOp, arg);} +void XEmitter::CVTPD2DQ(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF2, 0xE6, regOp, arg);} +void XEmitter::CVTPS2DQ(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, 0x5B, regOp, arg);} + +void XEmitter::CVTTSD2SI(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF2, 0x2C, regOp, arg);} +void XEmitter::CVTTSS2SI(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF3, 0x2C, regOp, arg);} +void XEmitter::CVTTPS2DQ(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF3, 0x5B, regOp, arg);} +void XEmitter::CVTTPD2DQ(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, 0xE6, regOp, arg);} + +void XEmitter::MASKMOVDQU(X64Reg dest, X64Reg src)  {WriteSSEOp(0x66, sseMASKMOVDQU, dest, R(src));} + +void XEmitter::MOVMSKPS(X64Reg dest, OpArg arg) {WriteSSEOp(0x00, 0x50, dest, arg);} +void XEmitter::MOVMSKPD(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0x50, dest, arg);} + +void XEmitter::LDDQU(X64Reg dest, OpArg arg)    {WriteSSEOp(0xF2, sseLDDQU, dest, arg);} // For integer data only + +// THESE TWO ARE UNTESTED. +void XEmitter::UNPCKLPS(X64Reg dest, OpArg arg) {WriteSSEOp(0x00, 0x14, dest, arg);} +void XEmitter::UNPCKHPS(X64Reg dest, OpArg arg) {WriteSSEOp(0x00, 0x15, dest, arg);} + +void XEmitter::UNPCKLPD(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0x14, dest, arg);} +void XEmitter::UNPCKHPD(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0x15, dest, arg);} + +void XEmitter::MOVDDUP(X64Reg regOp, OpArg arg) +{ +    if (Common::cpu_info.bSSE3) +    { +        WriteSSEOp(0xF2, 0x12, regOp, arg); //SSE3 movddup +    } +    else +    { +        // Simulate this instruction with SSE2 instructions +        if (!arg.IsSimpleReg(regOp)) +            MOVSD(regOp, arg); +        UNPCKLPD(regOp, R(regOp)); +    } +} + +//There are a few more left + +// Also some integer instructions are missing +void XEmitter::PACKSSDW(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0x6B, dest, arg);} +void XEmitter::PACKSSWB(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0x63, dest, arg);} +void XEmitter::PACKUSWB(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0x67, dest, arg);} + +void XEmitter::PUNPCKLBW(X64Reg dest, const OpArg &arg) {WriteSSEOp(0x66, 0x60, dest, arg);} +void XEmitter::PUNPCKLWD(X64Reg dest, const OpArg &arg) {WriteSSEOp(0x66, 0x61, dest, arg);} +void XEmitter::PUNPCKLDQ(X64Reg dest, const OpArg &arg) {WriteSSEOp(0x66, 0x62, dest, arg);} +void XEmitter::PUNPCKLQDQ(X64Reg dest, const OpArg &arg) {WriteSSEOp(0x66, 0x6C, dest, arg);} + +void XEmitter::PSRLW(X64Reg reg, int shift) +{ +    WriteSSEOp(0x66, 0x71, (X64Reg)2, R(reg)); +    Write8(shift); +} + +void XEmitter::PSRLD(X64Reg reg, int shift) +{ +    WriteSSEOp(0x66, 0x72, (X64Reg)2, R(reg)); +    Write8(shift); +} + +void XEmitter::PSRLQ(X64Reg reg, int shift) +{ +    WriteSSEOp(0x66, 0x73, (X64Reg)2, R(reg)); +    Write8(shift); +} + +void XEmitter::PSRLQ(X64Reg reg, OpArg arg) +{ +    WriteSSEOp(0x66, 0xd3, reg, arg); +} + +void XEmitter::PSRLDQ(X64Reg reg, int shift) { +    WriteSSEOp(0x66, 0x73, (X64Reg)3, R(reg)); +    Write8(shift); +} + +void XEmitter::PSLLW(X64Reg reg, int shift) +{ +    WriteSSEOp(0x66, 0x71, (X64Reg)6, R(reg)); +    Write8(shift); +} + +void XEmitter::PSLLD(X64Reg reg, int shift) +{ +    WriteSSEOp(0x66, 0x72, (X64Reg)6, R(reg)); +    Write8(shift); +} + +void XEmitter::PSLLQ(X64Reg reg, int shift) +{ +    WriteSSEOp(0x66, 0x73, (X64Reg)6, R(reg)); +    Write8(shift); +} + +void XEmitter::PSLLDQ(X64Reg reg, int shift) { +    WriteSSEOp(0x66, 0x73, (X64Reg)7, R(reg)); +    Write8(shift); +} + +void XEmitter::PSRAW(X64Reg reg, int shift) +{ +    WriteSSEOp(0x66, 0x71, (X64Reg)4, R(reg)); +    Write8(shift); +} + +void XEmitter::PSRAD(X64Reg reg, int shift) +{ +    WriteSSEOp(0x66, 0x72, (X64Reg)4, R(reg)); +    Write8(shift); +} + +void XEmitter::WriteSSSE3Op(u8 opPrefix, u16 op, X64Reg regOp, OpArg arg, int extrabytes) +{ +    if (!Common::cpu_info.bSSSE3) +        ASSERT_MSG(0, "Trying to use SSSE3 on a system that doesn't support it. Bad programmer."); +    WriteSSEOp(opPrefix, op, regOp, arg, extrabytes); +} + +void XEmitter::WriteSSE41Op(u8 opPrefix, u16 op, X64Reg regOp, OpArg arg, int extrabytes) +{ +    if (!Common::cpu_info.bSSE4_1) +        ASSERT_MSG(0, "Trying to use SSE4.1 on a system that doesn't support it. Bad programmer."); +    WriteSSEOp(opPrefix, op, regOp, arg, extrabytes); +} + +void XEmitter::PSHUFB(X64Reg dest, OpArg arg)   {WriteSSSE3Op(0x66, 0x3800, dest, arg);} +void XEmitter::PTEST(X64Reg dest, OpArg arg)    {WriteSSE41Op(0x66, 0x3817, dest, arg);} +void XEmitter::PACKUSDW(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x382b, dest, arg);} +void XEmitter::DPPS(X64Reg dest, OpArg arg, u8 mask) {WriteSSE41Op(0x66, 0x3A40, dest, arg, 1); Write8(mask);} + +void XEmitter::PMINSB(X64Reg dest, OpArg arg)   {WriteSSE41Op(0x66, 0x3838, dest, arg);} +void XEmitter::PMINSD(X64Reg dest, OpArg arg)   {WriteSSE41Op(0x66, 0x3839, dest, arg);} +void XEmitter::PMINUW(X64Reg dest, OpArg arg)   {WriteSSE41Op(0x66, 0x383a, dest, arg);} +void XEmitter::PMINUD(X64Reg dest, OpArg arg)   {WriteSSE41Op(0x66, 0x383b, dest, arg);} +void XEmitter::PMAXSB(X64Reg dest, OpArg arg)   {WriteSSE41Op(0x66, 0x383c, dest, arg);} +void XEmitter::PMAXSD(X64Reg dest, OpArg arg)   {WriteSSE41Op(0x66, 0x383d, dest, arg);} +void XEmitter::PMAXUW(X64Reg dest, OpArg arg)   {WriteSSE41Op(0x66, 0x383e, dest, arg);} +void XEmitter::PMAXUD(X64Reg dest, OpArg arg)   {WriteSSE41Op(0x66, 0x383f, dest, arg);} + +void XEmitter::PMOVSXBW(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3820, dest, arg);} +void XEmitter::PMOVSXBD(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3821, dest, arg);} +void XEmitter::PMOVSXBQ(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3822, dest, arg);} +void XEmitter::PMOVSXWD(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3823, dest, arg);} +void XEmitter::PMOVSXWQ(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3824, dest, arg);} +void XEmitter::PMOVSXDQ(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3825, dest, arg);} +void XEmitter::PMOVZXBW(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3830, dest, arg);} +void XEmitter::PMOVZXBD(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3831, dest, arg);} +void XEmitter::PMOVZXBQ(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3832, dest, arg);} +void XEmitter::PMOVZXWD(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3833, dest, arg);} +void XEmitter::PMOVZXWQ(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3834, dest, arg);} +void XEmitter::PMOVZXDQ(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3835, dest, arg);} + +void XEmitter::PBLENDVB(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3810, dest, arg);} +void XEmitter::BLENDVPS(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3814, dest, arg);} +void XEmitter::BLENDVPD(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3815, dest, arg);} +void XEmitter::BLENDPS(X64Reg dest, const OpArg& arg, u8 blend) { WriteSSE41Op(0x66, 0x3A0C, dest, arg, 1); Write8(blend); } +void XEmitter::BLENDPD(X64Reg dest, const OpArg& arg, u8 blend) { WriteSSE41Op(0x66, 0x3A0D, dest, arg, 1); Write8(blend); } + +void XEmitter::ROUNDSS(X64Reg dest, OpArg arg, u8 mode) {WriteSSE41Op(0x66, 0x3A0A, dest, arg, 1); Write8(mode);} +void XEmitter::ROUNDSD(X64Reg dest, OpArg arg, u8 mode) {WriteSSE41Op(0x66, 0x3A0B, dest, arg, 1); Write8(mode);} +void XEmitter::ROUNDPS(X64Reg dest, OpArg arg, u8 mode) {WriteSSE41Op(0x66, 0x3A08, dest, arg, 1); Write8(mode);} +void XEmitter::ROUNDPD(X64Reg dest, OpArg arg, u8 mode) {WriteSSE41Op(0x66, 0x3A09, dest, arg, 1); Write8(mode);} + +void XEmitter::PAND(X64Reg dest, OpArg arg)     {WriteSSEOp(0x66, 0xDB, dest, arg);} +void XEmitter::PANDN(X64Reg dest, OpArg arg)    {WriteSSEOp(0x66, 0xDF, dest, arg);} +void XEmitter::PXOR(X64Reg dest, OpArg arg)     {WriteSSEOp(0x66, 0xEF, dest, arg);} +void XEmitter::POR(X64Reg dest, OpArg arg)      {WriteSSEOp(0x66, 0xEB, dest, arg);} + +void XEmitter::PADDB(X64Reg dest, OpArg arg)    {WriteSSEOp(0x66, 0xFC, dest, arg);} +void XEmitter::PADDW(X64Reg dest, OpArg arg)    {WriteSSEOp(0x66, 0xFD, dest, arg);} +void XEmitter::PADDD(X64Reg dest, OpArg arg)    {WriteSSEOp(0x66, 0xFE, dest, arg);} +void XEmitter::PADDQ(X64Reg dest, OpArg arg)    {WriteSSEOp(0x66, 0xD4, dest, arg);} + +void XEmitter::PADDSB(X64Reg dest, OpArg arg)   {WriteSSEOp(0x66, 0xEC, dest, arg);} +void XEmitter::PADDSW(X64Reg dest, OpArg arg)   {WriteSSEOp(0x66, 0xED, dest, arg);} +void XEmitter::PADDUSB(X64Reg dest, OpArg arg)  {WriteSSEOp(0x66, 0xDC, dest, arg);} +void XEmitter::PADDUSW(X64Reg dest, OpArg arg)  {WriteSSEOp(0x66, 0xDD, dest, arg);} + +void XEmitter::PSUBB(X64Reg dest, OpArg arg)    {WriteSSEOp(0x66, 0xF8, dest, arg);} +void XEmitter::PSUBW(X64Reg dest, OpArg arg)    {WriteSSEOp(0x66, 0xF9, dest, arg);} +void XEmitter::PSUBD(X64Reg dest, OpArg arg)    {WriteSSEOp(0x66, 0xFA, dest, arg);} +void XEmitter::PSUBQ(X64Reg dest, OpArg arg)    {WriteSSEOp(0x66, 0xFB, dest, arg);} + +void XEmitter::PSUBSB(X64Reg dest, OpArg arg)   {WriteSSEOp(0x66, 0xE8, dest, arg);} +void XEmitter::PSUBSW(X64Reg dest, OpArg arg)   {WriteSSEOp(0x66, 0xE9, dest, arg);} +void XEmitter::PSUBUSB(X64Reg dest, OpArg arg)  {WriteSSEOp(0x66, 0xD8, dest, arg);} +void XEmitter::PSUBUSW(X64Reg dest, OpArg arg)  {WriteSSEOp(0x66, 0xD9, dest, arg);} + +void XEmitter::PAVGB(X64Reg dest, OpArg arg)    {WriteSSEOp(0x66, 0xE0, dest, arg);} +void XEmitter::PAVGW(X64Reg dest, OpArg arg)    {WriteSSEOp(0x66, 0xE3, dest, arg);} + +void XEmitter::PCMPEQB(X64Reg dest, OpArg arg)  {WriteSSEOp(0x66, 0x74, dest, arg);} +void XEmitter::PCMPEQW(X64Reg dest, OpArg arg)  {WriteSSEOp(0x66, 0x75, dest, arg);} +void XEmitter::PCMPEQD(X64Reg dest, OpArg arg)  {WriteSSEOp(0x66, 0x76, dest, arg);} + +void XEmitter::PCMPGTB(X64Reg dest, OpArg arg)  {WriteSSEOp(0x66, 0x64, dest, arg);} +void XEmitter::PCMPGTW(X64Reg dest, OpArg arg)  {WriteSSEOp(0x66, 0x65, dest, arg);} +void XEmitter::PCMPGTD(X64Reg dest, OpArg arg)  {WriteSSEOp(0x66, 0x66, dest, arg);} + +void XEmitter::PEXTRW(X64Reg dest, OpArg arg, u8 subreg)    {WriteSSEOp(0x66, 0xC5, dest, arg, 1); Write8(subreg);} +void XEmitter::PINSRW(X64Reg dest, OpArg arg, u8 subreg)    {WriteSSEOp(0x66, 0xC4, dest, arg, 1); Write8(subreg);} + +void XEmitter::PMADDWD(X64Reg dest, OpArg arg)  {WriteSSEOp(0x66, 0xF5, dest, arg); } +void XEmitter::PSADBW(X64Reg dest, OpArg arg)   {WriteSSEOp(0x66, 0xF6, dest, arg);} + +void XEmitter::PMAXSW(X64Reg dest, OpArg arg)   {WriteSSEOp(0x66, 0xEE, dest, arg); } +void XEmitter::PMAXUB(X64Reg dest, OpArg arg)   {WriteSSEOp(0x66, 0xDE, dest, arg); } +void XEmitter::PMINSW(X64Reg dest, OpArg arg)   {WriteSSEOp(0x66, 0xEA, dest, arg); } +void XEmitter::PMINUB(X64Reg dest, OpArg arg)   {WriteSSEOp(0x66, 0xDA, dest, arg); } + +void XEmitter::PMOVMSKB(X64Reg dest, OpArg arg)    {WriteSSEOp(0x66, 0xD7, dest, arg); } +void XEmitter::PSHUFD(X64Reg regOp, OpArg arg, u8 shuffle)    {WriteSSEOp(0x66, 0x70, regOp, arg, 1); Write8(shuffle);} +void XEmitter::PSHUFLW(X64Reg regOp, OpArg arg, u8 shuffle)   {WriteSSEOp(0xF2, 0x70, regOp, arg, 1); Write8(shuffle);} +void XEmitter::PSHUFHW(X64Reg regOp, OpArg arg, u8 shuffle)   {WriteSSEOp(0xF3, 0x70, regOp, arg, 1); Write8(shuffle);} + +// VEX +void XEmitter::VADDSD(X64Reg regOp1, X64Reg regOp2, OpArg arg)   {WriteAVXOp(0xF2, sseADD, regOp1, regOp2, arg);} +void XEmitter::VSUBSD(X64Reg regOp1, X64Reg regOp2, OpArg arg)   {WriteAVXOp(0xF2, sseSUB, regOp1, regOp2, arg);} +void XEmitter::VMULSD(X64Reg regOp1, X64Reg regOp2, OpArg arg)   {WriteAVXOp(0xF2, sseMUL, regOp1, regOp2, arg);} +void XEmitter::VDIVSD(X64Reg regOp1, X64Reg regOp2, OpArg arg)   {WriteAVXOp(0xF2, sseDIV, regOp1, regOp2, arg);} +void XEmitter::VADDPD(X64Reg regOp1, X64Reg regOp2, OpArg arg)   {WriteAVXOp(0x66, sseADD, regOp1, regOp2, arg);} +void XEmitter::VSUBPD(X64Reg regOp1, X64Reg regOp2, OpArg arg)   {WriteAVXOp(0x66, sseSUB, regOp1, regOp2, arg);} +void XEmitter::VMULPD(X64Reg regOp1, X64Reg regOp2, OpArg arg)   {WriteAVXOp(0x66, sseMUL, regOp1, regOp2, arg);} +void XEmitter::VDIVPD(X64Reg regOp1, X64Reg regOp2, OpArg arg)   {WriteAVXOp(0x66, sseDIV, regOp1, regOp2, arg);} +void XEmitter::VSQRTSD(X64Reg regOp1, X64Reg regOp2, OpArg arg)  {WriteAVXOp(0xF2, sseSQRT, regOp1, regOp2, arg);} +void XEmitter::VSHUFPD(X64Reg regOp1, X64Reg regOp2, OpArg arg, u8 shuffle) {WriteAVXOp(0x66, sseSHUF, regOp1, regOp2, arg, 1); Write8(shuffle);} +void XEmitter::VUNPCKLPD(X64Reg regOp1, X64Reg regOp2, OpArg arg){WriteAVXOp(0x66, 0x14, regOp1, regOp2, arg);} +void XEmitter::VUNPCKHPD(X64Reg regOp1, X64Reg regOp2, OpArg arg){WriteAVXOp(0x66, 0x15, regOp1, regOp2, arg);} + +void XEmitter::VANDPS(X64Reg regOp1, X64Reg regOp2, OpArg arg)   { WriteAVXOp(0x00, sseAND, regOp1, regOp2, arg); } +void XEmitter::VANDPD(X64Reg regOp1, X64Reg regOp2, OpArg arg)   { WriteAVXOp(0x66, sseAND, regOp1, regOp2, arg); } +void XEmitter::VANDNPS(X64Reg regOp1, X64Reg regOp2, OpArg arg)  { WriteAVXOp(0x00, sseANDN, regOp1, regOp2, arg); } +void XEmitter::VANDNPD(X64Reg regOp1, X64Reg regOp2, OpArg arg)  { WriteAVXOp(0x66, sseANDN, regOp1, regOp2, arg); } +void XEmitter::VORPS(X64Reg regOp1, X64Reg regOp2, OpArg arg)    { WriteAVXOp(0x00, sseOR, regOp1, regOp2, arg); } +void XEmitter::VORPD(X64Reg regOp1, X64Reg regOp2, OpArg arg)    { WriteAVXOp(0x66, sseOR, regOp1, regOp2, arg); } +void XEmitter::VXORPS(X64Reg regOp1, X64Reg regOp2, OpArg arg)   { WriteAVXOp(0x00, sseXOR, regOp1, regOp2, arg); } +void XEmitter::VXORPD(X64Reg regOp1, X64Reg regOp2, OpArg arg)   { WriteAVXOp(0x66, sseXOR, regOp1, regOp2, arg); } + +void XEmitter::VPAND(X64Reg regOp1, X64Reg regOp2, OpArg arg)    { WriteAVXOp(0x66, 0xDB, regOp1, regOp2, arg); } +void XEmitter::VPANDN(X64Reg regOp1, X64Reg regOp2, OpArg arg)   { WriteAVXOp(0x66, 0xDF, regOp1, regOp2, arg); } +void XEmitter::VPOR(X64Reg regOp1, X64Reg regOp2, OpArg arg)     { WriteAVXOp(0x66, 0xEB, regOp1, regOp2, arg); } +void XEmitter::VPXOR(X64Reg regOp1, X64Reg regOp2, OpArg arg)    { WriteAVXOp(0x66, 0xEF, regOp1, regOp2, arg); } + +void XEmitter::VFMADD132PS(X64Reg regOp1, X64Reg regOp2, OpArg arg)    { WriteAVXOp(0x66, 0x3898, regOp1, regOp2, arg); } +void XEmitter::VFMADD213PS(X64Reg regOp1, X64Reg regOp2, OpArg arg)    { WriteAVXOp(0x66, 0x38A8, regOp1, regOp2, arg); } +void XEmitter::VFMADD231PS(X64Reg regOp1, X64Reg regOp2, OpArg arg)    { WriteAVXOp(0x66, 0x38B8, regOp1, regOp2, arg); } +void XEmitter::VFMADD132PD(X64Reg regOp1, X64Reg regOp2, OpArg arg)    { WriteAVXOp(0x66, 0x3898, regOp1, regOp2, arg, 1); } +void XEmitter::VFMADD213PD(X64Reg regOp1, X64Reg regOp2, OpArg arg)    { WriteAVXOp(0x66, 0x38A8, regOp1, regOp2, arg, 1); } +void XEmitter::VFMADD231PD(X64Reg regOp1, X64Reg regOp2, OpArg arg)    { WriteAVXOp(0x66, 0x38B8, regOp1, regOp2, arg, 1); } +void XEmitter::VFMADD132SS(X64Reg regOp1, X64Reg regOp2, OpArg arg)    { WriteAVXOp(0x66, 0x3899, regOp1, regOp2, arg); } +void XEmitter::VFMADD213SS(X64Reg regOp1, X64Reg regOp2, OpArg arg)    { WriteAVXOp(0x66, 0x38A9, regOp1, regOp2, arg); } +void XEmitter::VFMADD231SS(X64Reg regOp1, X64Reg regOp2, OpArg arg)    { WriteAVXOp(0x66, 0x38B9, regOp1, regOp2, arg); } +void XEmitter::VFMADD132SD(X64Reg regOp1, X64Reg regOp2, OpArg arg)    { WriteAVXOp(0x66, 0x3899, regOp1, regOp2, arg, 1); } +void XEmitter::VFMADD213SD(X64Reg regOp1, X64Reg regOp2, OpArg arg)    { WriteAVXOp(0x66, 0x38A9, regOp1, regOp2, arg, 1); } +void XEmitter::VFMADD231SD(X64Reg regOp1, X64Reg regOp2, OpArg arg)    { WriteAVXOp(0x66, 0x38B9, regOp1, regOp2, arg, 1); } +void XEmitter::VFMSUB132PS(X64Reg regOp1, X64Reg regOp2, OpArg arg)    { WriteAVXOp(0x66, 0x389A, regOp1, regOp2, arg); } +void XEmitter::VFMSUB213PS(X64Reg regOp1, X64Reg regOp2, OpArg arg)    { WriteAVXOp(0x66, 0x38AA, regOp1, regOp2, arg); } +void XEmitter::VFMSUB231PS(X64Reg regOp1, X64Reg regOp2, OpArg arg)    { WriteAVXOp(0x66, 0x38BA, regOp1, regOp2, arg); } +void XEmitter::VFMSUB132PD(X64Reg regOp1, X64Reg regOp2, OpArg arg)    { WriteAVXOp(0x66, 0x389A, regOp1, regOp2, arg, 1); } +void XEmitter::VFMSUB213PD(X64Reg regOp1, X64Reg regOp2, OpArg arg)    { WriteAVXOp(0x66, 0x38AA, regOp1, regOp2, arg, 1); } +void XEmitter::VFMSUB231PD(X64Reg regOp1, X64Reg regOp2, OpArg arg)    { WriteAVXOp(0x66, 0x38BA, regOp1, regOp2, arg, 1); } +void XEmitter::VFMSUB132SS(X64Reg regOp1, X64Reg regOp2, OpArg arg)    { WriteAVXOp(0x66, 0x389B, regOp1, regOp2, arg); } +void XEmitter::VFMSUB213SS(X64Reg regOp1, X64Reg regOp2, OpArg arg)    { WriteAVXOp(0x66, 0x38AB, regOp1, regOp2, arg); } +void XEmitter::VFMSUB231SS(X64Reg regOp1, X64Reg regOp2, OpArg arg)    { WriteAVXOp(0x66, 0x38BB, regOp1, regOp2, arg); } +void XEmitter::VFMSUB132SD(X64Reg regOp1, X64Reg regOp2, OpArg arg)    { WriteAVXOp(0x66, 0x389B, regOp1, regOp2, arg, 1); } +void XEmitter::VFMSUB213SD(X64Reg regOp1, X64Reg regOp2, OpArg arg)    { WriteAVXOp(0x66, 0x38AB, regOp1, regOp2, arg, 1); } +void XEmitter::VFMSUB231SD(X64Reg regOp1, X64Reg regOp2, OpArg arg)    { WriteAVXOp(0x66, 0x38BB, regOp1, regOp2, arg, 1); } +void XEmitter::VFNMADD132PS(X64Reg regOp1, X64Reg regOp2, OpArg arg)   { WriteAVXOp(0x66, 0x389C, regOp1, regOp2, arg); } +void XEmitter::VFNMADD213PS(X64Reg regOp1, X64Reg regOp2, OpArg arg)   { WriteAVXOp(0x66, 0x38AC, regOp1, regOp2, arg); } +void XEmitter::VFNMADD231PS(X64Reg regOp1, X64Reg regOp2, OpArg arg)   { WriteAVXOp(0x66, 0x38BC, regOp1, regOp2, arg); } +void XEmitter::VFNMADD132PD(X64Reg regOp1, X64Reg regOp2, OpArg arg)   { WriteAVXOp(0x66, 0x389C, regOp1, regOp2, arg, 1); } +void XEmitter::VFNMADD213PD(X64Reg regOp1, X64Reg regOp2, OpArg arg)   { WriteAVXOp(0x66, 0x38AC, regOp1, regOp2, arg, 1); } +void XEmitter::VFNMADD231PD(X64Reg regOp1, X64Reg regOp2, OpArg arg)   { WriteAVXOp(0x66, 0x38BC, regOp1, regOp2, arg, 1); } +void XEmitter::VFNMADD132SS(X64Reg regOp1, X64Reg regOp2, OpArg arg)   { WriteAVXOp(0x66, 0x389D, regOp1, regOp2, arg); } +void XEmitter::VFNMADD213SS(X64Reg regOp1, X64Reg regOp2, OpArg arg)   { WriteAVXOp(0x66, 0x38AD, regOp1, regOp2, arg); } +void XEmitter::VFNMADD231SS(X64Reg regOp1, X64Reg regOp2, OpArg arg)   { WriteAVXOp(0x66, 0x38BD, regOp1, regOp2, arg); } +void XEmitter::VFNMADD132SD(X64Reg regOp1, X64Reg regOp2, OpArg arg)   { WriteAVXOp(0x66, 0x389D, regOp1, regOp2, arg, 1); } +void XEmitter::VFNMADD213SD(X64Reg regOp1, X64Reg regOp2, OpArg arg)   { WriteAVXOp(0x66, 0x38AD, regOp1, regOp2, arg, 1); } +void XEmitter::VFNMADD231SD(X64Reg regOp1, X64Reg regOp2, OpArg arg)   { WriteAVXOp(0x66, 0x38BD, regOp1, regOp2, arg, 1); } +void XEmitter::VFNMSUB132PS(X64Reg regOp1, X64Reg regOp2, OpArg arg)   { WriteAVXOp(0x66, 0x389E, regOp1, regOp2, arg); } +void XEmitter::VFNMSUB213PS(X64Reg regOp1, X64Reg regOp2, OpArg arg)   { WriteAVXOp(0x66, 0x38AE, regOp1, regOp2, arg); } +void XEmitter::VFNMSUB231PS(X64Reg regOp1, X64Reg regOp2, OpArg arg)   { WriteAVXOp(0x66, 0x38BE, regOp1, regOp2, arg); } +void XEmitter::VFNMSUB132PD(X64Reg regOp1, X64Reg regOp2, OpArg arg)   { WriteAVXOp(0x66, 0x389E, regOp1, regOp2, arg, 1); } +void XEmitter::VFNMSUB213PD(X64Reg regOp1, X64Reg regOp2, OpArg arg)   { WriteAVXOp(0x66, 0x38AE, regOp1, regOp2, arg, 1); } +void XEmitter::VFNMSUB231PD(X64Reg regOp1, X64Reg regOp2, OpArg arg)   { WriteAVXOp(0x66, 0x38BE, regOp1, regOp2, arg, 1); } +void XEmitter::VFNMSUB132SS(X64Reg regOp1, X64Reg regOp2, OpArg arg)   { WriteAVXOp(0x66, 0x389F, regOp1, regOp2, arg); } +void XEmitter::VFNMSUB213SS(X64Reg regOp1, X64Reg regOp2, OpArg arg)   { WriteAVXOp(0x66, 0x38AF, regOp1, regOp2, arg); } +void XEmitter::VFNMSUB231SS(X64Reg regOp1, X64Reg regOp2, OpArg arg)   { WriteAVXOp(0x66, 0x38BF, regOp1, regOp2, arg); } +void XEmitter::VFNMSUB132SD(X64Reg regOp1, X64Reg regOp2, OpArg arg)   { WriteAVXOp(0x66, 0x389F, regOp1, regOp2, arg, 1); } +void XEmitter::VFNMSUB213SD(X64Reg regOp1, X64Reg regOp2, OpArg arg)   { WriteAVXOp(0x66, 0x38AF, regOp1, regOp2, arg, 1); } +void XEmitter::VFNMSUB231SD(X64Reg regOp1, X64Reg regOp2, OpArg arg)   { WriteAVXOp(0x66, 0x38BF, regOp1, regOp2, arg, 1); } +void XEmitter::VFMADDSUB132PS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x3896, regOp1, regOp2, arg); } +void XEmitter::VFMADDSUB213PS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38A6, regOp1, regOp2, arg); } +void XEmitter::VFMADDSUB231PS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38B6, regOp1, regOp2, arg); } +void XEmitter::VFMADDSUB132PD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x3896, regOp1, regOp2, arg, 1); } +void XEmitter::VFMADDSUB213PD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38A6, regOp1, regOp2, arg, 1); } +void XEmitter::VFMADDSUB231PD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38B6, regOp1, regOp2, arg, 1); } +void XEmitter::VFMSUBADD132PS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x3897, regOp1, regOp2, arg); } +void XEmitter::VFMSUBADD213PS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38A7, regOp1, regOp2, arg); } +void XEmitter::VFMSUBADD231PS(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38B7, regOp1, regOp2, arg); } +void XEmitter::VFMSUBADD132PD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x3897, regOp1, regOp2, arg, 1); } +void XEmitter::VFMSUBADD213PD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38A7, regOp1, regOp2, arg, 1); } +void XEmitter::VFMSUBADD231PD(X64Reg regOp1, X64Reg regOp2, OpArg arg) { WriteAVXOp(0x66, 0x38B7, regOp1, regOp2, arg, 1); } + +void XEmitter::SARX(int bits, X64Reg regOp1, OpArg arg, X64Reg regOp2) {WriteBMI2Op(bits, 0xF3, 0x38F7, regOp1, regOp2, arg);} +void XEmitter::SHLX(int bits, X64Reg regOp1, OpArg arg, X64Reg regOp2) {WriteBMI2Op(bits, 0x66, 0x38F7, regOp1, regOp2, arg);} +void XEmitter::SHRX(int bits, X64Reg regOp1, OpArg arg, X64Reg regOp2) {WriteBMI2Op(bits, 0xF2, 0x38F7, regOp1, regOp2, arg);} +void XEmitter::RORX(int bits, X64Reg regOp, OpArg arg, u8 rotate)      {WriteBMI2Op(bits, 0xF2, 0x3AF0, regOp, INVALID_REG, arg, 1); Write8(rotate);} +void XEmitter::PEXT(int bits, X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteBMI2Op(bits, 0xF3, 0x38F5, regOp1, regOp2, arg);} +void XEmitter::PDEP(int bits, X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteBMI2Op(bits, 0xF2, 0x38F5, regOp1, regOp2, arg);} +void XEmitter::MULX(int bits, X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteBMI2Op(bits, 0xF2, 0x38F6, regOp2, regOp1, arg);} +void XEmitter::BZHI(int bits, X64Reg regOp1, OpArg arg, X64Reg regOp2) {WriteBMI2Op(bits, 0x00, 0x38F5, regOp1, regOp2, arg);} +void XEmitter::BLSR(int bits, X64Reg regOp, OpArg arg)                 {WriteBMI1Op(bits, 0x00, 0x38F3, (X64Reg)0x1, regOp, arg);} +void XEmitter::BLSMSK(int bits, X64Reg regOp, OpArg arg)               {WriteBMI1Op(bits, 0x00, 0x38F3, (X64Reg)0x2, regOp, arg);} +void XEmitter::BLSI(int bits, X64Reg regOp, OpArg arg)                 {WriteBMI1Op(bits, 0x00, 0x38F3, (X64Reg)0x3, regOp, arg);} +void XEmitter::BEXTR(int bits, X64Reg regOp1, OpArg arg, X64Reg regOp2){WriteBMI1Op(bits, 0x00, 0x38F7, regOp1, regOp2, arg);} +void XEmitter::ANDN(int bits, X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteBMI1Op(bits, 0x00, 0x38F2, regOp1, regOp2, arg);} + +// Prefixes + +void XEmitter::LOCK()  { Write8(0xF0); } +void XEmitter::REP()   { Write8(0xF3); } +void XEmitter::REPNE() { Write8(0xF2); } +void XEmitter::FSOverride() { Write8(0x64); } +void XEmitter::GSOverride() { Write8(0x65); } + +void XEmitter::FWAIT() +{ +    Write8(0x9B); +} + +// TODO: make this more generic +void XEmitter::WriteFloatLoadStore(int bits, FloatOp op, FloatOp op_80b, OpArg arg) +{ +    int mf = 0; +    ASSERT_MSG(!(bits == 80 && op_80b == floatINVALID), "WriteFloatLoadStore: 80 bits not supported for this instruction"); +    switch (bits) +    { +    case 32: mf = 0; break; +    case 64: mf = 4; break; +    case 80: mf = 2; break; +    default: ASSERT_MSG(0, "WriteFloatLoadStore: invalid bits (should be 32/64/80)"); +    } +    Write8(0xd9 | mf); +    // x87 instructions use the reg field of the ModR/M byte as opcode: +    if (bits == 80) +        op = op_80b; +    arg.WriteRest(this, 0, (X64Reg) op); +} + +void XEmitter::FLD(int bits, OpArg src) {WriteFloatLoadStore(bits, floatLD, floatLD80, src);} +void XEmitter::FST(int bits, OpArg dest) {WriteFloatLoadStore(bits, floatST, floatINVALID, dest);} +void XEmitter::FSTP(int bits, OpArg dest) {WriteFloatLoadStore(bits, floatSTP, floatSTP80, dest);} +void XEmitter::FNSTSW_AX() { Write8(0xDF); Write8(0xE0); } + +void XEmitter::RDTSC() { Write8(0x0F); Write8(0x31); } + +void XCodeBlock::PoisonMemory() { +    // x86/64: 0xCC = breakpoint +    memset(region, 0xCC, region_size); +} + +} diff --git a/src/common/x64/emitter.h b/src/common/x64/emitter.h new file mode 100644 index 000000000..312e9dc19 --- /dev/null +++ b/src/common/x64/emitter.h @@ -0,0 +1,1067 @@ +// Copyright (C) 2003 Dolphin Project. + +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, version 2.0 or later versions. + +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the +// GNU General Public License 2.0 for more details. + +// A copy of the GPL 2.0 should have been included with the program. +// If not, see http://www.gnu.org/licenses/ + +// Official SVN repository and contact information can be found at +// http://code.google.com/p/dolphin-emu/ + +#pragma once + +#include "common/assert.h" +#include "common/common_types.h" +#include "common/code_block.h" + +#if defined(ARCHITECTURE_X64) && !defined(_ARCH_64) +#define _ARCH_64 +#endif + +#ifdef _ARCH_64 +#define PTRBITS 64 +#else +#define PTRBITS 32 +#endif + +namespace Gen +{ + +enum X64Reg +{ +    EAX = 0, EBX = 3, ECX = 1, EDX = 2, +    ESI = 6, EDI = 7, EBP = 5, ESP = 4, + +    RAX = 0, RBX = 3, RCX = 1, RDX = 2, +    RSI = 6, RDI = 7, RBP = 5, RSP = 4, +    R8  = 8, R9  = 9, R10 = 10,R11 = 11, +    R12 = 12,R13 = 13,R14 = 14,R15 = 15, + +    AL = 0, BL = 3, CL = 1, DL = 2, +    SIL = 6, DIL = 7, BPL = 5, SPL = 4, +    AH = 0x104, BH = 0x107, CH = 0x105, DH = 0x106, + +    AX = 0, BX = 3, CX = 1, DX = 2, +    SI = 6, DI = 7, BP = 5, SP = 4, + +    XMM0=0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7, +    XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15, + +    YMM0=0, YMM1, YMM2, YMM3, YMM4, YMM5, YMM6, YMM7, +    YMM8, YMM9, YMM10, YMM11, YMM12, YMM13, YMM14, YMM15, + +    INVALID_REG = 0xFFFFFFFF +}; + +enum CCFlags +{ +    CC_O   = 0, +    CC_NO  = 1, +    CC_B   = 2, CC_C   = 2, CC_NAE = 2, +    CC_NB  = 3, CC_NC  = 3, CC_AE  = 3, +    CC_Z   = 4, CC_E   = 4, +    CC_NZ  = 5, CC_NE  = 5, +    CC_BE  = 6, CC_NA  = 6, +    CC_NBE = 7, CC_A   = 7, +    CC_S   = 8, +    CC_NS  = 9, +    CC_P   = 0xA, CC_PE  = 0xA, +    CC_NP  = 0xB, CC_PO  = 0xB, +    CC_L   = 0xC, CC_NGE = 0xC, +    CC_NL  = 0xD, CC_GE  = 0xD, +    CC_LE  = 0xE, CC_NG  = 0xE, +    CC_NLE = 0xF, CC_G   = 0xF +}; + +enum +{ +    NUMGPRs = 16, +    NUMXMMs = 16, +}; + +enum +{ +    SCALE_NONE = 0, +    SCALE_1 = 1, +    SCALE_2 = 2, +    SCALE_4 = 4, +    SCALE_8 = 8, +    SCALE_ATREG = 16, +    //SCALE_NOBASE_1 is not supported and can be replaced with SCALE_ATREG +    SCALE_NOBASE_2 = 34, +    SCALE_NOBASE_4 = 36, +    SCALE_NOBASE_8 = 40, +    SCALE_RIP = 0xFF, +    SCALE_IMM8  = 0xF0, +    SCALE_IMM16 = 0xF1, +    SCALE_IMM32 = 0xF2, +    SCALE_IMM64 = 0xF3, +}; + +enum NormalOp { +    nrmADD, +    nrmADC, +    nrmSUB, +    nrmSBB, +    nrmAND, +    nrmOR , +    nrmXOR, +    nrmMOV, +    nrmTEST, +    nrmCMP, +    nrmXCHG, +}; + +enum { +    CMP_EQ = 0, +    CMP_LT = 1, +    CMP_LE = 2, +    CMP_UNORD = 3, +    CMP_NEQ = 4, +    CMP_NLT = 5, +    CMP_NLE = 6, +    CMP_ORD = 7, +}; + +enum FloatOp { +    floatLD = 0, +    floatST = 2, +    floatSTP = 3, +    floatLD80 = 5, +    floatSTP80 = 7, + +    floatINVALID = -1, +}; + +enum FloatRound { +    FROUND_NEAREST = 0, +    FROUND_FLOOR = 1, +    FROUND_CEIL = 2, +    FROUND_ZERO = 3, +    FROUND_MXCSR = 4, + +    FROUND_RAISE_PRECISION = 0, +    FROUND_IGNORE_PRECISION = 8, +}; + +class XEmitter; + +// RIP addressing does not benefit from micro op fusion on Core arch +struct OpArg +{ +    OpArg() {}  // dummy op arg, used for storage +    OpArg(u64 _offset, int _scale, X64Reg rmReg = RAX, X64Reg scaledReg = RAX) +    { +        operandReg = 0; +        scale = (u8)_scale; +        offsetOrBaseReg = (u16)rmReg; +        indexReg = (u16)scaledReg; +        //if scale == 0 never mind offsetting +        offset = _offset; +    } +    bool operator==(const OpArg &b) const +    { +        return operandReg == b.operandReg && scale == b.scale && offsetOrBaseReg == b.offsetOrBaseReg && +               indexReg == b.indexReg && offset == b.offset; +    } +    void WriteRex(XEmitter *emit, int opBits, int bits, int customOp = -1) const; +    void WriteVex(XEmitter* emit, X64Reg regOp1, X64Reg regOp2, int L, int pp, int mmmmm, int W = 0) const; +    void WriteRest(XEmitter *emit, int extraBytes=0, X64Reg operandReg=INVALID_REG, bool warn_64bit_offset = true) const; +    void WriteFloatModRM(XEmitter *emit, FloatOp op); +    void WriteSingleByteOp(XEmitter *emit, u8 op, X64Reg operandReg, int bits); +    // This one is public - must be written to +    u64 offset;  // use RIP-relative as much as possible - 64-bit immediates are not available. +    u16 operandReg; + +    void WriteNormalOp(XEmitter *emit, bool toRM, NormalOp op, const OpArg &operand, int bits) const; +    bool IsImm() const {return scale == SCALE_IMM8 || scale == SCALE_IMM16 || scale == SCALE_IMM32 || scale == SCALE_IMM64;} +    bool IsSimpleReg() const {return scale == SCALE_NONE;} +    bool IsSimpleReg(X64Reg reg) const +    { +        if (!IsSimpleReg()) +            return false; +        return GetSimpleReg() == reg; +    } + +    bool CanDoOpWith(const OpArg &other) const +    { +        if (IsSimpleReg()) return true; +        if (!IsSimpleReg() && !other.IsSimpleReg() && !other.IsImm()) return false; +        return true; +    } + +    int GetImmBits() const +    { +        switch (scale) +        { +        case SCALE_IMM8: return 8; +        case SCALE_IMM16: return 16; +        case SCALE_IMM32: return 32; +        case SCALE_IMM64: return 64; +        default: return -1; +        } +    } + +    void SetImmBits(int bits) { +        switch (bits) +        { +            case 8: scale = SCALE_IMM8; break; +            case 16: scale = SCALE_IMM16; break; +            case 32: scale = SCALE_IMM32; break; +            case 64: scale = SCALE_IMM64; break; +        } +    } + +    X64Reg GetSimpleReg() const +    { +        if (scale == SCALE_NONE) +            return (X64Reg)offsetOrBaseReg; +        else +            return INVALID_REG; +    } + +    u32 GetImmValue() const { +        return (u32)offset; +    } + +    // For loops. +    void IncreaseOffset(int sz) { +        offset += sz; +    } + +private: +    u8 scale; +    u16 offsetOrBaseReg; +    u16 indexReg; +}; + +inline OpArg M(const void *ptr) {return OpArg((u64)ptr, (int)SCALE_RIP);} +template <typename T> +inline OpArg M(const T *ptr)    {return OpArg((u64)(const void *)ptr, (int)SCALE_RIP);} +inline OpArg R(X64Reg value)    {return OpArg(0, SCALE_NONE, value);} +inline OpArg MatR(X64Reg value) {return OpArg(0, SCALE_ATREG, value);} + +inline OpArg MDisp(X64Reg value, int offset) +{ +    return OpArg((u32)offset, SCALE_ATREG, value); +} + +inline OpArg MComplex(X64Reg base, X64Reg scaled, int scale, int offset) +{ +    return OpArg(offset, scale, base, scaled); +} + +inline OpArg MScaled(X64Reg scaled, int scale, int offset) +{ +    if (scale == SCALE_1) +        return OpArg(offset, SCALE_ATREG, scaled); +    else +        return OpArg(offset, scale | 0x20, RAX, scaled); +} + +inline OpArg MRegSum(X64Reg base, X64Reg offset) +{ +    return MComplex(base, offset, 1, 0); +} + +inline OpArg Imm8 (u8 imm)  {return OpArg(imm, SCALE_IMM8);} +inline OpArg Imm16(u16 imm) {return OpArg(imm, SCALE_IMM16);} //rarely used +inline OpArg Imm32(u32 imm) {return OpArg(imm, SCALE_IMM32);} +inline OpArg Imm64(u64 imm) {return OpArg(imm, SCALE_IMM64);} +inline OpArg UImmAuto(u32 imm) { +    return OpArg(imm, imm >= 128 ? SCALE_IMM32 : SCALE_IMM8); +} +inline OpArg SImmAuto(s32 imm) { +    return OpArg(imm, (imm >= 128 || imm < -128) ? SCALE_IMM32 : SCALE_IMM8); +} + +#ifdef _ARCH_64 +inline OpArg ImmPtr(const void* imm) {return Imm64((u64)imm);} +#else +inline OpArg ImmPtr(const void* imm) {return Imm32((u32)imm);} +#endif + +inline u32 PtrOffset(const void* ptr, const void* base) +{ +#ifdef _ARCH_64 +    s64 distance = (s64)ptr-(s64)base; +    if (distance >= 0x80000000LL || +        distance < -0x80000000LL) +    { +        ASSERT_MSG(0, "pointer offset out of range"); +        return 0; +    } + +    return (u32)distance; +#else +    return (u32)ptr-(u32)base; +#endif +} + +//usage: int a[]; ARRAY_OFFSET(a,10) +#define ARRAY_OFFSET(array,index) ((u32)((u64)&(array)[index]-(u64)&(array)[0])) +//usage: struct {int e;} s; STRUCT_OFFSET(s,e) +#define STRUCT_OFFSET(str,elem) ((u32)((u64)&(str).elem-(u64)&(str))) + +struct FixupBranch +{ +    u8 *ptr; +    int type; //0 = 8bit 1 = 32bit +}; + +enum SSECompare +{ +    EQ = 0, +    LT, +    LE, +    UNORD, +    NEQ, +    NLT, +    NLE, +    ORD, +}; + +typedef const u8* JumpTarget; + +class XEmitter +{ +    friend struct OpArg;  // for Write8 etc +private: +    u8 *code; +    bool flags_locked; + +    void CheckFlags(); + +    void Rex(int w, int r, int x, int b); +    void WriteSimple1Byte(int bits, u8 byte, X64Reg reg); +    void WriteSimple2Byte(int bits, u8 byte1, u8 byte2, X64Reg reg); +    void WriteMulDivType(int bits, OpArg src, int ext); +    void WriteBitSearchType(int bits, X64Reg dest, OpArg src, u8 byte2, bool rep = false); +    void WriteShift(int bits, OpArg dest, OpArg &shift, int ext); +    void WriteBitTest(int bits, OpArg &dest, OpArg &index, int ext); +    void WriteMXCSR(OpArg arg, int ext); +    void WriteSSEOp(u8 opPrefix, u16 op, X64Reg regOp, OpArg arg, int extrabytes = 0); +    void WriteSSSE3Op(u8 opPrefix, u16 op, X64Reg regOp, OpArg arg, int extrabytes = 0); +    void WriteSSE41Op(u8 opPrefix, u16 op, X64Reg regOp, OpArg arg, int extrabytes = 0); +    void WriteAVXOp(u8 opPrefix, u16 op, X64Reg regOp, OpArg arg, int extrabytes = 0); +    void WriteAVXOp(u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, OpArg arg, int extrabytes = 0); +    void WriteVEXOp(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, OpArg arg, int extrabytes = 0); +    void WriteBMI1Op(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, OpArg arg, int extrabytes = 0); +    void WriteBMI2Op(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, OpArg arg, int extrabytes = 0); +    void WriteFloatLoadStore(int bits, FloatOp op, FloatOp op_80b, OpArg arg); +    void WriteNormalOp(XEmitter *emit, int bits, NormalOp op, const OpArg &a1, const OpArg &a2); + +    void ABI_CalculateFrameSize(u32 mask, size_t rsp_alignment, size_t needed_frame_size, size_t* shadowp, size_t* subtractionp, size_t* xmm_offsetp); + +protected: +    inline void Write8(u8 value)   {*code++ = value;} +    inline void Write16(u16 value) {*(u16*)code = (value); code += 2;} +    inline void Write32(u32 value) {*(u32*)code = (value); code += 4;} +    inline void Write64(u64 value) {*(u64*)code = (value); code += 8;} + +public: +    XEmitter() { code = nullptr; flags_locked = false; } +    XEmitter(u8 *code_ptr) { code = code_ptr; flags_locked = false; } +    virtual ~XEmitter() {} + +    void WriteModRM(int mod, int rm, int reg); +    void WriteSIB(int scale, int index, int base); + +    void SetCodePtr(u8 *ptr); +    void ReserveCodeSpace(int bytes); +    const u8 *AlignCode4(); +    const u8 *AlignCode16(); +    const u8 *AlignCodePage(); +    const u8 *GetCodePtr() const; +    u8 *GetWritableCodePtr(); + +    void LockFlags() { flags_locked = true; } +    void UnlockFlags() { flags_locked = false; } + +    // Looking for one of these? It's BANNED!! Some instructions are slow on modern CPU +    // INC, DEC, LOOP, LOOPNE, LOOPE, ENTER, LEAVE, XCHG, XLAT, REP MOVSB/MOVSD, REP SCASD + other string instr., +    // INC and DEC are slow on Intel Core, but not on AMD. They create a +    // false flag dependency because they only update a subset of the flags. +    // XCHG is SLOW and should be avoided. + +    // Debug breakpoint +    void INT3(); + +    // Do nothing +    void NOP(size_t count = 1); + +    // Save energy in wait-loops on P4 only. Probably not too useful. +    void PAUSE(); + +    // Flag control +    void STC(); +    void CLC(); +    void CMC(); + +    // These two can not be executed in 64-bit mode on early Intel 64-bit CPU:s, only on Core2 and AMD! +    void LAHF(); // 3 cycle vector path +    void SAHF(); // direct path fast + + +    // Stack control +    void PUSH(X64Reg reg); +    void POP(X64Reg reg); +    void PUSH(int bits, const OpArg ®); +    void POP(int bits, const OpArg ®); +    void PUSHF(); +    void POPF(); + +    // Flow control +    void RET(); +    void RET_FAST(); +    void UD2(); +    FixupBranch J(bool force5bytes = false); + +    void JMP(const u8 * addr, bool force5Bytes = false); +    void JMP(OpArg arg); +    void JMPptr(const OpArg &arg); +    void JMPself(); //infinite loop! +#ifdef CALL +#undef CALL +#endif +    void CALL(const void *fnptr); +    void CALLptr(OpArg arg); + +    FixupBranch J_CC(CCFlags conditionCode, bool force5bytes = false); +    //void J_CC(CCFlags conditionCode, JumpTarget target); +    void J_CC(CCFlags conditionCode, const u8 * addr, bool force5Bytes = false); + +    void SetJumpTarget(const FixupBranch &branch); + +    void SETcc(CCFlags flag, OpArg dest); +    // Note: CMOV brings small if any benefit on current cpus. +    void CMOVcc(int bits, X64Reg dest, OpArg src, CCFlags flag); + +    // Fences +    void LFENCE(); +    void MFENCE(); +    void SFENCE(); + +    // Bit scan +    void BSF(int bits, X64Reg dest, OpArg src); //bottom bit to top bit +    void BSR(int bits, X64Reg dest, OpArg src); //top bit to bottom bit + +    // Cache control +    enum PrefetchLevel +    { +        PF_NTA, //Non-temporal (data used once and only once) +        PF_T0,  //All cache levels +        PF_T1,  //Levels 2+ (aliased to T0 on AMD) +        PF_T2,  //Levels 3+ (aliased to T0 on AMD) +    }; +    void PREFETCH(PrefetchLevel level, OpArg arg); +    void MOVNTI(int bits, OpArg dest, X64Reg src); +    void MOVNTDQ(OpArg arg, X64Reg regOp); +    void MOVNTPS(OpArg arg, X64Reg regOp); +    void MOVNTPD(OpArg arg, X64Reg regOp); + +    // Multiplication / division +    void MUL(int bits, OpArg src); //UNSIGNED +    void IMUL(int bits, OpArg src); //SIGNED +    void IMUL(int bits, X64Reg regOp, OpArg src); +    void IMUL(int bits, X64Reg regOp, OpArg src, OpArg imm); +    void DIV(int bits, OpArg src); +    void IDIV(int bits, OpArg src); + +    // Shift +    void ROL(int bits, OpArg dest, OpArg shift); +    void ROR(int bits, OpArg dest, OpArg shift); +    void RCL(int bits, OpArg dest, OpArg shift); +    void RCR(int bits, OpArg dest, OpArg shift); +    void SHL(int bits, OpArg dest, OpArg shift); +    void SHR(int bits, OpArg dest, OpArg shift); +    void SAR(int bits, OpArg dest, OpArg shift); + +    // Bit Test +    void BT(int bits, OpArg dest, OpArg index); +    void BTS(int bits, OpArg dest, OpArg index); +    void BTR(int bits, OpArg dest, OpArg index); +    void BTC(int bits, OpArg dest, OpArg index); + +    // Double-Precision Shift +    void SHRD(int bits, OpArg dest, OpArg src, OpArg shift); +    void SHLD(int bits, OpArg dest, OpArg src, OpArg shift); + +    // Extend EAX into EDX in various ways +    void CWD(int bits = 16); +    inline void CDQ() {CWD(32);} +    inline void CQO() {CWD(64);} +    void CBW(int bits = 8); +    inline void CWDE() {CBW(16);} +    inline void CDQE() {CBW(32);} + +    // Load effective address +    void LEA(int bits, X64Reg dest, OpArg src); + +    // Integer arithmetic +    void NEG (int bits, OpArg src); +    void ADD (int bits, const OpArg &a1, const OpArg &a2); +    void ADC (int bits, const OpArg &a1, const OpArg &a2); +    void SUB (int bits, const OpArg &a1, const OpArg &a2); +    void SBB (int bits, const OpArg &a1, const OpArg &a2); +    void AND (int bits, const OpArg &a1, const OpArg &a2); +    void CMP (int bits, const OpArg &a1, const OpArg &a2); + +    // Bit operations +    void NOT (int bits, OpArg src); +    void OR  (int bits, const OpArg &a1, const OpArg &a2); +    void XOR (int bits, const OpArg &a1, const OpArg &a2); +    void MOV (int bits, const OpArg &a1, const OpArg &a2); +    void TEST(int bits, const OpArg &a1, const OpArg &a2); + +    // Are these useful at all? Consider removing. +    void XCHG(int bits, const OpArg &a1, const OpArg &a2); +    void XCHG_AHAL(); + +    // Byte swapping (32 and 64-bit only). +    void BSWAP(int bits, X64Reg reg); + +    // Sign/zero extension +    void MOVSX(int dbits, int sbits, X64Reg dest, OpArg src); //automatically uses MOVSXD if necessary +    void MOVZX(int dbits, int sbits, X64Reg dest, OpArg src); + +    // Available only on Atom or >= Haswell so far. Test with cpu_info.bMOVBE. +    void MOVBE(int dbits, const OpArg& dest, const OpArg& src); + +    // Available only on AMD >= Phenom or Intel >= Haswell +    void LZCNT(int bits, X64Reg dest, OpArg src); +    // Note: this one is actually part of BMI1 +    void TZCNT(int bits, X64Reg dest, OpArg src); + +    // WARNING - These two take 11-13 cycles and are VectorPath! (AMD64) +    void STMXCSR(OpArg memloc); +    void LDMXCSR(OpArg memloc); + +    // Prefixes +    void LOCK(); +    void REP(); +    void REPNE(); +    void FSOverride(); +    void GSOverride(); + +    // x87 +    enum x87StatusWordBits { +        x87_InvalidOperation = 0x1, +        x87_DenormalizedOperand = 0x2, +        x87_DivisionByZero = 0x4, +        x87_Overflow = 0x8, +        x87_Underflow = 0x10, +        x87_Precision = 0x20, +        x87_StackFault = 0x40, +        x87_ErrorSummary = 0x80, +        x87_C0 = 0x100, +        x87_C1 = 0x200, +        x87_C2 = 0x400, +        x87_TopOfStack = 0x2000 | 0x1000 | 0x800, +        x87_C3 = 0x4000, +        x87_FPUBusy = 0x8000, +    }; + +    void FLD(int bits, OpArg src); +    void FST(int bits, OpArg dest); +    void FSTP(int bits, OpArg dest); +    void FNSTSW_AX(); +    void FWAIT(); + +    // SSE/SSE2: Floating point arithmetic +    void ADDSS(X64Reg regOp, OpArg arg); +    void ADDSD(X64Reg regOp, OpArg arg); +    void SUBSS(X64Reg regOp, OpArg arg); +    void SUBSD(X64Reg regOp, OpArg arg); +    void MULSS(X64Reg regOp, OpArg arg); +    void MULSD(X64Reg regOp, OpArg arg); +    void DIVSS(X64Reg regOp, OpArg arg); +    void DIVSD(X64Reg regOp, OpArg arg); +    void MINSS(X64Reg regOp, OpArg arg); +    void MINSD(X64Reg regOp, OpArg arg); +    void MAXSS(X64Reg regOp, OpArg arg); +    void MAXSD(X64Reg regOp, OpArg arg); +    void SQRTSS(X64Reg regOp, OpArg arg); +    void SQRTSD(X64Reg regOp, OpArg arg); +    void RSQRTSS(X64Reg regOp, OpArg arg); + +    // SSE/SSE2: Floating point bitwise (yes) +    void CMPSS(X64Reg regOp, OpArg arg, u8 compare); +    void CMPSD(X64Reg regOp, OpArg arg, u8 compare); + +    inline void CMPEQSS(X64Reg regOp, OpArg arg) { CMPSS(regOp, arg, CMP_EQ); } +    inline void CMPLTSS(X64Reg regOp, OpArg arg) { CMPSS(regOp, arg, CMP_LT); } +    inline void CMPLESS(X64Reg regOp, OpArg arg) { CMPSS(regOp, arg, CMP_LE); } +    inline void CMPUNORDSS(X64Reg regOp, OpArg arg) { CMPSS(regOp, arg, CMP_UNORD); } +    inline void CMPNEQSS(X64Reg regOp, OpArg arg) { CMPSS(regOp, arg, CMP_NEQ); } +    inline void CMPNLTSS(X64Reg regOp, OpArg arg) { CMPSS(regOp, arg, CMP_NLT); } +    inline void CMPORDSS(X64Reg regOp, OpArg arg) { CMPSS(regOp, arg, CMP_ORD); } + +    // SSE/SSE2: Floating point packed arithmetic (x4 for float, x2 for double) +    void ADDPS(X64Reg regOp, OpArg arg); +    void ADDPD(X64Reg regOp, OpArg arg); +    void SUBPS(X64Reg regOp, OpArg arg); +    void SUBPD(X64Reg regOp, OpArg arg); +    void CMPPS(X64Reg regOp, OpArg arg, u8 compare); +    void CMPPD(X64Reg regOp, OpArg arg, u8 compare); +    void MULPS(X64Reg regOp, OpArg arg); +    void MULPD(X64Reg regOp, OpArg arg); +    void DIVPS(X64Reg regOp, OpArg arg); +    void DIVPD(X64Reg regOp, OpArg arg); +    void MINPS(X64Reg regOp, OpArg arg); +    void MINPD(X64Reg regOp, OpArg arg); +    void MAXPS(X64Reg regOp, OpArg arg); +    void MAXPD(X64Reg regOp, OpArg arg); +    void SQRTPS(X64Reg regOp, OpArg arg); +    void SQRTPD(X64Reg regOp, OpArg arg); +    void RCPPS(X64Reg regOp, OpArg arg); +    void RSQRTPS(X64Reg regOp, OpArg arg); + +    // SSE/SSE2: Floating point packed bitwise (x4 for float, x2 for double) +    void ANDPS(X64Reg regOp, OpArg arg); +    void ANDPD(X64Reg regOp, OpArg arg); +    void ANDNPS(X64Reg regOp, OpArg arg); +    void ANDNPD(X64Reg regOp, OpArg arg); +    void ORPS(X64Reg regOp, OpArg arg); +    void ORPD(X64Reg regOp, OpArg arg); +    void XORPS(X64Reg regOp, OpArg arg); +    void XORPD(X64Reg regOp, OpArg arg); + +    // SSE/SSE2: Shuffle components. These are tricky - see Intel documentation. +    void SHUFPS(X64Reg regOp, OpArg arg, u8 shuffle); +    void SHUFPD(X64Reg regOp, OpArg arg, u8 shuffle); + +    // SSE/SSE2: Useful alternative to shuffle in some cases. +    void MOVDDUP(X64Reg regOp, OpArg arg); + +    // TODO: Actually implement +#if 0 +    // SSE3: Horizontal operations in SIMD registers. Could be useful for various VFPU things like dot products... +    void ADDSUBPS(X64Reg dest, OpArg src); +    void ADDSUBPD(X64Reg dest, OpArg src); +    void HADDPD(X64Reg dest, OpArg src); +    void HSUBPS(X64Reg dest, OpArg src); +    void HSUBPD(X64Reg dest, OpArg src); + +    // SSE4: Further horizontal operations - dot products. These are weirdly flexible, the arg contains both a read mask and a write "mask". +    void DPPD(X64Reg dest, OpArg src, u8 arg); + +    // These are probably useful for VFPU emulation. +    void INSERTPS(X64Reg dest, OpArg src, u8 arg); +    void EXTRACTPS(OpArg dest, X64Reg src, u8 arg); +#endif + +    // SSE3: Horizontal operations in SIMD registers. Very slow! shufps-based code beats it handily on Ivy. +    void HADDPS(X64Reg dest, OpArg src); + +    // SSE4: Further horizontal operations - dot products. These are weirdly flexible, the arg contains both a read mask and a write "mask". +    void DPPS(X64Reg dest, OpArg src, u8 arg); + +    void UNPCKLPS(X64Reg dest, OpArg src); +    void UNPCKHPS(X64Reg dest, OpArg src); +    void UNPCKLPD(X64Reg dest, OpArg src); +    void UNPCKHPD(X64Reg dest, OpArg src); + +    // SSE/SSE2: Compares. +    void COMISS(X64Reg regOp, OpArg arg); +    void COMISD(X64Reg regOp, OpArg arg); +    void UCOMISS(X64Reg regOp, OpArg arg); +    void UCOMISD(X64Reg regOp, OpArg arg); + +    // SSE/SSE2: Moves. Use the right data type for your data, in most cases. +    void MOVAPS(X64Reg regOp, OpArg arg); +    void MOVAPD(X64Reg regOp, OpArg arg); +    void MOVAPS(OpArg arg, X64Reg regOp); +    void MOVAPD(OpArg arg, X64Reg regOp); + +    void MOVUPS(X64Reg regOp, OpArg arg); +    void MOVUPD(X64Reg regOp, OpArg arg); +    void MOVUPS(OpArg arg, X64Reg regOp); +    void MOVUPD(OpArg arg, X64Reg regOp); + +    void MOVDQA(X64Reg regOp, OpArg arg); +    void MOVDQA(OpArg arg, X64Reg regOp); +    void MOVDQU(X64Reg regOp, OpArg arg); +    void MOVDQU(OpArg arg, X64Reg regOp); + +    void MOVSS(X64Reg regOp, OpArg arg); +    void MOVSD(X64Reg regOp, OpArg arg); +    void MOVSS(OpArg arg, X64Reg regOp); +    void MOVSD(OpArg arg, X64Reg regOp); + +    void MOVLPS(X64Reg regOp, OpArg arg); +    void MOVLPD(X64Reg regOp, OpArg arg); +    void MOVLPS(OpArg arg, X64Reg regOp); +    void MOVLPD(OpArg arg, X64Reg regOp); + +    void MOVHPS(X64Reg regOp, OpArg arg); +    void MOVHPD(X64Reg regOp, OpArg arg); +    void MOVHPS(OpArg arg, X64Reg regOp); +    void MOVHPD(OpArg arg, X64Reg regOp); + +    void MOVHLPS(X64Reg regOp1, X64Reg regOp2); +    void MOVLHPS(X64Reg regOp1, X64Reg regOp2); + +    void MOVD_xmm(X64Reg dest, const OpArg &arg); +    void MOVQ_xmm(X64Reg dest, OpArg arg); +    void MOVD_xmm(const OpArg &arg, X64Reg src); +    void MOVQ_xmm(OpArg arg, X64Reg src); + +    // SSE/SSE2: Generates a mask from the high bits of the components of the packed register in question. +    void MOVMSKPS(X64Reg dest, OpArg arg); +    void MOVMSKPD(X64Reg dest, OpArg arg); + +    // SSE2: Selective byte store, mask in src register. EDI/RDI specifies store address. This is a weird one. +    void MASKMOVDQU(X64Reg dest, X64Reg src); +    void LDDQU(X64Reg dest, OpArg src); + +    // SSE/SSE2: Data type conversions. +    void CVTPS2PD(X64Reg dest, OpArg src); +    void CVTPD2PS(X64Reg dest, OpArg src); +    void CVTSS2SD(X64Reg dest, OpArg src); +    void CVTSI2SS(X64Reg dest, OpArg src); +    void CVTSD2SS(X64Reg dest, OpArg src); +    void CVTSI2SD(X64Reg dest, OpArg src); +    void CVTDQ2PD(X64Reg regOp, OpArg arg); +    void CVTPD2DQ(X64Reg regOp, OpArg arg); +    void CVTDQ2PS(X64Reg regOp, OpArg arg); +    void CVTPS2DQ(X64Reg regOp, OpArg arg); + +    void CVTTPS2DQ(X64Reg regOp, OpArg arg); +    void CVTTPD2DQ(X64Reg regOp, OpArg arg); + +    // Destinations are X64 regs (rax, rbx, ...) for these instructions. +    void CVTSS2SI(X64Reg xregdest, OpArg src); +    void CVTSD2SI(X64Reg xregdest, OpArg src); +    void CVTTSS2SI(X64Reg xregdest, OpArg arg); +    void CVTTSD2SI(X64Reg xregdest, OpArg arg); + +    // SSE2: Packed integer instructions +    void PACKSSDW(X64Reg dest, OpArg arg); +    void PACKSSWB(X64Reg dest, OpArg arg); +    void PACKUSDW(X64Reg dest, OpArg arg); +    void PACKUSWB(X64Reg dest, OpArg arg); + +    void PUNPCKLBW(X64Reg dest, const OpArg &arg); +    void PUNPCKLWD(X64Reg dest, const OpArg &arg); +    void PUNPCKLDQ(X64Reg dest, const OpArg &arg); +    void PUNPCKLQDQ(X64Reg dest, const OpArg &arg); + +    void PTEST(X64Reg dest, OpArg arg); +    void PAND(X64Reg dest, OpArg arg); +    void PANDN(X64Reg dest, OpArg arg); +    void PXOR(X64Reg dest, OpArg arg); +    void POR(X64Reg dest, OpArg arg); + +    void PADDB(X64Reg dest, OpArg arg); +    void PADDW(X64Reg dest, OpArg arg); +    void PADDD(X64Reg dest, OpArg arg); +    void PADDQ(X64Reg dest, OpArg arg); + +    void PADDSB(X64Reg dest, OpArg arg); +    void PADDSW(X64Reg dest, OpArg arg); +    void PADDUSB(X64Reg dest, OpArg arg); +    void PADDUSW(X64Reg dest, OpArg arg); + +    void PSUBB(X64Reg dest, OpArg arg); +    void PSUBW(X64Reg dest, OpArg arg); +    void PSUBD(X64Reg dest, OpArg arg); +    void PSUBQ(X64Reg dest, OpArg arg); + +    void PSUBSB(X64Reg dest, OpArg arg); +    void PSUBSW(X64Reg dest, OpArg arg); +    void PSUBUSB(X64Reg dest, OpArg arg); +    void PSUBUSW(X64Reg dest, OpArg arg); + +    void PAVGB(X64Reg dest, OpArg arg); +    void PAVGW(X64Reg dest, OpArg arg); + +    void PCMPEQB(X64Reg dest, OpArg arg); +    void PCMPEQW(X64Reg dest, OpArg arg); +    void PCMPEQD(X64Reg dest, OpArg arg); + +    void PCMPGTB(X64Reg dest, OpArg arg); +    void PCMPGTW(X64Reg dest, OpArg arg); +    void PCMPGTD(X64Reg dest, OpArg arg); + +    void PEXTRW(X64Reg dest, OpArg arg, u8 subreg); +    void PINSRW(X64Reg dest, OpArg arg, u8 subreg); + +    void PMADDWD(X64Reg dest, OpArg arg); +    void PSADBW(X64Reg dest, OpArg arg); + +    void PMAXSW(X64Reg dest, OpArg arg); +    void PMAXUB(X64Reg dest, OpArg arg); +    void PMINSW(X64Reg dest, OpArg arg); +    void PMINUB(X64Reg dest, OpArg arg); +    // SSE4: More MAX/MIN instructions. +    void PMINSB(X64Reg dest, OpArg arg); +    void PMINSD(X64Reg dest, OpArg arg); +    void PMINUW(X64Reg dest, OpArg arg); +    void PMINUD(X64Reg dest, OpArg arg); +    void PMAXSB(X64Reg dest, OpArg arg); +    void PMAXSD(X64Reg dest, OpArg arg); +    void PMAXUW(X64Reg dest, OpArg arg); +    void PMAXUD(X64Reg dest, OpArg arg); + +    void PMOVMSKB(X64Reg dest, OpArg arg); +    void PSHUFD(X64Reg dest, OpArg arg, u8 shuffle); +    void PSHUFB(X64Reg dest, OpArg arg); + +    void PSHUFLW(X64Reg dest, OpArg arg, u8 shuffle); +    void PSHUFHW(X64Reg dest, OpArg arg, u8 shuffle); + +    void PSRLW(X64Reg reg, int shift); +    void PSRLD(X64Reg reg, int shift); +    void PSRLQ(X64Reg reg, int shift); +    void PSRLQ(X64Reg reg, OpArg arg); +    void PSRLDQ(X64Reg reg, int shift); + +    void PSLLW(X64Reg reg, int shift); +    void PSLLD(X64Reg reg, int shift); +    void PSLLQ(X64Reg reg, int shift); +    void PSLLDQ(X64Reg reg, int shift); + +    void PSRAW(X64Reg reg, int shift); +    void PSRAD(X64Reg reg, int shift); + +    // SSE4: data type conversions +    void PMOVSXBW(X64Reg dest, OpArg arg); +    void PMOVSXBD(X64Reg dest, OpArg arg); +    void PMOVSXBQ(X64Reg dest, OpArg arg); +    void PMOVSXWD(X64Reg dest, OpArg arg); +    void PMOVSXWQ(X64Reg dest, OpArg arg); +    void PMOVSXDQ(X64Reg dest, OpArg arg); +    void PMOVZXBW(X64Reg dest, OpArg arg); +    void PMOVZXBD(X64Reg dest, OpArg arg); +    void PMOVZXBQ(X64Reg dest, OpArg arg); +    void PMOVZXWD(X64Reg dest, OpArg arg); +    void PMOVZXWQ(X64Reg dest, OpArg arg); +    void PMOVZXDQ(X64Reg dest, OpArg arg); + +    // SSE4: variable blend instructions (xmm0 implicit argument) +    void PBLENDVB(X64Reg dest, OpArg arg); +    void BLENDVPS(X64Reg dest, OpArg arg); +    void BLENDVPD(X64Reg dest, OpArg arg); +    void BLENDPS(X64Reg dest, const OpArg& arg, u8 blend); +    void BLENDPD(X64Reg dest, const OpArg& arg, u8 blend); + +    // SSE4: rounding (see FloatRound for mode or use ROUNDNEARSS, etc. helpers.) +    void ROUNDSS(X64Reg dest, OpArg arg, u8 mode); +    void ROUNDSD(X64Reg dest, OpArg arg, u8 mode); +    void ROUNDPS(X64Reg dest, OpArg arg, u8 mode); +    void ROUNDPD(X64Reg dest, OpArg arg, u8 mode); + +    inline void ROUNDNEARSS(X64Reg dest, OpArg arg) { ROUNDSS(dest, arg, FROUND_NEAREST); } +    inline void ROUNDFLOORSS(X64Reg dest, OpArg arg) { ROUNDSS(dest, arg, FROUND_FLOOR); } +    inline void ROUNDCEILSS(X64Reg dest, OpArg arg) { ROUNDSS(dest, arg, FROUND_CEIL); } +    inline void ROUNDZEROSS(X64Reg dest, OpArg arg) { ROUNDSS(dest, arg, FROUND_ZERO); } + +    inline void ROUNDNEARSD(X64Reg dest, OpArg arg) { ROUNDSD(dest, arg, FROUND_NEAREST); } +    inline void ROUNDFLOORSD(X64Reg dest, OpArg arg) { ROUNDSD(dest, arg, FROUND_FLOOR); } +    inline void ROUNDCEILSD(X64Reg dest, OpArg arg) { ROUNDSD(dest, arg, FROUND_CEIL); } +    inline void ROUNDZEROSD(X64Reg dest, OpArg arg) { ROUNDSD(dest, arg, FROUND_ZERO); } + +    inline void ROUNDNEARPS(X64Reg dest, OpArg arg) { ROUNDPS(dest, arg, FROUND_NEAREST); } +    inline void ROUNDFLOORPS(X64Reg dest, OpArg arg) { ROUNDPS(dest, arg, FROUND_FLOOR); } +    inline void ROUNDCEILPS(X64Reg dest, OpArg arg) { ROUNDPS(dest, arg, FROUND_CEIL); } +    inline void ROUNDZEROPS(X64Reg dest, OpArg arg) { ROUNDPS(dest, arg, FROUND_ZERO); } + +    inline void ROUNDNEARPD(X64Reg dest, OpArg arg) { ROUNDPD(dest, arg, FROUND_NEAREST); } +    inline void ROUNDFLOORPD(X64Reg dest, OpArg arg) { ROUNDPD(dest, arg, FROUND_FLOOR); } +    inline void ROUNDCEILPD(X64Reg dest, OpArg arg) { ROUNDPD(dest, arg, FROUND_CEIL); } +    inline void ROUNDZEROPD(X64Reg dest, OpArg arg) { ROUNDPD(dest, arg, FROUND_ZERO); } + +    // AVX +    void VADDSD(X64Reg regOp1, X64Reg regOp2, OpArg arg); +    void VSUBSD(X64Reg regOp1, X64Reg regOp2, OpArg arg); +    void VMULSD(X64Reg regOp1, X64Reg regOp2, OpArg arg); +    void VDIVSD(X64Reg regOp1, X64Reg regOp2, OpArg arg); +    void VADDPD(X64Reg regOp1, X64Reg regOp2, OpArg arg); +    void VSUBPD(X64Reg regOp1, X64Reg regOp2, OpArg arg); +    void VMULPD(X64Reg regOp1, X64Reg regOp2, OpArg arg); +    void VDIVPD(X64Reg regOp1, X64Reg regOp2, OpArg arg); +    void VSQRTSD(X64Reg regOp1, X64Reg regOp2, OpArg arg); +    void VSHUFPD(X64Reg regOp1, X64Reg regOp2, OpArg arg, u8 shuffle); +    void VUNPCKLPD(X64Reg regOp1, X64Reg regOp2, OpArg arg); +    void VUNPCKHPD(X64Reg regOp1, X64Reg regOp2, OpArg arg); + +    void VANDPS(X64Reg regOp1, X64Reg regOp2, OpArg arg); +    void VANDPD(X64Reg regOp1, X64Reg regOp2, OpArg arg); +    void VANDNPS(X64Reg regOp1, X64Reg regOp2, OpArg arg); +    void VANDNPD(X64Reg regOp1, X64Reg regOp2, OpArg arg); +    void VORPS(X64Reg regOp1, X64Reg regOp2, OpArg arg); +    void VORPD(X64Reg regOp1, X64Reg regOp2, OpArg arg); +    void VXORPS(X64Reg regOp1, X64Reg regOp2, OpArg arg); +    void VXORPD(X64Reg regOp1, X64Reg regOp2, OpArg arg); + +    void VPAND(X64Reg regOp1, X64Reg regOp2, OpArg arg); +    void VPANDN(X64Reg regOp1, X64Reg regOp2, OpArg arg); +    void VPOR(X64Reg regOp1, X64Reg regOp2, OpArg arg); +    void VPXOR(X64Reg regOp1, X64Reg regOp2, OpArg arg); + +    // FMA3 +    void VFMADD132PS(X64Reg regOp1, X64Reg regOp2, OpArg arg); +    void VFMADD213PS(X64Reg regOp1, X64Reg regOp2, OpArg arg); +    void VFMADD231PS(X64Reg regOp1, X64Reg regOp2, OpArg arg); +    void VFMADD132PD(X64Reg regOp1, X64Reg regOp2, OpArg arg); +    void VFMADD213PD(X64Reg regOp1, X64Reg regOp2, OpArg arg); +    void VFMADD231PD(X64Reg regOp1, X64Reg regOp2, OpArg arg); +    void VFMADD132SS(X64Reg regOp1, X64Reg regOp2, OpArg arg); +    void VFMADD213SS(X64Reg regOp1, X64Reg regOp2, OpArg arg); +    void VFMADD231SS(X64Reg regOp1, X64Reg regOp2, OpArg arg); +    void VFMADD132SD(X64Reg regOp1, X64Reg regOp2, OpArg arg); +    void VFMADD213SD(X64Reg regOp1, X64Reg regOp2, OpArg arg); +    void VFMADD231SD(X64Reg regOp1, X64Reg regOp2, OpArg arg); +    void VFMSUB132PS(X64Reg regOp1, X64Reg regOp2, OpArg arg); +    void VFMSUB213PS(X64Reg regOp1, X64Reg regOp2, OpArg arg); +    void VFMSUB231PS(X64Reg regOp1, X64Reg regOp2, OpArg arg); +    void VFMSUB132PD(X64Reg regOp1, X64Reg regOp2, OpArg arg); +    void VFMSUB213PD(X64Reg regOp1, X64Reg regOp2, OpArg arg); +    void VFMSUB231PD(X64Reg regOp1, X64Reg regOp2, OpArg arg); +    void VFMSUB132SS(X64Reg regOp1, X64Reg regOp2, OpArg arg); +    void VFMSUB213SS(X64Reg regOp1, X64Reg regOp2, OpArg arg); +    void VFMSUB231SS(X64Reg regOp1, X64Reg regOp2, OpArg arg); +    void VFMSUB132SD(X64Reg regOp1, X64Reg regOp2, OpArg arg); +    void VFMSUB213SD(X64Reg regOp1, X64Reg regOp2, OpArg arg); +    void VFMSUB231SD(X64Reg regOp1, X64Reg regOp2, OpArg arg); +    void VFNMADD132PS(X64Reg regOp1, X64Reg regOp2, OpArg arg); +    void VFNMADD213PS(X64Reg regOp1, X64Reg regOp2, OpArg arg); +    void VFNMADD231PS(X64Reg regOp1, X64Reg regOp2, OpArg arg); +    void VFNMADD132PD(X64Reg regOp1, X64Reg regOp2, OpArg arg); +    void VFNMADD213PD(X64Reg regOp1, X64Reg regOp2, OpArg arg); +    void VFNMADD231PD(X64Reg regOp1, X64Reg regOp2, OpArg arg); +    void VFNMADD132SS(X64Reg regOp1, X64Reg regOp2, OpArg arg); +    void VFNMADD213SS(X64Reg regOp1, X64Reg regOp2, OpArg arg); +    void VFNMADD231SS(X64Reg regOp1, X64Reg regOp2, OpArg arg); +    void VFNMADD132SD(X64Reg regOp1, X64Reg regOp2, OpArg arg); +    void VFNMADD213SD(X64Reg regOp1, X64Reg regOp2, OpArg arg); +    void VFNMADD231SD(X64Reg regOp1, X64Reg regOp2, OpArg arg); +    void VFNMSUB132PS(X64Reg regOp1, X64Reg regOp2, OpArg arg); +    void VFNMSUB213PS(X64Reg regOp1, X64Reg regOp2, OpArg arg); +    void VFNMSUB231PS(X64Reg regOp1, X64Reg regOp2, OpArg arg); +    void VFNMSUB132PD(X64Reg regOp1, X64Reg regOp2, OpArg arg); +    void VFNMSUB213PD(X64Reg regOp1, X64Reg regOp2, OpArg arg); +    void VFNMSUB231PD(X64Reg regOp1, X64Reg regOp2, OpArg arg); +    void VFNMSUB132SS(X64Reg regOp1, X64Reg regOp2, OpArg arg); +    void VFNMSUB213SS(X64Reg regOp1, X64Reg regOp2, OpArg arg); +    void VFNMSUB231SS(X64Reg regOp1, X64Reg regOp2, OpArg arg); +    void VFNMSUB132SD(X64Reg regOp1, X64Reg regOp2, OpArg arg); +    void VFNMSUB213SD(X64Reg regOp1, X64Reg regOp2, OpArg arg); +    void VFNMSUB231SD(X64Reg regOp1, X64Reg regOp2, OpArg arg); +    void VFMADDSUB132PS(X64Reg regOp1, X64Reg regOp2, OpArg arg); +    void VFMADDSUB213PS(X64Reg regOp1, X64Reg regOp2, OpArg arg); +    void VFMADDSUB231PS(X64Reg regOp1, X64Reg regOp2, OpArg arg); +    void VFMADDSUB132PD(X64Reg regOp1, X64Reg regOp2, OpArg arg); +    void VFMADDSUB213PD(X64Reg regOp1, X64Reg regOp2, OpArg arg); +    void VFMADDSUB231PD(X64Reg regOp1, X64Reg regOp2, OpArg arg); +    void VFMSUBADD132PS(X64Reg regOp1, X64Reg regOp2, OpArg arg); +    void VFMSUBADD213PS(X64Reg regOp1, X64Reg regOp2, OpArg arg); +    void VFMSUBADD231PS(X64Reg regOp1, X64Reg regOp2, OpArg arg); +    void VFMSUBADD132PD(X64Reg regOp1, X64Reg regOp2, OpArg arg); +    void VFMSUBADD213PD(X64Reg regOp1, X64Reg regOp2, OpArg arg); +    void VFMSUBADD231PD(X64Reg regOp1, X64Reg regOp2, OpArg arg); + +    // VEX GPR instructions +    void SARX(int bits, X64Reg regOp1, OpArg arg, X64Reg regOp2); +    void SHLX(int bits, X64Reg regOp1, OpArg arg, X64Reg regOp2); +    void SHRX(int bits, X64Reg regOp1, OpArg arg, X64Reg regOp2); +    void RORX(int bits, X64Reg regOp, OpArg arg, u8 rotate); +    void PEXT(int bits, X64Reg regOp1, X64Reg regOp2, OpArg arg); +    void PDEP(int bits, X64Reg regOp1, X64Reg regOp2, OpArg arg); +    void MULX(int bits, X64Reg regOp1, X64Reg regOp2, OpArg arg); +    void BZHI(int bits, X64Reg regOp1, OpArg arg, X64Reg regOp2); +    void BLSR(int bits, X64Reg regOp, OpArg arg); +    void BLSMSK(int bits, X64Reg regOp, OpArg arg); +    void BLSI(int bits, X64Reg regOp, OpArg arg); +    void BEXTR(int bits, X64Reg regOp1, OpArg arg, X64Reg regOp2); +    void ANDN(int bits, X64Reg regOp1, X64Reg regOp2, OpArg arg); + +    void RDTSC(); + +    // Utility functions +    // The difference between this and CALL is that this aligns the stack +    // where appropriate. +    void ABI_CallFunction(const void *func); +    template <typename T> +    void ABI_CallFunction(T (*func)()) { +        ABI_CallFunction((const void *)func); +    } + +    void ABI_CallFunction(const u8 *func) { +        ABI_CallFunction((const void *)func); +    } +    void ABI_CallFunctionC16(const void *func, u16 param1); +    void ABI_CallFunctionCC16(const void *func, u32 param1, u16 param2); + + +    // These only support u32 parameters, but that's enough for a lot of uses. +    // These will destroy the 1 or 2 first "parameter regs". +    void ABI_CallFunctionC(const void *func, u32 param1); +    void ABI_CallFunctionCC(const void *func, u32 param1, u32 param2); +    void ABI_CallFunctionCCC(const void *func, u32 param1, u32 param2, u32 param3); +    void ABI_CallFunctionCCP(const void *func, u32 param1, u32 param2, void *param3); +    void ABI_CallFunctionCCCP(const void *func, u32 param1, u32 param2, u32 param3, void *param4); +    void ABI_CallFunctionP(const void *func, void *param1); +    void ABI_CallFunctionPA(const void *func, void *param1, const Gen::OpArg &arg2); +    void ABI_CallFunctionPAA(const void *func, void *param1, const Gen::OpArg &arg2, const Gen::OpArg &arg3); +    void ABI_CallFunctionPPC(const void *func, void *param1, void *param2, u32 param3); +    void ABI_CallFunctionAC(const void *func, const Gen::OpArg &arg1, u32 param2); +    void ABI_CallFunctionACC(const void *func, const Gen::OpArg &arg1, u32 param2, u32 param3); +    void ABI_CallFunctionA(const void *func, const Gen::OpArg &arg1); +    void ABI_CallFunctionAA(const void *func, const Gen::OpArg &arg1, const Gen::OpArg &arg2); + +    // Pass a register as a parameter. +    void ABI_CallFunctionR(const void *func, X64Reg reg1); +    void ABI_CallFunctionRR(const void *func, X64Reg reg1, X64Reg reg2); + +    template <typename Tr, typename T1> +    void ABI_CallFunctionC(Tr (*func)(T1), u32 param1) { +        ABI_CallFunctionC((const void *)func, param1); +    } + +    // A function that doesn't have any control over what it will do to regs, +    // such as the dispatcher, should be surrounded by these. +    void ABI_PushAllCalleeSavedRegsAndAdjustStack(); +    void ABI_PopAllCalleeSavedRegsAndAdjustStack(); + +    // A function that doesn't know anything about it's surroundings, should +    // be surrounded by these to establish a safe environment, where it can roam free. +    // An example is a backpatch injected function. +    void ABI_PushAllCallerSavedRegsAndAdjustStack(); +    void ABI_PopAllCallerSavedRegsAndAdjustStack(); + +    unsigned int ABI_GetAlignedFrameSize(unsigned int frameSize); +    void ABI_AlignStack(unsigned int frameSize); +    void ABI_RestoreStack(unsigned int frameSize); + +    // Sets up a __cdecl function. +    // Only x64 really needs the parameter count. +    void ABI_EmitPrologue(int maxCallParams); +    void ABI_EmitEpilogue(int maxCallParams); + +    #ifdef _M_IX86 +    inline int ABI_GetNumXMMRegs() { return 8; } +    #else +    inline int ABI_GetNumXMMRegs() { return 16; } +    #endif +};  // class XEmitter + + +// Everything that needs to generate X86 code should inherit from this. +// You get memory management for free, plus, you can use all the MOV etc functions without +// having to prefix them with gen-> or something similar. + +class XCodeBlock : public CodeBlock<XEmitter> { +public: +    void PoisonMemory() override; +}; + +}  // namespace | 
