diff options
Diffstat (limited to 'src/common/x64')
| -rw-r--r-- | src/common/x64/cpu_detect.cpp | 38 | ||||
| -rw-r--r-- | src/common/x64/cpu_detect.h | 13 | ||||
| -rw-r--r-- | src/common/x64/native_clock.cpp | 103 | ||||
| -rw-r--r-- | src/common/x64/native_clock.h | 48 | ||||
| -rw-r--r-- | src/common/x64/xbyak_abi.h | 229 | ||||
| -rw-r--r-- | src/common/x64/xbyak_util.h | 47 |
6 files changed, 478 insertions, 0 deletions
diff --git a/src/common/x64/cpu_detect.cpp b/src/common/x64/cpu_detect.cpp index c9349a6b4..fccd2eee5 100644 --- a/src/common/x64/cpu_detect.cpp +++ b/src/common/x64/cpu_detect.cpp @@ -62,6 +62,17 @@ static CPUCaps Detect() { std::memcpy(&caps.brand_string[0], &cpu_id[1], sizeof(int)); std::memcpy(&caps.brand_string[4], &cpu_id[3], sizeof(int)); std::memcpy(&caps.brand_string[8], &cpu_id[2], sizeof(int)); + if (cpu_id[1] == 0x756e6547 && cpu_id[2] == 0x6c65746e && cpu_id[3] == 0x49656e69) + caps.manufacturer = Manufacturer::Intel; + else if (cpu_id[1] == 0x68747541 && cpu_id[2] == 0x444d4163 && cpu_id[3] == 0x69746e65) + caps.manufacturer = Manufacturer::AMD; + else if (cpu_id[1] == 0x6f677948 && cpu_id[2] == 0x656e6975 && cpu_id[3] == 0x6e65476e) + caps.manufacturer = Manufacturer::Hygon; + else + caps.manufacturer = Manufacturer::Unknown; + + u32 family = {}; + u32 model = {}; __cpuid(cpu_id, 0x80000000); @@ -73,6 +84,14 @@ static CPUCaps Detect() { // Detect family and other miscellaneous features if (max_std_fn >= 1) { __cpuid(cpu_id, 0x00000001); + family = (cpu_id[0] >> 8) & 0xf; + model = (cpu_id[0] >> 4) & 0xf; + if (family == 0xf) { + family += (cpu_id[0] >> 20) & 0xff; + } + if (family >= 6) { + model += ((cpu_id[0] >> 16) & 0xf) << 4; + } if ((cpu_id[3] >> 25) & 1) caps.sse = true; @@ -110,6 +129,11 @@ static CPUCaps Detect() { caps.bmi1 = true; if ((cpu_id[1] >> 8) & 1) caps.bmi2 = true; + // Checks for AVX512F, AVX512CD, AVX512VL, AVX512DQ, AVX512BW (Intel Skylake-X/SP) + if ((cpu_id[1] >> 16) & 1 && (cpu_id[1] >> 28) & 1 && (cpu_id[1] >> 31) & 1 && + (cpu_id[1] >> 17) & 1 && (cpu_id[1] >> 30) & 1) { + caps.avx512 = caps.avx2; + } } } @@ -130,6 +154,20 @@ static CPUCaps Detect() { caps.fma4 = true; } + if (max_ex_fn >= 0x80000007) { + __cpuid(cpu_id, 0x80000007); + if (cpu_id[3] & (1 << 8)) { + caps.invariant_tsc = true; + } + } + + if (max_std_fn >= 0x16) { + __cpuid(cpu_id, 0x16); + caps.base_frequency = cpu_id[0]; + caps.max_frequency = cpu_id[1]; + caps.bus_frequency = cpu_id[2]; + } + return caps; } diff --git a/src/common/x64/cpu_detect.h b/src/common/x64/cpu_detect.h index 20f2ba234..e3b63302e 100644 --- a/src/common/x64/cpu_detect.h +++ b/src/common/x64/cpu_detect.h @@ -6,8 +6,16 @@ namespace Common { +enum class Manufacturer : u32 { + Intel = 0, + AMD = 1, + Hygon = 2, + Unknown = 3, +}; + /// x86/x64 CPU capabilities that may be detected by this module struct CPUCaps { + Manufacturer manufacturer; char cpu_string[0x21]; char brand_string[0x41]; bool sse; @@ -19,11 +27,16 @@ struct CPUCaps { bool lzcnt; bool avx; bool avx2; + bool avx512; bool bmi1; bool bmi2; bool fma; bool fma4; bool aes; + bool invariant_tsc; + u32 base_frequency; + u32 max_frequency; + u32 bus_frequency; }; /** diff --git a/src/common/x64/native_clock.cpp b/src/common/x64/native_clock.cpp new file mode 100644 index 000000000..424b39b1f --- /dev/null +++ b/src/common/x64/native_clock.cpp @@ -0,0 +1,103 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include <chrono> +#include <mutex> +#include <thread> + +#ifdef _MSC_VER +#include <intrin.h> +#else +#include <x86intrin.h> +#endif + +#include "common/uint128.h" +#include "common/x64/native_clock.h" + +namespace Common { + +u64 EstimateRDTSCFrequency() { + const auto milli_10 = std::chrono::milliseconds{10}; + // get current time + _mm_mfence(); + const u64 tscStart = __rdtsc(); + const auto startTime = std::chrono::high_resolution_clock::now(); + // wait roughly 3 seconds + while (true) { + auto milli = std::chrono::duration_cast<std::chrono::milliseconds>( + std::chrono::high_resolution_clock::now() - startTime); + if (milli.count() >= 3000) + break; + std::this_thread::sleep_for(milli_10); + } + const auto endTime = std::chrono::high_resolution_clock::now(); + _mm_mfence(); + const u64 tscEnd = __rdtsc(); + // calculate difference + const u64 timer_diff = + std::chrono::duration_cast<std::chrono::nanoseconds>(endTime - startTime).count(); + const u64 tsc_diff = tscEnd - tscStart; + const u64 tsc_freq = MultiplyAndDivide64(tsc_diff, 1000000000ULL, timer_diff); + return tsc_freq; +} + +namespace X64 { +NativeClock::NativeClock(u64 emulated_cpu_frequency, u64 emulated_clock_frequency, + u64 rtsc_frequency) + : WallClock(emulated_cpu_frequency, emulated_clock_frequency, true), rtsc_frequency{ + rtsc_frequency} { + _mm_mfence(); + last_measure = __rdtsc(); + accumulated_ticks = 0U; +} + +u64 NativeClock::GetRTSC() { + std::scoped_lock scope{rtsc_serialize}; + _mm_mfence(); + const u64 current_measure = __rdtsc(); + u64 diff = current_measure - last_measure; + diff = diff & ~static_cast<u64>(static_cast<s64>(diff) >> 63); // max(diff, 0) + if (current_measure > last_measure) { + last_measure = current_measure; + } + accumulated_ticks += diff; + /// The clock cannot be more precise than the guest timer, remove the lower bits + return accumulated_ticks & inaccuracy_mask; +} + +void NativeClock::Pause(bool is_paused) { + if (!is_paused) { + _mm_mfence(); + last_measure = __rdtsc(); + } +} + +std::chrono::nanoseconds NativeClock::GetTimeNS() { + const u64 rtsc_value = GetRTSC(); + return std::chrono::nanoseconds{MultiplyAndDivide64(rtsc_value, 1000000000, rtsc_frequency)}; +} + +std::chrono::microseconds NativeClock::GetTimeUS() { + const u64 rtsc_value = GetRTSC(); + return std::chrono::microseconds{MultiplyAndDivide64(rtsc_value, 1000000, rtsc_frequency)}; +} + +std::chrono::milliseconds NativeClock::GetTimeMS() { + const u64 rtsc_value = GetRTSC(); + return std::chrono::milliseconds{MultiplyAndDivide64(rtsc_value, 1000, rtsc_frequency)}; +} + +u64 NativeClock::GetClockCycles() { + const u64 rtsc_value = GetRTSC(); + return MultiplyAndDivide64(rtsc_value, emulated_clock_frequency, rtsc_frequency); +} + +u64 NativeClock::GetCPUCycles() { + const u64 rtsc_value = GetRTSC(); + return MultiplyAndDivide64(rtsc_value, emulated_cpu_frequency, rtsc_frequency); +} + +} // namespace X64 + +} // namespace Common diff --git a/src/common/x64/native_clock.h b/src/common/x64/native_clock.h new file mode 100644 index 000000000..97aab6ac9 --- /dev/null +++ b/src/common/x64/native_clock.h @@ -0,0 +1,48 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include <optional> + +#include "common/spin_lock.h" +#include "common/wall_clock.h" + +namespace Common { + +namespace X64 { +class NativeClock final : public WallClock { +public: + NativeClock(u64 emulated_cpu_frequency, u64 emulated_clock_frequency, u64 rtsc_frequency); + + std::chrono::nanoseconds GetTimeNS() override; + + std::chrono::microseconds GetTimeUS() override; + + std::chrono::milliseconds GetTimeMS() override; + + u64 GetClockCycles() override; + + u64 GetCPUCycles() override; + + void Pause(bool is_paused) override; + +private: + u64 GetRTSC(); + + /// value used to reduce the native clocks accuracy as some apss rely on + /// undefined behavior where the level of accuracy in the clock shouldn't + /// be higher. + static constexpr u64 inaccuracy_mask = ~(UINT64_C(0x400) - 1); + + SpinLock rtsc_serialize{}; + u64 last_measure{}; + u64 accumulated_ticks{}; + u64 rtsc_frequency; +}; +} // namespace X64 + +u64 EstimateRDTSCFrequency(); + +} // namespace Common diff --git a/src/common/x64/xbyak_abi.h b/src/common/x64/xbyak_abi.h new file mode 100644 index 000000000..26e4bfda5 --- /dev/null +++ b/src/common/x64/xbyak_abi.h @@ -0,0 +1,229 @@ +// Copyright 2016 Citra Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include <bitset> +#include <initializer_list> +#include <xbyak.h> +#include "common/assert.h" + +namespace Common::X64 { + +constexpr std::size_t RegToIndex(const Xbyak::Reg& reg) { + using Kind = Xbyak::Reg::Kind; + ASSERT_MSG((reg.getKind() & (Kind::REG | Kind::XMM)) != 0, + "RegSet only support GPRs and XMM registers."); + ASSERT_MSG(reg.getIdx() < 16, "RegSet only supports XXM0-15."); + return reg.getIdx() + (reg.getKind() == Kind::REG ? 0 : 16); +} + +constexpr Xbyak::Reg64 IndexToReg64(std::size_t reg_index) { + ASSERT(reg_index < 16); + return Xbyak::Reg64(static_cast<int>(reg_index)); +} + +constexpr Xbyak::Xmm IndexToXmm(std::size_t reg_index) { + ASSERT(reg_index >= 16 && reg_index < 32); + return Xbyak::Xmm(static_cast<int>(reg_index - 16)); +} + +constexpr Xbyak::Reg IndexToReg(std::size_t reg_index) { + if (reg_index < 16) { + return IndexToReg64(reg_index); + } else { + return IndexToXmm(reg_index); + } +} + +inline std::bitset<32> BuildRegSet(std::initializer_list<Xbyak::Reg> regs) { + std::bitset<32> bits; + for (const Xbyak::Reg& reg : regs) { + bits[RegToIndex(reg)] = true; + } + return bits; +} + +constexpr inline std::bitset<32> ABI_ALL_GPRS(0x0000FFFF); +constexpr inline std::bitset<32> ABI_ALL_XMMS(0xFFFF0000); + +#ifdef _WIN32 + +// Microsoft x64 ABI +constexpr inline Xbyak::Reg ABI_RETURN = Xbyak::util::rax; +constexpr inline Xbyak::Reg ABI_PARAM1 = Xbyak::util::rcx; +constexpr inline Xbyak::Reg ABI_PARAM2 = Xbyak::util::rdx; +constexpr inline Xbyak::Reg ABI_PARAM3 = Xbyak::util::r8; +constexpr inline Xbyak::Reg ABI_PARAM4 = Xbyak::util::r9; + +const std::bitset<32> ABI_ALL_CALLER_SAVED = BuildRegSet({ + // GPRs + Xbyak::util::rcx, + Xbyak::util::rdx, + Xbyak::util::r8, + Xbyak::util::r9, + Xbyak::util::r10, + Xbyak::util::r11, + // XMMs + Xbyak::util::xmm0, + Xbyak::util::xmm1, + Xbyak::util::xmm2, + Xbyak::util::xmm3, + Xbyak::util::xmm4, + Xbyak::util::xmm5, +}); + +const std::bitset<32> ABI_ALL_CALLEE_SAVED = BuildRegSet({ + // GPRs + Xbyak::util::rbx, + Xbyak::util::rsi, + Xbyak::util::rdi, + Xbyak::util::rbp, + Xbyak::util::r12, + Xbyak::util::r13, + Xbyak::util::r14, + Xbyak::util::r15, + // XMMs + Xbyak::util::xmm6, + Xbyak::util::xmm7, + Xbyak::util::xmm8, + Xbyak::util::xmm9, + Xbyak::util::xmm10, + Xbyak::util::xmm11, + Xbyak::util::xmm12, + Xbyak::util::xmm13, + Xbyak::util::xmm14, + Xbyak::util::xmm15, +}); + +constexpr size_t ABI_SHADOW_SPACE = 0x20; + +#else + +// System V x86-64 ABI +constexpr inline Xbyak::Reg ABI_RETURN = Xbyak::util::rax; +constexpr inline Xbyak::Reg ABI_PARAM1 = Xbyak::util::rdi; +constexpr inline Xbyak::Reg ABI_PARAM2 = Xbyak::util::rsi; +constexpr inline Xbyak::Reg ABI_PARAM3 = Xbyak::util::rdx; +constexpr inline Xbyak::Reg ABI_PARAM4 = Xbyak::util::rcx; + +const std::bitset<32> ABI_ALL_CALLER_SAVED = BuildRegSet({ + // GPRs + Xbyak::util::rcx, + Xbyak::util::rdx, + Xbyak::util::rdi, + Xbyak::util::rsi, + Xbyak::util::r8, + Xbyak::util::r9, + Xbyak::util::r10, + Xbyak::util::r11, + // XMMs + Xbyak::util::xmm0, + Xbyak::util::xmm1, + Xbyak::util::xmm2, + Xbyak::util::xmm3, + Xbyak::util::xmm4, + Xbyak::util::xmm5, + Xbyak::util::xmm6, + Xbyak::util::xmm7, + Xbyak::util::xmm8, + Xbyak::util::xmm9, + Xbyak::util::xmm10, + Xbyak::util::xmm11, + Xbyak::util::xmm12, + Xbyak::util::xmm13, + Xbyak::util::xmm14, + Xbyak::util::xmm15, +}); + +const std::bitset<32> ABI_ALL_CALLEE_SAVED = BuildRegSet({ + // GPRs + Xbyak::util::rbx, + Xbyak::util::rbp, + Xbyak::util::r12, + Xbyak::util::r13, + Xbyak::util::r14, + Xbyak::util::r15, +}); + +constexpr size_t ABI_SHADOW_SPACE = 0; + +#endif + +struct ABIFrameInfo { + s32 subtraction; + s32 xmm_offset; +}; + +inline ABIFrameInfo ABI_CalculateFrameSize(std::bitset<32> regs, size_t rsp_alignment, + size_t needed_frame_size) { + const auto count = (regs & ABI_ALL_GPRS).count(); + rsp_alignment -= count * 8; + size_t subtraction = 0; + const auto xmm_count = (regs & ABI_ALL_XMMS).count(); + if (xmm_count) { + // If we have any XMMs to save, we must align the stack here. + subtraction = rsp_alignment & 0xF; + } + subtraction += 0x10 * xmm_count; + size_t xmm_base_subtraction = subtraction; + subtraction += needed_frame_size; + subtraction += ABI_SHADOW_SPACE; + // Final alignment. + rsp_alignment -= subtraction; + subtraction += rsp_alignment & 0xF; + + return ABIFrameInfo{static_cast<s32>(subtraction), + static_cast<s32>(subtraction - xmm_base_subtraction)}; +} + +inline size_t ABI_PushRegistersAndAdjustStack(Xbyak::CodeGenerator& code, std::bitset<32> regs, + size_t rsp_alignment, size_t needed_frame_size = 0) { + auto frame_info = ABI_CalculateFrameSize(regs, rsp_alignment, needed_frame_size); + + for (std::size_t i = 0; i < regs.size(); ++i) { + if (regs[i] && ABI_ALL_GPRS[i]) { + code.push(IndexToReg64(i)); + } + } + + if (frame_info.subtraction != 0) { + code.sub(code.rsp, frame_info.subtraction); + } + + for (std::size_t i = 0; i < regs.size(); ++i) { + if (regs[i] && ABI_ALL_XMMS[i]) { + code.movaps(code.xword[code.rsp + frame_info.xmm_offset], IndexToXmm(i)); + frame_info.xmm_offset += 0x10; + } + } + + return ABI_SHADOW_SPACE; +} + +inline void ABI_PopRegistersAndAdjustStack(Xbyak::CodeGenerator& code, std::bitset<32> regs, + size_t rsp_alignment, size_t needed_frame_size = 0) { + auto frame_info = ABI_CalculateFrameSize(regs, rsp_alignment, needed_frame_size); + + for (std::size_t i = 0; i < regs.size(); ++i) { + if (regs[i] && ABI_ALL_XMMS[i]) { + code.movaps(IndexToXmm(i), code.xword[code.rsp + frame_info.xmm_offset]); + frame_info.xmm_offset += 0x10; + } + } + + if (frame_info.subtraction != 0) { + code.add(code.rsp, frame_info.subtraction); + } + + // GPRs need to be popped in reverse order + for (std::size_t j = 0; j < regs.size(); ++j) { + const std::size_t i = regs.size() - j - 1; + if (regs[i] && ABI_ALL_GPRS[i]) { + code.pop(IndexToReg64(i)); + } + } +} + +} // namespace Common::X64 diff --git a/src/common/x64/xbyak_util.h b/src/common/x64/xbyak_util.h new file mode 100644 index 000000000..df17f8cbe --- /dev/null +++ b/src/common/x64/xbyak_util.h @@ -0,0 +1,47 @@ +// Copyright 2016 Citra Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include <type_traits> +#include <xbyak.h> +#include "common/x64/xbyak_abi.h" + +namespace Common::X64 { + +// Constants for use with cmpps/cmpss +enum { + CMP_EQ = 0, + CMP_LT = 1, + CMP_LE = 2, + CMP_UNORD = 3, + CMP_NEQ = 4, + CMP_NLT = 5, + CMP_NLE = 6, + CMP_ORD = 7, +}; + +constexpr bool IsWithin2G(uintptr_t ref, uintptr_t target) { + const u64 distance = target - (ref + 5); + return !(distance >= 0x8000'0000ULL && distance <= ~0x8000'0000ULL); +} + +inline bool IsWithin2G(const Xbyak::CodeGenerator& code, uintptr_t target) { + return IsWithin2G(reinterpret_cast<uintptr_t>(code.getCurr()), target); +} + +template <typename T> +inline void CallFarFunction(Xbyak::CodeGenerator& code, const T f) { + static_assert(std::is_pointer_v<T>, "Argument must be a (function) pointer."); + size_t addr = reinterpret_cast<size_t>(f); + if (IsWithin2G(code, addr)) { + code.call(f); + } else { + // ABI_RETURN is a safe temp register to use before a call + code.mov(ABI_RETURN, addr); + code.call(ABI_RETURN); + } +} + +} // namespace Common::X64 |
