diff options
191 files changed, 6972 insertions, 5184 deletions
diff --git a/CMakeModules/CopyYuzuQt5Deps.cmake b/CMakeModules/CopyYuzuQt5Deps.cmake index 1e9810bba..2598b9b60 100644 --- a/CMakeModules/CopyYuzuQt5Deps.cmake +++ b/CMakeModules/CopyYuzuQt5Deps.cmake @@ -6,9 +6,9 @@ function(copy_yuzu_Qt5_deps target_dir) set(Qt5_STYLES_DIR "${Qt5_DIR}/../../../plugins/styles/") set(Qt5_IMAGEFORMATS_DIR "${Qt5_DIR}/../../../plugins/imageformats/") set(Qt5_RESOURCES_DIR "${Qt5_DIR}/../../../resources/") - set(PLATFORMS ${DLL_DEST}platforms/) - set(STYLES ${DLL_DEST}styles/) - set(IMAGEFORMATS ${DLL_DEST}imageformats/) + set(PLATFORMS ${DLL_DEST}plugins/platforms/) + set(STYLES ${DLL_DEST}plugins/styles/) + set(IMAGEFORMATS ${DLL_DEST}plugins/imageformats/) windows_copy_files(${target_dir} ${Qt5_DLL_DIR} ${DLL_DEST} icudt*.dll icuin*.dll @@ -42,11 +42,15 @@ function(copy_yuzu_Qt5_deps target_dir) icudtl.dat ) endif () - windows_copy_files(yuzu ${Qt5_PLATFORMS_DIR} ${PLATFORMS} qwindows$<$<CONFIG:Debug>:d>.*) windows_copy_files(yuzu ${Qt5_STYLES_DIR} ${STYLES} qwindowsvistastyle$<$<CONFIG:Debug>:d>.*) windows_copy_files(yuzu ${Qt5_IMAGEFORMATS_DIR} ${IMAGEFORMATS} qjpeg$<$<CONFIG:Debug>:d>.* qgif$<$<CONFIG:Debug>:d>.* ) + # Create an empty qt.conf file. Qt will detect that this file exists, and use the folder that its in as the root folder. + # This way it'll look for plugins in the root/plugins/ folder + add_custom_command(TARGET yuzu POST_BUILD + COMMAND ${CMAKE_COMMAND} -E touch ${DLL_DEST}qt.conf + ) endfunction(copy_yuzu_Qt5_deps) diff --git a/CMakeModules/GenerateSCMRev.cmake b/CMakeModules/GenerateSCMRev.cmake index fa7ae835f..83e4e9df2 100644 --- a/CMakeModules/GenerateSCMRev.cmake +++ b/CMakeModules/GenerateSCMRev.cmake @@ -57,8 +57,6 @@ set(HASH_FILES "${VIDEO_CORE}/renderer_opengl/gl_shader_decompiler.h" "${VIDEO_CORE}/renderer_opengl/gl_shader_disk_cache.cpp" "${VIDEO_CORE}/renderer_opengl/gl_shader_disk_cache.h" - "${VIDEO_CORE}/renderer_opengl/gl_shader_gen.cpp" - "${VIDEO_CORE}/renderer_opengl/gl_shader_gen.h" "${VIDEO_CORE}/shader/decode/arithmetic.cpp" "${VIDEO_CORE}/shader/decode/arithmetic_half.cpp" "${VIDEO_CORE}/shader/decode/arithmetic_half_immediate.cpp" @@ -91,8 +89,6 @@ set(HASH_FILES "${VIDEO_CORE}/shader/ast.h" "${VIDEO_CORE}/shader/compiler_settings.cpp" "${VIDEO_CORE}/shader/compiler_settings.h" - "${VIDEO_CORE}/shader/const_buffer_locker.cpp" - "${VIDEO_CORE}/shader/const_buffer_locker.h" "${VIDEO_CORE}/shader/control_flow.cpp" "${VIDEO_CORE}/shader/control_flow.h" "${VIDEO_CORE}/shader/decode.cpp" @@ -101,9 +97,13 @@ set(HASH_FILES "${VIDEO_CORE}/shader/node.h" "${VIDEO_CORE}/shader/node_helper.cpp" "${VIDEO_CORE}/shader/node_helper.h" + "${VIDEO_CORE}/shader/registry.cpp" + "${VIDEO_CORE}/shader/registry.h" "${VIDEO_CORE}/shader/shader_ir.cpp" "${VIDEO_CORE}/shader/shader_ir.h" "${VIDEO_CORE}/shader/track.cpp" + "${VIDEO_CORE}/shader/transform_feedback.cpp" + "${VIDEO_CORE}/shader/transform_feedback.h" ) set(COMBINED "") foreach (F IN LISTS HASH_FILES) @@ -1,7 +1,8 @@ yuzu emulator ============= -[](https://travis-ci.org/yuzu-emu/yuzu) +[](https://travis-ci.com/yuzu-emu/yuzu) [](https://dev.azure.com/yuzu-emu/yuzu/) +[](https://discord.gg/XQV6dn9) yuzu is an experimental open-source emulator for the Nintendo Switch from the creators of [Citra](https://citra-emu.org/). @@ -21,7 +22,7 @@ For development discussion, please join us on [Discord](https://discord.gg/XQV6d Most of the development happens on GitHub. It's also where [our central repository](https://github.com/yuzu-emu/yuzu) is hosted. -If you want to contribute please take a look at the [Contributor's Guide](CONTRIBUTING.md) and [Developer Information](https://github.com/yuzu-emu/yuzu/wiki/Developer-Information). You should as well contact any of the developers on Discord in order to know about the current state of the emulator. +If you want to contribute please take a look at the [Contributor's Guide](https://github.com/yuzu-emu/yuzu/wiki/Contributing) and [Developer Information](https://github.com/yuzu-emu/yuzu/wiki/Developer-Information). You should also contact any of the developers on Discord in order to know about the current state of the emulator. ### Building diff --git a/externals/microprofile/microprofile.h b/externals/microprofile/microprofile.h index cdb312b87..9d830f7bf 100644 --- a/externals/microprofile/microprofile.h +++ b/externals/microprofile/microprofile.h @@ -243,6 +243,7 @@ typedef uint32_t ThreadIdType; #define MICROPROFILE_DEFINE_GPU(var, name, color) MicroProfileToken g_mp_##var = MicroProfileGetToken("GPU", name, color, MicroProfileTokenTypeGpu) #define MICROPROFILE_TOKEN_PASTE0(a, b) a ## b #define MICROPROFILE_TOKEN_PASTE(a, b) MICROPROFILE_TOKEN_PASTE0(a,b) +#define MICROPROFILE_TOKEN(var) g_mp_##var #define MICROPROFILE_SCOPE(var) MicroProfileScopeHandler MICROPROFILE_TOKEN_PASTE(foo, __LINE__)(g_mp_##var) #define MICROPROFILE_SCOPE_TOKEN(token) MicroProfileScopeHandler MICROPROFILE_TOKEN_PASTE(foo, __LINE__)(token) #define MICROPROFILE_SCOPEI(group, name, color) static MicroProfileToken MICROPROFILE_TOKEN_PASTE(g_mp,__LINE__) = MicroProfileGetToken(group, name, color, MicroProfileTokenTypeCpu); MicroProfileScopeHandler MICROPROFILE_TOKEN_PASTE(foo,__LINE__)( MICROPROFILE_TOKEN_PASTE(g_mp,__LINE__)) @@ -827,7 +828,7 @@ inline MicroProfileLogEntry MicroProfileMakeLogIndex(uint64_t nBegin, MicroProfi MicroProfileLogEntry Entry = (nBegin<<62) | ((0x3fff&nToken)<<48) | (MP_LOG_TICK_MASK&nTick); int t = MicroProfileLogType(Entry); uint64_t nTimerIndex = MicroProfileLogTimerIndex(Entry); - MP_ASSERT(t == nBegin); + MP_ASSERT((uint64_t)t == nBegin); MP_ASSERT(nTimerIndex == (nToken&0x3fff)); return Entry; @@ -1555,10 +1556,10 @@ void MicroProfileFlip() pFramePut->nFrameStartCpu = MP_TICK(); pFramePut->nFrameStartGpu = (uint32_t)MicroProfileGpuInsertTimeStamp(); - if(pFrameNext->nFrameStartGpu != (uint64_t)-1) + if(pFrameNext->nFrameStartGpu != -1) pFrameNext->nFrameStartGpu = MicroProfileGpuGetTimeStamp((uint32_t)pFrameNext->nFrameStartGpu); - if(pFrameCurrent->nFrameStartGpu == (uint64_t)-1) + if(pFrameCurrent->nFrameStartGpu == -1) pFrameCurrent->nFrameStartGpu = pFrameNext->nFrameStartGpu + 1; uint64_t nFrameStartCpu = pFrameCurrent->nFrameStartCpu; diff --git a/src/audio_core/algorithm/interpolate.cpp b/src/audio_core/algorithm/interpolate.cpp index a58f24169..49ab9d3e1 100644 --- a/src/audio_core/algorithm/interpolate.cpp +++ b/src/audio_core/algorithm/interpolate.cpp @@ -8,13 +8,14 @@ #include <climits> #include <cmath> #include <vector> + #include "audio_core/algorithm/interpolate.h" #include "common/common_types.h" #include "common/logging/log.h" namespace AudioCore { -constexpr std::array<s16, 512> curve_lut0 = { +constexpr std::array<s16, 512> curve_lut0{ 6600, 19426, 6722, 3, 6479, 19424, 6845, 9, 6359, 19419, 6968, 15, 6239, 19412, 7093, 22, 6121, 19403, 7219, 28, 6004, 19391, 7345, 34, 5888, 19377, 7472, 41, 5773, 19361, 7600, 48, 5659, 19342, 7728, 55, 5546, 19321, 7857, @@ -56,7 +57,7 @@ constexpr std::array<s16, 512> curve_lut0 = { 19403, 6121, 22, 7093, 19412, 6239, 15, 6968, 19419, 6359, 9, 6845, 19424, 6479, 3, 6722, 19426, 6600}; -constexpr std::array<s16, 512> curve_lut1 = { +constexpr std::array<s16, 512> curve_lut1{ -68, 32639, 69, -5, -200, 32630, 212, -15, -328, 32613, 359, -26, -450, 32586, 512, -36, -568, 32551, 669, -47, -680, 32507, 832, -58, -788, 32454, 1000, -69, -891, 32393, 1174, -80, -990, 32323, 1352, -92, -1084, 32244, 1536, @@ -98,7 +99,7 @@ constexpr std::array<s16, 512> curve_lut1 = { 32551, -568, -36, 512, 32586, -450, -26, 359, 32613, -328, -15, 212, 32630, -200, -5, 69, 32639, -68}; -constexpr std::array<s16, 512> curve_lut2 = { +constexpr std::array<s16, 512> curve_lut2{ 3195, 26287, 3329, -32, 3064, 26281, 3467, -34, 2936, 26270, 3608, -38, 2811, 26253, 3751, -42, 2688, 26230, 3897, -46, 2568, 26202, 4046, -50, 2451, 26169, 4199, -54, 2338, 26130, 4354, -58, 2227, 26085, 4512, -63, 2120, 26035, 4673, @@ -146,10 +147,10 @@ std::vector<s16> Interpolate(InterpolationState& state, std::vector<s16> input, if (ratio <= 0) { LOG_CRITICAL(Audio, "Nonsensical interpolation ratio {}", ratio); - ratio = 1.0; + return input; } - const int step = static_cast<int>(ratio * 0x8000); + const s32 step{static_cast<s32>(ratio * 0x8000)}; const std::array<s16, 512>& lut = [step] { if (step > 0xaaaa) { return curve_lut0; @@ -160,28 +161,37 @@ std::vector<s16> Interpolate(InterpolationState& state, std::vector<s16> input, return curve_lut2; }(); - std::vector<s16> output(static_cast<std::size_t>(input.size() / ratio)); - int in_offset = 0; - for (std::size_t out_offset = 0; out_offset < output.size(); out_offset += 2) { - const int lut_index = (state.fraction >> 8) * 4; + const std::size_t num_frames{input.size() / 2}; + + std::vector<s16> output; + output.reserve(static_cast<std::size_t>(input.size() / ratio + InterpolationState::taps)); - const int l = input[(in_offset + 0) * 2 + 0] * lut[lut_index + 0] + - input[(in_offset + 1) * 2 + 0] * lut[lut_index + 1] + - input[(in_offset + 2) * 2 + 0] * lut[lut_index + 2] + - input[(in_offset + 3) * 2 + 0] * lut[lut_index + 3]; + for (std::size_t frame{}; frame < num_frames; ++frame) { + const std::size_t lut_index{(state.fraction >> 8) * InterpolationState::taps}; - const int r = input[(in_offset + 0) * 2 + 1] * lut[lut_index + 0] + - input[(in_offset + 1) * 2 + 1] * lut[lut_index + 1] + - input[(in_offset + 2) * 2 + 1] * lut[lut_index + 2] + - input[(in_offset + 3) * 2 + 1] * lut[lut_index + 3]; + std::rotate(state.history.begin(), state.history.end() - 1, state.history.end()); + state.history[0][0] = input[frame * 2 + 0]; + state.history[0][1] = input[frame * 2 + 1]; - const int new_offset = state.fraction + step; + while (state.position <= 1.0) { + const s32 left{state.history[0][0] * lut[lut_index + 0] + + state.history[1][0] * lut[lut_index + 1] + + state.history[2][0] * lut[lut_index + 2] + + state.history[3][0] * lut[lut_index + 3]}; + const s32 right{state.history[0][1] * lut[lut_index + 0] + + state.history[1][1] * lut[lut_index + 1] + + state.history[2][1] * lut[lut_index + 2] + + state.history[3][1] * lut[lut_index + 3]}; + const s32 new_offset{state.fraction + step}; - in_offset += new_offset >> 15; - state.fraction = new_offset & 0x7fff; + state.fraction = new_offset & 0x7fff; - output[out_offset + 0] = static_cast<s16>(std::clamp(l >> 15, SHRT_MIN, SHRT_MAX)); - output[out_offset + 1] = static_cast<s16>(std::clamp(r >> 15, SHRT_MIN, SHRT_MAX)); + output.emplace_back(static_cast<s16>(std::clamp(left >> 15, SHRT_MIN, SHRT_MAX))); + output.emplace_back(static_cast<s16>(std::clamp(right >> 15, SHRT_MIN, SHRT_MAX))); + + state.position += ratio; + } + state.position -= 1.0; } return output; diff --git a/src/audio_core/algorithm/interpolate.h b/src/audio_core/algorithm/interpolate.h index 1b9831a75..ab1a31754 100644 --- a/src/audio_core/algorithm/interpolate.h +++ b/src/audio_core/algorithm/interpolate.h @@ -6,12 +6,17 @@ #include <array> #include <vector> + #include "common/common_types.h" namespace AudioCore { struct InterpolationState { - int fraction = 0; + static constexpr std::size_t taps{4}; + static constexpr std::size_t history_size{taps * 2 - 1}; + std::array<std::array<s16, 2>, history_size> history{}; + double position{}; + s32 fraction{}; }; /// Interpolates input signal to produce output signal. diff --git a/src/audio_core/cubeb_sink.cpp b/src/audio_core/cubeb_sink.cpp index 7047ed9cf..c4e0e30fe 100644 --- a/src/audio_core/cubeb_sink.cpp +++ b/src/audio_core/cubeb_sink.cpp @@ -8,6 +8,7 @@ #include "audio_core/cubeb_sink.h" #include "audio_core/stream.h" #include "audio_core/time_stretch.h" +#include "common/assert.h" #include "common/logging/log.h" #include "common/ring_buffer.h" #include "core/settings.h" @@ -65,12 +66,25 @@ public: void EnqueueSamples(u32 source_num_channels, const std::vector<s16>& samples) override { if (source_num_channels > num_channels) { // Downsample 6 channels to 2 + ASSERT_MSG(source_num_channels == 6, "Channel count must be 6"); + std::vector<s16> buf; buf.reserve(samples.size() * num_channels / source_num_channels); for (std::size_t i = 0; i < samples.size(); i += source_num_channels) { - for (std::size_t ch = 0; ch < num_channels; ch++) { - buf.push_back(samples[i + ch]); - } + // Downmixing implementation taken from the ATSC standard + const s16 left{samples[i + 0]}; + const s16 right{samples[i + 1]}; + const s16 center{samples[i + 2]}; + const s16 surround_left{samples[i + 4]}; + const s16 surround_right{samples[i + 5]}; + // Not used in the ATSC reference implementation + [[maybe_unused]] const s16 low_frequency_effects { samples[i + 3] }; + + constexpr s32 clev{707}; // center mixing level coefficient + constexpr s32 slev{707}; // surround mixing level coefficient + + buf.push_back(left + (clev * center / 1000) + (slev * surround_left / 1000)); + buf.push_back(right + (clev * center / 1000) + (slev * surround_right / 1000)); } queue.Push(buf); return; diff --git a/src/common/CMakeLists.txt b/src/common/CMakeLists.txt index 9afc6105d..fbebed715 100644 --- a/src/common/CMakeLists.txt +++ b/src/common/CMakeLists.txt @@ -38,8 +38,6 @@ add_custom_command(OUTPUT scm_rev.cpp "${VIDEO_CORE}/renderer_opengl/gl_shader_decompiler.h" "${VIDEO_CORE}/renderer_opengl/gl_shader_disk_cache.cpp" "${VIDEO_CORE}/renderer_opengl/gl_shader_disk_cache.h" - "${VIDEO_CORE}/renderer_opengl/gl_shader_gen.cpp" - "${VIDEO_CORE}/renderer_opengl/gl_shader_gen.h" "${VIDEO_CORE}/shader/decode/arithmetic.cpp" "${VIDEO_CORE}/shader/decode/arithmetic_half.cpp" "${VIDEO_CORE}/shader/decode/arithmetic_half_immediate.cpp" @@ -72,8 +70,6 @@ add_custom_command(OUTPUT scm_rev.cpp "${VIDEO_CORE}/shader/ast.h" "${VIDEO_CORE}/shader/compiler_settings.cpp" "${VIDEO_CORE}/shader/compiler_settings.h" - "${VIDEO_CORE}/shader/const_buffer_locker.cpp" - "${VIDEO_CORE}/shader/const_buffer_locker.h" "${VIDEO_CORE}/shader/control_flow.cpp" "${VIDEO_CORE}/shader/control_flow.h" "${VIDEO_CORE}/shader/decode.cpp" @@ -82,9 +78,13 @@ add_custom_command(OUTPUT scm_rev.cpp "${VIDEO_CORE}/shader/node.h" "${VIDEO_CORE}/shader/node_helper.cpp" "${VIDEO_CORE}/shader/node_helper.h" + "${VIDEO_CORE}/shader/registry.cpp" + "${VIDEO_CORE}/shader/registry.h" "${VIDEO_CORE}/shader/shader_ir.cpp" "${VIDEO_CORE}/shader/shader_ir.h" "${VIDEO_CORE}/shader/track.cpp" + "${VIDEO_CORE}/shader/transform_feedback.cpp" + "${VIDEO_CORE}/shader/transform_feedback.h" # and also check that the scm_rev files haven't changed "${CMAKE_CURRENT_SOURCE_DIR}/scm_rev.cpp.in" "${CMAKE_CURRENT_SOURCE_DIR}/scm_rev.h" diff --git a/src/common/math_util.h b/src/common/math_util.h index d6c35ee89..83ef0201f 100644 --- a/src/common/math_util.h +++ b/src/common/math_util.h @@ -24,17 +24,29 @@ struct Rectangle { : left(left), top(top), right(right), bottom(bottom) {} T GetWidth() const { - return std::abs(static_cast<std::make_signed_t<T>>(right - left)); + if constexpr (std::is_floating_point_v<T>) { + return std::abs(right - left); + } else { + return std::abs(static_cast<std::make_signed_t<T>>(right - left)); + } } + T GetHeight() const { - return std::abs(static_cast<std::make_signed_t<T>>(bottom - top)); + if constexpr (std::is_floating_point_v<T>) { + return std::abs(bottom - top); + } else { + return std::abs(static_cast<std::make_signed_t<T>>(bottom - top)); + } } + Rectangle<T> TranslateX(const T x) const { return Rectangle{left + x, top, right + x, bottom}; } + Rectangle<T> TranslateY(const T y) const { return Rectangle{left, top + y, right, bottom + y}; } + Rectangle<T> Scale(const float s) const { return Rectangle{left, top, static_cast<T>(left + GetWidth() * s), static_cast<T>(top + GetHeight() * s)}; diff --git a/src/common/page_table.cpp b/src/common/page_table.cpp index 69b7abc54..566b57b62 100644 --- a/src/common/page_table.cpp +++ b/src/common/page_table.cpp @@ -16,7 +16,6 @@ void PageTable::Resize(std::size_t address_space_width_in_bits) { pointers.resize(num_page_table_entries); attributes.resize(num_page_table_entries); - backing_addr.resize(num_page_table_entries); // The default is a 39-bit address space, which causes an initial 1GB allocation size. If the // vector size is subsequently decreased (via resize), the vector might not automatically @@ -25,6 +24,17 @@ void PageTable::Resize(std::size_t address_space_width_in_bits) { pointers.shrink_to_fit(); attributes.shrink_to_fit(); +} + +BackingPageTable::BackingPageTable(std::size_t page_size_in_bits) : PageTable{page_size_in_bits} {} + +BackingPageTable::~BackingPageTable() = default; + +void BackingPageTable::Resize(std::size_t address_space_width_in_bits) { + PageTable::Resize(address_space_width_in_bits); + const std::size_t num_page_table_entries = 1ULL + << (address_space_width_in_bits - page_size_in_bits); + backing_addr.resize(num_page_table_entries); backing_addr.shrink_to_fit(); } diff --git a/src/common/page_table.h b/src/common/page_table.h index 8b8ff0bb8..dbc272ab7 100644 --- a/src/common/page_table.h +++ b/src/common/page_table.h @@ -76,9 +76,20 @@ struct PageTable { */ std::vector<PageType> attributes; - std::vector<u64> backing_addr; - const std::size_t page_size_in_bits{}; }; +/** + * A more advanced Page Table with the ability to save a backing address when using it + * depends on another MMU. + */ +struct BackingPageTable : PageTable { + explicit BackingPageTable(std::size_t page_size_in_bits); + ~BackingPageTable(); + + void Resize(std::size_t address_space_width_in_bits); + + std::vector<u64> backing_addr; +}; + } // namespace Common diff --git a/src/core/CMakeLists.txt b/src/core/CMakeLists.txt index 88c06b2ce..b31a0328c 100644 --- a/src/core/CMakeLists.txt +++ b/src/core/CMakeLists.txt @@ -131,8 +131,8 @@ add_library(core STATIC frontend/framebuffer_layout.cpp frontend/framebuffer_layout.h frontend/input.h - frontend/scope_acquire_window_context.cpp - frontend/scope_acquire_window_context.h + frontend/scope_acquire_context.cpp + frontend/scope_acquire_context.h gdbstub/gdbstub.cpp gdbstub/gdbstub.h hardware_interrupt_manager.cpp @@ -595,8 +595,12 @@ endif() if (ARCHITECTURE_x86_64) target_sources(core PRIVATE - arm/dynarmic/arm_dynarmic.cpp - arm/dynarmic/arm_dynarmic.h + arm/dynarmic/arm_dynarmic_32.cpp + arm/dynarmic/arm_dynarmic_32.h + arm/dynarmic/arm_dynarmic_64.cpp + arm/dynarmic/arm_dynarmic_64.h + arm/dynarmic/arm_dynarmic_cp15.cpp + arm/dynarmic/arm_dynarmic_cp15.h ) target_link_libraries(core PRIVATE dynarmic) endif() diff --git a/src/core/arm/arm_interface.h b/src/core/arm/arm_interface.h index 47b964eb7..57eae839e 100644 --- a/src/core/arm/arm_interface.h +++ b/src/core/arm/arm_interface.h @@ -25,7 +25,20 @@ public: explicit ARM_Interface(System& system_) : system{system_} {} virtual ~ARM_Interface() = default; - struct ThreadContext { + struct ThreadContext32 { + std::array<u32, 16> cpu_registers; + u32 cpsr; + std::array<u8, 4> padding; + std::array<u64, 32> fprs; + u32 fpscr; + u32 fpexc; + u32 tpidr; + }; + // Internally within the kernel, it expects the AArch32 version of the + // thread context to be 344 bytes in size. + static_assert(sizeof(ThreadContext32) == 0x158); + + struct ThreadContext64 { std::array<u64, 31> cpu_registers; u64 sp; u64 pc; @@ -38,7 +51,7 @@ public: }; // Internally within the kernel, it expects the AArch64 version of the // thread context to be 800 bytes in size. - static_assert(sizeof(ThreadContext) == 0x320); + static_assert(sizeof(ThreadContext64) == 0x320); /// Runs the CPU until an event happens virtual void Run() = 0; @@ -130,17 +143,10 @@ public: */ virtual void SetTPIDR_EL0(u64 value) = 0; - /** - * Saves the current CPU context - * @param ctx Thread context to save - */ - virtual void SaveContext(ThreadContext& ctx) = 0; - - /** - * Loads a CPU context - * @param ctx Thread context to load - */ - virtual void LoadContext(const ThreadContext& ctx) = 0; + virtual void SaveContext(ThreadContext32& ctx) = 0; + virtual void SaveContext(ThreadContext64& ctx) = 0; + virtual void LoadContext(const ThreadContext32& ctx) = 0; + virtual void LoadContext(const ThreadContext64& ctx) = 0; /// Clears the exclusive monitor's state. virtual void ClearExclusiveState() = 0; diff --git a/src/core/arm/dynarmic/arm_dynarmic_32.cpp b/src/core/arm/dynarmic/arm_dynarmic_32.cpp new file mode 100644 index 000000000..187a972ac --- /dev/null +++ b/src/core/arm/dynarmic/arm_dynarmic_32.cpp @@ -0,0 +1,208 @@ +// Copyright 2020 yuzu emulator team +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include <cinttypes> +#include <memory> +#include <dynarmic/A32/a32.h> +#include <dynarmic/A32/config.h> +#include <dynarmic/A32/context.h> +#include "common/microprofile.h" +#include "core/arm/dynarmic/arm_dynarmic_32.h" +#include "core/arm/dynarmic/arm_dynarmic_64.h" +#include "core/arm/dynarmic/arm_dynarmic_cp15.h" +#include "core/core.h" +#include "core/core_manager.h" +#include "core/core_timing.h" +#include "core/hle/kernel/svc.h" +#include "core/memory.h" + +namespace Core { + +class DynarmicCallbacks32 : public Dynarmic::A32::UserCallbacks { +public: + explicit DynarmicCallbacks32(ARM_Dynarmic_32& parent) : parent(parent) {} + + u8 MemoryRead8(u32 vaddr) override { + return parent.system.Memory().Read8(vaddr); + } + u16 MemoryRead16(u32 vaddr) override { + return parent.system.Memory().Read16(vaddr); + } + u32 MemoryRead32(u32 vaddr) override { + return parent.system.Memory().Read32(vaddr); + } + u64 MemoryRead64(u32 vaddr) override { + return parent.system.Memory().Read64(vaddr); + } + + void MemoryWrite8(u32 vaddr, u8 value) override { + parent.system.Memory().Write8(vaddr, value); + } + void MemoryWrite16(u32 vaddr, u16 value) override { + parent.system.Memory().Write16(vaddr, value); + } + void MemoryWrite32(u32 vaddr, u32 value) override { + parent.system.Memory().Write32(vaddr, value); + } + void MemoryWrite64(u32 vaddr, u64 value) override { + parent.system.Memory().Write64(vaddr, value); + } + + void InterpreterFallback(u32 pc, std::size_t num_instructions) override { + UNIMPLEMENTED(); + } + + void ExceptionRaised(u32 pc, Dynarmic::A32::Exception exception) override { + switch (exception) { + case Dynarmic::A32::Exception::UndefinedInstruction: + case Dynarmic::A32::Exception::UnpredictableInstruction: + break; + case Dynarmic::A32::Exception::Breakpoint: + break; + } + LOG_CRITICAL(HW_GPU, "ExceptionRaised(exception = {}, pc = {:08X}, code = {:08X})", + static_cast<std::size_t>(exception), pc, MemoryReadCode(pc)); + UNIMPLEMENTED(); + } + + void CallSVC(u32 swi) override { + Kernel::CallSVC(parent.system, swi); + } + + void AddTicks(u64 ticks) override { + // Divide the number of ticks by the amount of CPU cores. TODO(Subv): This yields only a + // rough approximation of the amount of executed ticks in the system, it may be thrown off + // if not all cores are doing a similar amount of work. Instead of doing this, we should + // device a way so that timing is consistent across all cores without increasing the ticks 4 + // times. + u64 amortized_ticks = (ticks - num_interpreted_instructions) / Core::NUM_CPU_CORES; + // Always execute at least one tick. + amortized_ticks = std::max<u64>(amortized_ticks, 1); + + parent.system.CoreTiming().AddTicks(amortized_ticks); + num_interpreted_instructions = 0; + } + u64 GetTicksRemaining() override { + return std::max(parent.system.CoreTiming().GetDowncount(), {}); + } + + ARM_Dynarmic_32& parent; + std::size_t num_interpreted_instructions{}; + u64 tpidrro_el0{}; + u64 tpidr_el0{}; +}; + +std::shared_ptr<Dynarmic::A32::Jit> ARM_Dynarmic_32::MakeJit(Common::PageTable& page_table, + std::size_t address_space_bits) const { + Dynarmic::A32::UserConfig config; + config.callbacks = cb.get(); + // TODO(bunnei): Implement page table for 32-bit + // config.page_table = &page_table.pointers; + config.coprocessors[15] = std::make_shared<DynarmicCP15>((u32*)&CP15_regs[0]); + config.define_unpredictable_behaviour = true; + return std::make_unique<Dynarmic::A32::Jit>(config); +} + +MICROPROFILE_DEFINE(ARM_Jit_Dynarmic_32, "ARM JIT", "Dynarmic", MP_RGB(255, 64, 64)); + +void ARM_Dynarmic_32::Run() { + MICROPROFILE_SCOPE(ARM_Jit_Dynarmic_32); + jit->Run(); +} + +void ARM_Dynarmic_32::Step() { + cb->InterpreterFallback(jit->Regs()[15], 1); +} + +ARM_Dynarmic_32::ARM_Dynarmic_32(System& system, ExclusiveMonitor& exclusive_monitor, + std::size_t core_index) + : ARM_Interface{system}, + cb(std::make_unique<DynarmicCallbacks32>(*this)), core_index{core_index}, + exclusive_monitor{dynamic_cast<DynarmicExclusiveMonitor&>(exclusive_monitor)} {} + +ARM_Dynarmic_32::~ARM_Dynarmic_32() = default; + +void ARM_Dynarmic_32::SetPC(u64 pc) { + jit->Regs()[15] = static_cast<u32>(pc); +} + +u64 ARM_Dynarmic_32::GetPC() const { + return jit->Regs()[15]; +} + +u64 ARM_Dynarmic_32::GetReg(int index) const { + return jit->Regs()[index]; +} + +void ARM_Dynarmic_32::SetReg(int index, u64 value) { + jit->Regs()[index] = static_cast<u32>(value); +} + +u128 ARM_Dynarmic_32::GetVectorReg(int index) const { + return {}; +} + +void ARM_Dynarmic_32::SetVectorReg(int index, u128 value) {} + +u32 ARM_Dynarmic_32::GetPSTATE() const { + return jit->Cpsr(); +} + +void ARM_Dynarmic_32::SetPSTATE(u32 cpsr) { + jit->SetCpsr(cpsr); +} + +u64 ARM_Dynarmic_32::GetTlsAddress() const { + return CP15_regs[static_cast<std::size_t>(CP15Register::CP15_THREAD_URO)]; +} + +void ARM_Dynarmic_32::SetTlsAddress(VAddr address) { + CP15_regs[static_cast<std::size_t>(CP15Register::CP15_THREAD_URO)] = static_cast<u32>(address); +} + +u64 ARM_Dynarmic_32::GetTPIDR_EL0() const { + return cb->tpidr_el0; +} + +void ARM_Dynarmic_32::SetTPIDR_EL0(u64 value) { + cb->tpidr_el0 = value; +} + +void ARM_Dynarmic_32::SaveContext(ThreadContext32& ctx) { + Dynarmic::A32::Context context; + jit->SaveContext(context); + ctx.cpu_registers = context.Regs(); + ctx.cpsr = context.Cpsr(); +} + +void ARM_Dynarmic_32::LoadContext(const ThreadContext32& ctx) { + Dynarmic::A32::Context context; + context.Regs() = ctx.cpu_registers; + context.SetCpsr(ctx.cpsr); + jit->LoadContext(context); +} + +void ARM_Dynarmic_32::PrepareReschedule() { + jit->HaltExecution(); +} + +void ARM_Dynarmic_32::ClearInstructionCache() { + jit->ClearCache(); +} + +void ARM_Dynarmic_32::ClearExclusiveState() {} + +void ARM_Dynarmic_32::PageTableChanged(Common::PageTable& page_table, + std::size_t new_address_space_size_in_bits) { + auto key = std::make_pair(&page_table, new_address_space_size_in_bits); + auto iter = jit_cache.find(key); + if (iter != jit_cache.end()) { + jit = iter->second; + return; + } + jit = MakeJit(page_table, new_address_space_size_in_bits); + jit_cache.emplace(key, jit); +} + +} // namespace Core diff --git a/src/core/arm/dynarmic/arm_dynarmic_32.h b/src/core/arm/dynarmic/arm_dynarmic_32.h new file mode 100644 index 000000000..143e46e4d --- /dev/null +++ b/src/core/arm/dynarmic/arm_dynarmic_32.h @@ -0,0 +1,77 @@ +// Copyright 2020 yuzu emulator team +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include <memory> +#include <unordered_map> + +#include <dynarmic/A32/a32.h> +#include <dynarmic/A64/a64.h> +#include <dynarmic/A64/exclusive_monitor.h> +#include "common/common_types.h" +#include "common/hash.h" +#include "core/arm/arm_interface.h" +#include "core/arm/exclusive_monitor.h" + +namespace Memory { +class Memory; +} + +namespace Core { + +class DynarmicCallbacks32; +class DynarmicExclusiveMonitor; +class System; + +class ARM_Dynarmic_32 final : public ARM_Interface { +public: + ARM_Dynarmic_32(System& system, ExclusiveMonitor& exclusive_monitor, std::size_t core_index); + ~ARM_Dynarmic_32() override; + + void SetPC(u64 pc) override; + u64 GetPC() const override; + u64 GetReg(int index) const override; + void SetReg(int index, u64 value) override; + u128 GetVectorReg(int index) const override; + void SetVectorReg(int index, u128 value) override; + u32 GetPSTATE() const override; + void SetPSTATE(u32 pstate) override; + void Run() override; + void Step() override; + VAddr GetTlsAddress() const override; + void SetTlsAddress(VAddr address) override; + void SetTPIDR_EL0(u64 value) override; + u64 GetTPIDR_EL0() const override; + + void SaveContext(ThreadContext32& ctx) override; + void SaveContext(ThreadContext64& ctx) override {} + void LoadContext(const ThreadContext32& ctx) override; + void LoadContext(const ThreadContext64& ctx) override {} + + void PrepareReschedule() override; + void ClearExclusiveState() override; + + void ClearInstructionCache() override; + void PageTableChanged(Common::PageTable& new_page_table, + std::size_t new_address_space_size_in_bits) override; + +private: + std::shared_ptr<Dynarmic::A32::Jit> MakeJit(Common::PageTable& page_table, + std::size_t address_space_bits) const; + + using JitCacheKey = std::pair<Common::PageTable*, std::size_t>; + using JitCacheType = + std::unordered_map<JitCacheKey, std::shared_ptr<Dynarmic::A32::Jit>, Common::PairHash>; + + friend class DynarmicCallbacks32; + std::unique_ptr<DynarmicCallbacks32> cb; + JitCacheType jit_cache; + std::shared_ptr<Dynarmic::A32::Jit> jit; + std::size_t core_index; + DynarmicExclusiveMonitor& exclusive_monitor; + std::array<u32, 84> CP15_regs{}; +}; + +} // namespace Core diff --git a/src/core/arm/dynarmic/arm_dynarmic.cpp b/src/core/arm/dynarmic/arm_dynarmic_64.cpp index 29eaf74e5..a53a58ba0 100644 --- a/src/core/arm/dynarmic/arm_dynarmic.cpp +++ b/src/core/arm/dynarmic/arm_dynarmic_64.cpp @@ -8,7 +8,7 @@ #include <dynarmic/A64/config.h> #include "common/logging/log.h" #include "common/microprofile.h" -#include "core/arm/dynarmic/arm_dynarmic.h" +#include "core/arm/dynarmic/arm_dynarmic_64.h" #include "core/core.h" #include "core/core_manager.h" #include "core/core_timing.h" @@ -25,9 +25,9 @@ namespace Core { using Vector = Dynarmic::A64::Vector; -class ARM_Dynarmic_Callbacks : public Dynarmic::A64::UserCallbacks { +class DynarmicCallbacks64 : public Dynarmic::A64::UserCallbacks { public: - explicit ARM_Dynarmic_Callbacks(ARM_Dynarmic& parent) : parent(parent) {} + explicit DynarmicCallbacks64(ARM_Dynarmic_64& parent) : parent(parent) {} u8 MemoryRead8(u64 vaddr) override { return parent.system.Memory().Read8(vaddr); @@ -68,7 +68,7 @@ public: LOG_INFO(Core_ARM, "Unicorn fallback @ 0x{:X} for {} instructions (instr = {:08X})", pc, num_instructions, MemoryReadCode(pc)); - ARM_Interface::ThreadContext ctx; + ARM_Interface::ThreadContext64 ctx; parent.SaveContext(ctx); parent.inner_unicorn.LoadContext(ctx); parent.inner_unicorn.ExecuteInstructions(num_instructions); @@ -90,7 +90,7 @@ public: parent.jit->HaltExecution(); parent.SetPC(pc); Kernel::Thread* const thread = parent.system.CurrentScheduler().GetCurrentThread(); - parent.SaveContext(thread->GetContext()); + parent.SaveContext(thread->GetContext64()); GDBStub::Break(); GDBStub::SendTrap(thread, 5); return; @@ -126,14 +126,14 @@ public: return Timing::CpuCyclesToClockCycles(parent.system.CoreTiming().GetTicks()); } - ARM_Dynarmic& parent; + ARM_Dynarmic_64& parent; std::size_t num_interpreted_instructions = 0; u64 tpidrro_el0 = 0; u64 tpidr_el0 = 0; }; -std::unique_ptr<Dynarmic::A64::Jit> ARM_Dynarmic::MakeJit(Common::PageTable& page_table, - std::size_t address_space_bits) const { +std::shared_ptr<Dynarmic::A64::Jit> ARM_Dynarmic_64::MakeJit(Common::PageTable& page_table, + std::size_t address_space_bits) const { Dynarmic::A64::UserConfig config; // Callbacks @@ -159,79 +159,79 @@ std::unique_ptr<Dynarmic::A64::Jit> ARM_Dynarmic::MakeJit(Common::PageTable& pag // Unpredictable instructions config.define_unpredictable_behaviour = true; - return std::make_unique<Dynarmic::A64::Jit>(config); + return std::make_shared<Dynarmic::A64::Jit>(config); } -MICROPROFILE_DEFINE(ARM_Jit_Dynarmic, "ARM JIT", "Dynarmic", MP_RGB(255, 64, 64)); +MICROPROFILE_DEFINE(ARM_Jit_Dynarmic_64, "ARM JIT", "Dynarmic", MP_RGB(255, 64, 64)); -void ARM_Dynarmic::Run() { - MICROPROFILE_SCOPE(ARM_Jit_Dynarmic); +void ARM_Dynarmic_64::Run() { + MICROPROFILE_SCOPE(ARM_Jit_Dynarmic_64); jit->Run(); } -void ARM_Dynarmic::Step() { +void ARM_Dynarmic_64::Step() { cb->InterpreterFallback(jit->GetPC(), 1); } -ARM_Dynarmic::ARM_Dynarmic(System& system, ExclusiveMonitor& exclusive_monitor, - std::size_t core_index) +ARM_Dynarmic_64::ARM_Dynarmic_64(System& system, ExclusiveMonitor& exclusive_monitor, + std::size_t core_index) : ARM_Interface{system}, - cb(std::make_unique<ARM_Dynarmic_Callbacks>(*this)), inner_unicorn{system}, + cb(std::make_unique<DynarmicCallbacks64>(*this)), inner_unicorn{system}, core_index{core_index}, exclusive_monitor{ dynamic_cast<DynarmicExclusiveMonitor&>(exclusive_monitor)} {} -ARM_Dynarmic::~ARM_Dynarmic() = default; +ARM_Dynarmic_64::~ARM_Dynarmic_64() = default; -void ARM_Dynarmic::SetPC(u64 pc) { +void ARM_Dynarmic_64::SetPC(u64 pc) { jit->SetPC(pc); } -u64 ARM_Dynarmic::GetPC() const { +u64 ARM_Dynarmic_64::GetPC() const { return jit->GetPC(); } -u64 ARM_Dynarmic::GetReg(int index) const { +u64 ARM_Dynarmic_64::GetReg(int index) const { return jit->GetRegister(index); } -void ARM_Dynarmic::SetReg(int index, u64 value) { +void ARM_Dynarmic_64::SetReg(int index, u64 value) { jit->SetRegister(index, value); } -u128 ARM_Dynarmic::GetVectorReg(int index) const { +u128 ARM_Dynarmic_64::GetVectorReg(int index) const { return jit->GetVector(index); } -void ARM_Dynarmic::SetVectorReg(int index, u128 value) { +void ARM_Dynarmic_64::SetVectorReg(int index, u128 value) { jit->SetVector(index, value); } -u32 ARM_Dynarmic::GetPSTATE() const { +u32 ARM_Dynarmic_64::GetPSTATE() const { return jit->GetPstate(); } -void ARM_Dynarmic::SetPSTATE(u32 pstate) { +void ARM_Dynarmic_64::SetPSTATE(u32 pstate) { jit->SetPstate(pstate); } -u64 ARM_Dynarmic::GetTlsAddress() const { +u64 ARM_Dynarmic_64::GetTlsAddress() const { return cb->tpidrro_el0; } -void ARM_Dynarmic::SetTlsAddress(VAddr address) { +void ARM_Dynarmic_64::SetTlsAddress(VAddr address) { cb->tpidrro_el0 = address; } -u64 ARM_Dynarmic::GetTPIDR_EL0() const { +u64 ARM_Dynarmic_64::GetTPIDR_EL0() const { return cb->tpidr_el0; } -void ARM_Dynarmic::SetTPIDR_EL0(u64 value) { +void ARM_Dynarmic_64::SetTPIDR_EL0(u64 value) { cb->tpidr_el0 = value; } -void ARM_Dynarmic::SaveContext(ThreadContext& ctx) { +void ARM_Dynarmic_64::SaveContext(ThreadContext64& ctx) { ctx.cpu_registers = jit->GetRegisters(); ctx.sp = jit->GetSP(); ctx.pc = jit->GetPC(); @@ -242,7 +242,7 @@ void ARM_Dynarmic::SaveContext(ThreadContext& ctx) { ctx.tpidr = cb->tpidr_el0; } -void ARM_Dynarmic::LoadContext(const ThreadContext& ctx) { +void ARM_Dynarmic_64::LoadContext(const ThreadContext64& ctx) { jit->SetRegisters(ctx.cpu_registers); jit->SetSP(ctx.sp); jit->SetPC(ctx.pc); @@ -253,25 +253,32 @@ void ARM_Dynarmic::LoadContext(const ThreadContext& ctx) { SetTPIDR_EL0(ctx.tpidr); } -void ARM_Dynarmic::PrepareReschedule() { +void ARM_Dynarmic_64::PrepareReschedule() { jit->HaltExecution(); } -void ARM_Dynarmic::ClearInstructionCache() { +void ARM_Dynarmic_64::ClearInstructionCache() { jit->ClearCache(); } -void ARM_Dynarmic::ClearExclusiveState() { +void ARM_Dynarmic_64::ClearExclusiveState() { jit->ClearExclusiveState(); } -void ARM_Dynarmic::PageTableChanged(Common::PageTable& page_table, - std::size_t new_address_space_size_in_bits) { +void ARM_Dynarmic_64::PageTableChanged(Common::PageTable& page_table, + std::size_t new_address_space_size_in_bits) { + auto key = std::make_pair(&page_table, new_address_space_size_in_bits); + auto iter = jit_cache.find(key); + if (iter != jit_cache.end()) { + jit = iter->second; + return; + } jit = MakeJit(page_table, new_address_space_size_in_bits); + jit_cache.emplace(key, jit); } -DynarmicExclusiveMonitor::DynarmicExclusiveMonitor(Memory::Memory& memory_, std::size_t core_count) - : monitor(core_count), memory{memory_} {} +DynarmicExclusiveMonitor::DynarmicExclusiveMonitor(Memory::Memory& memory, std::size_t core_count) + : monitor(core_count), memory{memory} {} DynarmicExclusiveMonitor::~DynarmicExclusiveMonitor() = default; diff --git a/src/core/arm/dynarmic/arm_dynarmic.h b/src/core/arm/dynarmic/arm_dynarmic_64.h index 9cd475cfb..e71240a96 100644 --- a/src/core/arm/dynarmic/arm_dynarmic.h +++ b/src/core/arm/dynarmic/arm_dynarmic_64.h @@ -5,9 +5,12 @@ #pragma once #include <memory> +#include <unordered_map> + #include <dynarmic/A64/a64.h> #include <dynarmic/A64/exclusive_monitor.h> #include "common/common_types.h" +#include "common/hash.h" #include "core/arm/arm_interface.h" #include "core/arm/exclusive_monitor.h" #include "core/arm/unicorn/arm_unicorn.h" @@ -18,14 +21,14 @@ class Memory; namespace Core { -class ARM_Dynarmic_Callbacks; +class DynarmicCallbacks64; class DynarmicExclusiveMonitor; class System; -class ARM_Dynarmic final : public ARM_Interface { +class ARM_Dynarmic_64 final : public ARM_Interface { public: - ARM_Dynarmic(System& system, ExclusiveMonitor& exclusive_monitor, std::size_t core_index); - ~ARM_Dynarmic() override; + ARM_Dynarmic_64(System& system, ExclusiveMonitor& exclusive_monitor, std::size_t core_index); + ~ARM_Dynarmic_64() override; void SetPC(u64 pc) override; u64 GetPC() const override; @@ -42,8 +45,10 @@ public: void SetTPIDR_EL0(u64 value) override; u64 GetTPIDR_EL0() const override; - void SaveContext(ThreadContext& ctx) override; - void LoadContext(const ThreadContext& ctx) override; + void SaveContext(ThreadContext32& ctx) override {} + void SaveContext(ThreadContext64& ctx) override; + void LoadContext(const ThreadContext32& ctx) override {} + void LoadContext(const ThreadContext64& ctx) override; void PrepareReschedule() override; void ClearExclusiveState() override; @@ -53,12 +58,17 @@ public: std::size_t new_address_space_size_in_bits) override; private: - std::unique_ptr<Dynarmic::A64::Jit> MakeJit(Common::PageTable& page_table, + std::shared_ptr<Dynarmic::A64::Jit> MakeJit(Common::PageTable& page_table, std::size_t address_space_bits) const; - friend class ARM_Dynarmic_Callbacks; - std::unique_ptr<ARM_Dynarmic_Callbacks> cb; - std::unique_ptr<Dynarmic::A64::Jit> jit; + using JitCacheKey = std::pair<Common::PageTable*, std::size_t>; + using JitCacheType = + std::unordered_map<JitCacheKey, std::shared_ptr<Dynarmic::A64::Jit>, Common::PairHash>; + + friend class DynarmicCallbacks64; + std::unique_ptr<DynarmicCallbacks64> cb; + JitCacheType jit_cache; + std::shared_ptr<Dynarmic::A64::Jit> jit; ARM_Unicorn inner_unicorn; std::size_t core_index; @@ -67,7 +77,7 @@ private: class DynarmicExclusiveMonitor final : public ExclusiveMonitor { public: - explicit DynarmicExclusiveMonitor(Memory::Memory& memory_, std::size_t core_count); + explicit DynarmicExclusiveMonitor(Memory::Memory& memory, std::size_t core_count); ~DynarmicExclusiveMonitor() override; void SetExclusive(std::size_t core_index, VAddr addr) override; @@ -80,7 +90,7 @@ public: bool ExclusiveWrite128(std::size_t core_index, VAddr vaddr, u128 value) override; private: - friend class ARM_Dynarmic; + friend class ARM_Dynarmic_64; Dynarmic::A64::ExclusiveMonitor monitor; Memory::Memory& memory; }; diff --git a/src/core/arm/dynarmic/arm_dynarmic_cp15.cpp b/src/core/arm/dynarmic/arm_dynarmic_cp15.cpp new file mode 100644 index 000000000..3fdcdebde --- /dev/null +++ b/src/core/arm/dynarmic/arm_dynarmic_cp15.cpp @@ -0,0 +1,80 @@ +// Copyright 2017 Citra Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include "core/arm/dynarmic/arm_dynarmic_cp15.h" + +using Callback = Dynarmic::A32::Coprocessor::Callback; +using CallbackOrAccessOneWord = Dynarmic::A32::Coprocessor::CallbackOrAccessOneWord; +using CallbackOrAccessTwoWords = Dynarmic::A32::Coprocessor::CallbackOrAccessTwoWords; + +std::optional<Callback> DynarmicCP15::CompileInternalOperation(bool two, unsigned opc1, + CoprocReg CRd, CoprocReg CRn, + CoprocReg CRm, unsigned opc2) { + return {}; +} + +CallbackOrAccessOneWord DynarmicCP15::CompileSendOneWord(bool two, unsigned opc1, CoprocReg CRn, + CoprocReg CRm, unsigned opc2) { + // TODO(merry): Privileged CP15 registers + + if (!two && CRn == CoprocReg::C7 && opc1 == 0 && CRm == CoprocReg::C5 && opc2 == 4) { + // This is a dummy write, we ignore the value written here. + return &CP15[static_cast<std::size_t>(CP15Register::CP15_FLUSH_PREFETCH_BUFFER)]; + } + + if (!two && CRn == CoprocReg::C7 && opc1 == 0 && CRm == CoprocReg::C10) { + switch (opc2) { + case 4: + // This is a dummy write, we ignore the value written here. + return &CP15[static_cast<std::size_t>(CP15Register::CP15_DATA_SYNC_BARRIER)]; + case 5: + // This is a dummy write, we ignore the value written here. + return &CP15[static_cast<std::size_t>(CP15Register::CP15_DATA_MEMORY_BARRIER)]; + default: + return {}; + } + } + + if (!two && CRn == CoprocReg::C13 && opc1 == 0 && CRm == CoprocReg::C0 && opc2 == 2) { + return &CP15[static_cast<std::size_t>(CP15Register::CP15_THREAD_UPRW)]; + } + + return {}; +} + +CallbackOrAccessTwoWords DynarmicCP15::CompileSendTwoWords(bool two, unsigned opc, CoprocReg CRm) { + return {}; +} + +CallbackOrAccessOneWord DynarmicCP15::CompileGetOneWord(bool two, unsigned opc1, CoprocReg CRn, + CoprocReg CRm, unsigned opc2) { + // TODO(merry): Privileged CP15 registers + + if (!two && CRn == CoprocReg::C13 && opc1 == 0 && CRm == CoprocReg::C0) { + switch (opc2) { + case 2: + return &CP15[static_cast<std::size_t>(CP15Register::CP15_THREAD_UPRW)]; + case 3: + return &CP15[static_cast<std::size_t>(CP15Register::CP15_THREAD_URO)]; + default: + return {}; + } + } + + return {}; +} + +CallbackOrAccessTwoWords DynarmicCP15::CompileGetTwoWords(bool two, unsigned opc, CoprocReg CRm) { + return {}; +} + +std::optional<Callback> DynarmicCP15::CompileLoadWords(bool two, bool long_transfer, CoprocReg CRd, + std::optional<u8> option) { + return {}; +} + +std::optional<Callback> DynarmicCP15::CompileStoreWords(bool two, bool long_transfer, CoprocReg CRd, + std::optional<u8> option) { + return {}; +} diff --git a/src/core/arm/dynarmic/arm_dynarmic_cp15.h b/src/core/arm/dynarmic/arm_dynarmic_cp15.h new file mode 100644 index 000000000..07bcde5f9 --- /dev/null +++ b/src/core/arm/dynarmic/arm_dynarmic_cp15.h @@ -0,0 +1,152 @@ +// Copyright 2017 Citra Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include <memory> +#include <optional> + +#include <dynarmic/A32/coprocessor.h> +#include "common/common_types.h" + +enum class CP15Register { + // c0 - Information registers + CP15_MAIN_ID, + CP15_CACHE_TYPE, + CP15_TCM_STATUS, + CP15_TLB_TYPE, + CP15_CPU_ID, + CP15_PROCESSOR_FEATURE_0, + CP15_PROCESSOR_FEATURE_1, + CP15_DEBUG_FEATURE_0, + CP15_AUXILIARY_FEATURE_0, + CP15_MEMORY_MODEL_FEATURE_0, + CP15_MEMORY_MODEL_FEATURE_1, + CP15_MEMORY_MODEL_FEATURE_2, + CP15_MEMORY_MODEL_FEATURE_3, + CP15_ISA_FEATURE_0, + CP15_ISA_FEATURE_1, + CP15_ISA_FEATURE_2, + CP15_ISA_FEATURE_3, + CP15_ISA_FEATURE_4, + + // c1 - Control registers + CP15_CONTROL, + CP15_AUXILIARY_CONTROL, + CP15_COPROCESSOR_ACCESS_CONTROL, + + // c2 - Translation table registers + CP15_TRANSLATION_BASE_TABLE_0, + CP15_TRANSLATION_BASE_TABLE_1, + CP15_TRANSLATION_BASE_CONTROL, + CP15_DOMAIN_ACCESS_CONTROL, + CP15_RESERVED, + + // c5 - Fault status registers + CP15_FAULT_STATUS, + CP15_INSTR_FAULT_STATUS, + CP15_COMBINED_DATA_FSR = CP15_FAULT_STATUS, + CP15_INST_FSR, + + // c6 - Fault Address registers + CP15_FAULT_ADDRESS, + CP15_COMBINED_DATA_FAR = CP15_FAULT_ADDRESS, + CP15_WFAR, + CP15_IFAR, + + // c7 - Cache operation registers + CP15_WAIT_FOR_INTERRUPT, + CP15_PHYS_ADDRESS, + CP15_INVALIDATE_INSTR_CACHE, + CP15_INVALIDATE_INSTR_CACHE_USING_MVA, + CP15_INVALIDATE_INSTR_CACHE_USING_INDEX, + CP15_FLUSH_PREFETCH_BUFFER, + CP15_FLUSH_BRANCH_TARGET_CACHE, + CP15_FLUSH_BRANCH_TARGET_CACHE_ENTRY, + CP15_INVALIDATE_DATA_CACHE, + CP15_INVALIDATE_DATA_CACHE_LINE_USING_MVA, + CP15_INVALIDATE_DATA_CACHE_LINE_USING_INDEX, + CP15_INVALIDATE_DATA_AND_INSTR_CACHE, + CP15_CLEAN_DATA_CACHE, + CP15_CLEAN_DATA_CACHE_LINE_USING_MVA, + CP15_CLEAN_DATA_CACHE_LINE_USING_INDEX, + CP15_DATA_SYNC_BARRIER, + CP15_DATA_MEMORY_BARRIER, + CP15_CLEAN_AND_INVALIDATE_DATA_CACHE, + CP15_CLEAN_AND_INVALIDATE_DATA_CACHE_LINE_USING_MVA, + CP15_CLEAN_AND_INVALIDATE_DATA_CACHE_LINE_USING_INDEX, + + // c8 - TLB operations + CP15_INVALIDATE_ITLB, + CP15_INVALIDATE_ITLB_SINGLE_ENTRY, + CP15_INVALIDATE_ITLB_ENTRY_ON_ASID_MATCH, + CP15_INVALIDATE_ITLB_ENTRY_ON_MVA, + CP15_INVALIDATE_DTLB, + CP15_INVALIDATE_DTLB_SINGLE_ENTRY, + CP15_INVALIDATE_DTLB_ENTRY_ON_ASID_MATCH, + CP15_INVALIDATE_DTLB_ENTRY_ON_MVA, + CP15_INVALIDATE_UTLB, + CP15_INVALIDATE_UTLB_SINGLE_ENTRY, + CP15_INVALIDATE_UTLB_ENTRY_ON_ASID_MATCH, + CP15_INVALIDATE_UTLB_ENTRY_ON_MVA, + + // c9 - Data cache lockdown register + CP15_DATA_CACHE_LOCKDOWN, + + // c10 - TLB/Memory map registers + CP15_TLB_LOCKDOWN, + CP15_PRIMARY_REGION_REMAP, + CP15_NORMAL_REGION_REMAP, + + // c13 - Thread related registers + CP15_PID, + CP15_CONTEXT_ID, + CP15_THREAD_UPRW, // Thread ID register - User/Privileged Read/Write + CP15_THREAD_URO, // Thread ID register - User Read Only (Privileged R/W) + CP15_THREAD_PRW, // Thread ID register - Privileged R/W only. + + // c15 - Performance and TLB lockdown registers + CP15_PERFORMANCE_MONITOR_CONTROL, + CP15_CYCLE_COUNTER, + CP15_COUNT_0, + CP15_COUNT_1, + CP15_READ_MAIN_TLB_LOCKDOWN_ENTRY, + CP15_WRITE_MAIN_TLB_LOCKDOWN_ENTRY, + CP15_MAIN_TLB_LOCKDOWN_VIRT_ADDRESS, + CP15_MAIN_TLB_LOCKDOWN_PHYS_ADDRESS, + CP15_MAIN_TLB_LOCKDOWN_ATTRIBUTE, + CP15_TLB_DEBUG_CONTROL, + + // Skyeye defined + CP15_TLB_FAULT_ADDR, + CP15_TLB_FAULT_STATUS, + + // Not an actual register. + // All registers should be defined above this. + CP15_REGISTER_COUNT, +}; + +class DynarmicCP15 final : public Dynarmic::A32::Coprocessor { +public: + using CoprocReg = Dynarmic::A32::CoprocReg; + + explicit DynarmicCP15(u32* cp15) : CP15(cp15){}; + + std::optional<Callback> CompileInternalOperation(bool two, unsigned opc1, CoprocReg CRd, + CoprocReg CRn, CoprocReg CRm, + unsigned opc2) override; + CallbackOrAccessOneWord CompileSendOneWord(bool two, unsigned opc1, CoprocReg CRn, + CoprocReg CRm, unsigned opc2) override; + CallbackOrAccessTwoWords CompileSendTwoWords(bool two, unsigned opc, CoprocReg CRm) override; + CallbackOrAccessOneWord CompileGetOneWord(bool two, unsigned opc1, CoprocReg CRn, CoprocReg CRm, + unsigned opc2) override; + CallbackOrAccessTwoWords CompileGetTwoWords(bool two, unsigned opc, CoprocReg CRm) override; + std::optional<Callback> CompileLoadWords(bool two, bool long_transfer, CoprocReg CRd, + std::optional<u8> option) override; + std::optional<Callback> CompileStoreWords(bool two, bool long_transfer, CoprocReg CRd, + std::optional<u8> option) override; + +private: + u32* CP15{}; +}; diff --git a/src/core/arm/exclusive_monitor.cpp b/src/core/arm/exclusive_monitor.cpp index 94570e520..b32401e0b 100644 --- a/src/core/arm/exclusive_monitor.cpp +++ b/src/core/arm/exclusive_monitor.cpp @@ -3,7 +3,7 @@ // Refer to the license.txt file included. #ifdef ARCHITECTURE_x86_64 -#include "core/arm/dynarmic/arm_dynarmic.h" +#include "core/arm/dynarmic/arm_dynarmic_64.h" #endif #include "core/arm/exclusive_monitor.h" #include "core/memory.h" diff --git a/src/core/arm/unicorn/arm_unicorn.cpp b/src/core/arm/unicorn/arm_unicorn.cpp index f99ad5802..8a9800a96 100644 --- a/src/core/arm/unicorn/arm_unicorn.cpp +++ b/src/core/arm/unicorn/arm_unicorn.cpp @@ -53,7 +53,7 @@ static bool UnmappedMemoryHook(uc_engine* uc, uc_mem_type type, u64 addr, int si void* user_data) { auto* const system = static_cast<System*>(user_data); - ARM_Interface::ThreadContext ctx{}; + ARM_Interface::ThreadContext64 ctx{}; system->CurrentArmInterface().SaveContext(ctx); ASSERT_MSG(false, "Attempted to read from unmapped memory: 0x{:X}, pc=0x{:X}, lr=0x{:X}", addr, ctx.pc, ctx.cpu_registers[30]); @@ -179,7 +179,7 @@ void ARM_Unicorn::ExecuteInstructions(std::size_t num_instructions) { } Kernel::Thread* const thread = system.CurrentScheduler().GetCurrentThread(); - SaveContext(thread->GetContext()); + SaveContext(thread->GetContext64()); if (last_bkpt_hit || GDBStub::IsMemoryBreak() || GDBStub::GetCpuStepFlag()) { last_bkpt_hit = false; GDBStub::Break(); @@ -188,7 +188,7 @@ void ARM_Unicorn::ExecuteInstructions(std::size_t num_instructions) { } } -void ARM_Unicorn::SaveContext(ThreadContext& ctx) { +void ARM_Unicorn::SaveContext(ThreadContext64& ctx) { int uregs[32]; void* tregs[32]; @@ -215,7 +215,7 @@ void ARM_Unicorn::SaveContext(ThreadContext& ctx) { CHECKED(uc_reg_read_batch(uc, uregs, tregs, 32)); } -void ARM_Unicorn::LoadContext(const ThreadContext& ctx) { +void ARM_Unicorn::LoadContext(const ThreadContext64& ctx) { int uregs[32]; void* tregs[32]; diff --git a/src/core/arm/unicorn/arm_unicorn.h b/src/core/arm/unicorn/arm_unicorn.h index 3c5b155f9..f30d13cb6 100644 --- a/src/core/arm/unicorn/arm_unicorn.h +++ b/src/core/arm/unicorn/arm_unicorn.h @@ -30,8 +30,6 @@ public: void SetTlsAddress(VAddr address) override; void SetTPIDR_EL0(u64 value) override; u64 GetTPIDR_EL0() const override; - void SaveContext(ThreadContext& ctx) override; - void LoadContext(const ThreadContext& ctx) override; void PrepareReschedule() override; void ClearExclusiveState() override; void ExecuteInstructions(std::size_t num_instructions); @@ -41,6 +39,11 @@ public: void PageTableChanged(Common::PageTable&, std::size_t) override {} void RecordBreak(GDBStub::BreakpointAddress bkpt); + void SaveContext(ThreadContext32& ctx) override {} + void SaveContext(ThreadContext64& ctx) override; + void LoadContext(const ThreadContext32& ctx) override {} + void LoadContext(const ThreadContext64& ctx) override; + private: static void InterruptHook(uc_engine* uc, u32 int_no, void* user_data); diff --git a/src/core/core.cpp b/src/core/core.cpp index 86e314c94..d1bc9340d 100644 --- a/src/core/core.cpp +++ b/src/core/core.cpp @@ -24,6 +24,7 @@ #include "core/file_sys/sdmc_factory.h" #include "core/file_sys/vfs_concat.h" #include "core/file_sys/vfs_real.h" +#include "core/frontend/scope_acquire_context.h" #include "core/gdbstub/gdbstub.h" #include "core/hardware_interrupt_manager.h" #include "core/hle/kernel/client_port.h" @@ -165,7 +166,7 @@ struct System::Impl { service_manager = std::make_shared<Service::SM::ServiceManager>(); Service::Init(service_manager, system); - GDBStub::Init(); + GDBStub::DeferStart(); renderer = VideoCore::CreateRenderer(emu_window, system); if (!renderer->Init()) { @@ -173,6 +174,7 @@ struct System::Impl { } interrupt_manager = std::make_unique<Core::Hardware::InterruptManager>(system); gpu_core = VideoCore::CreateGPU(system); + renderer->Rasterizer().SetupDirtyFlags(); is_powered_on = true; exit_lock = false; @@ -184,6 +186,8 @@ struct System::Impl { ResultStatus Load(System& system, Frontend::EmuWindow& emu_window, const std::string& filepath) { + Core::Frontend::ScopeAcquireContext acquire_context{emu_window}; + app_loader = Loader::GetLoader(GetGameFileFromPath(virtual_filesystem, filepath)); if (!app_loader) { LOG_CRITICAL(Core, "Failed to obtain loader for {}!", filepath); diff --git a/src/core/core_manager.cpp b/src/core/core_manager.cpp index 8eacf92dd..b6b797c80 100644 --- a/src/core/core_manager.cpp +++ b/src/core/core_manager.cpp @@ -6,9 +6,6 @@ #include <mutex> #include "common/logging/log.h" -#ifdef ARCHITECTURE_x86_64 -#include "core/arm/dynarmic/arm_dynarmic.h" -#endif #include "core/arm/exclusive_monitor.h" #include "core/arm/unicorn/arm_unicorn.h" #include "core/core.h" diff --git a/src/core/frontend/emu_window.h b/src/core/frontend/emu_window.h index 3376eedc5..5eb87fb63 100644 --- a/src/core/frontend/emu_window.h +++ b/src/core/frontend/emu_window.h @@ -26,9 +26,6 @@ public: /// Releases (dunno if this is the "right" word) the context from the caller thread virtual void DoneCurrent() = 0; - - /// Swap buffers to display the next frame - virtual void SwapBuffers() = 0; }; /** diff --git a/src/core/frontend/framebuffer_layout.cpp b/src/core/frontend/framebuffer_layout.cpp index 2dc795d56..68a0e0906 100644 --- a/src/core/frontend/framebuffer_layout.cpp +++ b/src/core/frontend/framebuffer_layout.cpp @@ -48,8 +48,8 @@ FramebufferLayout FrameLayoutFromResolutionScale(u32 res_scale) { u32 width, height; if (Settings::values.use_docked_mode) { - width = ScreenDocked::WidthDocked * res_scale; - height = ScreenDocked::HeightDocked * res_scale; + width = ScreenDocked::Width * res_scale; + height = ScreenDocked::Height * res_scale; } else { width = ScreenUndocked::Width * res_scale; height = ScreenUndocked::Height * res_scale; diff --git a/src/core/frontend/framebuffer_layout.h b/src/core/frontend/framebuffer_layout.h index 1d39c1faf..15ecfb13d 100644 --- a/src/core/frontend/framebuffer_layout.h +++ b/src/core/frontend/framebuffer_layout.h @@ -8,15 +8,15 @@ namespace Layout { -enum ScreenUndocked : u32 { - Width = 1280, - Height = 720, -}; +namespace ScreenUndocked { +constexpr u32 Width = 1280; +constexpr u32 Height = 720; +} // namespace ScreenUndocked -enum ScreenDocked : u32 { - WidthDocked = 1920, - HeightDocked = 1080, -}; +namespace ScreenDocked { +constexpr u32 Width = 1920; +constexpr u32 Height = 1080; +} // namespace ScreenDocked enum class AspectRatio { Default, @@ -29,6 +29,7 @@ enum class AspectRatio { struct FramebufferLayout { u32 width{ScreenUndocked::Width}; u32 height{ScreenUndocked::Height}; + bool is_srgb{}; Common::Rectangle<u32> screen; diff --git a/src/core/frontend/scope_acquire_context.cpp b/src/core/frontend/scope_acquire_context.cpp new file mode 100644 index 000000000..878c3157c --- /dev/null +++ b/src/core/frontend/scope_acquire_context.cpp @@ -0,0 +1,18 @@ +// Copyright 2019 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include "core/frontend/emu_window.h" +#include "core/frontend/scope_acquire_context.h" + +namespace Core::Frontend { + +ScopeAcquireContext::ScopeAcquireContext(Core::Frontend::GraphicsContext& context) + : context{context} { + context.MakeCurrent(); +} +ScopeAcquireContext::~ScopeAcquireContext() { + context.DoneCurrent(); +} + +} // namespace Core::Frontend diff --git a/src/core/frontend/scope_acquire_window_context.h b/src/core/frontend/scope_acquire_context.h index 2d9f6e825..7a65c0623 100644 --- a/src/core/frontend/scope_acquire_window_context.h +++ b/src/core/frontend/scope_acquire_context.h @@ -8,16 +8,16 @@ namespace Core::Frontend { -class EmuWindow; +class GraphicsContext; /// Helper class to acquire/release window context within a given scope -class ScopeAcquireWindowContext : NonCopyable { +class ScopeAcquireContext : NonCopyable { public: - explicit ScopeAcquireWindowContext(Core::Frontend::EmuWindow& window); - ~ScopeAcquireWindowContext(); + explicit ScopeAcquireContext(Core::Frontend::GraphicsContext& context); + ~ScopeAcquireContext(); private: - Core::Frontend::EmuWindow& emu_window; + Core::Frontend::GraphicsContext& context; }; } // namespace Core::Frontend diff --git a/src/core/frontend/scope_acquire_window_context.cpp b/src/core/frontend/scope_acquire_window_context.cpp deleted file mode 100644 index 3663dad17..000000000 --- a/src/core/frontend/scope_acquire_window_context.cpp +++ /dev/null @@ -1,18 +0,0 @@ -// Copyright 2019 yuzu Emulator Project -// Licensed under GPLv2 or any later version -// Refer to the license.txt file included. - -#include "core/frontend/emu_window.h" -#include "core/frontend/scope_acquire_window_context.h" - -namespace Core::Frontend { - -ScopeAcquireWindowContext::ScopeAcquireWindowContext(Core::Frontend::EmuWindow& emu_window_) - : emu_window{emu_window_} { - emu_window.MakeCurrent(); -} -ScopeAcquireWindowContext::~ScopeAcquireWindowContext() { - emu_window.DoneCurrent(); -} - -} // namespace Core::Frontend diff --git a/src/core/gdbstub/gdbstub.cpp b/src/core/gdbstub/gdbstub.cpp index 67e95999d..6d15aeed9 100644 --- a/src/core/gdbstub/gdbstub.cpp +++ b/src/core/gdbstub/gdbstub.cpp @@ -141,6 +141,7 @@ constexpr char target_xml[] = )"; int gdbserver_socket = -1; +bool defer_start = false; u8 command_buffer[GDB_BUFFER_SIZE]; u32 command_length; @@ -217,7 +218,7 @@ static u64 RegRead(std::size_t id, Kernel::Thread* thread = nullptr) { return 0; } - const auto& thread_context = thread->GetContext(); + const auto& thread_context = thread->GetContext64(); if (id < SP_REGISTER) { return thread_context.cpu_registers[id]; @@ -239,7 +240,7 @@ static void RegWrite(std::size_t id, u64 val, Kernel::Thread* thread = nullptr) return; } - auto& thread_context = thread->GetContext(); + auto& thread_context = thread->GetContext64(); if (id < SP_REGISTER) { thread_context.cpu_registers[id] = val; @@ -259,7 +260,7 @@ static u128 FpuRead(std::size_t id, Kernel::Thread* thread = nullptr) { return u128{0}; } - auto& thread_context = thread->GetContext(); + auto& thread_context = thread->GetContext64(); if (id >= UC_ARM64_REG_Q0 && id < FPCR_REGISTER) { return thread_context.vector_registers[id - UC_ARM64_REG_Q0]; @@ -275,7 +276,7 @@ static void FpuWrite(std::size_t id, u128 val, Kernel::Thread* thread = nullptr) return; } - auto& thread_context = thread->GetContext(); + auto& thread_context = thread->GetContext64(); if (id >= UC_ARM64_REG_Q0 && id < FPCR_REGISTER) { thread_context.vector_registers[id - UC_ARM64_REG_Q0] = val; @@ -916,7 +917,7 @@ static void WriteRegister() { // Update ARM context, skipping scheduler - no running threads at this point Core::System::GetInstance() .ArmInterface(current_core) - .LoadContext(current_thread->GetContext()); + .LoadContext(current_thread->GetContext64()); SendReply("OK"); } @@ -947,7 +948,7 @@ static void WriteRegisters() { // Update ARM context, skipping scheduler - no running threads at this point Core::System::GetInstance() .ArmInterface(current_core) - .LoadContext(current_thread->GetContext()); + .LoadContext(current_thread->GetContext64()); SendReply("OK"); } @@ -1019,7 +1020,7 @@ static void Step() { // Update ARM context, skipping scheduler - no running threads at this point Core::System::GetInstance() .ArmInterface(current_core) - .LoadContext(current_thread->GetContext()); + .LoadContext(current_thread->GetContext64()); } step_loop = true; halt_loop = true; @@ -1166,6 +1167,9 @@ static void RemoveBreakpoint() { void HandlePacket() { if (!IsConnected()) { + if (defer_start) { + ToggleServer(true); + } return; } @@ -1256,6 +1260,10 @@ void ToggleServer(bool status) { } } +void DeferStart() { + defer_start = true; +} + static void Init(u16 port) { if (!server_enabled) { // Set the halt loop to false in case the user enabled the gdbstub mid-execution. @@ -1341,6 +1349,7 @@ void Shutdown() { if (!server_enabled) { return; } + defer_start = false; LOG_INFO(Debug_GDBStub, "Stopping GDB ..."); if (gdbserver_socket != -1) { diff --git a/src/core/gdbstub/gdbstub.h b/src/core/gdbstub/gdbstub.h index 5a36524b2..8fe3c320b 100644 --- a/src/core/gdbstub/gdbstub.h +++ b/src/core/gdbstub/gdbstub.h @@ -43,6 +43,13 @@ void ToggleServer(bool status); /// Start the gdbstub server. void Init(); +/** + * Defer initialization of the gdbstub to the first packet processing functions. + * This avoids a case where the gdbstub thread is frozen after initialization + * and fails to respond in time to packets. + */ +void DeferStart(); + /// Stop gdbstub server. void Shutdown(); diff --git a/src/core/hle/kernel/kernel.cpp b/src/core/hle/kernel/kernel.cpp index 9232f4d7e..e47f1deed 100644 --- a/src/core/hle/kernel/kernel.cpp +++ b/src/core/hle/kernel/kernel.cpp @@ -186,6 +186,10 @@ struct KernelCore::Impl { return; } + for (auto& core : cores) { + core.SetIs64Bit(process->Is64BitProcess()); + } + system.Memory().SetCurrentPageTable(*process); } diff --git a/src/core/hle/kernel/physical_core.cpp b/src/core/hle/kernel/physical_core.cpp index 9303dd273..aa2787467 100644 --- a/src/core/hle/kernel/physical_core.cpp +++ b/src/core/hle/kernel/physical_core.cpp @@ -5,7 +5,8 @@ #include "common/logging/log.h" #include "core/arm/arm_interface.h" #ifdef ARCHITECTURE_x86_64 -#include "core/arm/dynarmic/arm_dynarmic.h" +#include "core/arm/dynarmic/arm_dynarmic_32.h" +#include "core/arm/dynarmic/arm_dynarmic_64.h" #endif #include "core/arm/exclusive_monitor.h" #include "core/arm/unicorn/arm_unicorn.h" @@ -20,13 +21,17 @@ PhysicalCore::PhysicalCore(Core::System& system, std::size_t id, Core::ExclusiveMonitor& exclusive_monitor) : core_index{id} { #ifdef ARCHITECTURE_x86_64 - arm_interface = std::make_unique<Core::ARM_Dynarmic>(system, exclusive_monitor, core_index); + arm_interface_32 = + std::make_unique<Core::ARM_Dynarmic_32>(system, exclusive_monitor, core_index); + arm_interface_64 = + std::make_unique<Core::ARM_Dynarmic_64>(system, exclusive_monitor, core_index); + #else arm_interface = std::make_shared<Core::ARM_Unicorn>(system); LOG_WARNING(Core, "CPU JIT requested, but Dynarmic not available"); #endif - scheduler = std::make_unique<Kernel::Scheduler>(system, *arm_interface, core_index); + scheduler = std::make_unique<Kernel::Scheduler>(system, core_index); } PhysicalCore::~PhysicalCore() = default; @@ -48,4 +53,12 @@ void PhysicalCore::Shutdown() { scheduler->Shutdown(); } +void PhysicalCore::SetIs64Bit(bool is_64_bit) { + if (is_64_bit) { + arm_interface = arm_interface_64.get(); + } else { + arm_interface = arm_interface_32.get(); + } +} + } // namespace Kernel diff --git a/src/core/hle/kernel/physical_core.h b/src/core/hle/kernel/physical_core.h index 4c32c0f1b..3269166be 100644 --- a/src/core/hle/kernel/physical_core.h +++ b/src/core/hle/kernel/physical_core.h @@ -68,10 +68,14 @@ public: return *scheduler; } + void SetIs64Bit(bool is_64_bit); + private: std::size_t core_index; - std::unique_ptr<Core::ARM_Interface> arm_interface; + std::unique_ptr<Core::ARM_Interface> arm_interface_32; + std::unique_ptr<Core::ARM_Interface> arm_interface_64; std::unique_ptr<Kernel::Scheduler> scheduler; + Core::ARM_Interface* arm_interface{}; }; } // namespace Kernel diff --git a/src/core/hle/kernel/process.cpp b/src/core/hle/kernel/process.cpp index 2fcb7326c..edc414d69 100644 --- a/src/core/hle/kernel/process.cpp +++ b/src/core/hle/kernel/process.cpp @@ -42,7 +42,8 @@ void SetupMainThread(Process& owner_process, KernelCore& kernel, u32 priority) { // Register 1 must be a handle to the main thread const Handle thread_handle = owner_process.GetHandleTable().Create(thread).Unwrap(); - thread->GetContext().cpu_registers[1] = thread_handle; + thread->GetContext32().cpu_registers[1] = thread_handle; + thread->GetContext64().cpu_registers[1] = thread_handle; // Threads by default are dormant, wake up the main thread so it runs when the scheduler fires thread->ResumeFromWait(); diff --git a/src/core/hle/kernel/scheduler.cpp b/src/core/hle/kernel/scheduler.cpp index c65f82fb7..1140c72a3 100644 --- a/src/core/hle/kernel/scheduler.cpp +++ b/src/core/hle/kernel/scheduler.cpp @@ -383,8 +383,8 @@ void GlobalScheduler::Unlock() { // TODO(Blinkhawk): Setup the interrupts and change context on current core. } -Scheduler::Scheduler(Core::System& system, Core::ARM_Interface& cpu_core, std::size_t core_id) - : system(system), cpu_core(cpu_core), core_id(core_id) {} +Scheduler::Scheduler(Core::System& system, std::size_t core_id) + : system{system}, core_id{core_id} {} Scheduler::~Scheduler() = default; @@ -422,9 +422,10 @@ void Scheduler::UnloadThread() { // Save context for previous thread if (previous_thread) { - cpu_core.SaveContext(previous_thread->GetContext()); + system.ArmInterface(core_id).SaveContext(previous_thread->GetContext32()); + system.ArmInterface(core_id).SaveContext(previous_thread->GetContext64()); // Save the TPIDR_EL0 system register in case it was modified. - previous_thread->SetTPIDR_EL0(cpu_core.GetTPIDR_EL0()); + previous_thread->SetTPIDR_EL0(system.ArmInterface(core_id).GetTPIDR_EL0()); if (previous_thread->GetStatus() == ThreadStatus::Running) { // This is only the case when a reschedule is triggered without the current thread @@ -451,9 +452,10 @@ void Scheduler::SwitchContext() { // Save context for previous thread if (previous_thread) { - cpu_core.SaveContext(previous_thread->GetContext()); + system.ArmInterface(core_id).SaveContext(previous_thread->GetContext32()); + system.ArmInterface(core_id).SaveContext(previous_thread->GetContext64()); // Save the TPIDR_EL0 system register in case it was modified. - previous_thread->SetTPIDR_EL0(cpu_core.GetTPIDR_EL0()); + previous_thread->SetTPIDR_EL0(system.ArmInterface(core_id).GetTPIDR_EL0()); if (previous_thread->GetStatus() == ThreadStatus::Running) { // This is only the case when a reschedule is triggered without the current thread @@ -481,9 +483,10 @@ void Scheduler::SwitchContext() { system.Kernel().MakeCurrentProcess(thread_owner_process); } - cpu_core.LoadContext(new_thread->GetContext()); - cpu_core.SetTlsAddress(new_thread->GetTLSAddress()); - cpu_core.SetTPIDR_EL0(new_thread->GetTPIDR_EL0()); + system.ArmInterface(core_id).LoadContext(new_thread->GetContext32()); + system.ArmInterface(core_id).LoadContext(new_thread->GetContext64()); + system.ArmInterface(core_id).SetTlsAddress(new_thread->GetTLSAddress()); + system.ArmInterface(core_id).SetTPIDR_EL0(new_thread->GetTPIDR_EL0()); } else { current_thread = nullptr; // Note: We do not reset the current process and current page table when idling because diff --git a/src/core/hle/kernel/scheduler.h b/src/core/hle/kernel/scheduler.h index 1c93a838c..07df33f9c 100644 --- a/src/core/hle/kernel/scheduler.h +++ b/src/core/hle/kernel/scheduler.h @@ -181,7 +181,7 @@ private: class Scheduler final { public: - explicit Scheduler(Core::System& system, Core::ARM_Interface& cpu_core, std::size_t core_id); + explicit Scheduler(Core::System& system, std::size_t core_id); ~Scheduler(); /// Returns whether there are any threads that are ready to run. @@ -235,7 +235,6 @@ private: std::shared_ptr<Thread> selected_thread = nullptr; Core::System& system; - Core::ARM_Interface& cpu_core; u64 last_context_switch_time = 0; u64 idle_selection_count = 0; const std::size_t core_id; diff --git a/src/core/hle/kernel/svc.cpp b/src/core/hle/kernel/svc.cpp index fd91779a3..4ffc113c2 100644 --- a/src/core/hle/kernel/svc.cpp +++ b/src/core/hle/kernel/svc.cpp @@ -187,6 +187,13 @@ static ResultCode SetHeapSize(Core::System& system, VAddr* heap_addr, u64 heap_s return RESULT_SUCCESS; } +static ResultCode SetHeapSize32(Core::System& system, u32* heap_addr, u32 heap_size) { + VAddr temp_heap_addr{}; + const ResultCode result{SetHeapSize(system, &temp_heap_addr, heap_size)}; + *heap_addr = static_cast<u32>(temp_heap_addr); + return result; +} + static ResultCode SetMemoryPermission(Core::System& system, VAddr addr, u64 size, u32 prot) { LOG_TRACE(Kernel_SVC, "called, addr=0x{:X}, size=0x{:X}, prot=0x{:X}", addr, size, prot); @@ -371,6 +378,12 @@ static ResultCode ConnectToNamedPort(Core::System& system, Handle* out_handle, return RESULT_SUCCESS; } +static ResultCode ConnectToNamedPort32(Core::System& system, Handle* out_handle, + u32 port_name_address) { + + return ConnectToNamedPort(system, out_handle, port_name_address); +} + /// Makes a blocking IPC call to an OS service. static ResultCode SendSyncRequest(Core::System& system, Handle handle) { const auto& handle_table = system.Kernel().CurrentProcess()->GetHandleTable(); @@ -390,6 +403,10 @@ static ResultCode SendSyncRequest(Core::System& system, Handle handle) { return session->SendSyncRequest(SharedFrom(thread), system.Memory()); } +static ResultCode SendSyncRequest32(Core::System& system, Handle handle) { + return SendSyncRequest(system, handle); +} + /// Get the ID for the specified thread. static ResultCode GetThreadId(Core::System& system, u64* thread_id, Handle thread_handle) { LOG_TRACE(Kernel_SVC, "called thread=0x{:08X}", thread_handle); @@ -405,6 +422,17 @@ static ResultCode GetThreadId(Core::System& system, u64* thread_id, Handle threa return RESULT_SUCCESS; } +static ResultCode GetThreadId32(Core::System& system, u32* thread_id_low, u32* thread_id_high, + Handle thread_handle) { + u64 thread_id{}; + const ResultCode result{GetThreadId(system, &thread_id, thread_handle)}; + + *thread_id_low = static_cast<u32>(thread_id >> 32); + *thread_id_high = static_cast<u32>(thread_id & std::numeric_limits<u32>::max()); + + return result; +} + /// Gets the ID of the specified process or a specified thread's owning process. static ResultCode GetProcessId(Core::System& system, u64* process_id, Handle handle) { LOG_DEBUG(Kernel_SVC, "called handle=0x{:08X}", handle); @@ -479,6 +507,12 @@ static ResultCode WaitSynchronization(Core::System& system, Handle* index, VAddr return result; } +static ResultCode WaitSynchronization32(Core::System& system, u32 timeout_low, u32 handles_address, + s32 handle_count, u32 timeout_high, Handle* index) { + const s64 nano_seconds{(static_cast<s64>(timeout_high) << 32) | static_cast<s64>(timeout_low)}; + return WaitSynchronization(system, index, handles_address, handle_count, nano_seconds); +} + /// Resumes a thread waiting on WaitSynchronization static ResultCode CancelSynchronization(Core::System& system, Handle thread_handle) { LOG_TRACE(Kernel_SVC, "called thread=0x{:X}", thread_handle); @@ -917,6 +951,18 @@ static ResultCode GetInfo(Core::System& system, u64* result, u64 info_id, u64 ha } } +static ResultCode GetInfo32(Core::System& system, u32* result_low, u32* result_high, u32 sub_id_low, + u32 info_id, u32 handle, u32 sub_id_high) { + const u64 sub_id{static_cast<u64>(sub_id_low | (static_cast<u64>(sub_id_high) << 32))}; + u64 res_value{}; + + const ResultCode result{GetInfo(system, &res_value, info_id, handle, sub_id)}; + *result_high = static_cast<u32>(res_value >> 32); + *result_low = static_cast<u32>(res_value & std::numeric_limits<u32>::max()); + + return result; +} + /// Maps memory at a desired address static ResultCode MapPhysicalMemory(Core::System& system, VAddr addr, u64 size) { LOG_DEBUG(Kernel_SVC, "called, addr=0x{:016X}, size=0x{:X}", addr, size); @@ -1058,7 +1104,7 @@ static ResultCode GetThreadContext(Core::System& system, VAddr thread_context, H return ERR_BUSY; } - Core::ARM_Interface::ThreadContext ctx = thread->GetContext(); + Core::ARM_Interface::ThreadContext64 ctx = thread->GetContext64(); // Mask away mode bits, interrupt bits, IL bit, and other reserved bits. ctx.pstate &= 0xFF0FFE20; @@ -1088,6 +1134,10 @@ static ResultCode GetThreadPriority(Core::System& system, u32* priority, Handle return RESULT_SUCCESS; } +static ResultCode GetThreadPriority32(Core::System& system, u32* priority, Handle handle) { + return GetThreadPriority(system, priority, handle); +} + /// Sets the priority for the specified thread static ResultCode SetThreadPriority(Core::System& system, Handle handle, u32 priority) { LOG_TRACE(Kernel_SVC, "called"); @@ -1259,6 +1309,11 @@ static ResultCode QueryMemory(Core::System& system, VAddr memory_info_address, query_address); } +static ResultCode QueryMemory32(Core::System& system, u32 memory_info_address, + u32 page_info_address, u32 query_address) { + return QueryMemory(system, memory_info_address, page_info_address, query_address); +} + static ResultCode MapProcessCodeMemory(Core::System& system, Handle process_handle, u64 dst_address, u64 src_address, u64 size) { LOG_DEBUG(Kernel_SVC, @@ -1675,6 +1730,10 @@ static void SignalProcessWideKey(Core::System& system, VAddr condition_variable_ } } +static void SignalProcessWideKey32(Core::System& system, u32 condition_variable_addr, s32 target) { + SignalProcessWideKey(system, condition_variable_addr, target); +} + // Wait for an address (via Address Arbiter) static ResultCode WaitForAddress(Core::System& system, VAddr address, u32 type, s32 value, s64 timeout) { @@ -1760,6 +1819,10 @@ static ResultCode CloseHandle(Core::System& system, Handle handle) { return handle_table.Close(handle); } +static ResultCode CloseHandle32(Core::System& system, Handle handle) { + return CloseHandle(system, handle); +} + /// Clears the signaled state of an event or process. static ResultCode ResetSignal(Core::System& system, Handle handle) { LOG_DEBUG(Kernel_SVC, "called handle 0x{:08X}", handle); @@ -2317,69 +2380,196 @@ struct FunctionDef { }; } // namespace -static const FunctionDef SVC_Table[] = { +static const FunctionDef SVC_Table_32[] = { {0x00, nullptr, "Unknown"}, - {0x01, SvcWrap<SetHeapSize>, "SetHeapSize"}, - {0x02, SvcWrap<SetMemoryPermission>, "SetMemoryPermission"}, - {0x03, SvcWrap<SetMemoryAttribute>, "SetMemoryAttribute"}, - {0x04, SvcWrap<MapMemory>, "MapMemory"}, - {0x05, SvcWrap<UnmapMemory>, "UnmapMemory"}, - {0x06, SvcWrap<QueryMemory>, "QueryMemory"}, - {0x07, SvcWrap<ExitProcess>, "ExitProcess"}, - {0x08, SvcWrap<CreateThread>, "CreateThread"}, - {0x09, SvcWrap<StartThread>, "StartThread"}, - {0x0A, SvcWrap<ExitThread>, "ExitThread"}, - {0x0B, SvcWrap<SleepThread>, "SleepThread"}, - {0x0C, SvcWrap<GetThreadPriority>, "GetThreadPriority"}, - {0x0D, SvcWrap<SetThreadPriority>, "SetThreadPriority"}, - {0x0E, SvcWrap<GetThreadCoreMask>, "GetThreadCoreMask"}, - {0x0F, SvcWrap<SetThreadCoreMask>, "SetThreadCoreMask"}, - {0x10, SvcWrap<GetCurrentProcessorNumber>, "GetCurrentProcessorNumber"}, - {0x11, SvcWrap<SignalEvent>, "SignalEvent"}, - {0x12, SvcWrap<ClearEvent>, "ClearEvent"}, - {0x13, SvcWrap<MapSharedMemory>, "MapSharedMemory"}, - {0x14, SvcWrap<UnmapSharedMemory>, "UnmapSharedMemory"}, - {0x15, SvcWrap<CreateTransferMemory>, "CreateTransferMemory"}, - {0x16, SvcWrap<CloseHandle>, "CloseHandle"}, - {0x17, SvcWrap<ResetSignal>, "ResetSignal"}, - {0x18, SvcWrap<WaitSynchronization>, "WaitSynchronization"}, - {0x19, SvcWrap<CancelSynchronization>, "CancelSynchronization"}, - {0x1A, SvcWrap<ArbitrateLock>, "ArbitrateLock"}, - {0x1B, SvcWrap<ArbitrateUnlock>, "ArbitrateUnlock"}, - {0x1C, SvcWrap<WaitProcessWideKeyAtomic>, "WaitProcessWideKeyAtomic"}, - {0x1D, SvcWrap<SignalProcessWideKey>, "SignalProcessWideKey"}, - {0x1E, SvcWrap<GetSystemTick>, "GetSystemTick"}, - {0x1F, SvcWrap<ConnectToNamedPort>, "ConnectToNamedPort"}, + {0x01, SvcWrap32<SetHeapSize32>, "SetHeapSize32"}, + {0x02, nullptr, "Unknown"}, + {0x03, nullptr, "SetMemoryAttribute32"}, + {0x04, nullptr, "MapMemory32"}, + {0x05, nullptr, "UnmapMemory32"}, + {0x06, SvcWrap32<QueryMemory32>, "QueryMemory32"}, + {0x07, nullptr, "ExitProcess32"}, + {0x08, nullptr, "CreateThread32"}, + {0x09, nullptr, "StartThread32"}, + {0x0a, nullptr, "ExitThread32"}, + {0x0b, nullptr, "SleepThread32"}, + {0x0c, SvcWrap32<GetThreadPriority32>, "GetThreadPriority32"}, + {0x0d, nullptr, "SetThreadPriority32"}, + {0x0e, nullptr, "GetThreadCoreMask32"}, + {0x0f, nullptr, "SetThreadCoreMask32"}, + {0x10, nullptr, "GetCurrentProcessorNumber32"}, + {0x11, nullptr, "SignalEvent32"}, + {0x12, nullptr, "ClearEvent32"}, + {0x13, nullptr, "MapSharedMemory32"}, + {0x14, nullptr, "UnmapSharedMemory32"}, + {0x15, nullptr, "CreateTransferMemory32"}, + {0x16, SvcWrap32<CloseHandle32>, "CloseHandle32"}, + {0x17, nullptr, "ResetSignal32"}, + {0x18, SvcWrap32<WaitSynchronization32>, "WaitSynchronization32"}, + {0x19, nullptr, "CancelSynchronization32"}, + {0x1a, nullptr, "ArbitrateLock32"}, + {0x1b, nullptr, "ArbitrateUnlock32"}, + {0x1c, nullptr, "WaitProcessWideKeyAtomic32"}, + {0x1d, SvcWrap32<SignalProcessWideKey32>, "SignalProcessWideKey32"}, + {0x1e, nullptr, "GetSystemTick32"}, + {0x1f, SvcWrap32<ConnectToNamedPort32>, "ConnectToNamedPort32"}, + {0x20, nullptr, "Unknown"}, + {0x21, SvcWrap32<SendSyncRequest32>, "SendSyncRequest32"}, + {0x22, nullptr, "SendSyncRequestWithUserBuffer32"}, + {0x23, nullptr, "Unknown"}, + {0x24, nullptr, "GetProcessId32"}, + {0x25, SvcWrap32<GetThreadId32>, "GetThreadId32"}, + {0x26, nullptr, "Break32"}, + {0x27, nullptr, "OutputDebugString32"}, + {0x28, nullptr, "Unknown"}, + {0x29, SvcWrap32<GetInfo32>, "GetInfo32"}, + {0x2a, nullptr, "Unknown"}, + {0x2b, nullptr, "Unknown"}, + {0x2c, nullptr, "MapPhysicalMemory32"}, + {0x2d, nullptr, "UnmapPhysicalMemory32"}, + {0x2e, nullptr, "Unknown"}, + {0x2f, nullptr, "Unknown"}, + {0x30, nullptr, "Unknown"}, + {0x31, nullptr, "Unknown"}, + {0x32, nullptr, "SetThreadActivity32"}, + {0x33, nullptr, "GetThreadContext32"}, + {0x34, nullptr, "WaitForAddress32"}, + {0x35, nullptr, "SignalToAddress32"}, + {0x36, nullptr, "Unknown"}, + {0x37, nullptr, "Unknown"}, + {0x38, nullptr, "Unknown"}, + {0x39, nullptr, "Unknown"}, + {0x3a, nullptr, "Unknown"}, + {0x3b, nullptr, "Unknown"}, + {0x3c, nullptr, "Unknown"}, + {0x3d, nullptr, "Unknown"}, + {0x3e, nullptr, "Unknown"}, + {0x3f, nullptr, "Unknown"}, + {0x40, nullptr, "CreateSession32"}, + {0x41, nullptr, "AcceptSession32"}, + {0x42, nullptr, "Unknown"}, + {0x43, nullptr, "ReplyAndReceive32"}, + {0x44, nullptr, "Unknown"}, + {0x45, nullptr, "CreateEvent32"}, + {0x46, nullptr, "Unknown"}, + {0x47, nullptr, "Unknown"}, + {0x48, nullptr, "Unknown"}, + {0x49, nullptr, "Unknown"}, + {0x4a, nullptr, "Unknown"}, + {0x4b, nullptr, "Unknown"}, + {0x4c, nullptr, "Unknown"}, + {0x4d, nullptr, "Unknown"}, + {0x4e, nullptr, "Unknown"}, + {0x4f, nullptr, "Unknown"}, + {0x50, nullptr, "Unknown"}, + {0x51, nullptr, "Unknown"}, + {0x52, nullptr, "Unknown"}, + {0x53, nullptr, "Unknown"}, + {0x54, nullptr, "Unknown"}, + {0x55, nullptr, "Unknown"}, + {0x56, nullptr, "Unknown"}, + {0x57, nullptr, "Unknown"}, + {0x58, nullptr, "Unknown"}, + {0x59, nullptr, "Unknown"}, + {0x5a, nullptr, "Unknown"}, + {0x5b, nullptr, "Unknown"}, + {0x5c, nullptr, "Unknown"}, + {0x5d, nullptr, "Unknown"}, + {0x5e, nullptr, "Unknown"}, + {0x5F, nullptr, "FlushProcessDataCache32"}, + {0x60, nullptr, "Unknown"}, + {0x61, nullptr, "Unknown"}, + {0x62, nullptr, "Unknown"}, + {0x63, nullptr, "Unknown"}, + {0x64, nullptr, "Unknown"}, + {0x65, nullptr, "GetProcessList32"}, + {0x66, nullptr, "Unknown"}, + {0x67, nullptr, "Unknown"}, + {0x68, nullptr, "Unknown"}, + {0x69, nullptr, "Unknown"}, + {0x6A, nullptr, "Unknown"}, + {0x6B, nullptr, "Unknown"}, + {0x6C, nullptr, "Unknown"}, + {0x6D, nullptr, "Unknown"}, + {0x6E, nullptr, "Unknown"}, + {0x6f, nullptr, "GetSystemInfo32"}, + {0x70, nullptr, "CreatePort32"}, + {0x71, nullptr, "ManageNamedPort32"}, + {0x72, nullptr, "ConnectToPort32"}, + {0x73, nullptr, "SetProcessMemoryPermission32"}, + {0x74, nullptr, "Unknown"}, + {0x75, nullptr, "Unknown"}, + {0x76, nullptr, "Unknown"}, + {0x77, nullptr, "MapProcessCodeMemory32"}, + {0x78, nullptr, "UnmapProcessCodeMemory32"}, + {0x79, nullptr, "Unknown"}, + {0x7A, nullptr, "Unknown"}, + {0x7B, nullptr, "TerminateProcess32"}, +}; + +static const FunctionDef SVC_Table_64[] = { + {0x00, nullptr, "Unknown"}, + {0x01, SvcWrap64<SetHeapSize>, "SetHeapSize"}, + {0x02, SvcWrap64<SetMemoryPermission>, "SetMemoryPermission"}, + {0x03, SvcWrap64<SetMemoryAttribute>, "SetMemoryAttribute"}, + {0x04, SvcWrap64<MapMemory>, "MapMemory"}, + {0x05, SvcWrap64<UnmapMemory>, "UnmapMemory"}, + {0x06, SvcWrap64<QueryMemory>, "QueryMemory"}, + {0x07, SvcWrap64<ExitProcess>, "ExitProcess"}, + {0x08, SvcWrap64<CreateThread>, "CreateThread"}, + {0x09, SvcWrap64<StartThread>, "StartThread"}, + {0x0A, SvcWrap64<ExitThread>, "ExitThread"}, + {0x0B, SvcWrap64<SleepThread>, "SleepThread"}, + {0x0C, SvcWrap64<GetThreadPriority>, "GetThreadPriority"}, + {0x0D, SvcWrap64<SetThreadPriority>, "SetThreadPriority"}, + {0x0E, SvcWrap64<GetThreadCoreMask>, "GetThreadCoreMask"}, + {0x0F, SvcWrap64<SetThreadCoreMask>, "SetThreadCoreMask"}, + {0x10, SvcWrap64<GetCurrentProcessorNumber>, "GetCurrentProcessorNumber"}, + {0x11, SvcWrap64<SignalEvent>, "SignalEvent"}, + {0x12, SvcWrap64<ClearEvent>, "ClearEvent"}, + {0x13, SvcWrap64<MapSharedMemory>, "MapSharedMemory"}, + {0x14, SvcWrap64<UnmapSharedMemory>, "UnmapSharedMemory"}, + {0x15, SvcWrap64<CreateTransferMemory>, "CreateTransferMemory"}, + {0x16, SvcWrap64<CloseHandle>, "CloseHandle"}, + {0x17, SvcWrap64<ResetSignal>, "ResetSignal"}, + {0x18, SvcWrap64<WaitSynchronization>, "WaitSynchronization"}, + {0x19, SvcWrap64<CancelSynchronization>, "CancelSynchronization"}, + {0x1A, SvcWrap64<ArbitrateLock>, "ArbitrateLock"}, + {0x1B, SvcWrap64<ArbitrateUnlock>, "ArbitrateUnlock"}, + {0x1C, SvcWrap64<WaitProcessWideKeyAtomic>, "WaitProcessWideKeyAtomic"}, + {0x1D, SvcWrap64<SignalProcessWideKey>, "SignalProcessWideKey"}, + {0x1E, SvcWrap64<GetSystemTick>, "GetSystemTick"}, + {0x1F, SvcWrap64<ConnectToNamedPort>, "ConnectToNamedPort"}, {0x20, nullptr, "SendSyncRequestLight"}, - {0x21, SvcWrap<SendSyncRequest>, "SendSyncRequest"}, + {0x21, SvcWrap64<SendSyncRequest>, "SendSyncRequest"}, {0x22, nullptr, "SendSyncRequestWithUserBuffer"}, {0x23, nullptr, "SendAsyncRequestWithUserBuffer"}, - {0x24, SvcWrap<GetProcessId>, "GetProcessId"}, - {0x25, SvcWrap<GetThreadId>, "GetThreadId"}, - {0x26, SvcWrap<Break>, "Break"}, - {0x27, SvcWrap<OutputDebugString>, "OutputDebugString"}, + {0x24, SvcWrap64<GetProcessId>, "GetProcessId"}, + {0x25, SvcWrap64<GetThreadId>, "GetThreadId"}, + {0x26, SvcWrap64<Break>, "Break"}, + {0x27, SvcWrap64<OutputDebugString>, "OutputDebugString"}, {0x28, nullptr, "ReturnFromException"}, - {0x29, SvcWrap<GetInfo>, "GetInfo"}, + {0x29, SvcWrap64<GetInfo>, "GetInfo"}, {0x2A, nullptr, "FlushEntireDataCache"}, {0x2B, nullptr, "FlushDataCache"}, - {0x2C, SvcWrap<MapPhysicalMemory>, "MapPhysicalMemory"}, - {0x2D, SvcWrap<UnmapPhysicalMemory>, "UnmapPhysicalMemory"}, + {0x2C, SvcWrap64<MapPhysicalMemory>, "MapPhysicalMemory"}, + {0x2D, SvcWrap64<UnmapPhysicalMemory>, "UnmapPhysicalMemory"}, {0x2E, nullptr, "GetFutureThreadInfo"}, {0x2F, nullptr, "GetLastThreadInfo"}, - {0x30, SvcWrap<GetResourceLimitLimitValue>, "GetResourceLimitLimitValue"}, - {0x31, SvcWrap<GetResourceLimitCurrentValue>, "GetResourceLimitCurrentValue"}, - {0x32, SvcWrap<SetThreadActivity>, "SetThreadActivity"}, - {0x33, SvcWrap<GetThreadContext>, "GetThreadContext"}, - {0x34, SvcWrap<WaitForAddress>, "WaitForAddress"}, - {0x35, SvcWrap<SignalToAddress>, "SignalToAddress"}, + {0x30, SvcWrap64<GetResourceLimitLimitValue>, "GetResourceLimitLimitValue"}, + {0x31, SvcWrap64<GetResourceLimitCurrentValue>, "GetResourceLimitCurrentValue"}, + {0x32, SvcWrap64<SetThreadActivity>, "SetThreadActivity"}, + {0x33, SvcWrap64<GetThreadContext>, "GetThreadContext"}, + {0x34, SvcWrap64<WaitForAddress>, "WaitForAddress"}, + {0x35, SvcWrap64<SignalToAddress>, "SignalToAddress"}, {0x36, nullptr, "SynchronizePreemptionState"}, {0x37, nullptr, "Unknown"}, {0x38, nullptr, "Unknown"}, {0x39, nullptr, "Unknown"}, {0x3A, nullptr, "Unknown"}, {0x3B, nullptr, "Unknown"}, - {0x3C, SvcWrap<KernelDebug>, "KernelDebug"}, - {0x3D, SvcWrap<ChangeKernelTraceState>, "ChangeKernelTraceState"}, + {0x3C, SvcWrap64<KernelDebug>, "KernelDebug"}, + {0x3D, SvcWrap64<ChangeKernelTraceState>, "ChangeKernelTraceState"}, {0x3E, nullptr, "Unknown"}, {0x3F, nullptr, "Unknown"}, {0x40, nullptr, "CreateSession"}, @@ -2387,7 +2577,7 @@ static const FunctionDef SVC_Table[] = { {0x42, nullptr, "ReplyAndReceiveLight"}, {0x43, nullptr, "ReplyAndReceive"}, {0x44, nullptr, "ReplyAndReceiveWithUserBuffer"}, - {0x45, SvcWrap<CreateEvent>, "CreateEvent"}, + {0x45, SvcWrap64<CreateEvent>, "CreateEvent"}, {0x46, nullptr, "Unknown"}, {0x47, nullptr, "Unknown"}, {0x48, nullptr, "MapPhysicalMemoryUnsafe"}, @@ -2398,9 +2588,9 @@ static const FunctionDef SVC_Table[] = { {0x4D, nullptr, "SleepSystem"}, {0x4E, nullptr, "ReadWriteRegister"}, {0x4F, nullptr, "SetProcessActivity"}, - {0x50, SvcWrap<CreateSharedMemory>, "CreateSharedMemory"}, - {0x51, SvcWrap<MapTransferMemory>, "MapTransferMemory"}, - {0x52, SvcWrap<UnmapTransferMemory>, "UnmapTransferMemory"}, + {0x50, SvcWrap64<CreateSharedMemory>, "CreateSharedMemory"}, + {0x51, SvcWrap64<MapTransferMemory>, "MapTransferMemory"}, + {0x52, SvcWrap64<UnmapTransferMemory>, "UnmapTransferMemory"}, {0x53, nullptr, "CreateInterruptEvent"}, {0x54, nullptr, "QueryPhysicalAddress"}, {0x55, nullptr, "QueryIoMapping"}, @@ -2419,8 +2609,8 @@ static const FunctionDef SVC_Table[] = { {0x62, nullptr, "TerminateDebugProcess"}, {0x63, nullptr, "GetDebugEvent"}, {0x64, nullptr, "ContinueDebugEvent"}, - {0x65, SvcWrap<GetProcessList>, "GetProcessList"}, - {0x66, SvcWrap<GetThreadList>, "GetThreadList"}, + {0x65, SvcWrap64<GetProcessList>, "GetProcessList"}, + {0x66, SvcWrap64<GetThreadList>, "GetThreadList"}, {0x67, nullptr, "GetDebugThreadContext"}, {0x68, nullptr, "SetDebugThreadContext"}, {0x69, nullptr, "QueryDebugProcessMemory"}, @@ -2436,24 +2626,32 @@ static const FunctionDef SVC_Table[] = { {0x73, nullptr, "SetProcessMemoryPermission"}, {0x74, nullptr, "MapProcessMemory"}, {0x75, nullptr, "UnmapProcessMemory"}, - {0x76, SvcWrap<QueryProcessMemory>, "QueryProcessMemory"}, - {0x77, SvcWrap<MapProcessCodeMemory>, "MapProcessCodeMemory"}, - {0x78, SvcWrap<UnmapProcessCodeMemory>, "UnmapProcessCodeMemory"}, + {0x76, SvcWrap64<QueryProcessMemory>, "QueryProcessMemory"}, + {0x77, SvcWrap64<MapProcessCodeMemory>, "MapProcessCodeMemory"}, + {0x78, SvcWrap64<UnmapProcessCodeMemory>, "UnmapProcessCodeMemory"}, {0x79, nullptr, "CreateProcess"}, {0x7A, nullptr, "StartProcess"}, {0x7B, nullptr, "TerminateProcess"}, - {0x7C, SvcWrap<GetProcessInfo>, "GetProcessInfo"}, - {0x7D, SvcWrap<CreateResourceLimit>, "CreateResourceLimit"}, - {0x7E, SvcWrap<SetResourceLimitLimitValue>, "SetResourceLimitLimitValue"}, + {0x7C, SvcWrap64<GetProcessInfo>, "GetProcessInfo"}, + {0x7D, SvcWrap64<CreateResourceLimit>, "CreateResourceLimit"}, + {0x7E, SvcWrap64<SetResourceLimitLimitValue>, "SetResourceLimitLimitValue"}, {0x7F, nullptr, "CallSecureMonitor"}, }; -static const FunctionDef* GetSVCInfo(u32 func_num) { - if (func_num >= std::size(SVC_Table)) { +static const FunctionDef* GetSVCInfo32(u32 func_num) { + if (func_num >= std::size(SVC_Table_32)) { + LOG_ERROR(Kernel_SVC, "Unknown svc=0x{:02X}", func_num); + return nullptr; + } + return &SVC_Table_32[func_num]; +} + +static const FunctionDef* GetSVCInfo64(u32 func_num) { + if (func_num >= std::size(SVC_Table_64)) { LOG_ERROR(Kernel_SVC, "Unknown svc=0x{:02X}", func_num); return nullptr; } - return &SVC_Table[func_num]; + return &SVC_Table_64[func_num]; } MICROPROFILE_DEFINE(Kernel_SVC, "Kernel", "SVC", MP_RGB(70, 200, 70)); @@ -2464,7 +2662,8 @@ void CallSVC(Core::System& system, u32 immediate) { // Lock the global kernel mutex when we enter the kernel HLE. std::lock_guard lock{HLE::g_hle_lock}; - const FunctionDef* info = GetSVCInfo(immediate); + const FunctionDef* info = system.CurrentProcess()->Is64BitProcess() ? GetSVCInfo64(immediate) + : GetSVCInfo32(immediate); if (info) { if (info->func) { info->func(system); diff --git a/src/core/hle/kernel/svc_wrap.h b/src/core/hle/kernel/svc_wrap.h index 29a2cfa9d..7d735e3fa 100644 --- a/src/core/hle/kernel/svc_wrap.h +++ b/src/core/hle/kernel/svc_wrap.h @@ -15,6 +15,10 @@ static inline u64 Param(const Core::System& system, int n) { return system.CurrentArmInterface().GetReg(n); } +static inline u32 Param32(const Core::System& system, int n) { + return static_cast<u32>(system.CurrentArmInterface().GetReg(n)); +} + /** * HLE a function return from the current ARM userland process * @param system System context @@ -24,40 +28,44 @@ static inline void FuncReturn(Core::System& system, u64 result) { system.CurrentArmInterface().SetReg(0, result); } +static inline void FuncReturn32(Core::System& system, u32 result) { + system.CurrentArmInterface().SetReg(0, (u64)result); +} + //////////////////////////////////////////////////////////////////////////////////////////////////// // Function wrappers that return type ResultCode template <ResultCode func(Core::System&, u64)> -void SvcWrap(Core::System& system) { +void SvcWrap64(Core::System& system) { FuncReturn(system, func(system, Param(system, 0)).raw); } template <ResultCode func(Core::System&, u64, u64)> -void SvcWrap(Core::System& system) { +void SvcWrap64(Core::System& system) { FuncReturn(system, func(system, Param(system, 0), Param(system, 1)).raw); } template <ResultCode func(Core::System&, u32)> -void SvcWrap(Core::System& system) { +void SvcWrap64(Core::System& system) { FuncReturn(system, func(system, static_cast<u32>(Param(system, 0))).raw); } template <ResultCode func(Core::System&, u32, u32)> -void SvcWrap(Core::System& system) { +void SvcWrap64(Core::System& system) { FuncReturn( system, func(system, static_cast<u32>(Param(system, 0)), static_cast<u32>(Param(system, 1))).raw); } template <ResultCode func(Core::System&, u32, u64, u64, u64)> -void SvcWrap(Core::System& system) { +void SvcWrap64(Core::System& system) { FuncReturn(system, func(system, static_cast<u32>(Param(system, 0)), Param(system, 1), Param(system, 2), Param(system, 3)) .raw); } template <ResultCode func(Core::System&, u32*)> -void SvcWrap(Core::System& system) { +void SvcWrap64(Core::System& system) { u32 param = 0; const u32 retval = func(system, ¶m).raw; system.CurrentArmInterface().SetReg(1, param); @@ -65,7 +73,7 @@ void SvcWrap(Core::System& system) { } template <ResultCode func(Core::System&, u32*, u32)> -void SvcWrap(Core::System& system) { +void SvcWrap64(Core::System& system) { u32 param_1 = 0; const u32 retval = func(system, ¶m_1, static_cast<u32>(Param(system, 1))).raw; system.CurrentArmInterface().SetReg(1, param_1); @@ -73,7 +81,7 @@ void SvcWrap(Core::System& system) { } template <ResultCode func(Core::System&, u32*, u32*)> -void SvcWrap(Core::System& system) { +void SvcWrap64(Core::System& system) { u32 param_1 = 0; u32 param_2 = 0; const u32 retval = func(system, ¶m_1, ¶m_2).raw; @@ -86,7 +94,7 @@ void SvcWrap(Core::System& system) { } template <ResultCode func(Core::System&, u32*, u64)> -void SvcWrap(Core::System& system) { +void SvcWrap64(Core::System& system) { u32 param_1 = 0; const u32 retval = func(system, ¶m_1, Param(system, 1)).raw; system.CurrentArmInterface().SetReg(1, param_1); @@ -94,7 +102,7 @@ void SvcWrap(Core::System& system) { } template <ResultCode func(Core::System&, u32*, u64, u32)> -void SvcWrap(Core::System& system) { +void SvcWrap64(Core::System& system) { u32 param_1 = 0; const u32 retval = func(system, ¶m_1, Param(system, 1), static_cast<u32>(Param(system, 2))).raw; @@ -104,7 +112,7 @@ void SvcWrap(Core::System& system) { } template <ResultCode func(Core::System&, u64*, u32)> -void SvcWrap(Core::System& system) { +void SvcWrap64(Core::System& system) { u64 param_1 = 0; const u32 retval = func(system, ¶m_1, static_cast<u32>(Param(system, 1))).raw; @@ -113,12 +121,12 @@ void SvcWrap(Core::System& system) { } template <ResultCode func(Core::System&, u64, u32)> -void SvcWrap(Core::System& system) { +void SvcWrap64(Core::System& system) { FuncReturn(system, func(system, Param(system, 0), static_cast<u32>(Param(system, 1))).raw); } template <ResultCode func(Core::System&, u64*, u64)> -void SvcWrap(Core::System& system) { +void SvcWrap64(Core::System& system) { u64 param_1 = 0; const u32 retval = func(system, ¶m_1, Param(system, 1)).raw; @@ -127,7 +135,7 @@ void SvcWrap(Core::System& system) { } template <ResultCode func(Core::System&, u64*, u32, u32)> -void SvcWrap(Core::System& system) { +void SvcWrap64(Core::System& system) { u64 param_1 = 0; const u32 retval = func(system, ¶m_1, static_cast<u32>(Param(system, 1)), static_cast<u32>(Param(system, 2))) @@ -138,19 +146,19 @@ void SvcWrap(Core::System& system) { } template <ResultCode func(Core::System&, u32, u64)> -void SvcWrap(Core::System& system) { +void SvcWrap64(Core::System& system) { FuncReturn(system, func(system, static_cast<u32>(Param(system, 0)), Param(system, 1)).raw); } template <ResultCode func(Core::System&, u32, u32, u64)> -void SvcWrap(Core::System& system) { +void SvcWrap64(Core::System& system) { FuncReturn(system, func(system, static_cast<u32>(Param(system, 0)), static_cast<u32>(Param(system, 1)), Param(system, 2)) .raw); } template <ResultCode func(Core::System&, u32, u32*, u64*)> -void SvcWrap(Core::System& system) { +void SvcWrap64(Core::System& system) { u32 param_1 = 0; u64 param_2 = 0; const ResultCode retval = func(system, static_cast<u32>(Param(system, 2)), ¶m_1, ¶m_2); @@ -161,54 +169,54 @@ void SvcWrap(Core::System& system) { } template <ResultCode func(Core::System&, u64, u64, u32, u32)> -void SvcWrap(Core::System& system) { +void SvcWrap64(Core::System& system) { FuncReturn(system, func(system, Param(system, 0), Param(system, 1), static_cast<u32>(Param(system, 2)), static_cast<u32>(Param(system, 3))) .raw); } template <ResultCode func(Core::System&, u64, u64, u32, u64)> -void SvcWrap(Core::System& system) { +void SvcWrap64(Core::System& system) { FuncReturn(system, func(system, Param(system, 0), Param(system, 1), static_cast<u32>(Param(system, 2)), Param(system, 3)) .raw); } template <ResultCode func(Core::System&, u32, u64, u32)> -void SvcWrap(Core::System& system) { +void SvcWrap64(Core::System& system) { FuncReturn(system, func(system, static_cast<u32>(Param(system, 0)), Param(system, 1), static_cast<u32>(Param(system, 2))) .raw); } template <ResultCode func(Core::System&, u64, u64, u64)> -void SvcWrap(Core::System& system) { +void SvcWrap64(Core::System& system) { FuncReturn(system, func(system, Param(system, 0), Param(system, 1), Param(system, 2)).raw); } template <ResultCode func(Core::System&, u64, u64, u32)> -void SvcWrap(Core::System& system) { +void SvcWrap64(Core::System& system) { FuncReturn( system, func(system, Param(system, 0), Param(system, 1), static_cast<u32>(Param(system, 2))).raw); } template <ResultCode func(Core::System&, u32, u64, u64, u32)> -void SvcWrap(Core::System& system) { +void SvcWrap64(Core::System& system) { FuncReturn(system, func(system, static_cast<u32>(Param(system, 0)), Param(system, 1), Param(system, 2), static_cast<u32>(Param(system, 3))) .raw); } template <ResultCode func(Core::System&, u32, u64, u64)> -void SvcWrap(Core::System& system) { +void SvcWrap64(Core::System& system) { FuncReturn( system, func(system, static_cast<u32>(Param(system, 0)), Param(system, 1), Param(system, 2)).raw); } template <ResultCode func(Core::System&, u32*, u64, u64, s64)> -void SvcWrap(Core::System& system) { +void SvcWrap64(Core::System& system) { u32 param_1 = 0; const u32 retval = func(system, ¶m_1, Param(system, 1), static_cast<u32>(Param(system, 2)), static_cast<s64>(Param(system, 3))) @@ -219,14 +227,14 @@ void SvcWrap(Core::System& system) { } template <ResultCode func(Core::System&, u64, u64, u32, s64)> -void SvcWrap(Core::System& system) { +void SvcWrap64(Core::System& system) { FuncReturn(system, func(system, Param(system, 0), Param(system, 1), static_cast<u32>(Param(system, 2)), static_cast<s64>(Param(system, 3))) .raw); } template <ResultCode func(Core::System&, u64*, u64, u64, u64)> -void SvcWrap(Core::System& system) { +void SvcWrap64(Core::System& system) { u64 param_1 = 0; const u32 retval = func(system, ¶m_1, Param(system, 1), Param(system, 2), Param(system, 3)).raw; @@ -236,7 +244,7 @@ void SvcWrap(Core::System& system) { } template <ResultCode func(Core::System&, u32*, u64, u64, u64, u32, s32)> -void SvcWrap(Core::System& system) { +void SvcWrap64(Core::System& system) { u32 param_1 = 0; const u32 retval = func(system, ¶m_1, Param(system, 1), Param(system, 2), Param(system, 3), static_cast<u32>(Param(system, 4)), static_cast<s32>(Param(system, 5))) @@ -247,7 +255,7 @@ void SvcWrap(Core::System& system) { } template <ResultCode func(Core::System&, u32*, u64, u64, u32)> -void SvcWrap(Core::System& system) { +void SvcWrap64(Core::System& system) { u32 param_1 = 0; const u32 retval = func(system, ¶m_1, Param(system, 1), Param(system, 2), static_cast<u32>(Param(system, 3))) @@ -258,7 +266,7 @@ void SvcWrap(Core::System& system) { } template <ResultCode func(Core::System&, Handle*, u64, u32, u32)> -void SvcWrap(Core::System& system) { +void SvcWrap64(Core::System& system) { u32 param_1 = 0; const u32 retval = func(system, ¶m_1, Param(system, 1), static_cast<u32>(Param(system, 2)), static_cast<u32>(Param(system, 3))) @@ -269,14 +277,14 @@ void SvcWrap(Core::System& system) { } template <ResultCode func(Core::System&, u64, u32, s32, s64)> -void SvcWrap(Core::System& system) { +void SvcWrap64(Core::System& system) { FuncReturn(system, func(system, Param(system, 0), static_cast<u32>(Param(system, 1)), static_cast<s32>(Param(system, 2)), static_cast<s64>(Param(system, 3))) .raw); } template <ResultCode func(Core::System&, u64, u32, s32, s32)> -void SvcWrap(Core::System& system) { +void SvcWrap64(Core::System& system) { FuncReturn(system, func(system, Param(system, 0), static_cast<u32>(Param(system, 1)), static_cast<s32>(Param(system, 2)), static_cast<s32>(Param(system, 3))) .raw); @@ -286,7 +294,7 @@ void SvcWrap(Core::System& system) { // Function wrappers that return type u32 template <u32 func(Core::System&)> -void SvcWrap(Core::System& system) { +void SvcWrap64(Core::System& system) { FuncReturn(system, func(system)); } @@ -294,7 +302,7 @@ void SvcWrap(Core::System& system) { // Function wrappers that return type u64 template <u64 func(Core::System&)> -void SvcWrap(Core::System& system) { +void SvcWrap64(Core::System& system) { FuncReturn(system, func(system)); } @@ -302,44 +310,110 @@ void SvcWrap(Core::System& system) { /// Function wrappers that return type void template <void func(Core::System&)> -void SvcWrap(Core::System& system) { +void SvcWrap64(Core::System& system) { func(system); } template <void func(Core::System&, u32)> -void SvcWrap(Core::System& system) { +void SvcWrap64(Core::System& system) { func(system, static_cast<u32>(Param(system, 0))); } template <void func(Core::System&, u32, u64, u64, u64)> -void SvcWrap(Core::System& system) { +void SvcWrap64(Core::System& system) { func(system, static_cast<u32>(Param(system, 0)), Param(system, 1), Param(system, 2), Param(system, 3)); } template <void func(Core::System&, s64)> -void SvcWrap(Core::System& system) { +void SvcWrap64(Core::System& system) { func(system, static_cast<s64>(Param(system, 0))); } template <void func(Core::System&, u64, s32)> -void SvcWrap(Core::System& system) { +void SvcWrap64(Core::System& system) { func(system, Param(system, 0), static_cast<s32>(Param(system, 1))); } template <void func(Core::System&, u64, u64)> -void SvcWrap(Core::System& system) { +void SvcWrap64(Core::System& system) { func(system, Param(system, 0), Param(system, 1)); } template <void func(Core::System&, u64, u64, u64)> -void SvcWrap(Core::System& system) { +void SvcWrap64(Core::System& system) { func(system, Param(system, 0), Param(system, 1), Param(system, 2)); } template <void func(Core::System&, u32, u64, u64)> -void SvcWrap(Core::System& system) { +void SvcWrap64(Core::System& system) { func(system, static_cast<u32>(Param(system, 0)), Param(system, 1), Param(system, 2)); } +// Used by QueryMemory32 +template <ResultCode func(Core::System&, u32, u32, u32)> +void SvcWrap32(Core::System& system) { + FuncReturn32(system, + func(system, Param32(system, 0), Param32(system, 1), Param32(system, 2)).raw); +} + +// Used by GetInfo32 +template <ResultCode func(Core::System&, u32*, u32*, u32, u32, u32, u32)> +void SvcWrap32(Core::System& system) { + u32 param_1 = 0; + u32 param_2 = 0; + + const u32 retval = func(system, ¶m_1, ¶m_2, Param32(system, 0), Param32(system, 1), + Param32(system, 2), Param32(system, 3)) + .raw; + + system.CurrentArmInterface().SetReg(1, param_1); + system.CurrentArmInterface().SetReg(2, param_2); + FuncReturn(system, retval); +} + +// Used by GetThreadPriority32, ConnectToNamedPort32 +template <ResultCode func(Core::System&, u32*, u32)> +void SvcWrap32(Core::System& system) { + u32 param_1 = 0; + const u32 retval = func(system, ¶m_1, Param32(system, 1)).raw; + system.CurrentArmInterface().SetReg(1, param_1); + FuncReturn(system, retval); +} + +// Used by GetThreadId32 +template <ResultCode func(Core::System&, u32*, u32*, u32)> +void SvcWrap32(Core::System& system) { + u32 param_1 = 0; + u32 param_2 = 0; + + const u32 retval = func(system, ¶m_1, ¶m_2, Param32(system, 1)).raw; + system.CurrentArmInterface().SetReg(1, param_1); + system.CurrentArmInterface().SetReg(2, param_2); + FuncReturn(system, retval); +} + +// Used by SignalProcessWideKey32 +template <void func(Core::System&, u32, s32)> +void SvcWrap32(Core::System& system) { + func(system, static_cast<u32>(Param(system, 0)), static_cast<s32>(Param(system, 1))); +} + +// Used by SendSyncRequest32 +template <ResultCode func(Core::System&, u32)> +void SvcWrap32(Core::System& system) { + FuncReturn(system, func(system, static_cast<u32>(Param(system, 0))).raw); +} + +// Used by WaitSynchronization32 +template <ResultCode func(Core::System&, u32, u32, s32, u32, Handle*)> +void SvcWrap32(Core::System& system) { + u32 param_1 = 0; + const u32 retval = func(system, Param32(system, 0), Param32(system, 1), Param32(system, 2), + Param32(system, 3), ¶m_1) + .raw; + system.CurrentArmInterface().SetReg(1, param_1); + FuncReturn(system, retval); +} + } // namespace Kernel diff --git a/src/core/hle/kernel/thread.cpp b/src/core/hle/kernel/thread.cpp index bf850e0b2..83e956036 100644 --- a/src/core/hle/kernel/thread.cpp +++ b/src/core/hle/kernel/thread.cpp @@ -133,15 +133,16 @@ void Thread::CancelWait() { ResumeFromWait(); } -/** - * Resets a thread context, making it ready to be scheduled and run by the CPU - * @param context Thread context to reset - * @param stack_top Address of the top of the stack - * @param entry_point Address of entry point for execution - * @param arg User argument for thread - */ -static void ResetThreadContext(Core::ARM_Interface::ThreadContext& context, VAddr stack_top, - VAddr entry_point, u64 arg) { +static void ResetThreadContext32(Core::ARM_Interface::ThreadContext32& context, u32 stack_top, + u32 entry_point, u32 arg) { + context = {}; + context.cpu_registers[0] = arg; + context.cpu_registers[15] = entry_point; + context.cpu_registers[13] = stack_top; +} + +static void ResetThreadContext64(Core::ARM_Interface::ThreadContext64& context, VAddr stack_top, + VAddr entry_point, u64 arg) { context = {}; context.cpu_registers[0] = arg; context.pc = entry_point; @@ -198,9 +199,9 @@ ResultVal<std::shared_ptr<Thread>> Thread::Create(KernelCore& kernel, std::strin thread->owner_process->RegisterThread(thread.get()); - // TODO(peachum): move to ScheduleThread() when scheduler is added so selected core is used - // to initialize the context - ResetThreadContext(thread->context, stack_top, entry_point, arg); + ResetThreadContext32(thread->context_32, static_cast<u32>(stack_top), + static_cast<u32>(entry_point), static_cast<u32>(arg)); + ResetThreadContext64(thread->context_64, stack_top, entry_point, arg); return MakeResult<std::shared_ptr<Thread>>(std::move(thread)); } @@ -213,11 +214,13 @@ void Thread::SetPriority(u32 priority) { } void Thread::SetWaitSynchronizationResult(ResultCode result) { - context.cpu_registers[0] = result.raw; + context_32.cpu_registers[0] = result.raw; + context_64.cpu_registers[0] = result.raw; } void Thread::SetWaitSynchronizationOutput(s32 output) { - context.cpu_registers[1] = output; + context_32.cpu_registers[1] = output; + context_64.cpu_registers[1] = output; } s32 Thread::GetSynchronizationObjectIndex(std::shared_ptr<SynchronizationObject> object) const { diff --git a/src/core/hle/kernel/thread.h b/src/core/hle/kernel/thread.h index 129e7858a..23fdef8a4 100644 --- a/src/core/hle/kernel/thread.h +++ b/src/core/hle/kernel/thread.h @@ -102,7 +102,8 @@ public: using MutexWaitingThreads = std::vector<std::shared_ptr<Thread>>; - using ThreadContext = Core::ARM_Interface::ThreadContext; + using ThreadContext32 = Core::ARM_Interface::ThreadContext32; + using ThreadContext64 = Core::ARM_Interface::ThreadContext64; using ThreadSynchronizationObjects = std::vector<std::shared_ptr<SynchronizationObject>>; @@ -273,12 +274,20 @@ public: return status == ThreadStatus::WaitSynch; } - ThreadContext& GetContext() { - return context; + ThreadContext32& GetContext32() { + return context_32; } - const ThreadContext& GetContext() const { - return context; + const ThreadContext32& GetContext32() const { + return context_32; + } + + ThreadContext64& GetContext64() { + return context_64; + } + + const ThreadContext64& GetContext64() const { + return context_64; } ThreadStatus GetStatus() const { @@ -466,7 +475,8 @@ private: void AdjustSchedulingOnPriority(u32 old_priority); void AdjustSchedulingOnAffinity(u64 old_affinity_mask, s32 old_core); - Core::ARM_Interface::ThreadContext context{}; + ThreadContext32 context_32{}; + ThreadContext64 context_64{}; u64 thread_id = 0; diff --git a/src/core/hle/service/am/am.cpp b/src/core/hle/service/am/am.cpp index cc978713b..d1bf13c89 100644 --- a/src/core/hle/service/am/am.cpp +++ b/src/core/hle/service/am/am.cpp @@ -607,7 +607,7 @@ ICommonStateGetter::ICommonStateGetter(Core::System& system, {40, nullptr, "GetCradleFwVersion"}, {50, nullptr, "IsVrModeEnabled"}, {51, nullptr, "SetVrModeEnabled"}, - {52, nullptr, "SwitchLcdBacklight"}, + {52, &ICommonStateGetter::SetLcdBacklighOffEnabled, "SetLcdBacklighOffEnabled"}, {53, nullptr, "BeginVrModeEx"}, {54, nullptr, "EndVrModeEx"}, {55, nullptr, "IsInControllerFirmwareUpdateSection"}, @@ -636,7 +636,6 @@ void ICommonStateGetter::GetBootMode(Kernel::HLERequestContext& ctx) { IPC::ResponseBuilder rb{ctx, 3}; rb.Push(RESULT_SUCCESS); - rb.Push<u8>(static_cast<u8>(Service::PM::SystemBootMode::Normal)); // Normal boot mode } @@ -660,6 +659,7 @@ void ICommonStateGetter::ReceiveMessage(Kernel::HLERequestContext& ctx) { rb.PushEnum<AppletMessageQueue::AppletMessage>(message); return; } + rb.Push(RESULT_SUCCESS); rb.PushEnum<AppletMessageQueue::AppletMessage>(message); } @@ -672,6 +672,17 @@ void ICommonStateGetter::GetCurrentFocusState(Kernel::HLERequestContext& ctx) { rb.Push(static_cast<u8>(FocusState::InFocus)); } +void ICommonStateGetter::SetLcdBacklighOffEnabled(Kernel::HLERequestContext& ctx) { + IPC::RequestParser rp{ctx}; + const auto is_lcd_backlight_off_enabled = rp.Pop<bool>(); + + LOG_WARNING(Service_AM, "(STUBBED) called. is_lcd_backlight_off_enabled={}", + is_lcd_backlight_off_enabled); + + IPC::ResponseBuilder rb{ctx, 2}; + rb.Push(RESULT_SUCCESS); +} + void ICommonStateGetter::GetDefaultDisplayResolutionChangeEvent(Kernel::HLERequestContext& ctx) { LOG_DEBUG(Service_AM, "called"); diff --git a/src/core/hle/service/am/am.h b/src/core/hle/service/am/am.h index 0b9a4332d..0843de781 100644 --- a/src/core/hle/service/am/am.h +++ b/src/core/hle/service/am/am.h @@ -182,6 +182,7 @@ private: void GetOperationMode(Kernel::HLERequestContext& ctx); void GetPerformanceMode(Kernel::HLERequestContext& ctx); void GetBootMode(Kernel::HLERequestContext& ctx); + void SetLcdBacklighOffEnabled(Kernel::HLERequestContext& ctx); void GetDefaultDisplayResolution(Kernel::HLERequestContext& ctx); void SetCpuBoostMode(Kernel::HLERequestContext& ctx); diff --git a/src/core/hle/service/am/applets/web_browser.cpp b/src/core/hle/service/am/applets/web_browser.cpp index 12443c910..9f30e167d 100644 --- a/src/core/hle/service/am/applets/web_browser.cpp +++ b/src/core/hle/service/am/applets/web_browser.cpp @@ -254,6 +254,12 @@ void WebBrowser::Execute() { if (status != RESULT_SUCCESS) { complete = true; + + // This is a workaround in order not to softlock yuzu when an error happens during the + // webapplet init. In order to avoid an svcBreak, the status is set to RESULT_SUCCESS + Finalize(); + status = RESULT_SUCCESS; + return; } diff --git a/src/core/hle/service/hid/controllers/npad.cpp b/src/core/hle/service/hid/controllers/npad.cpp index 15c09f04c..c1e32b28c 100644 --- a/src/core/hle/service/hid/controllers/npad.cpp +++ b/src/core/hle/service/hid/controllers/npad.cpp @@ -287,13 +287,13 @@ void Controller_NPad::RequestPadStateUpdate(u32 npad_id) { analog_state[static_cast<std::size_t>(JoystickId::Joystick_Left)]->GetAnalogDirectionStatus( Input::AnalogDirection::DOWN)); - pad_state.r_stick_up.Assign(analog_state[static_cast<std::size_t>(JoystickId::Joystick_Right)] - ->GetAnalogDirectionStatus(Input::AnalogDirection::RIGHT)); - pad_state.r_stick_left.Assign(analog_state[static_cast<std::size_t>(JoystickId::Joystick_Right)] - ->GetAnalogDirectionStatus(Input::AnalogDirection::LEFT)); pad_state.r_stick_right.Assign( analog_state[static_cast<std::size_t>(JoystickId::Joystick_Right)] - ->GetAnalogDirectionStatus(Input::AnalogDirection::UP)); + ->GetAnalogDirectionStatus(Input::AnalogDirection::RIGHT)); + pad_state.r_stick_left.Assign(analog_state[static_cast<std::size_t>(JoystickId::Joystick_Right)] + ->GetAnalogDirectionStatus(Input::AnalogDirection::LEFT)); + pad_state.r_stick_up.Assign(analog_state[static_cast<std::size_t>(JoystickId::Joystick_Right)] + ->GetAnalogDirectionStatus(Input::AnalogDirection::UP)); pad_state.r_stick_down.Assign(analog_state[static_cast<std::size_t>(JoystickId::Joystick_Right)] ->GetAnalogDirectionStatus(Input::AnalogDirection::DOWN)); diff --git a/src/core/hle/service/nvflinger/nvflinger.cpp b/src/core/hle/service/nvflinger/nvflinger.cpp index 134152210..437bc5dee 100644 --- a/src/core/hle/service/nvflinger/nvflinger.cpp +++ b/src/core/hle/service/nvflinger/nvflinger.cpp @@ -191,8 +191,6 @@ void NVFlinger::Compose() { // Search for a queued buffer and acquire it auto buffer = buffer_queue.AcquireBuffer(); - MicroProfileFlip(); - if (!buffer) { continue; } @@ -206,6 +204,8 @@ void NVFlinger::Compose() { gpu.WaitFence(fence.id, fence.value); } + MicroProfileFlip(); + // Now send the buffer to the GPU for drawing. // TODO(Subv): Support more than just disp0. The display device selection is probably based // on which display we're drawing (Default, Internal, External, etc) diff --git a/src/core/hle/service/set/set.cpp b/src/core/hle/service/set/set.cpp index 5bcc0b588..9e12c76fc 100644 --- a/src/core/hle/service/set/set.cpp +++ b/src/core/hle/service/set/set.cpp @@ -111,6 +111,14 @@ void SET::GetLanguageCode(Kernel::HLERequestContext& ctx) { rb.PushEnum(available_language_codes[Settings::values.language_index]); } +void SET::GetRegionCode(Kernel::HLERequestContext& ctx) { + LOG_DEBUG(Service_SET, "called"); + + IPC::ResponseBuilder rb{ctx, 3}; + rb.Push(RESULT_SUCCESS); + rb.Push(Settings::values.region_index); +} + SET::SET() : ServiceFramework("set") { // clang-format off static const FunctionInfo functions[] = { @@ -118,7 +126,7 @@ SET::SET() : ServiceFramework("set") { {1, &SET::GetAvailableLanguageCodes, "GetAvailableLanguageCodes"}, {2, &SET::MakeLanguageCode, "MakeLanguageCode"}, {3, &SET::GetAvailableLanguageCodeCount, "GetAvailableLanguageCodeCount"}, - {4, nullptr, "GetRegionCode"}, + {4, &SET::GetRegionCode, "GetRegionCode"}, {5, &SET::GetAvailableLanguageCodes2, "GetAvailableLanguageCodes2"}, {6, &SET::GetAvailableLanguageCodeCount2, "GetAvailableLanguageCodeCount2"}, {7, nullptr, "GetKeyCodeMap"}, diff --git a/src/core/hle/service/set/set.h b/src/core/hle/service/set/set.h index b154e08aa..6084b345d 100644 --- a/src/core/hle/service/set/set.h +++ b/src/core/hle/service/set/set.h @@ -43,6 +43,7 @@ private: void GetAvailableLanguageCodeCount(Kernel::HLERequestContext& ctx); void GetAvailableLanguageCodeCount2(Kernel::HLERequestContext& ctx); void GetQuestFlag(Kernel::HLERequestContext& ctx); + void GetRegionCode(Kernel::HLERequestContext& ctx); }; } // namespace Service::Set diff --git a/src/core/hle/service/sm/controller.cpp b/src/core/hle/service/sm/controller.cpp index c45b285f8..9cca84b31 100644 --- a/src/core/hle/service/sm/controller.cpp +++ b/src/core/hle/service/sm/controller.cpp @@ -44,7 +44,7 @@ void Controller::QueryPointerBufferSize(Kernel::HLERequestContext& ctx) { IPC::ResponseBuilder rb{ctx, 3}; rb.Push(RESULT_SUCCESS); - rb.Push<u16>(0x500); + rb.Push<u16>(0x1000); } Controller::Controller() : ServiceFramework("IpcController") { diff --git a/src/core/hle/service/time/time_zone_content_manager.cpp b/src/core/hle/service/time/time_zone_content_manager.cpp index 57b1a2bca..78d4acd95 100644 --- a/src/core/hle/service/time/time_zone_content_manager.cpp +++ b/src/core/hle/service/time/time_zone_content_manager.cpp @@ -53,7 +53,7 @@ static std::vector<std::string> BuildLocationNameCache(Core::System& system) { return {}; } - std::vector<char> raw_data(binary_list->GetSize()); + std::vector<char> raw_data(binary_list->GetSize() + 1); binary_list->ReadBytes<char>(raw_data.data(), binary_list->GetSize()); std::stringstream data_stream{raw_data.data()}; diff --git a/src/core/loader/deconstructed_rom_directory.cpp b/src/core/loader/deconstructed_rom_directory.cpp index d19c3623c..53559e8b1 100644 --- a/src/core/loader/deconstructed_rom_directory.cpp +++ b/src/core/loader/deconstructed_rom_directory.cpp @@ -129,12 +129,6 @@ AppLoader_DeconstructedRomDirectory::LoadResult AppLoader_DeconstructedRomDirect } metadata.Print(); - const FileSys::ProgramAddressSpaceType arch_bits{metadata.GetAddressSpaceType()}; - if (arch_bits == FileSys::ProgramAddressSpaceType::Is32Bit || - arch_bits == FileSys::ProgramAddressSpaceType::Is32BitNoMap) { - return {ResultStatus::Error32BitISA, {}}; - } - if (process.LoadFromMetadata(metadata).IsError()) { return {ResultStatus::ErrorUnableToParseKernelMetadata, {}}; } diff --git a/src/core/reporter.cpp b/src/core/reporter.cpp index f95eee3b1..85ac81ef7 100644 --- a/src/core/reporter.cpp +++ b/src/core/reporter.cpp @@ -111,7 +111,7 @@ json GetProcessorStateDataAuto(Core::System& system) { const auto& vm_manager{process->VMManager()}; auto& arm{system.CurrentArmInterface()}; - Core::ARM_Interface::ThreadContext context{}; + Core::ARM_Interface::ThreadContext64 context{}; arm.SaveContext(context); return GetProcessorStateData(process->Is64BitProcess() ? "AArch64" : "AArch32", diff --git a/src/core/settings.cpp b/src/core/settings.cpp index d1fc94060..c1282cb80 100644 --- a/src/core/settings.cpp +++ b/src/core/settings.cpp @@ -86,6 +86,7 @@ void LogSettings() { LogSetting("System_RngSeed", Settings::values.rng_seed.value_or(0)); LogSetting("System_CurrentUser", Settings::values.current_user); LogSetting("System_LanguageIndex", Settings::values.language_index); + LogSetting("System_RegionIndex", Settings::values.region_index); LogSetting("Core_UseMultiCore", Settings::values.use_multi_core); LogSetting("Renderer_UseResolutionFactor", Settings::values.resolution_factor); LogSetting("Renderer_UseFrameLimit", Settings::values.use_frame_limit); @@ -94,6 +95,7 @@ void LogSettings() { LogSetting("Renderer_UseAccurateGpuEmulation", Settings::values.use_accurate_gpu_emulation); LogSetting("Renderer_UseAsynchronousGpuEmulation", Settings::values.use_asynchronous_gpu_emulation); + LogSetting("Renderer_UseVsync", Settings::values.use_vsync); LogSetting("Audio_OutputEngine", Settings::values.sink_id); LogSetting("Audio_EnableAudioStretching", Settings::values.enable_audio_stretching); LogSetting("Audio_OutputDevice", Settings::values.audio_device_id); diff --git a/src/core/settings.h b/src/core/settings.h index f837d3fbc..79ec01731 100644 --- a/src/core/settings.h +++ b/src/core/settings.h @@ -387,6 +387,8 @@ struct Values { s32 current_user; s32 language_index; + s32 region_index; + s32 sound_index; // Controls std::array<PlayerInput, 10> players; @@ -430,11 +432,13 @@ struct Values { float resolution_factor; int aspect_ratio; + int max_anisotropy; bool use_frame_limit; u16 frame_limit; bool use_disk_shader_cache; bool use_accurate_gpu_emulation; bool use_asynchronous_gpu_emulation; + bool use_vsync; bool force_30fps_mode; float bg_red; diff --git a/src/core/telemetry_session.cpp b/src/core/telemetry_session.cpp index 0e72d31cd..0f3685d1c 100644 --- a/src/core/telemetry_session.cpp +++ b/src/core/telemetry_session.cpp @@ -188,6 +188,7 @@ void TelemetrySession::AddInitialInfo(Loader::AppLoader& app_loader) { Settings::values.use_accurate_gpu_emulation); AddField(field_type, "Renderer_UseAsynchronousGpuEmulation", Settings::values.use_asynchronous_gpu_emulation); + AddField(field_type, "Renderer_UseVsync", Settings::values.use_vsync); AddField(field_type, "System_UseDockedMode", Settings::values.use_docked_mode); } diff --git a/src/input_common/analog_from_button.cpp b/src/input_common/analog_from_button.cpp index e1a260762..6cabdaa3c 100755 --- a/src/input_common/analog_from_button.cpp +++ b/src/input_common/analog_from_button.cpp @@ -34,6 +34,20 @@ public: y * coef * (x == 0 ? 1.0f : SQRT_HALF)); } + bool GetAnalogDirectionStatus(Input::AnalogDirection direction) const override { + switch (direction) { + case Input::AnalogDirection::RIGHT: + return right->GetStatus(); + case Input::AnalogDirection::LEFT: + return left->GetStatus(); + case Input::AnalogDirection::UP: + return up->GetStatus(); + case Input::AnalogDirection::DOWN: + return down->GetStatus(); + } + return false; + } + private: Button up; Button down; diff --git a/src/input_common/udp/client.cpp b/src/input_common/udp/client.cpp index 2228571a6..da5227058 100644 --- a/src/input_common/udp/client.cpp +++ b/src/input_common/udp/client.cpp @@ -32,8 +32,16 @@ public: SocketCallback callback) : callback(std::move(callback)), timer(io_service), socket(io_service, udp::endpoint(udp::v4(), 0)), client_id(client_id), - pad_index(pad_index), - send_endpoint(udp::endpoint(boost::asio::ip::make_address_v4(host), port)) {} + pad_index(pad_index) { + boost::system::error_code ec{}; + auto ipv4 = boost::asio::ip::make_address_v4(host, ec); + if (ec.value() != boost::system::errc::success) { + LOG_ERROR(Input, "Invalid IPv4 address \"{}\" provided to socket", host); + ipv4 = boost::asio::ip::address_v4{}; + } + + send_endpoint = {udp::endpoint(ipv4, port)}; + } void Stop() { io_service.stop(); @@ -85,17 +93,18 @@ private: } void HandleSend(const boost::system::error_code& error) { + boost::system::error_code _ignored{}; // Send a request for getting port info for the pad Request::PortInfo port_info{1, {pad_index, 0, 0, 0}}; const auto port_message = Request::Create(port_info, client_id); std::memcpy(&send_buffer1, &port_message, PORT_INFO_SIZE); - socket.send_to(boost::asio::buffer(send_buffer1), send_endpoint); + socket.send_to(boost::asio::buffer(send_buffer1), send_endpoint, {}, _ignored); // Send a request for getting pad data for the pad Request::PadData pad_data{Request::PadData::Flags::Id, pad_index, EMPTY_MAC_ADDRESS}; const auto pad_message = Request::Create(pad_data, client_id); std::memcpy(send_buffer2.data(), &pad_message, PAD_DATA_SIZE); - socket.send_to(boost::asio::buffer(send_buffer2), send_endpoint); + socket.send_to(boost::asio::buffer(send_buffer2), send_endpoint, {}, _ignored); StartSend(timer.expiry()); } diff --git a/src/input_common/udp/protocol.cpp b/src/input_common/udp/protocol.cpp index a982ac49d..5e50bd612 100644 --- a/src/input_common/udp/protocol.cpp +++ b/src/input_common/udp/protocol.cpp @@ -31,7 +31,6 @@ namespace Response { */ std::optional<Type> Validate(u8* data, std::size_t size) { if (size < sizeof(Header)) { - LOG_DEBUG(Input, "Invalid UDP packet received"); return std::nullopt; } Header header{}; diff --git a/src/input_common/udp/udp.cpp b/src/input_common/udp/udp.cpp index ca99cc22f..8c6ef1394 100644 --- a/src/input_common/udp/udp.cpp +++ b/src/input_common/udp/udp.cpp @@ -3,6 +3,7 @@ // Refer to the license.txt file included. #include <mutex> +#include <optional> #include <tuple> #include "common/param_package.h" @@ -44,7 +45,7 @@ public: std::unique_ptr<Input::TouchDevice> Create(const Common::ParamPackage& params) override { { std::lock_guard guard(status->update_mutex); - status->touch_calibration.emplace(); + status->touch_calibration = DeviceStatus::CalibrationData{}; // These default values work well for DS4 but probably not other touch inputs status->touch_calibration->min_x = params.Get("min_x", 100); status->touch_calibration->min_y = params.Get("min_y", 50); diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt index 4b0c6346f..91df062d7 100644 --- a/src/video_core/CMakeLists.txt +++ b/src/video_core/CMakeLists.txt @@ -2,6 +2,8 @@ add_library(video_core STATIC buffer_cache/buffer_block.h buffer_cache/buffer_cache.h buffer_cache/map_interval.h + dirty_flags.cpp + dirty_flags.h dma_pusher.cpp dma_pusher.h engines/const_buffer_engine_interface.h @@ -63,14 +65,12 @@ add_library(video_core STATIC renderer_opengl/gl_shader_decompiler.h renderer_opengl/gl_shader_disk_cache.cpp renderer_opengl/gl_shader_disk_cache.h - renderer_opengl/gl_shader_gen.cpp - renderer_opengl/gl_shader_gen.h renderer_opengl/gl_shader_manager.cpp renderer_opengl/gl_shader_manager.h renderer_opengl/gl_shader_util.cpp renderer_opengl/gl_shader_util.h - renderer_opengl/gl_state.cpp - renderer_opengl/gl_state.h + renderer_opengl/gl_state_tracker.cpp + renderer_opengl/gl_state_tracker.h renderer_opengl/gl_stream_buffer.cpp renderer_opengl/gl_stream_buffer.h renderer_opengl/gl_texture_cache.cpp @@ -116,8 +116,6 @@ add_library(video_core STATIC shader/ast.h shader/compiler_settings.cpp shader/compiler_settings.h - shader/const_buffer_locker.cpp - shader/const_buffer_locker.h shader/control_flow.cpp shader/control_flow.h shader/decode.cpp @@ -126,9 +124,13 @@ add_library(video_core STATIC shader/node_helper.cpp shader/node_helper.h shader/node.h + shader/registry.cpp + shader/registry.h shader/shader_ir.cpp shader/shader_ir.h shader/track.cpp + shader/transform_feedback.cpp + shader/transform_feedback.h surface.cpp surface.h texture_cache/format_lookup_table.cpp @@ -198,6 +200,8 @@ if (ENABLE_VULKAN) renderer_vulkan/vk_shader_util.h renderer_vulkan/vk_staging_buffer_pool.cpp renderer_vulkan/vk_staging_buffer_pool.h + renderer_vulkan/vk_state_tracker.cpp + renderer_vulkan/vk_state_tracker.h renderer_vulkan/vk_stream_buffer.cpp renderer_vulkan/vk_stream_buffer.h renderer_vulkan/vk_swapchain.cpp diff --git a/src/video_core/dirty_flags.cpp b/src/video_core/dirty_flags.cpp new file mode 100644 index 000000000..e16075993 --- /dev/null +++ b/src/video_core/dirty_flags.cpp @@ -0,0 +1,38 @@ +// Copyright 2019 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include <array> +#include <cstddef> + +#include "common/common_types.h" +#include "video_core/dirty_flags.h" + +#define OFF(field_name) MAXWELL3D_REG_INDEX(field_name) +#define NUM(field_name) (sizeof(::Tegra::Engines::Maxwell3D::Regs::field_name) / sizeof(u32)) + +namespace VideoCommon::Dirty { + +using Tegra::Engines::Maxwell3D; + +void SetupDirtyRenderTargets(Tegra::Engines::Maxwell3D::DirtyState::Tables& tables) { + static constexpr std::size_t num_per_rt = NUM(rt[0]); + static constexpr std::size_t begin = OFF(rt); + static constexpr std::size_t num = num_per_rt * Maxwell3D::Regs::NumRenderTargets; + for (std::size_t rt = 0; rt < Maxwell3D::Regs::NumRenderTargets; ++rt) { + FillBlock(tables[0], begin + rt * num_per_rt, num_per_rt, ColorBuffer0 + rt); + } + FillBlock(tables[1], begin, num, RenderTargets); + + static constexpr std::array zeta_flags{ZetaBuffer, RenderTargets}; + for (std::size_t i = 0; i < std::size(zeta_flags); ++i) { + const u8 flag = zeta_flags[i]; + auto& table = tables[i]; + table[OFF(zeta_enable)] = flag; + table[OFF(zeta_width)] = flag; + table[OFF(zeta_height)] = flag; + FillBlock(table, OFF(zeta), NUM(zeta), flag); + } +} + +} // namespace VideoCommon::Dirty diff --git a/src/video_core/dirty_flags.h b/src/video_core/dirty_flags.h new file mode 100644 index 000000000..3f6c1d83a --- /dev/null +++ b/src/video_core/dirty_flags.h @@ -0,0 +1,49 @@ +// Copyright 2019 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include <algorithm> +#include <cstddef> +#include <iterator> + +#include "common/common_types.h" +#include "video_core/engines/maxwell_3d.h" + +namespace VideoCommon::Dirty { + +enum : u8 { + NullEntry = 0, + + RenderTargets, + ColorBuffer0, + ColorBuffer1, + ColorBuffer2, + ColorBuffer3, + ColorBuffer4, + ColorBuffer5, + ColorBuffer6, + ColorBuffer7, + ZetaBuffer, + + LastCommonEntry, +}; + +template <typename Integer> +void FillBlock(Tegra::Engines::Maxwell3D::DirtyState::Table& table, std::size_t begin, + std::size_t num, Integer dirty_index) { + const auto it = std::begin(table) + begin; + std::fill(it, it + num, static_cast<u8>(dirty_index)); +} + +template <typename Integer1, typename Integer2> +void FillBlock(Tegra::Engines::Maxwell3D::DirtyState::Tables& tables, std::size_t begin, + std::size_t num, Integer1 index_a, Integer2 index_b) { + FillBlock(tables[0], begin, num, index_a); + FillBlock(tables[1], begin, num, index_b); +} + +void SetupDirtyRenderTargets(Tegra::Engines::Maxwell3D::DirtyState::Tables& tables); + +} // namespace VideoCommon::Dirty diff --git a/src/video_core/dma_pusher.cpp b/src/video_core/dma_pusher.cpp index 0094fd715..713c14182 100644 --- a/src/video_core/dma_pusher.cpp +++ b/src/video_core/dma_pusher.cpp @@ -22,7 +22,7 @@ void DmaPusher::DispatchCalls() { MICROPROFILE_SCOPE(DispatchCalls); // On entering GPU code, assume all memory may be touched by the ARM core. - gpu.Maxwell3D().dirty.OnMemoryWrite(); + gpu.Maxwell3D().OnMemoryWrite(); dma_pushbuffer_subindex = 0; diff --git a/src/video_core/engines/const_buffer_engine_interface.h b/src/video_core/engines/const_buffer_engine_interface.h index d56a47710..724ee0fd6 100644 --- a/src/video_core/engines/const_buffer_engine_interface.h +++ b/src/video_core/engines/const_buffer_engine_interface.h @@ -16,11 +16,12 @@ namespace Tegra::Engines { struct SamplerDescriptor { union { - BitField<0, 20, Tegra::Shader::TextureType> texture_type; - BitField<20, 1, u32> is_array; - BitField<21, 1, u32> is_buffer; - BitField<22, 1, u32> is_shadow; - u32 raw{}; + u32 raw = 0; + BitField<0, 2, Tegra::Shader::TextureType> texture_type; + BitField<2, 3, Tegra::Texture::ComponentType> component_type; + BitField<5, 1, u32> is_array; + BitField<6, 1, u32> is_buffer; + BitField<7, 1, u32> is_shadow; }; bool operator==(const SamplerDescriptor& rhs) const noexcept { @@ -31,68 +32,48 @@ struct SamplerDescriptor { return !operator==(rhs); } - static SamplerDescriptor FromTicTexture(Tegra::Texture::TextureType tic_texture_type) { + static SamplerDescriptor FromTIC(const Tegra::Texture::TICEntry& tic) { + using Tegra::Shader::TextureType; SamplerDescriptor result; - switch (tic_texture_type) { + + // This is going to be used to determine the shading language type. + // Because of that we don't care about all component types on color textures. + result.component_type.Assign(tic.r_type.Value()); + + switch (tic.texture_type.Value()) { case Tegra::Texture::TextureType::Texture1D: - result.texture_type.Assign(Tegra::Shader::TextureType::Texture1D); - result.is_array.Assign(0); - result.is_buffer.Assign(0); - result.is_shadow.Assign(0); + result.texture_type.Assign(TextureType::Texture1D); return result; case Tegra::Texture::TextureType::Texture2D: - result.texture_type.Assign(Tegra::Shader::TextureType::Texture2D); - result.is_array.Assign(0); - result.is_buffer.Assign(0); - result.is_shadow.Assign(0); + result.texture_type.Assign(TextureType::Texture2D); return result; case Tegra::Texture::TextureType::Texture3D: - result.texture_type.Assign(Tegra::Shader::TextureType::Texture3D); - result.is_array.Assign(0); - result.is_buffer.Assign(0); - result.is_shadow.Assign(0); + result.texture_type.Assign(TextureType::Texture3D); return result; case Tegra::Texture::TextureType::TextureCubemap: - result.texture_type.Assign(Tegra::Shader::TextureType::TextureCube); - result.is_array.Assign(0); - result.is_buffer.Assign(0); - result.is_shadow.Assign(0); + result.texture_type.Assign(TextureType::TextureCube); return result; case Tegra::Texture::TextureType::Texture1DArray: - result.texture_type.Assign(Tegra::Shader::TextureType::Texture1D); + result.texture_type.Assign(TextureType::Texture1D); result.is_array.Assign(1); - result.is_buffer.Assign(0); - result.is_shadow.Assign(0); return result; case Tegra::Texture::TextureType::Texture2DArray: - result.texture_type.Assign(Tegra::Shader::TextureType::Texture2D); + result.texture_type.Assign(TextureType::Texture2D); result.is_array.Assign(1); - result.is_buffer.Assign(0); - result.is_shadow.Assign(0); return result; case Tegra::Texture::TextureType::Texture1DBuffer: - result.texture_type.Assign(Tegra::Shader::TextureType::Texture1D); - result.is_array.Assign(0); + result.texture_type.Assign(TextureType::Texture1D); result.is_buffer.Assign(1); - result.is_shadow.Assign(0); return result; case Tegra::Texture::TextureType::Texture2DNoMipmap: - result.texture_type.Assign(Tegra::Shader::TextureType::Texture2D); - result.is_array.Assign(0); - result.is_buffer.Assign(0); - result.is_shadow.Assign(0); + result.texture_type.Assign(TextureType::Texture2D); return result; case Tegra::Texture::TextureType::TextureCubeArray: - result.texture_type.Assign(Tegra::Shader::TextureType::TextureCube); + result.texture_type.Assign(TextureType::TextureCube); result.is_array.Assign(1); - result.is_buffer.Assign(0); - result.is_shadow.Assign(0); return result; default: - result.texture_type.Assign(Tegra::Shader::TextureType::Texture2D); - result.is_array.Assign(0); - result.is_buffer.Assign(0); - result.is_shadow.Assign(0); + result.texture_type.Assign(TextureType::Texture2D); return result; } } diff --git a/src/video_core/engines/kepler_compute.cpp b/src/video_core/engines/kepler_compute.cpp index 4b824aa4e..368c75a66 100644 --- a/src/video_core/engines/kepler_compute.cpp +++ b/src/video_core/engines/kepler_compute.cpp @@ -39,7 +39,7 @@ void KeplerCompute::CallMethod(const GPU::MethodCall& method_call) { const bool is_last_call = method_call.IsLastCall(); upload_state.ProcessData(method_call.argument, is_last_call); if (is_last_call) { - system.GPU().Maxwell3D().dirty.OnMemoryWrite(); + system.GPU().Maxwell3D().OnMemoryWrite(); } break; } @@ -89,7 +89,7 @@ SamplerDescriptor KeplerCompute::AccessBindlessSampler(ShaderType stage, u64 con const Texture::TextureHandle tex_handle{memory_manager.Read<u32>(tex_info_address)}; const Texture::FullTextureInfo tex_info = GetTextureInfo(tex_handle); - SamplerDescriptor result = SamplerDescriptor::FromTicTexture(tex_info.tic.texture_type.Value()); + SamplerDescriptor result = SamplerDescriptor::FromTIC(tex_info.tic); result.is_shadow.Assign(tex_info.tsc.depth_compare_enabled.Value()); return result; } @@ -119,14 +119,6 @@ Texture::TICEntry KeplerCompute::GetTICEntry(u32 tic_index) const { Texture::TICEntry tic_entry; memory_manager.ReadBlockUnsafe(tic_address_gpu, &tic_entry, sizeof(Texture::TICEntry)); - const auto r_type{tic_entry.r_type.Value()}; - const auto g_type{tic_entry.g_type.Value()}; - const auto b_type{tic_entry.b_type.Value()}; - const auto a_type{tic_entry.a_type.Value()}; - - // TODO(Subv): Different data types for separate components are not supported - DEBUG_ASSERT(r_type == g_type && r_type == b_type && r_type == a_type); - return tic_entry; } diff --git a/src/video_core/engines/kepler_memory.cpp b/src/video_core/engines/kepler_memory.cpp index fa4a7c5c1..597872e43 100644 --- a/src/video_core/engines/kepler_memory.cpp +++ b/src/video_core/engines/kepler_memory.cpp @@ -34,7 +34,7 @@ void KeplerMemory::CallMethod(const GPU::MethodCall& method_call) { const bool is_last_call = method_call.IsLastCall(); upload_state.ProcessData(method_call.argument, is_last_call); if (is_last_call) { - system.GPU().Maxwell3D().dirty.OnMemoryWrite(); + system.GPU().Maxwell3D().OnMemoryWrite(); } break; } diff --git a/src/video_core/engines/maxwell_3d.cpp b/src/video_core/engines/maxwell_3d.cpp index b28de1092..ba63b44b4 100644 --- a/src/video_core/engines/maxwell_3d.cpp +++ b/src/video_core/engines/maxwell_3d.cpp @@ -26,7 +26,8 @@ Maxwell3D::Maxwell3D(Core::System& system, VideoCore::RasterizerInterface& raste MemoryManager& memory_manager) : system{system}, rasterizer{rasterizer}, memory_manager{memory_manager}, macro_interpreter{*this}, upload_state{memory_manager, regs.upload} { - InitDirtySettings(); + dirty.flags.flip(); + InitializeRegisterDefaults(); } @@ -75,8 +76,8 @@ void Maxwell3D::InitializeRegisterDefaults() { regs.stencil_back_mask = 0xFFFFFFFF; regs.depth_test_func = Regs::ComparisonOp::Always; - regs.cull.front_face = Regs::Cull::FrontFace::CounterClockWise; - regs.cull.cull_face = Regs::Cull::CullFace::Back; + regs.front_face = Regs::FrontFace::CounterClockWise; + regs.cull_face = Regs::CullFace::Back; // TODO(Rodrigo): Most games do not set a point size. I think this is a case of a // register carrying a default value. Assume it's OpenGL's default (1). @@ -95,7 +96,9 @@ void Maxwell3D::InitializeRegisterDefaults() { regs.rasterize_enable = 1; regs.rt_separate_frag_data = 1; regs.framebuffer_srgb = 1; - regs.cull.front_face = Maxwell3D::Regs::Cull::FrontFace::ClockWise; + regs.front_face = Maxwell3D::Regs::FrontFace::ClockWise; + + shadow_state = regs; mme_inline[MAXWELL3D_REG_INDEX(draw.vertex_end_gl)] = true; mme_inline[MAXWELL3D_REG_INDEX(draw.vertex_begin_gl)] = true; @@ -103,164 +106,6 @@ void Maxwell3D::InitializeRegisterDefaults() { mme_inline[MAXWELL3D_REG_INDEX(index_array.count)] = true; } -#define DIRTY_REGS_POS(field_name) static_cast<u8>(offsetof(Maxwell3D::DirtyRegs, field_name)) - -void Maxwell3D::InitDirtySettings() { - const auto set_block = [this](std::size_t start, std::size_t range, u8 position) { - const auto start_itr = dirty_pointers.begin() + start; - const auto end_itr = start_itr + range; - std::fill(start_itr, end_itr, position); - }; - dirty.regs.fill(true); - - // Init Render Targets - constexpr u32 registers_per_rt = sizeof(regs.rt[0]) / sizeof(u32); - constexpr u32 rt_start_reg = MAXWELL3D_REG_INDEX(rt); - constexpr u32 rt_end_reg = rt_start_reg + registers_per_rt * 8; - u8 rt_dirty_reg = DIRTY_REGS_POS(render_target); - for (u32 rt_reg = rt_start_reg; rt_reg < rt_end_reg; rt_reg += registers_per_rt) { - set_block(rt_reg, registers_per_rt, rt_dirty_reg); - ++rt_dirty_reg; - } - constexpr u32 depth_buffer_flag = DIRTY_REGS_POS(depth_buffer); - dirty_pointers[MAXWELL3D_REG_INDEX(zeta_enable)] = depth_buffer_flag; - dirty_pointers[MAXWELL3D_REG_INDEX(zeta_width)] = depth_buffer_flag; - dirty_pointers[MAXWELL3D_REG_INDEX(zeta_height)] = depth_buffer_flag; - constexpr u32 registers_in_zeta = sizeof(regs.zeta) / sizeof(u32); - constexpr u32 zeta_reg = MAXWELL3D_REG_INDEX(zeta); - set_block(zeta_reg, registers_in_zeta, depth_buffer_flag); - - // Init Vertex Arrays - constexpr u32 vertex_array_start = MAXWELL3D_REG_INDEX(vertex_array); - constexpr u32 vertex_array_size = sizeof(regs.vertex_array[0]) / sizeof(u32); - constexpr u32 vertex_array_end = vertex_array_start + vertex_array_size * Regs::NumVertexArrays; - u8 va_dirty_reg = DIRTY_REGS_POS(vertex_array); - u8 vi_dirty_reg = DIRTY_REGS_POS(vertex_instance); - for (u32 vertex_reg = vertex_array_start; vertex_reg < vertex_array_end; - vertex_reg += vertex_array_size) { - set_block(vertex_reg, 3, va_dirty_reg); - // The divisor concerns vertex array instances - dirty_pointers[static_cast<std::size_t>(vertex_reg) + 3] = vi_dirty_reg; - ++va_dirty_reg; - ++vi_dirty_reg; - } - constexpr u32 vertex_limit_start = MAXWELL3D_REG_INDEX(vertex_array_limit); - constexpr u32 vertex_limit_size = sizeof(regs.vertex_array_limit[0]) / sizeof(u32); - constexpr u32 vertex_limit_end = vertex_limit_start + vertex_limit_size * Regs::NumVertexArrays; - va_dirty_reg = DIRTY_REGS_POS(vertex_array); - for (u32 vertex_reg = vertex_limit_start; vertex_reg < vertex_limit_end; - vertex_reg += vertex_limit_size) { - set_block(vertex_reg, vertex_limit_size, va_dirty_reg); - va_dirty_reg++; - } - constexpr u32 vertex_instance_start = MAXWELL3D_REG_INDEX(instanced_arrays); - constexpr u32 vertex_instance_size = - sizeof(regs.instanced_arrays.is_instanced[0]) / sizeof(u32); - constexpr u32 vertex_instance_end = - vertex_instance_start + vertex_instance_size * Regs::NumVertexArrays; - vi_dirty_reg = DIRTY_REGS_POS(vertex_instance); - for (u32 vertex_reg = vertex_instance_start; vertex_reg < vertex_instance_end; - vertex_reg += vertex_instance_size) { - set_block(vertex_reg, vertex_instance_size, vi_dirty_reg); - vi_dirty_reg++; - } - set_block(MAXWELL3D_REG_INDEX(vertex_attrib_format), regs.vertex_attrib_format.size(), - DIRTY_REGS_POS(vertex_attrib_format)); - - // Init Shaders - constexpr u32 shader_registers_count = - sizeof(regs.shader_config[0]) * Regs::MaxShaderProgram / sizeof(u32); - set_block(MAXWELL3D_REG_INDEX(shader_config[0]), shader_registers_count, - DIRTY_REGS_POS(shaders)); - - // State - - // Viewport - constexpr u8 viewport_dirty_reg = DIRTY_REGS_POS(viewport); - constexpr u32 viewport_start = MAXWELL3D_REG_INDEX(viewports); - constexpr u32 viewport_size = sizeof(regs.viewports) / sizeof(u32); - set_block(viewport_start, viewport_size, viewport_dirty_reg); - constexpr u32 view_volume_start = MAXWELL3D_REG_INDEX(view_volume_clip_control); - constexpr u32 view_volume_size = sizeof(regs.view_volume_clip_control) / sizeof(u32); - set_block(view_volume_start, view_volume_size, viewport_dirty_reg); - - // Viewport transformation - constexpr u32 viewport_trans_start = MAXWELL3D_REG_INDEX(viewport_transform); - constexpr u32 viewport_trans_size = sizeof(regs.viewport_transform) / sizeof(u32); - set_block(viewport_trans_start, viewport_trans_size, DIRTY_REGS_POS(viewport_transform)); - - // Cullmode - constexpr u32 cull_mode_start = MAXWELL3D_REG_INDEX(cull); - constexpr u32 cull_mode_size = sizeof(regs.cull) / sizeof(u32); - set_block(cull_mode_start, cull_mode_size, DIRTY_REGS_POS(cull_mode)); - - // Screen y control - dirty_pointers[MAXWELL3D_REG_INDEX(screen_y_control)] = DIRTY_REGS_POS(screen_y_control); - - // Primitive Restart - constexpr u32 primitive_restart_start = MAXWELL3D_REG_INDEX(primitive_restart); - constexpr u32 primitive_restart_size = sizeof(regs.primitive_restart) / sizeof(u32); - set_block(primitive_restart_start, primitive_restart_size, DIRTY_REGS_POS(primitive_restart)); - - // Depth Test - constexpr u8 depth_test_dirty_reg = DIRTY_REGS_POS(depth_test); - dirty_pointers[MAXWELL3D_REG_INDEX(depth_test_enable)] = depth_test_dirty_reg; - dirty_pointers[MAXWELL3D_REG_INDEX(depth_write_enabled)] = depth_test_dirty_reg; - dirty_pointers[MAXWELL3D_REG_INDEX(depth_test_func)] = depth_test_dirty_reg; - - // Stencil Test - constexpr u32 stencil_test_dirty_reg = DIRTY_REGS_POS(stencil_test); - dirty_pointers[MAXWELL3D_REG_INDEX(stencil_enable)] = stencil_test_dirty_reg; - dirty_pointers[MAXWELL3D_REG_INDEX(stencil_front_func_func)] = stencil_test_dirty_reg; - dirty_pointers[MAXWELL3D_REG_INDEX(stencil_front_func_ref)] = stencil_test_dirty_reg; - dirty_pointers[MAXWELL3D_REG_INDEX(stencil_front_func_mask)] = stencil_test_dirty_reg; - dirty_pointers[MAXWELL3D_REG_INDEX(stencil_front_op_fail)] = stencil_test_dirty_reg; - dirty_pointers[MAXWELL3D_REG_INDEX(stencil_front_op_zfail)] = stencil_test_dirty_reg; - dirty_pointers[MAXWELL3D_REG_INDEX(stencil_front_op_zpass)] = stencil_test_dirty_reg; - dirty_pointers[MAXWELL3D_REG_INDEX(stencil_front_mask)] = stencil_test_dirty_reg; - dirty_pointers[MAXWELL3D_REG_INDEX(stencil_two_side_enable)] = stencil_test_dirty_reg; - dirty_pointers[MAXWELL3D_REG_INDEX(stencil_back_func_func)] = stencil_test_dirty_reg; - dirty_pointers[MAXWELL3D_REG_INDEX(stencil_back_func_ref)] = stencil_test_dirty_reg; - dirty_pointers[MAXWELL3D_REG_INDEX(stencil_back_func_mask)] = stencil_test_dirty_reg; - dirty_pointers[MAXWELL3D_REG_INDEX(stencil_back_op_fail)] = stencil_test_dirty_reg; - dirty_pointers[MAXWELL3D_REG_INDEX(stencil_back_op_zfail)] = stencil_test_dirty_reg; - dirty_pointers[MAXWELL3D_REG_INDEX(stencil_back_op_zpass)] = stencil_test_dirty_reg; - dirty_pointers[MAXWELL3D_REG_INDEX(stencil_back_mask)] = stencil_test_dirty_reg; - - // Color Mask - constexpr u8 color_mask_dirty_reg = DIRTY_REGS_POS(color_mask); - dirty_pointers[MAXWELL3D_REG_INDEX(color_mask_common)] = color_mask_dirty_reg; - set_block(MAXWELL3D_REG_INDEX(color_mask), sizeof(regs.color_mask) / sizeof(u32), - color_mask_dirty_reg); - // Blend State - constexpr u8 blend_state_dirty_reg = DIRTY_REGS_POS(blend_state); - set_block(MAXWELL3D_REG_INDEX(blend_color), sizeof(regs.blend_color) / sizeof(u32), - blend_state_dirty_reg); - dirty_pointers[MAXWELL3D_REG_INDEX(independent_blend_enable)] = blend_state_dirty_reg; - set_block(MAXWELL3D_REG_INDEX(blend), sizeof(regs.blend) / sizeof(u32), blend_state_dirty_reg); - set_block(MAXWELL3D_REG_INDEX(independent_blend), sizeof(regs.independent_blend) / sizeof(u32), - blend_state_dirty_reg); - - // Scissor State - constexpr u8 scissor_test_dirty_reg = DIRTY_REGS_POS(scissor_test); - set_block(MAXWELL3D_REG_INDEX(scissor_test), sizeof(regs.scissor_test) / sizeof(u32), - scissor_test_dirty_reg); - - // Polygon Offset - constexpr u8 polygon_offset_dirty_reg = DIRTY_REGS_POS(polygon_offset); - dirty_pointers[MAXWELL3D_REG_INDEX(polygon_offset_fill_enable)] = polygon_offset_dirty_reg; - dirty_pointers[MAXWELL3D_REG_INDEX(polygon_offset_line_enable)] = polygon_offset_dirty_reg; - dirty_pointers[MAXWELL3D_REG_INDEX(polygon_offset_point_enable)] = polygon_offset_dirty_reg; - dirty_pointers[MAXWELL3D_REG_INDEX(polygon_offset_units)] = polygon_offset_dirty_reg; - dirty_pointers[MAXWELL3D_REG_INDEX(polygon_offset_factor)] = polygon_offset_dirty_reg; - dirty_pointers[MAXWELL3D_REG_INDEX(polygon_offset_clamp)] = polygon_offset_dirty_reg; - - // Depth bounds - constexpr u8 depth_bounds_values_dirty_reg = DIRTY_REGS_POS(depth_bounds_values); - dirty_pointers[MAXWELL3D_REG_INDEX(depth_bounds[0])] = depth_bounds_values_dirty_reg; - dirty_pointers[MAXWELL3D_REG_INDEX(depth_bounds[1])] = depth_bounds_values_dirty_reg; -} - void Maxwell3D::CallMacroMethod(u32 method, std::size_t num_parameters, const u32* parameters) { // Reset the current macro. executing_macro = 0; @@ -317,31 +162,34 @@ void Maxwell3D::CallMethod(const GPU::MethodCall& method_call) { ASSERT_MSG(method < Regs::NUM_REGS, "Invalid Maxwell3D register, increase the size of the Regs structure"); - if (regs.reg_array[method] != method_call.argument) { - regs.reg_array[method] = method_call.argument; - const std::size_t dirty_reg = dirty_pointers[method]; - if (dirty_reg) { - dirty.regs[dirty_reg] = true; - if (dirty_reg >= DIRTY_REGS_POS(vertex_array) && - dirty_reg < DIRTY_REGS_POS(vertex_array_buffers)) { - dirty.vertex_array_buffers = true; - } else if (dirty_reg >= DIRTY_REGS_POS(vertex_instance) && - dirty_reg < DIRTY_REGS_POS(vertex_instances)) { - dirty.vertex_instances = true; - } else if (dirty_reg >= DIRTY_REGS_POS(render_target) && - dirty_reg < DIRTY_REGS_POS(render_settings)) { - dirty.render_settings = true; - } + u32 arg = method_call.argument; + // Keep track of the register value in shadow_state when requested. + if (shadow_state.shadow_ram_control == Regs::ShadowRamControl::Track || + shadow_state.shadow_ram_control == Regs::ShadowRamControl::TrackWithFilter) { + shadow_state.reg_array[method] = arg; + } else if (shadow_state.shadow_ram_control == Regs::ShadowRamControl::Replay) { + arg = shadow_state.reg_array[method]; + } + + if (regs.reg_array[method] != arg) { + regs.reg_array[method] = arg; + + for (const auto& table : dirty.tables) { + dirty.flags[table[method]] = true; } } switch (method) { + case MAXWELL3D_REG_INDEX(shadow_ram_control): { + shadow_state.shadow_ram_control = static_cast<Regs::ShadowRamControl>(method_call.argument); + break; + } case MAXWELL3D_REG_INDEX(macros.data): { - ProcessMacroUpload(method_call.argument); + ProcessMacroUpload(arg); break; } case MAXWELL3D_REG_INDEX(macros.bind): { - ProcessMacroBind(method_call.argument); + ProcessMacroBind(arg); break; } case MAXWELL3D_REG_INDEX(firmware[4]): { @@ -417,9 +265,9 @@ void Maxwell3D::CallMethod(const GPU::MethodCall& method_call) { } case MAXWELL3D_REG_INDEX(data_upload): { const bool is_last_call = method_call.IsLastCall(); - upload_state.ProcessData(method_call.argument, is_last_call); + upload_state.ProcessData(arg, is_last_call); if (is_last_call) { - dirty.OnMemoryWrite(); + OnMemoryWrite(); } break; } @@ -727,7 +575,7 @@ void Maxwell3D::FinishCBData() { const u32 id = cb_data_state.id; memory_manager.WriteBlock(address, cb_data_state.buffer[id].data(), size); - dirty.OnMemoryWrite(); + OnMemoryWrite(); cb_data_state.id = null_cb_data; cb_data_state.current = null_cb_data; @@ -805,7 +653,7 @@ SamplerDescriptor Maxwell3D::AccessBindlessSampler(ShaderType stage, u64 const_b const Texture::TextureHandle tex_handle{memory_manager.Read<u32>(tex_info_address)}; const Texture::FullTextureInfo tex_info = GetTextureInfo(tex_handle); - SamplerDescriptor result = SamplerDescriptor::FromTicTexture(tex_info.tic.texture_type.Value()); + SamplerDescriptor result = SamplerDescriptor::FromTIC(tex_info.tic); result.is_shadow.Assign(tex_info.tsc.depth_compare_enabled.Value()); return result; } diff --git a/src/video_core/engines/maxwell_3d.h b/src/video_core/engines/maxwell_3d.h index 26939be3f..d24c9f657 100644 --- a/src/video_core/engines/maxwell_3d.h +++ b/src/video_core/engines/maxwell_3d.h @@ -6,6 +6,7 @@ #include <array> #include <bitset> +#include <limits> #include <optional> #include <type_traits> #include <unordered_map> @@ -66,6 +67,7 @@ public: static constexpr std::size_t NumVaryings = 31; static constexpr std::size_t NumImages = 8; // TODO(Rodrigo): Investigate this number static constexpr std::size_t NumClipDistances = 8; + static constexpr std::size_t NumTransformFeedbackBuffers = 4; static constexpr std::size_t MaxShaderProgram = 6; static constexpr std::size_t MaxShaderStage = 5; // Maximum number of const buffers per shader stage. @@ -431,21 +433,15 @@ public: GeneratedPrimitives = 0x1F, }; - struct Cull { - enum class FrontFace : u32 { - ClockWise = 0x0900, - CounterClockWise = 0x0901, - }; - - enum class CullFace : u32 { - Front = 0x0404, - Back = 0x0405, - FrontAndBack = 0x0408, - }; + enum class FrontFace : u32 { + ClockWise = 0x0900, + CounterClockWise = 0x0901, + }; - u32 enabled; - FrontFace front_face; - CullFace cull_face; + enum class CullFace : u32 { + Front = 0x0404, + Back = 0x0405, + FrontAndBack = 0x0408, }; struct Blend { @@ -529,6 +525,23 @@ public: FractionalEven = 2, }; + enum class PolygonMode : u32 { + Point = 0x1b00, + Line = 0x1b01, + Fill = 0x1b02, + }; + + enum class ShadowRamControl : u32 { + // write value to shadow ram + Track = 0, + // write value to shadow ram ( with validation ??? ) + TrackWithFilter = 1, + // only write to real hw register + Passthrough = 2, + // write value from shadow ram to real hw register + Replay = 3, + }; + struct RenderTargetConfig { u32 address_high; u32 address_low; @@ -542,7 +555,7 @@ public: BitField<12, 1, InvMemoryLayout> type; } memory_layout; union { - BitField<0, 16, u32> array_mode; + BitField<0, 16, u32> layers; BitField<16, 1, u32> volume; }; u32 layer_stride; @@ -574,7 +587,7 @@ public: f32 translate_z; INSERT_UNION_PADDING_WORDS(2); - Common::Rectangle<s32> GetRect() const { + Common::Rectangle<f32> GetRect() const { return { GetX(), // left GetY() + GetHeight(), // top @@ -583,20 +596,20 @@ public: }; }; - s32 GetX() const { - return static_cast<s32>(std::max(0.0f, translate_x - std::fabs(scale_x))); + f32 GetX() const { + return std::max(0.0f, translate_x - std::fabs(scale_x)); } - s32 GetY() const { - return static_cast<s32>(std::max(0.0f, translate_y - std::fabs(scale_y))); + f32 GetY() const { + return std::max(0.0f, translate_y - std::fabs(scale_y)); } - s32 GetWidth() const { - return static_cast<s32>(translate_x + std::fabs(scale_x)) - GetX(); + f32 GetWidth() const { + return translate_x + std::fabs(scale_x) - GetX(); } - s32 GetHeight() const { - return static_cast<s32>(translate_y + std::fabs(scale_y)) - GetY(); + f32 GetHeight() const { + return translate_y + std::fabs(scale_y) - GetY(); } }; @@ -626,6 +639,29 @@ public: float depth_range_far; }; + struct TransformFeedbackBinding { + u32 buffer_enable; + u32 address_high; + u32 address_low; + s32 buffer_size; + s32 buffer_offset; + INSERT_UNION_PADDING_WORDS(3); + + GPUVAddr Address() const { + return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high) << 32) | + address_low); + } + }; + static_assert(sizeof(TransformFeedbackBinding) == 32); + + struct TransformFeedbackLayout { + u32 stream; + u32 varying_count; + u32 stride; + INSERT_UNION_PADDING_WORDS(1); + }; + static_assert(sizeof(TransformFeedbackLayout) == 16); + bool IsShaderConfigEnabled(std::size_t index) const { // The VertexB is always enabled. if (index == static_cast<std::size_t>(Regs::ShaderProgram::VertexB)) { @@ -634,6 +670,10 @@ public: return shader_config[index].enable != 0; } + bool IsShaderConfigEnabled(Regs::ShaderProgram type) const { + return IsShaderConfigEnabled(static_cast<std::size_t>(type)); + } + union { struct { INSERT_UNION_PADDING_WORDS(0x45); @@ -645,7 +685,9 @@ public: u32 bind; } macros; - INSERT_UNION_PADDING_WORDS(0x17); + ShadowRamControl shadow_ram_control; + + INSERT_UNION_PADDING_WORDS(0x16); Upload::Registers upload; struct { @@ -682,7 +724,13 @@ public: u32 rasterize_enable; - INSERT_UNION_PADDING_WORDS(0xF1); + std::array<TransformFeedbackBinding, NumTransformFeedbackBuffers> tfb_bindings; + + INSERT_UNION_PADDING_WORDS(0xC0); + + std::array<TransformFeedbackLayout, NumTransformFeedbackBuffers> tfb_layouts; + + INSERT_UNION_PADDING_WORDS(0x1); u32 tfb_enabled; @@ -710,7 +758,12 @@ public: s32 clear_stencil; - INSERT_UNION_PADDING_WORDS(0x7); + INSERT_UNION_PADDING_WORDS(0x2); + + PolygonMode polygon_mode_front; + PolygonMode polygon_mode_back; + + INSERT_UNION_PADDING_WORDS(0x3); u32 polygon_offset_point_enable; u32 polygon_offset_line_enable; @@ -769,7 +822,11 @@ public: BitField<12, 4, u32> viewport; } clear_flags; - INSERT_UNION_PADDING_WORDS(0x19); + INSERT_UNION_PADDING_WORDS(0x10); + + u32 fill_rectangle; + + INSERT_UNION_PADDING_WORDS(0x8); std::array<VertexAttribute, NumVertexAttributes> vertex_attrib_format; @@ -800,8 +857,12 @@ public: u32 zeta_width; u32 zeta_height; + union { + BitField<0, 16, u32> zeta_layers; + BitField<16, 1, u32> zeta_volume; + }; - INSERT_UNION_PADDING_WORDS(0x27); + INSERT_UNION_PADDING_WORDS(0x26); u32 depth_test_enable; @@ -868,16 +929,7 @@ public: INSERT_UNION_PADDING_WORDS(0x35); - union { - BitField<0, 1, u32> c0; - BitField<1, 1, u32> c1; - BitField<2, 1, u32> c2; - BitField<3, 1, u32> c3; - BitField<4, 1, u32> c4; - BitField<5, 1, u32> c5; - BitField<6, 1, u32> c6; - BitField<7, 1, u32> c7; - } clip_distance_enabled; + u32 clip_distance_enabled; u32 samplecnt_enable; @@ -1056,7 +1108,9 @@ public: INSERT_UNION_PADDING_WORDS(1); - Cull cull; + u32 cull_test_enabled; + FrontFace front_face; + CullFace cull_face; u32 pixel_center_integer; @@ -1195,7 +1249,11 @@ public: u32 tex_cb_index; - INSERT_UNION_PADDING_WORDS(0x395); + INSERT_UNION_PADDING_WORDS(0x7D); + + std::array<std::array<u8, 128>, NumTransformFeedbackBuffers> tfb_varying_locs; + + INSERT_UNION_PADDING_WORDS(0x298); struct { /// Compressed address of a buffer that holds information about bound SSBOs. @@ -1218,7 +1276,12 @@ public: }; std::array<u32, NUM_REGS> reg_array; }; - } regs{}; + }; + + Regs regs{}; + + /// Store temporary hw register values, used by some calls to restore state after a operation + Regs shadow_state; static_assert(sizeof(Regs) == Regs::NUM_REGS * sizeof(u32), "Maxwell3D Regs has wrong size"); static_assert(std::is_trivially_copyable_v<Regs>, "Maxwell3D Regs must be trivially copyable"); @@ -1234,79 +1297,6 @@ public: State state{}; - struct DirtyRegs { - static constexpr std::size_t NUM_REGS = 256; - static_assert(NUM_REGS - 1 <= std::numeric_limits<u8>::max()); - - union { - struct { - bool null_dirty; - - // Vertex Attributes - bool vertex_attrib_format; - - // Vertex Arrays - std::array<bool, 32> vertex_array; - - bool vertex_array_buffers; - - // Vertex Instances - std::array<bool, 32> vertex_instance; - - bool vertex_instances; - - // Render Targets - std::array<bool, 8> render_target; - bool depth_buffer; - - bool render_settings; - - // Shaders - bool shaders; - - // Rasterizer State - bool viewport; - bool clip_coefficient; - bool cull_mode; - bool primitive_restart; - bool depth_test; - bool stencil_test; - bool blend_state; - bool scissor_test; - bool transform_feedback; - bool color_mask; - bool polygon_offset; - bool depth_bounds_values; - - // Complementary - bool viewport_transform; - bool screen_y_control; - - bool memory_general; - }; - std::array<bool, NUM_REGS> regs; - }; - - void ResetVertexArrays() { - vertex_array.fill(true); - vertex_array_buffers = true; - } - - void ResetRenderTargets() { - depth_buffer = true; - render_target.fill(true); - render_settings = true; - } - - void OnMemoryWrite() { - shaders = true; - memory_general = true; - ResetRenderTargets(); - ResetVertexArrays(); - } - - } dirty{}; - /// Reads a register value located at the input method address u32 GetRegisterValue(u32 method) const; @@ -1352,6 +1342,11 @@ public: return execute_on; } + /// Notify a memory write has happened. + void OnMemoryWrite() { + dirty.flags |= dirty.on_write_stores; + } + enum class MMEDrawMode : u32 { Undefined, Array, @@ -1367,6 +1362,16 @@ public: u32 gl_end_count{}; } mme_draw; + struct DirtyState { + using Flags = std::bitset<std::numeric_limits<u8>::max()>; + using Table = std::array<u8, Regs::NUM_REGS>; + using Tables = std::array<Table, 2>; + + Flags flags; + Flags on_write_stores; + Tables tables{}; + } dirty; + private: void InitializeRegisterDefaults(); @@ -1413,8 +1418,6 @@ private: /// Retrieves information about a specific TSC entry from the TSC buffer. Texture::TSCEntry GetTSCEntry(u32 tsc_index) const; - void InitDirtySettings(); - /** * Call a macro on this engine. * @param method Method to call @@ -1473,6 +1476,7 @@ private: "Field " #field_name " has invalid position") ASSERT_REG_POSITION(macros, 0x45); +ASSERT_REG_POSITION(shadow_ram_control, 0x49); ASSERT_REG_POSITION(upload, 0x60); ASSERT_REG_POSITION(exec_upload, 0x6C); ASSERT_REG_POSITION(data_upload, 0x6D); @@ -1481,6 +1485,8 @@ ASSERT_REG_POSITION(tess_mode, 0xC8); ASSERT_REG_POSITION(tess_level_outer, 0xC9); ASSERT_REG_POSITION(tess_level_inner, 0xCD); ASSERT_REG_POSITION(rasterize_enable, 0xDF); +ASSERT_REG_POSITION(tfb_bindings, 0xE0); +ASSERT_REG_POSITION(tfb_layouts, 0x1C0); ASSERT_REG_POSITION(tfb_enabled, 0x1D1); ASSERT_REG_POSITION(rt, 0x200); ASSERT_REG_POSITION(viewport_transform, 0x280); @@ -1490,6 +1496,8 @@ ASSERT_REG_POSITION(depth_mode, 0x35F); ASSERT_REG_POSITION(clear_color[0], 0x360); ASSERT_REG_POSITION(clear_depth, 0x364); ASSERT_REG_POSITION(clear_stencil, 0x368); +ASSERT_REG_POSITION(polygon_mode_front, 0x36B); +ASSERT_REG_POSITION(polygon_mode_back, 0x36C); ASSERT_REG_POSITION(polygon_offset_point_enable, 0x370); ASSERT_REG_POSITION(polygon_offset_line_enable, 0x371); ASSERT_REG_POSITION(polygon_offset_fill_enable, 0x372); @@ -1503,10 +1511,12 @@ ASSERT_REG_POSITION(rt_separate_frag_data, 0x3EB); ASSERT_REG_POSITION(depth_bounds, 0x3E7); ASSERT_REG_POSITION(zeta, 0x3F8); ASSERT_REG_POSITION(clear_flags, 0x43E); +ASSERT_REG_POSITION(fill_rectangle, 0x44F); ASSERT_REG_POSITION(vertex_attrib_format, 0x458); ASSERT_REG_POSITION(rt_control, 0x487); ASSERT_REG_POSITION(zeta_width, 0x48a); ASSERT_REG_POSITION(zeta_height, 0x48b); +ASSERT_REG_POSITION(zeta_layers, 0x48c); ASSERT_REG_POSITION(depth_test_enable, 0x4B3); ASSERT_REG_POSITION(independent_blend_enable, 0x4B9); ASSERT_REG_POSITION(depth_write_enabled, 0x4BA); @@ -1556,7 +1566,9 @@ ASSERT_REG_POSITION(index_array, 0x5F2); ASSERT_REG_POSITION(polygon_offset_clamp, 0x61F); ASSERT_REG_POSITION(instanced_arrays, 0x620); ASSERT_REG_POSITION(vp_point_size, 0x644); -ASSERT_REG_POSITION(cull, 0x646); +ASSERT_REG_POSITION(cull_test_enabled, 0x646); +ASSERT_REG_POSITION(front_face, 0x647); +ASSERT_REG_POSITION(cull_face, 0x648); ASSERT_REG_POSITION(pixel_center_integer, 0x649); ASSERT_REG_POSITION(viewport_transform_enabled, 0x64B); ASSERT_REG_POSITION(view_volume_clip_control, 0x64F); @@ -1573,6 +1585,7 @@ ASSERT_REG_POSITION(firmware, 0x8C0); ASSERT_REG_POSITION(const_buffer, 0x8E0); ASSERT_REG_POSITION(cb_bind[0], 0x904); ASSERT_REG_POSITION(tex_cb_index, 0x982); +ASSERT_REG_POSITION(tfb_varying_locs, 0xA00); ASSERT_REG_POSITION(ssbo_info, 0xD18); ASSERT_REG_POSITION(tex_info_buffers.address[0], 0xD2A); ASSERT_REG_POSITION(tex_info_buffers.size[0], 0xD2F); diff --git a/src/video_core/engines/maxwell_dma.cpp b/src/video_core/engines/maxwell_dma.cpp index ad8453c5f..c2610f992 100644 --- a/src/video_core/engines/maxwell_dma.cpp +++ b/src/video_core/engines/maxwell_dma.cpp @@ -57,7 +57,7 @@ void MaxwellDMA::HandleCopy() { } // All copies here update the main memory, so mark all rasterizer states as invalid. - system.GPU().Maxwell3D().dirty.OnMemoryWrite(); + system.GPU().Maxwell3D().OnMemoryWrite(); if (regs.exec.is_dst_linear && regs.exec.is_src_linear) { // When the enable_2d bit is disabled, the copy is performed as if we were copying a 1D diff --git a/src/video_core/engines/shader_bytecode.h b/src/video_core/engines/shader_bytecode.h index c9bc83cd7..49dc5abe0 100644 --- a/src/video_core/engines/shader_bytecode.h +++ b/src/video_core/engines/shader_bytecode.h @@ -82,6 +82,10 @@ union Attribute { Position = 7, Attribute_0 = 8, Attribute_31 = 39, + FrontColor = 40, + FrontSecondaryColor = 41, + BackColor = 42, + BackSecondaryColor = 43, ClipDistances0123 = 44, ClipDistances4567 = 45, PointCoord = 46, @@ -89,6 +93,8 @@ union Attribute { // shader, and a tuple of (TessCoord.x, TessCoord.y, TessCoord.z, ~) when inside a Tess Eval // shader. TessCoordInstanceIDVertexID = 47, + TexCoord_0 = 48, + TexCoord_7 = 55, // This attribute contains a tuple of (Unk, Unk, Unk, gl_FrontFacing) when inside a fragment // shader. It is unknown what the other values contain. FrontFacing = 63, @@ -911,14 +917,9 @@ union Instruction { } fadd32i; union { - BitField<20, 8, u64> shift_position; - BitField<28, 8, u64> shift_length; - BitField<48, 1, u64> negate_b; - BitField<49, 1, u64> negate_a; - - u64 GetLeftShiftValue() const { - return 32 - (shift_position + shift_length); - } + BitField<40, 1, u64> brev; + BitField<47, 1, u64> rd_cc; + BitField<48, 1, u64> is_signed; } bfe; union { diff --git a/src/video_core/gpu.cpp b/src/video_core/gpu.cpp index 7d7137109..e8f763ce9 100644 --- a/src/video_core/gpu.cpp +++ b/src/video_core/gpu.cpp @@ -140,71 +140,6 @@ void GPU::FlushCommands() { renderer.Rasterizer().FlushCommands(); } -u32 RenderTargetBytesPerPixel(RenderTargetFormat format) { - ASSERT(format != RenderTargetFormat::NONE); - - switch (format) { - case RenderTargetFormat::RGBA32_FLOAT: - case RenderTargetFormat::RGBA32_UINT: - return 16; - case RenderTargetFormat::RGBA16_UINT: - case RenderTargetFormat::RGBA16_UNORM: - case RenderTargetFormat::RGBA16_FLOAT: - case RenderTargetFormat::RGBX16_FLOAT: - case RenderTargetFormat::RG32_FLOAT: - case RenderTargetFormat::RG32_UINT: - return 8; - case RenderTargetFormat::RGBA8_UNORM: - case RenderTargetFormat::RGBA8_SNORM: - case RenderTargetFormat::RGBA8_SRGB: - case RenderTargetFormat::RGBA8_UINT: - case RenderTargetFormat::RGB10_A2_UNORM: - case RenderTargetFormat::BGRA8_UNORM: - case RenderTargetFormat::BGRA8_SRGB: - case RenderTargetFormat::RG16_UNORM: - case RenderTargetFormat::RG16_SNORM: - case RenderTargetFormat::RG16_UINT: - case RenderTargetFormat::RG16_SINT: - case RenderTargetFormat::RG16_FLOAT: - case RenderTargetFormat::R32_FLOAT: - case RenderTargetFormat::R11G11B10_FLOAT: - case RenderTargetFormat::R32_UINT: - return 4; - case RenderTargetFormat::R16_UNORM: - case RenderTargetFormat::R16_SNORM: - case RenderTargetFormat::R16_UINT: - case RenderTargetFormat::R16_SINT: - case RenderTargetFormat::R16_FLOAT: - case RenderTargetFormat::RG8_UNORM: - case RenderTargetFormat::RG8_SNORM: - return 2; - case RenderTargetFormat::R8_UNORM: - case RenderTargetFormat::R8_UINT: - return 1; - default: - UNIMPLEMENTED_MSG("Unimplemented render target format {}", static_cast<u32>(format)); - return 1; - } -} - -u32 DepthFormatBytesPerPixel(DepthFormat format) { - switch (format) { - case DepthFormat::Z32_S8_X24_FLOAT: - return 8; - case DepthFormat::Z32_FLOAT: - case DepthFormat::S8_Z24_UNORM: - case DepthFormat::Z24_X8_UNORM: - case DepthFormat::Z24_S8_UNORM: - case DepthFormat::Z24_C8_UNORM: - return 4; - case DepthFormat::Z16_UNORM: - return 2; - default: - UNIMPLEMENTED_MSG("Unimplemented Depth format {}", static_cast<u32>(format)); - return 1; - } -} - // Note that, traditionally, methods are treated as 4-byte addressable locations, and hence // their numbers are written down multiplied by 4 in Docs. Here we are not multiply by 4. // So the values you see in docs might be multiplied by 4. diff --git a/src/video_core/gpu.h b/src/video_core/gpu.h index 07727210c..64acb17df 100644 --- a/src/video_core/gpu.h +++ b/src/video_core/gpu.h @@ -39,6 +39,7 @@ enum class RenderTargetFormat : u32 { RGBA32_FLOAT = 0xC0, RGBA32_UINT = 0xC2, RGBA16_UNORM = 0xC6, + RGBA16_SNORM = 0xC7, RGBA16_UINT = 0xC9, RGBA16_FLOAT = 0xCA, RG32_FLOAT = 0xCB, @@ -57,6 +58,7 @@ enum class RenderTargetFormat : u32 { RG16_UINT = 0xDD, RG16_FLOAT = 0xDE, R11G11B10_FLOAT = 0xE0, + R32_SINT = 0xE3, R32_UINT = 0xE4, R32_FLOAT = 0xE5, B5G6R5_UNORM = 0xE8, @@ -82,12 +84,6 @@ enum class DepthFormat : u32 { Z32_S8_X24_FLOAT = 0x19, }; -/// Returns the number of bytes per pixel of each rendertarget format. -u32 RenderTargetBytesPerPixel(RenderTargetFormat format); - -/// Returns the number of bytes per pixel of each depth format. -u32 DepthFormatBytesPerPixel(DepthFormat format); - struct CommandListHeader; class DebugContext; diff --git a/src/video_core/gpu_thread.cpp b/src/video_core/gpu_thread.cpp index 2cdf1aa7f..b1088af3d 100644 --- a/src/video_core/gpu_thread.cpp +++ b/src/video_core/gpu_thread.cpp @@ -5,7 +5,7 @@ #include "common/assert.h" #include "common/microprofile.h" #include "core/core.h" -#include "core/frontend/scope_acquire_window_context.h" +#include "core/frontend/scope_acquire_context.h" #include "video_core/dma_pusher.h" #include "video_core/gpu.h" #include "video_core/gpu_thread.h" @@ -27,7 +27,7 @@ static void RunThread(VideoCore::RendererBase& renderer, Tegra::DmaPusher& dma_p return; } - Core::Frontend::ScopeAcquireWindowContext acquire_context{renderer.GetRenderWindow()}; + Core::Frontend::ScopeAcquireContext acquire_context{renderer.GetRenderWindow()}; CommandDataContainer next; while (state.is_running) { diff --git a/src/video_core/guest_driver.cpp b/src/video_core/guest_driver.cpp index 6adef459e..f058f2744 100644 --- a/src/video_core/guest_driver.cpp +++ b/src/video_core/guest_driver.cpp @@ -4,13 +4,15 @@ #include <algorithm> #include <limits> +#include <vector> +#include "common/common_types.h" #include "video_core/guest_driver.h" namespace VideoCore { -void GuestDriverProfile::DeduceTextureHandlerSize(std::vector<u32>&& bound_offsets) { - if (texture_handler_size_deduced) { +void GuestDriverProfile::DeduceTextureHandlerSize(std::vector<u32> bound_offsets) { + if (texture_handler_size) { return; } const std::size_t size = bound_offsets.size(); @@ -29,7 +31,6 @@ void GuestDriverProfile::DeduceTextureHandlerSize(std::vector<u32>&& bound_offse if (min_val > 2) { return; } - texture_handler_size_deduced = true; texture_handler_size = min_texture_handler_size * min_val; } diff --git a/src/video_core/guest_driver.h b/src/video_core/guest_driver.h index fc1917347..99450777e 100644 --- a/src/video_core/guest_driver.h +++ b/src/video_core/guest_driver.h @@ -4,6 +4,7 @@ #pragma once +#include <optional> #include <vector> #include "common/common_types.h" @@ -17,25 +18,29 @@ namespace VideoCore { */ class GuestDriverProfile { public: - void DeduceTextureHandlerSize(std::vector<u32>&& bound_offsets); + explicit GuestDriverProfile() = default; + explicit GuestDriverProfile(std::optional<u32> texture_handler_size) + : texture_handler_size{texture_handler_size} {} + + void DeduceTextureHandlerSize(std::vector<u32> bound_offsets); u32 GetTextureHandlerSize() const { - return texture_handler_size; + return texture_handler_size.value_or(default_texture_handler_size); } - bool TextureHandlerSizeKnown() const { - return texture_handler_size_deduced; + bool IsTextureHandlerSizeKnown() const { + return texture_handler_size.has_value(); } private: // Minimum size of texture handler any driver can use. static constexpr u32 min_texture_handler_size = 4; - // This goes with Vulkan and OpenGL standards but Nvidia GPUs can easily - // use 4 bytes instead. Thus, certain drivers may squish the size. + + // This goes with Vulkan and OpenGL standards but Nvidia GPUs can easily use 4 bytes instead. + // Thus, certain drivers may squish the size. static constexpr u32 default_texture_handler_size = 8; - u32 texture_handler_size = default_texture_handler_size; - bool texture_handler_size_deduced = false; + std::optional<u32> texture_handler_size = default_texture_handler_size; }; } // namespace VideoCore diff --git a/src/video_core/memory_manager.h b/src/video_core/memory_manager.h index aea010087..073bdb491 100644 --- a/src/video_core/memory_manager.h +++ b/src/video_core/memory_manager.h @@ -174,7 +174,7 @@ private: /// End of address space, based on address space in bits. static constexpr GPUVAddr address_space_end{1ULL << address_space_width}; - Common::PageTable page_table{page_bits}; + Common::BackingPageTable page_table{page_bits}; VMAMap vma_map; VideoCore::RasterizerInterface& rasterizer; diff --git a/src/video_core/morton.cpp b/src/video_core/morton.cpp index 2f2fe6859..6d522c318 100644 --- a/src/video_core/morton.cpp +++ b/src/video_core/morton.cpp @@ -51,6 +51,7 @@ static constexpr ConversionArray morton_to_linear_fns = { MortonCopy<true, PixelFormat::R8UI>, MortonCopy<true, PixelFormat::RGBA16F>, MortonCopy<true, PixelFormat::RGBA16U>, + MortonCopy<true, PixelFormat::RGBA16S>, MortonCopy<true, PixelFormat::RGBA16UI>, MortonCopy<true, PixelFormat::R11FG11FB10F>, MortonCopy<true, PixelFormat::RGBA32UI>, @@ -85,6 +86,7 @@ static constexpr ConversionArray morton_to_linear_fns = { MortonCopy<true, PixelFormat::RG32UI>, MortonCopy<true, PixelFormat::RGBX16F>, MortonCopy<true, PixelFormat::R32UI>, + MortonCopy<true, PixelFormat::R32I>, MortonCopy<true, PixelFormat::ASTC_2D_8X8>, MortonCopy<true, PixelFormat::ASTC_2D_8X5>, MortonCopy<true, PixelFormat::ASTC_2D_5X4>, @@ -130,6 +132,7 @@ static constexpr ConversionArray linear_to_morton_fns = { MortonCopy<false, PixelFormat::R8U>, MortonCopy<false, PixelFormat::R8UI>, MortonCopy<false, PixelFormat::RGBA16F>, + MortonCopy<false, PixelFormat::RGBA16S>, MortonCopy<false, PixelFormat::RGBA16U>, MortonCopy<false, PixelFormat::RGBA16UI>, MortonCopy<false, PixelFormat::R11FG11FB10F>, @@ -166,6 +169,7 @@ static constexpr ConversionArray linear_to_morton_fns = { MortonCopy<false, PixelFormat::RG32UI>, MortonCopy<false, PixelFormat::RGBX16F>, MortonCopy<false, PixelFormat::R32UI>, + MortonCopy<false, PixelFormat::R32I>, nullptr, nullptr, nullptr, diff --git a/src/video_core/rasterizer_interface.h b/src/video_core/rasterizer_interface.h index f18eaf4bc..1a68e3caa 100644 --- a/src/video_core/rasterizer_interface.h +++ b/src/video_core/rasterizer_interface.h @@ -25,7 +25,6 @@ constexpr std::size_t NumQueryTypes = 1; enum class LoadCallbackStage { Prepare, - Decompile, Build, Complete, }; @@ -89,6 +88,9 @@ public: virtual void LoadDiskResources(const std::atomic_bool& stop_loading = false, const DiskResourceLoadCallback& callback = {}) {} + /// Initializes renderer dirty flags + virtual void SetupDirtyFlags() {} + /// Grant access to the Guest Driver Profile for recording/obtaining info on the guest driver. GuestDriverProfile& AccessGuestDriverProfile() { return guest_driver_profile; diff --git a/src/video_core/renderer_base.h b/src/video_core/renderer_base.h index af1bebc4f..5ec99a126 100644 --- a/src/video_core/renderer_base.h +++ b/src/video_core/renderer_base.h @@ -35,15 +35,19 @@ public: explicit RendererBase(Core::Frontend::EmuWindow& window); virtual ~RendererBase(); - /// Swap buffers (render frame) - virtual void SwapBuffers(const Tegra::FramebufferConfig* framebuffer) = 0; - /// Initialize the renderer virtual bool Init() = 0; /// Shutdown the renderer virtual void ShutDown() = 0; + /// Finalize rendering the guest frame and draw into the presentation texture + virtual void SwapBuffers(const Tegra::FramebufferConfig* framebuffer) = 0; + + /// Draws the latest frame to the window waiting timeout_ms for a frame to arrive (Renderer + /// specific implementation) + virtual void TryPresent(int timeout_ms) = 0; + // Getter/setter functions: // ------------------------ diff --git a/src/video_core/renderer_opengl/gl_framebuffer_cache.cpp b/src/video_core/renderer_opengl/gl_framebuffer_cache.cpp index 874ed3c6e..b8a512cb6 100644 --- a/src/video_core/renderer_opengl/gl_framebuffer_cache.cpp +++ b/src/video_core/renderer_opengl/gl_framebuffer_cache.cpp @@ -11,7 +11,6 @@ #include "common/common_types.h" #include "video_core/engines/maxwell_3d.h" #include "video_core/renderer_opengl/gl_framebuffer_cache.h" -#include "video_core/renderer_opengl/gl_state.h" namespace OpenGL { @@ -36,8 +35,7 @@ OGLFramebuffer FramebufferCacheOpenGL::CreateFramebuffer(const FramebufferCacheK framebuffer.Create(); // TODO(Rodrigo): Use DSA here after Nvidia fixes their framebuffer DSA bugs. - local_state.draw.draw_framebuffer = framebuffer.handle; - local_state.ApplyFramebufferState(); + glBindFramebuffer(GL_DRAW_FRAMEBUFFER, framebuffer.handle); if (key.zeta) { const bool stencil = key.zeta->GetSurfaceParams().type == SurfaceType::DepthStencil; diff --git a/src/video_core/renderer_opengl/gl_framebuffer_cache.h b/src/video_core/renderer_opengl/gl_framebuffer_cache.h index 02ec80ae9..8f698fee0 100644 --- a/src/video_core/renderer_opengl/gl_framebuffer_cache.h +++ b/src/video_core/renderer_opengl/gl_framebuffer_cache.h @@ -13,7 +13,6 @@ #include "common/common_types.h" #include "video_core/engines/maxwell_3d.h" #include "video_core/renderer_opengl/gl_resource_manager.h" -#include "video_core/renderer_opengl/gl_state.h" #include "video_core/renderer_opengl/gl_texture_cache.h" namespace OpenGL { @@ -63,7 +62,6 @@ public: private: OGLFramebuffer CreateFramebuffer(const FramebufferCacheKey& key); - OpenGLState local_state; std::unordered_map<FramebufferCacheKey, OGLFramebuffer> cache; }; diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp index e1965fb21..826eee7df 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.cpp +++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp @@ -28,7 +28,6 @@ #include "video_core/renderer_opengl/gl_query_cache.h" #include "video_core/renderer_opengl/gl_rasterizer.h" #include "video_core/renderer_opengl/gl_shader_cache.h" -#include "video_core/renderer_opengl/gl_shader_gen.h" #include "video_core/renderer_opengl/maxwell_to_gl.h" #include "video_core/renderer_opengl/renderer_opengl.h" @@ -36,6 +35,7 @@ namespace OpenGL { using Maxwell = Tegra::Engines::Maxwell3D::Regs; +using Tegra::Engines::ShaderType; using VideoCore::Surface::PixelFormat; using VideoCore::Surface::SurfaceTarget; using VideoCore::Surface::SurfaceType; @@ -54,10 +54,11 @@ MICROPROFILE_DEFINE(OpenGL_PrimitiveAssembly, "OpenGL", "Prim Asmbl", MP_RGB(255 namespace { +constexpr std::size_t NumSupportedVertexAttributes = 16; + template <typename Engine, typename Entry> Tegra::Texture::FullTextureInfo GetTextureInfo(const Engine& engine, const Entry& entry, - Tegra::Engines::ShaderType shader_type, - std::size_t index = 0) { + ShaderType shader_type, std::size_t index = 0) { if (entry.IsBindless()) { const Tegra::Texture::TextureHandle tex_handle = engine.AccessConstBuffer32(shader_type, entry.GetBuffer(), entry.GetOffset()); @@ -74,7 +75,7 @@ Tegra::Texture::FullTextureInfo GetTextureInfo(const Engine& engine, const Entry } std::size_t GetConstBufferSize(const Tegra::Engines::ConstBufferInfo& buffer, - const GLShader::ConstBufferEntry& entry) { + const ConstBufferEntry& entry) { if (!entry.IsIndirect()) { return entry.GetSize(); } @@ -88,18 +89,19 @@ std::size_t GetConstBufferSize(const Tegra::Engines::ConstBufferInfo& buffer, return buffer.size; } +void oglEnable(GLenum cap, bool state) { + (state ? glEnable : glDisable)(cap); +} + } // Anonymous namespace RasterizerOpenGL::RasterizerOpenGL(Core::System& system, Core::Frontend::EmuWindow& emu_window, - ScreenInfo& info) - : RasterizerAccelerated{system.Memory()}, texture_cache{system, *this, device}, + ScreenInfo& info, GLShader::ProgramManager& program_manager, + StateTracker& state_tracker) + : RasterizerAccelerated{system.Memory()}, texture_cache{system, *this, device, state_tracker}, shader_cache{*this, system, emu_window, device}, query_cache{system, *this}, system{system}, - screen_info{info}, buffer_cache{*this, system, device, STREAM_BUFFER_SIZE} { - shader_program_manager = std::make_unique<GLShader::ProgramManager>(); - state.draw.shader_program = 0; - state.Apply(); - - LOG_DEBUG(Render_OpenGL, "Sync fixed function OpenGL state here"); + screen_info{info}, program_manager{program_manager}, state_tracker{state_tracker}, + buffer_cache{*this, system, device, STREAM_BUFFER_SIZE} { CheckExtensions(); } @@ -113,93 +115,72 @@ void RasterizerOpenGL::CheckExtensions() { } } -GLuint RasterizerOpenGL::SetupVertexFormat() { +void RasterizerOpenGL::SetupVertexFormat() { auto& gpu = system.GPU().Maxwell3D(); - const auto& regs = gpu.regs; - - if (!gpu.dirty.vertex_attrib_format) { - return state.draw.vertex_array; + auto& flags = gpu.dirty.flags; + if (!flags[Dirty::VertexFormats]) { + return; } - gpu.dirty.vertex_attrib_format = false; + flags[Dirty::VertexFormats] = false; MICROPROFILE_SCOPE(OpenGL_VAO); - auto [iter, is_cache_miss] = vertex_array_cache.try_emplace(regs.vertex_attrib_format); - auto& vao_entry = iter->second; - - if (is_cache_miss) { - vao_entry.Create(); - const GLuint vao = vao_entry.handle; - - // Eventhough we are using DSA to create this vertex array, there is a bug on Intel's blob - // that fails to properly create the vertex array if it's not bound even after creating it - // with glCreateVertexArrays - state.draw.vertex_array = vao; - state.ApplyVertexArrayState(); - - // Use the vertex array as-is, assumes that the data is formatted correctly for OpenGL. - // Enables the first 16 vertex attributes always, as we don't know which ones are actually - // used until shader time. Note, Tegra technically supports 32, but we're capping this to 16 - // for now to avoid OpenGL errors. - // TODO(Subv): Analyze the shader to identify which attributes are actually used and don't - // assume every shader uses them all. - for (u32 index = 0; index < 16; ++index) { - const auto& attrib = regs.vertex_attrib_format[index]; - - // Ignore invalid attributes. - if (!attrib.IsValid()) - continue; - - const auto& buffer = regs.vertex_array[attrib.buffer]; - LOG_TRACE(Render_OpenGL, - "vertex attrib {}, count={}, size={}, type={}, offset={}, normalize={}", - index, attrib.ComponentCount(), attrib.SizeString(), attrib.TypeString(), - attrib.offset.Value(), attrib.IsNormalized()); - - ASSERT(buffer.IsEnabled()); - - glEnableVertexArrayAttrib(vao, index); - if (attrib.type == Tegra::Engines::Maxwell3D::Regs::VertexAttribute::Type::SignedInt || - attrib.type == - Tegra::Engines::Maxwell3D::Regs::VertexAttribute::Type::UnsignedInt) { - glVertexArrayAttribIFormat(vao, index, attrib.ComponentCount(), - MaxwellToGL::VertexType(attrib), attrib.offset); - } else { - glVertexArrayAttribFormat( - vao, index, attrib.ComponentCount(), MaxwellToGL::VertexType(attrib), - attrib.IsNormalized() ? GL_TRUE : GL_FALSE, attrib.offset); - } - glVertexArrayAttribBinding(vao, index, attrib.buffer); + // Use the vertex array as-is, assumes that the data is formatted correctly for OpenGL. Enables + // the first 16 vertex attributes always, as we don't know which ones are actually used until + // shader time. Note, Tegra technically supports 32, but we're capping this to 16 for now to + // avoid OpenGL errors. + // TODO(Subv): Analyze the shader to identify which attributes are actually used and don't + // assume every shader uses them all. + for (std::size_t index = 0; index < NumSupportedVertexAttributes; ++index) { + if (!flags[Dirty::VertexFormat0 + index]) { + continue; } - } + flags[Dirty::VertexFormat0 + index] = false; + + const auto attrib = gpu.regs.vertex_attrib_format[index]; + const auto gl_index = static_cast<GLuint>(index); - // Rebinding the VAO invalidates the vertex buffer bindings. - gpu.dirty.ResetVertexArrays(); + // Ignore invalid attributes. + if (!attrib.IsValid()) { + glDisableVertexAttribArray(gl_index); + continue; + } + glEnableVertexAttribArray(gl_index); - state.draw.vertex_array = vao_entry.handle; - return vao_entry.handle; + if (attrib.type == Maxwell::VertexAttribute::Type::SignedInt || + attrib.type == Maxwell::VertexAttribute::Type::UnsignedInt) { + glVertexAttribIFormat(gl_index, attrib.ComponentCount(), + MaxwellToGL::VertexType(attrib), attrib.offset); + } else { + glVertexAttribFormat(gl_index, attrib.ComponentCount(), MaxwellToGL::VertexType(attrib), + attrib.IsNormalized() ? GL_TRUE : GL_FALSE, attrib.offset); + } + glVertexAttribBinding(gl_index, attrib.buffer); + } } -void RasterizerOpenGL::SetupVertexBuffer(GLuint vao) { +void RasterizerOpenGL::SetupVertexBuffer() { auto& gpu = system.GPU().Maxwell3D(); - if (!gpu.dirty.vertex_array_buffers) + auto& flags = gpu.dirty.flags; + if (!flags[Dirty::VertexBuffers]) { return; - gpu.dirty.vertex_array_buffers = false; - - const auto& regs = gpu.regs; + } + flags[Dirty::VertexBuffers] = false; MICROPROFILE_SCOPE(OpenGL_VB); // Upload all guest vertex arrays sequentially to our buffer - for (u32 index = 0; index < Maxwell::NumVertexArrays; ++index) { - if (!gpu.dirty.vertex_array[index]) + const auto& regs = gpu.regs; + for (std::size_t index = 0; index < Maxwell::NumVertexArrays; ++index) { + if (!flags[Dirty::VertexBuffer0 + index]) { continue; - gpu.dirty.vertex_array[index] = false; - gpu.dirty.vertex_instance[index] = false; + } + flags[Dirty::VertexBuffer0 + index] = false; const auto& vertex_array = regs.vertex_array[index]; - if (!vertex_array.IsEnabled()) + if (!vertex_array.IsEnabled()) { continue; + } const GPUVAddr start = vertex_array.StartAddress(); const GPUVAddr end = regs.vertex_array_limit[index].LimitAddress(); @@ -209,42 +190,30 @@ void RasterizerOpenGL::SetupVertexBuffer(GLuint vao) { const auto [vertex_buffer, vertex_buffer_offset] = buffer_cache.UploadMemory(start, size); // Bind the vertex array to the buffer at the current offset. - vertex_array_pushbuffer.SetVertexBuffer(index, vertex_buffer, vertex_buffer_offset, - vertex_array.stride); - - if (regs.instanced_arrays.IsInstancingEnabled(index) && vertex_array.divisor != 0) { - // Enable vertex buffer instancing with the specified divisor. - glVertexArrayBindingDivisor(vao, index, vertex_array.divisor); - } else { - // Disable the vertex buffer instancing. - glVertexArrayBindingDivisor(vao, index, 0); - } + vertex_array_pushbuffer.SetVertexBuffer(static_cast<GLuint>(index), vertex_buffer, + vertex_buffer_offset, vertex_array.stride); } } -void RasterizerOpenGL::SetupVertexInstances(GLuint vao) { +void RasterizerOpenGL::SetupVertexInstances() { auto& gpu = system.GPU().Maxwell3D(); - - if (!gpu.dirty.vertex_instances) + auto& flags = gpu.dirty.flags; + if (!flags[Dirty::VertexInstances]) { return; - gpu.dirty.vertex_instances = false; + } + flags[Dirty::VertexInstances] = false; const auto& regs = gpu.regs; - // Upload all guest vertex arrays sequentially to our buffer - for (u32 index = 0; index < Maxwell::NumVertexArrays; ++index) { - if (!gpu.dirty.vertex_instance[index]) + for (std::size_t index = 0; index < NumSupportedVertexAttributes; ++index) { + if (!flags[Dirty::VertexInstance0 + index]) { continue; - - gpu.dirty.vertex_instance[index] = false; - - if (regs.instanced_arrays.IsInstancingEnabled(index) && - regs.vertex_array[index].divisor != 0) { - // Enable vertex buffer instancing with the specified divisor. - glVertexArrayBindingDivisor(vao, index, regs.vertex_array[index].divisor); - } else { - // Disable the vertex buffer instancing. - glVertexArrayBindingDivisor(vao, index, 0); } + flags[Dirty::VertexInstance0 + index] = false; + + const auto gl_index = static_cast<GLuint>(index); + const bool instancing_enabled = regs.instanced_arrays.IsInstancingEnabled(gl_index); + const GLuint divisor = instancing_enabled ? regs.vertex_array[index].divisor : 0; + glVertexBindingDivisor(gl_index, divisor); } } @@ -260,8 +229,7 @@ GLintptr RasterizerOpenGL::SetupIndexBuffer() { void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) { MICROPROFILE_SCOPE(OpenGL_Shader); auto& gpu = system.GPU().Maxwell3D(); - - std::array<bool, Maxwell::NumClipDistances> clip_distances{}; + u32 clip_distances = 0; for (std::size_t index = 0; index < Maxwell::MaxShaderProgram; ++index) { const auto& shader_config = gpu.regs.shader_config[index]; @@ -271,10 +239,10 @@ void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) { if (!gpu.regs.IsShaderConfigEnabled(index)) { switch (program) { case Maxwell::ShaderProgram::Geometry: - shader_program_manager->UseTrivialGeometryShader(); + program_manager.UseGeometryShader(0); break; case Maxwell::ShaderProgram::Fragment: - shader_program_manager->UseTrivialFragmentShader(); + program_manager.UseFragmentShader(0); break; default: break; @@ -299,19 +267,17 @@ void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) { SetupDrawTextures(stage, shader); SetupDrawImages(stage, shader); - const ProgramVariant variant(primitive_mode); - const auto program_handle = shader->GetHandle(variant); - + const GLuint program_handle = shader->GetHandle(); switch (program) { case Maxwell::ShaderProgram::VertexA: case Maxwell::ShaderProgram::VertexB: - shader_program_manager->UseProgrammableVertexShader(program_handle); + program_manager.UseVertexShader(program_handle); break; case Maxwell::ShaderProgram::Geometry: - shader_program_manager->UseProgrammableGeometryShader(program_handle); + program_manager.UseGeometryShader(program_handle); break; case Maxwell::ShaderProgram::Fragment: - shader_program_manager->UseProgrammableFragmentShader(program_handle); + program_manager.UseFragmentShader(program_handle); break; default: UNIMPLEMENTED_MSG("Unimplemented shader index={}, enable={}, offset=0x{:08X}", index, @@ -322,9 +288,7 @@ void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) { // When a clip distance is enabled but not set in the shader it crops parts of the screen // (sometimes it's half the screen, sometimes three quarters). To avoid this, enable the // clip distances only when it's written by a shader stage. - for (std::size_t i = 0; i < Maxwell::NumClipDistances; ++i) { - clip_distances[i] = clip_distances[i] || shader->GetShaderEntries().clip_distances[i]; - } + clip_distances |= shader->GetEntries().clip_distances; // When VertexA is enabled, we have dual vertex shaders if (program == Maxwell::ShaderProgram::VertexA) { @@ -334,8 +298,7 @@ void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) { } SyncClipEnabled(clip_distances); - - gpu.dirty.shaders = false; + gpu.dirty.flags[Dirty::Shaders] = false; } std::size_t RasterizerOpenGL::CalculateVertexArraysSize() const { @@ -368,20 +331,23 @@ void RasterizerOpenGL::LoadDiskResources(const std::atomic_bool& stop_loading, shader_cache.LoadDiskCache(stop_loading, callback); } +void RasterizerOpenGL::SetupDirtyFlags() { + state_tracker.Initialize(); +} + void RasterizerOpenGL::ConfigureFramebuffers() { MICROPROFILE_SCOPE(OpenGL_Framebuffer); auto& gpu = system.GPU().Maxwell3D(); - if (!gpu.dirty.render_settings) { + if (!gpu.dirty.flags[VideoCommon::Dirty::RenderTargets]) { return; } - gpu.dirty.render_settings = false; + gpu.dirty.flags[VideoCommon::Dirty::RenderTargets] = false; texture_cache.GuardRenderTargets(true); View depth_surface = texture_cache.GetDepthBufferSurface(true); const auto& regs = gpu.regs; - state.framebuffer_srgb.enabled = regs.framebuffer_srgb != 0; UNIMPLEMENTED_IF(regs.rt_separate_frag_data == 0); // Bind the framebuffer surfaces @@ -409,14 +375,11 @@ void RasterizerOpenGL::ConfigureFramebuffers() { texture_cache.GuardRenderTargets(false); - state.draw.draw_framebuffer = framebuffer_cache.GetFramebuffer(key); - SyncViewport(state); + glBindFramebuffer(GL_DRAW_FRAMEBUFFER, framebuffer_cache.GetFramebuffer(key)); } -void RasterizerOpenGL::ConfigureClearFramebuffer(OpenGLState& current_state, bool using_color_fb, - bool using_depth_fb, bool using_stencil_fb) { - using VideoCore::Surface::SurfaceType; - +void RasterizerOpenGL::ConfigureClearFramebuffer(bool using_color_fb, bool using_depth_fb, + bool using_stencil_fb) { auto& gpu = system.GPU().Maxwell3D(); const auto& regs = gpu.regs; @@ -435,80 +398,44 @@ void RasterizerOpenGL::ConfigureClearFramebuffer(OpenGLState& current_state, boo key.colors[0] = color_surface; key.zeta = depth_surface; - current_state.draw.draw_framebuffer = framebuffer_cache.GetFramebuffer(key); - current_state.ApplyFramebufferState(); + state_tracker.NotifyFramebuffer(); + glBindFramebuffer(GL_DRAW_FRAMEBUFFER, framebuffer_cache.GetFramebuffer(key)); } void RasterizerOpenGL::Clear() { - const auto& maxwell3d = system.GPU().Maxwell3D(); - - if (!maxwell3d.ShouldExecute()) { + const auto& gpu = system.GPU().Maxwell3D(); + if (!gpu.ShouldExecute()) { return; } - const auto& regs = maxwell3d.regs; + const auto& regs = gpu.regs; bool use_color{}; bool use_depth{}; bool use_stencil{}; - OpenGLState prev_state{OpenGLState::GetCurState()}; - SCOPE_EXIT({ - prev_state.AllDirty(); - prev_state.Apply(); - }); - - OpenGLState clear_state{OpenGLState::GetCurState()}; - clear_state.SetDefaultViewports(); if (regs.clear_buffers.R || regs.clear_buffers.G || regs.clear_buffers.B || regs.clear_buffers.A) { use_color = true; } if (use_color) { - clear_state.color_mask[0].red_enabled = regs.clear_buffers.R ? GL_TRUE : GL_FALSE; - clear_state.color_mask[0].green_enabled = regs.clear_buffers.G ? GL_TRUE : GL_FALSE; - clear_state.color_mask[0].blue_enabled = regs.clear_buffers.B ? GL_TRUE : GL_FALSE; - clear_state.color_mask[0].alpha_enabled = regs.clear_buffers.A ? GL_TRUE : GL_FALSE; + state_tracker.NotifyColorMask0(); + glColorMaski(0, regs.clear_buffers.R != 0, regs.clear_buffers.G != 0, + regs.clear_buffers.B != 0, regs.clear_buffers.A != 0); + + // TODO(Rodrigo): Determine if clamping is used on clears + SyncFragmentColorClampState(); + SyncFramebufferSRGB(); } if (regs.clear_buffers.Z) { ASSERT_MSG(regs.zeta_enable != 0, "Tried to clear Z but buffer is not enabled!"); use_depth = true; - // Always enable the depth write when clearing the depth buffer. The depth write mask is - // ignored when clearing the buffer in the Switch, but OpenGL obeys it so we set it to - // true. - clear_state.depth.test_enabled = true; - clear_state.depth.test_func = GL_ALWAYS; - clear_state.depth.write_mask = GL_TRUE; + state_tracker.NotifyDepthMask(); + glDepthMask(GL_TRUE); } if (regs.clear_buffers.S) { - ASSERT_MSG(regs.zeta_enable != 0, "Tried to clear stencil but buffer is not enabled!"); + ASSERT_MSG(regs.zeta_enable, "Tried to clear stencil but buffer is not enabled!"); use_stencil = true; - clear_state.stencil.test_enabled = true; - - if (regs.clear_flags.stencil) { - // Stencil affects the clear so fill it with the used masks - clear_state.stencil.front.test_func = GL_ALWAYS; - clear_state.stencil.front.test_mask = regs.stencil_front_func_mask; - clear_state.stencil.front.action_stencil_fail = GL_KEEP; - clear_state.stencil.front.action_depth_fail = GL_KEEP; - clear_state.stencil.front.action_depth_pass = GL_KEEP; - clear_state.stencil.front.write_mask = regs.stencil_front_mask; - if (regs.stencil_two_side_enable) { - clear_state.stencil.back.test_func = GL_ALWAYS; - clear_state.stencil.back.test_mask = regs.stencil_back_func_mask; - clear_state.stencil.back.action_stencil_fail = GL_KEEP; - clear_state.stencil.back.action_depth_fail = GL_KEEP; - clear_state.stencil.back.action_depth_pass = GL_KEEP; - clear_state.stencil.back.write_mask = regs.stencil_back_mask; - } else { - clear_state.stencil.back.test_func = GL_ALWAYS; - clear_state.stencil.back.test_mask = 0xFFFFFFFF; - clear_state.stencil.back.write_mask = 0xFFFFFFFF; - clear_state.stencil.back.action_stencil_fail = GL_KEEP; - clear_state.stencil.back.action_depth_fail = GL_KEEP; - clear_state.stencil.back.action_depth_pass = GL_KEEP; - } - } } if (!use_color && !use_depth && !use_stencil) { @@ -516,20 +443,18 @@ void RasterizerOpenGL::Clear() { return; } - ConfigureClearFramebuffer(clear_state, use_color, use_depth, use_stencil); + SyncRasterizeEnable(); - SyncViewport(clear_state); - SyncRasterizeEnable(clear_state); if (regs.clear_flags.scissor) { - SyncScissorTest(clear_state); + SyncScissorTest(); + } else { + state_tracker.NotifyScissor0(); + glDisablei(GL_SCISSOR_TEST, 0); } - if (regs.clear_flags.viewport) { - clear_state.EmulateViewportWithScissor(); - } + UNIMPLEMENTED_IF(regs.clear_flags.viewport); - clear_state.AllDirty(); - clear_state.Apply(); + ConfigureClearFramebuffer(use_color, use_depth, use_stencil); if (use_color) { glClearBufferfv(GL_COLOR, 0, regs.clear_color); @@ -549,25 +474,27 @@ void RasterizerOpenGL::Clear() { void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) { MICROPROFILE_SCOPE(OpenGL_Drawing); auto& gpu = system.GPU().Maxwell3D(); - const auto& regs = gpu.regs; query_cache.UpdateCounters(); - SyncRasterizeEnable(state); + SyncViewport(); + SyncRasterizeEnable(); + SyncPolygonModes(); SyncColorMask(); SyncFragmentColorClampState(); SyncMultiSampleState(); SyncDepthTestState(); + SyncDepthClamp(); SyncStencilTestState(); SyncBlendState(); SyncLogicOpState(); SyncCullMode(); SyncPrimitiveRestart(); - SyncScissorTest(state); - SyncTransformFeedback(); + SyncScissorTest(); SyncPointState(); SyncPolygonOffset(); SyncAlphaTest(); + SyncFramebufferSRGB(); buffer_cache.Acquire(); @@ -591,14 +518,13 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) { buffer_cache.Map(buffer_size); // Prepare vertex array format. - const GLuint vao = SetupVertexFormat(); - vertex_array_pushbuffer.Setup(vao); + SetupVertexFormat(); + vertex_array_pushbuffer.Setup(); // Upload vertex and index data. - SetupVertexBuffer(vao); - SetupVertexInstances(vao); - - GLintptr index_buffer_offset; + SetupVertexBuffer(); + SetupVertexInstances(); + GLintptr index_buffer_offset = 0; if (is_indexed) { index_buffer_offset = SetupIndexBuffer(); } @@ -624,27 +550,20 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) { ConfigureFramebuffers(); // Signal the buffer cache that we are not going to upload more things. - const bool invalidate = buffer_cache.Unmap(); + buffer_cache.Unmap(); // Now that we are no longer uploading data, we can safely bind the buffers to OpenGL. vertex_array_pushbuffer.Bind(); bind_ubo_pushbuffer.Bind(); bind_ssbo_pushbuffer.Bind(); - if (invalidate) { - // As all cached buffers are invalidated, we need to recheck their state. - gpu.dirty.ResetVertexArrays(); - } - gpu.dirty.memory_general = false; - - shader_program_manager->ApplyTo(state); - state.Apply(); + program_manager.BindGraphicsPipeline(); if (texture_cache.TextureBarrier()) { glTextureBarrier(); } - ++num_queued_commands; + BeginTransformFeedback(primitive_mode); const GLuint base_instance = static_cast<GLuint>(gpu.regs.vb_base_instance); const GLsizei num_instances = @@ -683,6 +602,10 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) { num_instances, base_instance); } } + + EndTransformFeedback(); + + ++num_queued_commands; } void RasterizerOpenGL::DispatchCompute(GPUVAddr code_addr) { @@ -695,13 +618,7 @@ void RasterizerOpenGL::DispatchCompute(GPUVAddr code_addr) { auto kernel = shader_cache.GetComputeKernel(code_addr); SetupComputeTextures(kernel); SetupComputeImages(kernel); - - const auto& launch_desc = system.GPU().KeplerCompute().launch_description; - const ProgramVariant variant(launch_desc.block_dim_x, launch_desc.block_dim_y, - launch_desc.block_dim_z, launch_desc.shared_alloc, - launch_desc.local_pos_alloc); - state.draw.shader_program = kernel->GetHandle(variant); - state.draw.program_pipeline = 0; + program_manager.BindComputeShader(kernel->GetHandle()); const std::size_t buffer_size = Tegra::Engines::KeplerCompute::NumConstBuffers * @@ -719,11 +636,7 @@ void RasterizerOpenGL::DispatchCompute(GPUVAddr code_addr) { bind_ubo_pushbuffer.Bind(); bind_ssbo_pushbuffer.Bind(); - state.ApplyTextures(); - state.ApplyImages(); - state.ApplyShaderProgram(); - state.ApplyProgramPipeline(); - + const auto& launch_desc = system.GPU().KeplerCompute().launch_description; glDispatchCompute(launch_desc.grid_dim_x, launch_desc.grid_dim_y, launch_desc.grid_dim_z); ++num_queued_commands; } @@ -828,7 +741,7 @@ void RasterizerOpenGL::SetupDrawConstBuffers(std::size_t stage_index, const Shad const auto& shader_stage = stages[stage_index]; u32 binding = device.GetBaseBindings(stage_index).uniform_buffer; - for (const auto& entry : shader->GetShaderEntries().const_buffers) { + for (const auto& entry : shader->GetEntries().const_buffers) { const auto& buffer = shader_stage.const_buffers[entry.GetIndex()]; SetupConstBuffer(binding++, buffer, entry); } @@ -839,7 +752,7 @@ void RasterizerOpenGL::SetupComputeConstBuffers(const Shader& kernel) { const auto& launch_desc = system.GPU().KeplerCompute().launch_description; u32 binding = 0; - for (const auto& entry : kernel->GetShaderEntries().const_buffers) { + for (const auto& entry : kernel->GetEntries().const_buffers) { const auto& config = launch_desc.const_buffer_config[entry.GetIndex()]; const std::bitset<8> mask = launch_desc.const_buffer_enable_mask.Value(); Tegra::Engines::ConstBufferInfo buffer; @@ -851,7 +764,7 @@ void RasterizerOpenGL::SetupComputeConstBuffers(const Shader& kernel) { } void RasterizerOpenGL::SetupConstBuffer(u32 binding, const Tegra::Engines::ConstBufferInfo& buffer, - const GLShader::ConstBufferEntry& entry) { + const ConstBufferEntry& entry) { if (!buffer.enabled) { // Set values to zero to unbind buffers bind_ubo_pushbuffer.Push(binding, buffer_cache.GetEmptyBuffer(sizeof(float)), 0, @@ -875,7 +788,7 @@ void RasterizerOpenGL::SetupDrawGlobalMemory(std::size_t stage_index, const Shad const auto cbufs{gpu.Maxwell3D().state.shader_stages[stage_index]}; u32 binding = device.GetBaseBindings(stage_index).shader_storage_buffer; - for (const auto& entry : shader->GetShaderEntries().global_memory_entries) { + for (const auto& entry : shader->GetEntries().global_memory_entries) { const auto addr{cbufs.const_buffers[entry.GetCbufIndex()].address + entry.GetCbufOffset()}; const auto gpu_addr{memory_manager.Read<u64>(addr)}; const auto size{memory_manager.Read<u32>(addr + 8)}; @@ -889,7 +802,7 @@ void RasterizerOpenGL::SetupComputeGlobalMemory(const Shader& kernel) { const auto cbufs{gpu.KeplerCompute().launch_description.const_buffer_config}; u32 binding = 0; - for (const auto& entry : kernel->GetShaderEntries().global_memory_entries) { + for (const auto& entry : kernel->GetEntries().global_memory_entries) { const auto addr{cbufs[entry.GetCbufIndex()].Address() + entry.GetCbufOffset()}; const auto gpu_addr{memory_manager.Read<u64>(addr)}; const auto size{memory_manager.Read<u32>(addr + 8)}; @@ -897,7 +810,7 @@ void RasterizerOpenGL::SetupComputeGlobalMemory(const Shader& kernel) { } } -void RasterizerOpenGL::SetupGlobalMemory(u32 binding, const GLShader::GlobalMemoryEntry& entry, +void RasterizerOpenGL::SetupGlobalMemory(u32 binding, const GlobalMemoryEntry& entry, GPUVAddr gpu_addr, std::size_t size) { const auto alignment{device.GetShaderStorageBufferAlignment()}; const auto [ssbo, buffer_offset] = @@ -909,16 +822,11 @@ void RasterizerOpenGL::SetupDrawTextures(std::size_t stage_index, const Shader& MICROPROFILE_SCOPE(OpenGL_Texture); const auto& maxwell3d = system.GPU().Maxwell3D(); u32 binding = device.GetBaseBindings(stage_index).sampler; - for (const auto& entry : shader->GetShaderEntries().samplers) { - const auto shader_type = static_cast<Tegra::Engines::ShaderType>(stage_index); - if (!entry.IsIndexed()) { - const auto texture = GetTextureInfo(maxwell3d, entry, shader_type); + for (const auto& entry : shader->GetEntries().samplers) { + const auto shader_type = static_cast<ShaderType>(stage_index); + for (std::size_t i = 0; i < entry.Size(); ++i) { + const auto texture = GetTextureInfo(maxwell3d, entry, shader_type, i); SetupTexture(binding++, texture, entry); - } else { - for (std::size_t i = 0; i < entry.Size(); ++i) { - const auto texture = GetTextureInfo(maxwell3d, entry, shader_type, i); - SetupTexture(binding++, texture, entry); - } } } } @@ -927,46 +835,39 @@ void RasterizerOpenGL::SetupComputeTextures(const Shader& kernel) { MICROPROFILE_SCOPE(OpenGL_Texture); const auto& compute = system.GPU().KeplerCompute(); u32 binding = 0; - for (const auto& entry : kernel->GetShaderEntries().samplers) { - if (!entry.IsIndexed()) { - const auto texture = - GetTextureInfo(compute, entry, Tegra::Engines::ShaderType::Compute); + for (const auto& entry : kernel->GetEntries().samplers) { + for (std::size_t i = 0; i < entry.Size(); ++i) { + const auto texture = GetTextureInfo(compute, entry, ShaderType::Compute, i); SetupTexture(binding++, texture, entry); - } else { - for (std::size_t i = 0; i < entry.Size(); ++i) { - const auto texture = - GetTextureInfo(compute, entry, Tegra::Engines::ShaderType::Compute, i); - SetupTexture(binding++, texture, entry); - } } } } void RasterizerOpenGL::SetupTexture(u32 binding, const Tegra::Texture::FullTextureInfo& texture, - const GLShader::SamplerEntry& entry) { + const SamplerEntry& entry) { const auto view = texture_cache.GetTextureSurface(texture.tic, entry); if (!view) { // Can occur when texture addr is null or its memory is unmapped/invalid - state.samplers[binding] = 0; - state.textures[binding] = 0; + glBindSampler(binding, 0); + glBindTextureUnit(binding, 0); return; } - state.textures[binding] = view->GetTexture(); + glBindTextureUnit(binding, view->GetTexture()); if (view->GetSurfaceParams().IsBuffer()) { return; } - state.samplers[binding] = sampler_cache.GetSampler(texture.tsc); - // Apply swizzle to textures that are not buffers. view->ApplySwizzle(texture.tic.x_source, texture.tic.y_source, texture.tic.z_source, texture.tic.w_source); + + glBindSampler(binding, sampler_cache.GetSampler(texture.tsc)); } void RasterizerOpenGL::SetupDrawImages(std::size_t stage_index, const Shader& shader) { const auto& maxwell3d = system.GPU().Maxwell3D(); u32 binding = device.GetBaseBindings(stage_index).image; - for (const auto& entry : shader->GetShaderEntries().images) { + for (const auto& entry : shader->GetEntries().images) { const auto shader_type = static_cast<Tegra::Engines::ShaderType>(stage_index); const auto tic = GetTextureInfo(maxwell3d, entry, shader_type).tic; SetupImage(binding++, tic, entry); @@ -976,17 +877,17 @@ void RasterizerOpenGL::SetupDrawImages(std::size_t stage_index, const Shader& sh void RasterizerOpenGL::SetupComputeImages(const Shader& shader) { const auto& compute = system.GPU().KeplerCompute(); u32 binding = 0; - for (const auto& entry : shader->GetShaderEntries().images) { + for (const auto& entry : shader->GetEntries().images) { const auto tic = GetTextureInfo(compute, entry, Tegra::Engines::ShaderType::Compute).tic; SetupImage(binding++, tic, entry); } } void RasterizerOpenGL::SetupImage(u32 binding, const Tegra::Texture::TICEntry& tic, - const GLShader::ImageEntry& entry) { + const ImageEntry& entry) { const auto view = texture_cache.GetImageSurface(tic, entry); if (!view) { - state.images[binding] = 0; + glBindImageTexture(binding, 0, 0, GL_FALSE, 0, GL_READ_ONLY, GL_R8); return; } if (!tic.IsBuffer()) { @@ -995,55 +896,87 @@ void RasterizerOpenGL::SetupImage(u32 binding, const Tegra::Texture::TICEntry& t if (entry.IsWritten()) { view->MarkAsModified(texture_cache.Tick()); } - state.images[binding] = view->GetTexture(); + glBindImageTexture(binding, view->GetTexture(), 0, GL_TRUE, 0, GL_READ_WRITE, + view->GetFormat()); } -void RasterizerOpenGL::SyncViewport(OpenGLState& current_state) { - const auto& regs = system.GPU().Maxwell3D().regs; - const bool geometry_shaders_enabled = - regs.IsShaderConfigEnabled(static_cast<size_t>(Maxwell::ShaderProgram::Geometry)); - const std::size_t viewport_count = - geometry_shaders_enabled ? Tegra::Engines::Maxwell3D::Regs::NumViewports : 1; - for (std::size_t i = 0; i < viewport_count; i++) { - auto& viewport = current_state.viewports[i]; - const auto& src = regs.viewports[i]; - const Common::Rectangle<s32> viewport_rect{regs.viewport_transform[i].GetRect()}; - viewport.x = viewport_rect.left; - viewport.y = viewport_rect.bottom; - viewport.width = viewport_rect.GetWidth(); - viewport.height = viewport_rect.GetHeight(); - viewport.depth_range_far = src.depth_range_far; - viewport.depth_range_near = src.depth_range_near; - } - state.depth_clamp.far_plane = regs.view_volume_clip_control.depth_clamp_far != 0; - state.depth_clamp.near_plane = regs.view_volume_clip_control.depth_clamp_near != 0; - - bool flip_y = false; - if (regs.viewport_transform[0].scale_y < 0.0) { - flip_y = !flip_y; - } - if (regs.screen_y_control.y_negate != 0) { - flip_y = !flip_y; - } - state.clip_control.origin = flip_y ? GL_UPPER_LEFT : GL_LOWER_LEFT; - state.clip_control.depth_mode = - regs.depth_mode == Tegra::Engines::Maxwell3D::Regs::DepthMode::ZeroToOne - ? GL_ZERO_TO_ONE - : GL_NEGATIVE_ONE_TO_ONE; +void RasterizerOpenGL::SyncViewport() { + auto& gpu = system.GPU().Maxwell3D(); + auto& flags = gpu.dirty.flags; + const auto& regs = gpu.regs; + + const bool dirty_viewport = flags[Dirty::Viewports]; + if (dirty_viewport || flags[Dirty::ClipControl]) { + flags[Dirty::ClipControl] = false; + + bool flip_y = false; + if (regs.viewport_transform[0].scale_y < 0.0) { + flip_y = !flip_y; + } + if (regs.screen_y_control.y_negate != 0) { + flip_y = !flip_y; + } + glClipControl(flip_y ? GL_UPPER_LEFT : GL_LOWER_LEFT, + regs.depth_mode == Maxwell::DepthMode::ZeroToOne ? GL_ZERO_TO_ONE + : GL_NEGATIVE_ONE_TO_ONE); + } + + if (dirty_viewport) { + flags[Dirty::Viewports] = false; + + const bool force = flags[Dirty::ViewportTransform]; + flags[Dirty::ViewportTransform] = false; + + for (std::size_t i = 0; i < Maxwell::NumViewports; ++i) { + if (!force && !flags[Dirty::Viewport0 + i]) { + continue; + } + flags[Dirty::Viewport0 + i] = false; + + const auto& src = regs.viewport_transform[i]; + const Common::Rectangle<f32> rect{src.GetRect()}; + glViewportIndexedf(static_cast<GLuint>(i), rect.left, rect.bottom, rect.GetWidth(), + rect.GetHeight()); + + const GLdouble reduce_z = regs.depth_mode == Maxwell::DepthMode::MinusOneToOne; + const GLdouble near_depth = src.translate_z - src.scale_z * reduce_z; + const GLdouble far_depth = src.translate_z + src.scale_z; + glDepthRangeIndexed(static_cast<GLuint>(i), near_depth, far_depth); + } + } } -void RasterizerOpenGL::SyncClipEnabled( - const std::array<bool, Maxwell::Regs::NumClipDistances>& clip_mask) { +void RasterizerOpenGL::SyncDepthClamp() { + auto& gpu = system.GPU().Maxwell3D(); + auto& flags = gpu.dirty.flags; + if (!flags[Dirty::DepthClampEnabled]) { + return; + } + flags[Dirty::DepthClampEnabled] = false; - const auto& regs = system.GPU().Maxwell3D().regs; - const std::array<bool, Maxwell::Regs::NumClipDistances> reg_state{ - regs.clip_distance_enabled.c0 != 0, regs.clip_distance_enabled.c1 != 0, - regs.clip_distance_enabled.c2 != 0, regs.clip_distance_enabled.c3 != 0, - regs.clip_distance_enabled.c4 != 0, regs.clip_distance_enabled.c5 != 0, - regs.clip_distance_enabled.c6 != 0, regs.clip_distance_enabled.c7 != 0}; + const auto& state = gpu.regs.view_volume_clip_control; + UNIMPLEMENTED_IF_MSG(state.depth_clamp_far != state.depth_clamp_near, + "Unimplemented depth clamp separation!"); + + oglEnable(GL_DEPTH_CLAMP, state.depth_clamp_far || state.depth_clamp_near); +} + +void RasterizerOpenGL::SyncClipEnabled(u32 clip_mask) { + auto& gpu = system.GPU().Maxwell3D(); + auto& flags = gpu.dirty.flags; + if (!flags[Dirty::ClipDistances] && !flags[Dirty::Shaders]) { + return; + } + flags[Dirty::ClipDistances] = false; + + clip_mask &= gpu.regs.clip_distance_enabled; + if (clip_mask == last_clip_distance_mask) { + return; + } + last_clip_distance_mask = clip_mask; for (std::size_t i = 0; i < Maxwell::Regs::NumClipDistances; ++i) { - state.clip_distance[i] = reg_state[i] && clip_mask[i]; + oglEnable(static_cast<GLenum>(GL_CLIP_DISTANCE0 + i), (clip_mask >> i) & 1); } } @@ -1052,247 +985,442 @@ void RasterizerOpenGL::SyncClipCoef() { } void RasterizerOpenGL::SyncCullMode() { - const auto& regs = system.GPU().Maxwell3D().regs; + auto& gpu = system.GPU().Maxwell3D(); + auto& flags = gpu.dirty.flags; + const auto& regs = gpu.regs; - state.cull.enabled = regs.cull.enabled != 0; - if (state.cull.enabled) { - state.cull.mode = MaxwellToGL::CullFace(regs.cull.cull_face); + if (flags[Dirty::CullTest]) { + flags[Dirty::CullTest] = false; + + if (regs.cull_test_enabled) { + glEnable(GL_CULL_FACE); + glCullFace(MaxwellToGL::CullFace(regs.cull_face)); + } else { + glDisable(GL_CULL_FACE); + } } - state.cull.front_face = MaxwellToGL::FrontFace(regs.cull.front_face); + if (flags[Dirty::FrontFace]) { + flags[Dirty::FrontFace] = false; + glFrontFace(MaxwellToGL::FrontFace(regs.front_face)); + } } void RasterizerOpenGL::SyncPrimitiveRestart() { - const auto& regs = system.GPU().Maxwell3D().regs; + auto& gpu = system.GPU().Maxwell3D(); + auto& flags = gpu.dirty.flags; + if (!flags[Dirty::PrimitiveRestart]) { + return; + } + flags[Dirty::PrimitiveRestart] = false; - state.primitive_restart.enabled = regs.primitive_restart.enabled; - state.primitive_restart.index = regs.primitive_restart.index; + if (gpu.regs.primitive_restart.enabled) { + glEnable(GL_PRIMITIVE_RESTART); + glPrimitiveRestartIndex(gpu.regs.primitive_restart.index); + } else { + glDisable(GL_PRIMITIVE_RESTART); + } } void RasterizerOpenGL::SyncDepthTestState() { - const auto& regs = system.GPU().Maxwell3D().regs; - - state.depth.test_enabled = regs.depth_test_enable != 0; - state.depth.write_mask = regs.depth_write_enabled ? GL_TRUE : GL_FALSE; + auto& gpu = system.GPU().Maxwell3D(); + auto& flags = gpu.dirty.flags; - if (!state.depth.test_enabled) { - return; + const auto& regs = gpu.regs; + if (flags[Dirty::DepthMask]) { + flags[Dirty::DepthMask] = false; + glDepthMask(regs.depth_write_enabled ? GL_TRUE : GL_FALSE); } - state.depth.test_func = MaxwellToGL::ComparisonOp(regs.depth_test_func); + if (flags[Dirty::DepthTest]) { + flags[Dirty::DepthTest] = false; + if (regs.depth_test_enable) { + glEnable(GL_DEPTH_TEST); + glDepthFunc(MaxwellToGL::ComparisonOp(regs.depth_test_func)); + } else { + glDisable(GL_DEPTH_TEST); + } + } } void RasterizerOpenGL::SyncStencilTestState() { - auto& maxwell3d = system.GPU().Maxwell3D(); - if (!maxwell3d.dirty.stencil_test) { + auto& gpu = system.GPU().Maxwell3D(); + auto& flags = gpu.dirty.flags; + if (!flags[Dirty::StencilTest]) { return; } - maxwell3d.dirty.stencil_test = false; - - const auto& regs = maxwell3d.regs; - state.stencil.test_enabled = regs.stencil_enable != 0; - state.MarkDirtyStencilState(); + flags[Dirty::StencilTest] = false; + const auto& regs = gpu.regs; if (!regs.stencil_enable) { + glDisable(GL_STENCIL_TEST); return; } - state.stencil.front.test_func = MaxwellToGL::ComparisonOp(regs.stencil_front_func_func); - state.stencil.front.test_ref = regs.stencil_front_func_ref; - state.stencil.front.test_mask = regs.stencil_front_func_mask; - state.stencil.front.action_stencil_fail = MaxwellToGL::StencilOp(regs.stencil_front_op_fail); - state.stencil.front.action_depth_fail = MaxwellToGL::StencilOp(regs.stencil_front_op_zfail); - state.stencil.front.action_depth_pass = MaxwellToGL::StencilOp(regs.stencil_front_op_zpass); - state.stencil.front.write_mask = regs.stencil_front_mask; + glEnable(GL_STENCIL_TEST); + glStencilFuncSeparate(GL_FRONT, MaxwellToGL::ComparisonOp(regs.stencil_front_func_func), + regs.stencil_front_func_ref, regs.stencil_front_func_mask); + glStencilOpSeparate(GL_FRONT, MaxwellToGL::StencilOp(regs.stencil_front_op_fail), + MaxwellToGL::StencilOp(regs.stencil_front_op_zfail), + MaxwellToGL::StencilOp(regs.stencil_front_op_zpass)); + glStencilMaskSeparate(GL_FRONT, regs.stencil_front_mask); + if (regs.stencil_two_side_enable) { - state.stencil.back.test_func = MaxwellToGL::ComparisonOp(regs.stencil_back_func_func); - state.stencil.back.test_ref = regs.stencil_back_func_ref; - state.stencil.back.test_mask = regs.stencil_back_func_mask; - state.stencil.back.action_stencil_fail = MaxwellToGL::StencilOp(regs.stencil_back_op_fail); - state.stencil.back.action_depth_fail = MaxwellToGL::StencilOp(regs.stencil_back_op_zfail); - state.stencil.back.action_depth_pass = MaxwellToGL::StencilOp(regs.stencil_back_op_zpass); - state.stencil.back.write_mask = regs.stencil_back_mask; + glStencilFuncSeparate(GL_BACK, MaxwellToGL::ComparisonOp(regs.stencil_back_func_func), + regs.stencil_back_func_ref, regs.stencil_back_func_mask); + glStencilOpSeparate(GL_BACK, MaxwellToGL::StencilOp(regs.stencil_back_op_fail), + MaxwellToGL::StencilOp(regs.stencil_back_op_zfail), + MaxwellToGL::StencilOp(regs.stencil_back_op_zpass)); + glStencilMaskSeparate(GL_BACK, regs.stencil_back_mask); } else { - state.stencil.back.test_func = GL_ALWAYS; - state.stencil.back.test_ref = 0; - state.stencil.back.test_mask = 0xFFFFFFFF; - state.stencil.back.write_mask = 0xFFFFFFFF; - state.stencil.back.action_stencil_fail = GL_KEEP; - state.stencil.back.action_depth_fail = GL_KEEP; - state.stencil.back.action_depth_pass = GL_KEEP; + glStencilFuncSeparate(GL_BACK, GL_ALWAYS, 0, 0xFFFFFFFF); + glStencilOpSeparate(GL_BACK, GL_KEEP, GL_KEEP, GL_KEEP); + glStencilMaskSeparate(GL_BACK, 0xFFFFFFFF); } } -void RasterizerOpenGL::SyncRasterizeEnable(OpenGLState& current_state) { - const auto& regs = system.GPU().Maxwell3D().regs; - current_state.rasterizer_discard = regs.rasterize_enable == 0; +void RasterizerOpenGL::SyncRasterizeEnable() { + auto& gpu = system.GPU().Maxwell3D(); + auto& flags = gpu.dirty.flags; + if (!flags[Dirty::RasterizeEnable]) { + return; + } + flags[Dirty::RasterizeEnable] = false; + + oglEnable(GL_RASTERIZER_DISCARD, gpu.regs.rasterize_enable == 0); +} + +void RasterizerOpenGL::SyncPolygonModes() { + auto& gpu = system.GPU().Maxwell3D(); + auto& flags = gpu.dirty.flags; + if (!flags[Dirty::PolygonModes]) { + return; + } + flags[Dirty::PolygonModes] = false; + + if (gpu.regs.fill_rectangle) { + if (!GLAD_GL_NV_fill_rectangle) { + LOG_ERROR(Render_OpenGL, "GL_NV_fill_rectangle used and not supported"); + glPolygonMode(GL_FRONT_AND_BACK, GL_FILL); + return; + } + + flags[Dirty::PolygonModeFront] = true; + flags[Dirty::PolygonModeBack] = true; + glPolygonMode(GL_FRONT_AND_BACK, GL_FILL_RECTANGLE_NV); + return; + } + + if (gpu.regs.polygon_mode_front == gpu.regs.polygon_mode_back) { + flags[Dirty::PolygonModeFront] = false; + flags[Dirty::PolygonModeBack] = false; + glPolygonMode(GL_FRONT_AND_BACK, MaxwellToGL::PolygonMode(gpu.regs.polygon_mode_front)); + return; + } + + if (flags[Dirty::PolygonModeFront]) { + flags[Dirty::PolygonModeFront] = false; + glPolygonMode(GL_FRONT, MaxwellToGL::PolygonMode(gpu.regs.polygon_mode_front)); + } + + if (flags[Dirty::PolygonModeBack]) { + flags[Dirty::PolygonModeBack] = false; + glPolygonMode(GL_BACK, MaxwellToGL::PolygonMode(gpu.regs.polygon_mode_back)); + } } void RasterizerOpenGL::SyncColorMask() { - auto& maxwell3d = system.GPU().Maxwell3D(); - if (!maxwell3d.dirty.color_mask) { + auto& gpu = system.GPU().Maxwell3D(); + auto& flags = gpu.dirty.flags; + if (!flags[Dirty::ColorMasks]) { return; } - const auto& regs = maxwell3d.regs; + flags[Dirty::ColorMasks] = false; + + const bool force = flags[Dirty::ColorMaskCommon]; + flags[Dirty::ColorMaskCommon] = false; + + const auto& regs = gpu.regs; + if (regs.color_mask_common) { + if (!force && !flags[Dirty::ColorMask0]) { + return; + } + flags[Dirty::ColorMask0] = false; - const std::size_t count = - regs.independent_blend_enable ? Tegra::Engines::Maxwell3D::Regs::NumRenderTargets : 1; - for (std::size_t i = 0; i < count; i++) { - const auto& source = regs.color_mask[regs.color_mask_common ? 0 : i]; - auto& dest = state.color_mask[i]; - dest.red_enabled = (source.R == 0) ? GL_FALSE : GL_TRUE; - dest.green_enabled = (source.G == 0) ? GL_FALSE : GL_TRUE; - dest.blue_enabled = (source.B == 0) ? GL_FALSE : GL_TRUE; - dest.alpha_enabled = (source.A == 0) ? GL_FALSE : GL_TRUE; + auto& mask = regs.color_mask[0]; + glColorMask(mask.R != 0, mask.B != 0, mask.G != 0, mask.A != 0); + return; } - state.MarkDirtyColorMask(); - maxwell3d.dirty.color_mask = false; + // Path without color_mask_common set + for (std::size_t i = 0; i < Maxwell::NumRenderTargets; ++i) { + if (!force && !flags[Dirty::ColorMask0 + i]) { + continue; + } + flags[Dirty::ColorMask0 + i] = false; + + const auto& mask = regs.color_mask[i]; + glColorMaski(static_cast<GLuint>(i), mask.R != 0, mask.G != 0, mask.B != 0, mask.A != 0); + } } void RasterizerOpenGL::SyncMultiSampleState() { + auto& gpu = system.GPU().Maxwell3D(); + auto& flags = gpu.dirty.flags; + if (!flags[Dirty::MultisampleControl]) { + return; + } + flags[Dirty::MultisampleControl] = false; + const auto& regs = system.GPU().Maxwell3D().regs; - state.multisample_control.alpha_to_coverage = regs.multisample_control.alpha_to_coverage != 0; - state.multisample_control.alpha_to_one = regs.multisample_control.alpha_to_one != 0; + oglEnable(GL_SAMPLE_ALPHA_TO_COVERAGE, regs.multisample_control.alpha_to_coverage); + oglEnable(GL_SAMPLE_ALPHA_TO_ONE, regs.multisample_control.alpha_to_one); } void RasterizerOpenGL::SyncFragmentColorClampState() { - const auto& regs = system.GPU().Maxwell3D().regs; - state.fragment_color_clamp.enabled = regs.frag_color_clamp != 0; + auto& gpu = system.GPU().Maxwell3D(); + auto& flags = gpu.dirty.flags; + if (!flags[Dirty::FragmentClampColor]) { + return; + } + flags[Dirty::FragmentClampColor] = false; + + glClampColor(GL_CLAMP_FRAGMENT_COLOR, gpu.regs.frag_color_clamp ? GL_TRUE : GL_FALSE); } void RasterizerOpenGL::SyncBlendState() { - auto& maxwell3d = system.GPU().Maxwell3D(); - if (!maxwell3d.dirty.blend_state) { + auto& gpu = system.GPU().Maxwell3D(); + auto& flags = gpu.dirty.flags; + const auto& regs = gpu.regs; + + if (flags[Dirty::BlendColor]) { + flags[Dirty::BlendColor] = false; + glBlendColor(regs.blend_color.r, regs.blend_color.g, regs.blend_color.b, + regs.blend_color.a); + } + + // TODO(Rodrigo): Revisit blending, there are several registers we are not reading + + if (!flags[Dirty::BlendStates]) { return; } - const auto& regs = maxwell3d.regs; - - state.blend_color.red = regs.blend_color.r; - state.blend_color.green = regs.blend_color.g; - state.blend_color.blue = regs.blend_color.b; - state.blend_color.alpha = regs.blend_color.a; - - state.independant_blend.enabled = regs.independent_blend_enable; - if (!state.independant_blend.enabled) { - auto& blend = state.blend[0]; - const auto& src = regs.blend; - blend.enabled = src.enable[0] != 0; - if (blend.enabled) { - blend.rgb_equation = MaxwellToGL::BlendEquation(src.equation_rgb); - blend.src_rgb_func = MaxwellToGL::BlendFunc(src.factor_source_rgb); - blend.dst_rgb_func = MaxwellToGL::BlendFunc(src.factor_dest_rgb); - blend.a_equation = MaxwellToGL::BlendEquation(src.equation_a); - blend.src_a_func = MaxwellToGL::BlendFunc(src.factor_source_a); - blend.dst_a_func = MaxwellToGL::BlendFunc(src.factor_dest_a); - } - for (std::size_t i = 1; i < Tegra::Engines::Maxwell3D::Regs::NumRenderTargets; i++) { - state.blend[i].enabled = false; + flags[Dirty::BlendStates] = false; + + if (!regs.independent_blend_enable) { + if (!regs.blend.enable[0]) { + glDisable(GL_BLEND); + return; } - maxwell3d.dirty.blend_state = false; - state.MarkDirtyBlendState(); + glEnable(GL_BLEND); + glBlendFuncSeparate(MaxwellToGL::BlendFunc(regs.blend.factor_source_rgb), + MaxwellToGL::BlendFunc(regs.blend.factor_dest_rgb), + MaxwellToGL::BlendFunc(regs.blend.factor_source_a), + MaxwellToGL::BlendFunc(regs.blend.factor_dest_a)); + glBlendEquationSeparate(MaxwellToGL::BlendEquation(regs.blend.equation_rgb), + MaxwellToGL::BlendEquation(regs.blend.equation_a)); return; } - for (std::size_t i = 0; i < Tegra::Engines::Maxwell3D::Regs::NumRenderTargets; i++) { - auto& blend = state.blend[i]; - const auto& src = regs.independent_blend[i]; - blend.enabled = regs.blend.enable[i] != 0; - if (!blend.enabled) + const bool force = flags[Dirty::BlendIndependentEnabled]; + flags[Dirty::BlendIndependentEnabled] = false; + + for (std::size_t i = 0; i < Maxwell::NumRenderTargets; ++i) { + if (!force && !flags[Dirty::BlendState0 + i]) { continue; - blend.rgb_equation = MaxwellToGL::BlendEquation(src.equation_rgb); - blend.src_rgb_func = MaxwellToGL::BlendFunc(src.factor_source_rgb); - blend.dst_rgb_func = MaxwellToGL::BlendFunc(src.factor_dest_rgb); - blend.a_equation = MaxwellToGL::BlendEquation(src.equation_a); - blend.src_a_func = MaxwellToGL::BlendFunc(src.factor_source_a); - blend.dst_a_func = MaxwellToGL::BlendFunc(src.factor_dest_a); - } + } + flags[Dirty::BlendState0 + i] = false; + + if (!regs.blend.enable[i]) { + glDisablei(GL_BLEND, static_cast<GLuint>(i)); + continue; + } + glEnablei(GL_BLEND, static_cast<GLuint>(i)); - state.MarkDirtyBlendState(); - maxwell3d.dirty.blend_state = false; + const auto& src = regs.independent_blend[i]; + glBlendFuncSeparatei(static_cast<GLuint>(i), MaxwellToGL::BlendFunc(src.factor_source_rgb), + MaxwellToGL::BlendFunc(src.factor_dest_rgb), + MaxwellToGL::BlendFunc(src.factor_source_a), + MaxwellToGL::BlendFunc(src.factor_dest_a)); + glBlendEquationSeparatei(static_cast<GLuint>(i), + MaxwellToGL::BlendEquation(src.equation_rgb), + MaxwellToGL::BlendEquation(src.equation_a)); + } } void RasterizerOpenGL::SyncLogicOpState() { - const auto& regs = system.GPU().Maxwell3D().regs; + auto& gpu = system.GPU().Maxwell3D(); + auto& flags = gpu.dirty.flags; + if (!flags[Dirty::LogicOp]) { + return; + } + flags[Dirty::LogicOp] = false; - state.logic_op.enabled = regs.logic_op.enable != 0; + const auto& regs = gpu.regs; + if (regs.logic_op.enable) { + glEnable(GL_COLOR_LOGIC_OP); + glLogicOp(MaxwellToGL::LogicOp(regs.logic_op.operation)); + } else { + glDisable(GL_COLOR_LOGIC_OP); + } +} - if (!state.logic_op.enabled) +void RasterizerOpenGL::SyncScissorTest() { + auto& gpu = system.GPU().Maxwell3D(); + auto& flags = gpu.dirty.flags; + if (!flags[Dirty::Scissors]) { return; + } + flags[Dirty::Scissors] = false; - ASSERT_MSG(regs.blend.enable[0] == 0, - "Blending and logic op can't be enabled at the same time."); - - state.logic_op.operation = MaxwellToGL::LogicOp(regs.logic_op.operation); -} + const auto& regs = gpu.regs; + for (std::size_t index = 0; index < Maxwell::NumViewports; ++index) { + if (!flags[Dirty::Scissor0 + index]) { + continue; + } + flags[Dirty::Scissor0 + index] = false; -void RasterizerOpenGL::SyncScissorTest(OpenGLState& current_state) { - const auto& regs = system.GPU().Maxwell3D().regs; - const bool geometry_shaders_enabled = - regs.IsShaderConfigEnabled(static_cast<size_t>(Maxwell::ShaderProgram::Geometry)); - const std::size_t viewport_count = - geometry_shaders_enabled ? Tegra::Engines::Maxwell3D::Regs::NumViewports : 1; - for (std::size_t i = 0; i < viewport_count; i++) { - const auto& src = regs.scissor_test[i]; - auto& dst = current_state.viewports[i].scissor; - dst.enabled = (src.enable != 0); - if (dst.enabled == 0) { - return; + const auto& src = regs.scissor_test[index]; + if (src.enable) { + glEnablei(GL_SCISSOR_TEST, static_cast<GLuint>(index)); + glScissorIndexed(static_cast<GLuint>(index), src.min_x, src.min_y, + src.max_x - src.min_x, src.max_y - src.min_y); + } else { + glDisablei(GL_SCISSOR_TEST, static_cast<GLuint>(index)); } - const u32 width = src.max_x - src.min_x; - const u32 height = src.max_y - src.min_y; - dst.x = src.min_x; - dst.y = src.min_y; - dst.width = width; - dst.height = height; } } -void RasterizerOpenGL::SyncTransformFeedback() { - const auto& regs = system.GPU().Maxwell3D().regs; - UNIMPLEMENTED_IF_MSG(regs.tfb_enabled != 0, "Transform feedbacks are not implemented"); -} - void RasterizerOpenGL::SyncPointState() { - const auto& regs = system.GPU().Maxwell3D().regs; + auto& gpu = system.GPU().Maxwell3D(); + auto& flags = gpu.dirty.flags; + if (!flags[Dirty::PointSize]) { + return; + } + flags[Dirty::PointSize] = false; + + oglEnable(GL_POINT_SPRITE, gpu.regs.point_sprite_enable); + + if (gpu.regs.vp_point_size.enable) { + // By definition of GL_POINT_SIZE, it only matters if GL_PROGRAM_POINT_SIZE is disabled. + glEnable(GL_PROGRAM_POINT_SIZE); + return; + } + // Limit the point size to 1 since nouveau sometimes sets a point size of 0 (and that's invalid // in OpenGL). - state.point.program_control = regs.vp_point_size.enable != 0; - state.point.sprite = regs.point_sprite_enable != 0; - state.point.size = std::max(1.0f, regs.point_size); + glPointSize(std::max(1.0f, gpu.regs.point_size)); + glDisable(GL_PROGRAM_POINT_SIZE); } void RasterizerOpenGL::SyncPolygonOffset() { - auto& maxwell3d = system.GPU().Maxwell3D(); - if (!maxwell3d.dirty.polygon_offset) { + auto& gpu = system.GPU().Maxwell3D(); + auto& flags = gpu.dirty.flags; + if (!flags[Dirty::PolygonOffset]) { return; } - const auto& regs = maxwell3d.regs; - - state.polygon_offset.fill_enable = regs.polygon_offset_fill_enable != 0; - state.polygon_offset.line_enable = regs.polygon_offset_line_enable != 0; - state.polygon_offset.point_enable = regs.polygon_offset_point_enable != 0; + flags[Dirty::PolygonOffset] = false; - // Hardware divides polygon offset units by two - state.polygon_offset.units = regs.polygon_offset_units / 2.0f; - state.polygon_offset.factor = regs.polygon_offset_factor; - state.polygon_offset.clamp = regs.polygon_offset_clamp; + const auto& regs = gpu.regs; + oglEnable(GL_POLYGON_OFFSET_FILL, regs.polygon_offset_fill_enable); + oglEnable(GL_POLYGON_OFFSET_LINE, regs.polygon_offset_line_enable); + oglEnable(GL_POLYGON_OFFSET_POINT, regs.polygon_offset_point_enable); - state.MarkDirtyPolygonOffset(); - maxwell3d.dirty.polygon_offset = false; + if (regs.polygon_offset_fill_enable || regs.polygon_offset_line_enable || + regs.polygon_offset_point_enable) { + // Hardware divides polygon offset units by two + glPolygonOffsetClamp(regs.polygon_offset_factor, regs.polygon_offset_units / 2.0f, + regs.polygon_offset_clamp); + } } void RasterizerOpenGL::SyncAlphaTest() { + auto& gpu = system.GPU().Maxwell3D(); + auto& flags = gpu.dirty.flags; + if (!flags[Dirty::AlphaTest]) { + return; + } + flags[Dirty::AlphaTest] = false; + + const auto& regs = gpu.regs; + if (regs.alpha_test_enabled && regs.rt_control.count > 1) { + LOG_WARNING(Render_OpenGL, "Alpha testing with more than one render target is not tested"); + } + + if (regs.alpha_test_enabled) { + glEnable(GL_ALPHA_TEST); + glAlphaFunc(MaxwellToGL::ComparisonOp(regs.alpha_test_func), regs.alpha_test_ref); + } else { + glDisable(GL_ALPHA_TEST); + } +} + +void RasterizerOpenGL::SyncFramebufferSRGB() { + auto& gpu = system.GPU().Maxwell3D(); + auto& flags = gpu.dirty.flags; + if (!flags[Dirty::FramebufferSRGB]) { + return; + } + flags[Dirty::FramebufferSRGB] = false; + + oglEnable(GL_FRAMEBUFFER_SRGB, gpu.regs.framebuffer_srgb); +} + +void RasterizerOpenGL::BeginTransformFeedback(GLenum primitive_mode) { const auto& regs = system.GPU().Maxwell3D().regs; - UNIMPLEMENTED_IF_MSG(regs.alpha_test_enabled != 0 && regs.rt_control.count > 1, - "Alpha Testing is enabled with more than one rendertarget"); + if (regs.tfb_enabled == 0) { + return; + } + + UNIMPLEMENTED_IF(regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::TesselationControl) || + regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::TesselationEval) || + regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::Geometry)); - state.alpha_test.enabled = regs.alpha_test_enabled; - if (!state.alpha_test.enabled) { + for (std::size_t index = 0; index < Maxwell::NumTransformFeedbackBuffers; ++index) { + const auto& binding = regs.tfb_bindings[index]; + if (!binding.buffer_enable) { + if (enabled_transform_feedback_buffers[index]) { + glBindBufferRange(GL_TRANSFORM_FEEDBACK_BUFFER, static_cast<GLuint>(index), 0, 0, + 0); + } + enabled_transform_feedback_buffers[index] = false; + continue; + } + enabled_transform_feedback_buffers[index] = true; + + auto& tfb_buffer = transform_feedback_buffers[index]; + tfb_buffer.Create(); + + const GLuint handle = tfb_buffer.handle; + const std::size_t size = binding.buffer_size; + glNamedBufferData(handle, static_cast<GLsizeiptr>(size), nullptr, GL_STREAM_COPY); + glBindBufferRange(GL_TRANSFORM_FEEDBACK_BUFFER, static_cast<GLuint>(index), handle, 0, + static_cast<GLsizeiptr>(size)); + } + + glBeginTransformFeedback(GL_POINTS); +} + +void RasterizerOpenGL::EndTransformFeedback() { + const auto& regs = system.GPU().Maxwell3D().regs; + if (regs.tfb_enabled == 0) { return; } - state.alpha_test.func = MaxwellToGL::ComparisonOp(regs.alpha_test_func); - state.alpha_test.ref = regs.alpha_test_ref; + + glEndTransformFeedback(); + + for (std::size_t index = 0; index < Maxwell::NumTransformFeedbackBuffers; ++index) { + const auto& binding = regs.tfb_bindings[index]; + if (!binding.buffer_enable) { + continue; + } + UNIMPLEMENTED_IF(binding.buffer_offset != 0); + + const GLuint handle = transform_feedback_buffers[index].handle; + const GPUVAddr gpu_addr = binding.Address(); + const std::size_t size = binding.buffer_size; + const auto [dest_buffer, offset] = buffer_cache.UploadMemory(gpu_addr, size, 4, true); + glCopyNamedBufferSubData(handle, *dest_buffer, 0, offset, static_cast<GLsizeiptr>(size)); + } } } // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h index 68abe9a21..2d3be2437 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.h +++ b/src/video_core/renderer_opengl/gl_rasterizer.h @@ -30,7 +30,7 @@ #include "video_core/renderer_opengl/gl_shader_cache.h" #include "video_core/renderer_opengl/gl_shader_decompiler.h" #include "video_core/renderer_opengl/gl_shader_manager.h" -#include "video_core/renderer_opengl/gl_state.h" +#include "video_core/renderer_opengl/gl_state_tracker.h" #include "video_core/renderer_opengl/gl_texture_cache.h" #include "video_core/renderer_opengl/utils.h" #include "video_core/textures/texture.h" @@ -55,7 +55,8 @@ struct DrawParameters; class RasterizerOpenGL : public VideoCore::RasterizerAccelerated { public: explicit RasterizerOpenGL(Core::System& system, Core::Frontend::EmuWindow& emu_window, - ScreenInfo& info); + ScreenInfo& info, GLShader::ProgramManager& program_manager, + StateTracker& state_tracker); ~RasterizerOpenGL() override; void Draw(bool is_indexed, bool is_instanced) override; @@ -76,6 +77,7 @@ public: u32 pixel_stride) override; void LoadDiskResources(const std::atomic_bool& stop_loading, const VideoCore::DiskResourceLoadCallback& callback) override; + void SetupDirtyFlags() override; /// Returns true when there are commands queued to the OpenGL server. bool AnyCommandQueued() const { @@ -86,8 +88,7 @@ private: /// Configures the color and depth framebuffer states. void ConfigureFramebuffers(); - void ConfigureClearFramebuffer(OpenGLState& current_state, bool using_color_fb, - bool using_depth_fb, bool using_stencil_fb); + void ConfigureClearFramebuffer(bool using_color_fb, bool using_depth_fb, bool using_stencil_fb); /// Configures the current constbuffers to use for the draw command. void SetupDrawConstBuffers(std::size_t stage_index, const Shader& shader); @@ -97,7 +98,7 @@ private: /// Configures a constant buffer. void SetupConstBuffer(u32 binding, const Tegra::Engines::ConstBufferInfo& buffer, - const GLShader::ConstBufferEntry& entry); + const ConstBufferEntry& entry); /// Configures the current global memory entries to use for the draw command. void SetupDrawGlobalMemory(std::size_t stage_index, const Shader& shader); @@ -106,7 +107,7 @@ private: void SetupComputeGlobalMemory(const Shader& kernel); /// Configures a constant buffer. - void SetupGlobalMemory(u32 binding, const GLShader::GlobalMemoryEntry& entry, GPUVAddr gpu_addr, + void SetupGlobalMemory(u32 binding, const GlobalMemoryEntry& entry, GPUVAddr gpu_addr, std::size_t size); /// Configures the current textures to use for the draw command. @@ -117,7 +118,7 @@ private: /// Configures a texture. void SetupTexture(u32 binding, const Tegra::Texture::FullTextureInfo& texture, - const GLShader::SamplerEntry& entry); + const SamplerEntry& entry); /// Configures images in a graphics shader. void SetupDrawImages(std::size_t stage_index, const Shader& shader); @@ -126,15 +127,16 @@ private: void SetupComputeImages(const Shader& shader); /// Configures an image. - void SetupImage(u32 binding, const Tegra::Texture::TICEntry& tic, - const GLShader::ImageEntry& entry); + void SetupImage(u32 binding, const Tegra::Texture::TICEntry& tic, const ImageEntry& entry); /// Syncs the viewport and depth range to match the guest state - void SyncViewport(OpenGLState& current_state); + void SyncViewport(); + + /// Syncs the depth clamp state + void SyncDepthClamp(); /// Syncs the clip enabled status to match the guest state - void SyncClipEnabled( - const std::array<bool, Tegra::Engines::Maxwell3D::Regs::NumClipDistances>& clip_mask); + void SyncClipEnabled(u32 clip_mask); /// Syncs the clip coefficients to match the guest state void SyncClipCoef(); @@ -164,16 +166,16 @@ private: void SyncMultiSampleState(); /// Syncs the scissor test state to match the guest state - void SyncScissorTest(OpenGLState& current_state); - - /// Syncs the transform feedback state to match the guest state - void SyncTransformFeedback(); + void SyncScissorTest(); /// Syncs the point state to match the guest state void SyncPointState(); /// Syncs the rasterizer enable state to match the guest state - void SyncRasterizeEnable(OpenGLState& current_state); + void SyncRasterizeEnable(); + + /// Syncs polygon modes to match the guest state + void SyncPolygonModes(); /// Syncs Color Mask void SyncColorMask(); @@ -184,6 +186,15 @@ private: /// Syncs the alpha test state to match the guest state void SyncAlphaTest(); + /// Syncs the framebuffer sRGB state to match the guest state + void SyncFramebufferSRGB(); + + /// Begin a transform feedback + void BeginTransformFeedback(GLenum primitive_mode); + + /// End a transform feedback + void EndTransformFeedback(); + /// Check for extension that are not strictly required but are needed for correct emulation void CheckExtensions(); @@ -191,18 +202,17 @@ private: std::size_t CalculateIndexBufferSize() const; - /// Updates and returns a vertex array object representing current vertex format - GLuint SetupVertexFormat(); + /// Updates the current vertex format + void SetupVertexFormat(); - void SetupVertexBuffer(GLuint vao); - void SetupVertexInstances(GLuint vao); + void SetupVertexBuffer(); + void SetupVertexInstances(); GLintptr SetupIndexBuffer(); void SetupShaders(GLenum primitive_mode); const Device device; - OpenGLState state; TextureCacheOpenGL texture_cache; ShaderCacheOpenGL shader_cache; @@ -212,22 +222,25 @@ private: Core::System& system; ScreenInfo& screen_info; - - std::unique_ptr<GLShader::ProgramManager> shader_program_manager; - std::map<std::array<Tegra::Engines::Maxwell3D::Regs::VertexAttribute, - Tegra::Engines::Maxwell3D::Regs::NumVertexAttributes>, - OGLVertexArray> - vertex_array_cache; + GLShader::ProgramManager& program_manager; + StateTracker& state_tracker; static constexpr std::size_t STREAM_BUFFER_SIZE = 128 * 1024 * 1024; OGLBufferCache buffer_cache; - VertexArrayPushBuffer vertex_array_pushbuffer; + VertexArrayPushBuffer vertex_array_pushbuffer{state_tracker}; BindBuffersRangePushBuffer bind_ubo_pushbuffer{GL_UNIFORM_BUFFER}; BindBuffersRangePushBuffer bind_ssbo_pushbuffer{GL_SHADER_STORAGE_BUFFER}; + std::array<OGLBuffer, Tegra::Engines::Maxwell3D::Regs::NumTransformFeedbackBuffers> + transform_feedback_buffers; + std::bitset<Tegra::Engines::Maxwell3D::Regs::NumTransformFeedbackBuffers> + enabled_transform_feedback_buffers; + /// Number of commands queued to the OpenGL driver. Reseted on flush. std::size_t num_queued_commands = 0; + + u32 last_clip_distance_mask = 0; }; } // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_resource_manager.cpp b/src/video_core/renderer_opengl/gl_resource_manager.cpp index f0ddfb276..97803d480 100644 --- a/src/video_core/renderer_opengl/gl_resource_manager.cpp +++ b/src/video_core/renderer_opengl/gl_resource_manager.cpp @@ -8,13 +8,29 @@ #include "common/microprofile.h" #include "video_core/renderer_opengl/gl_resource_manager.h" #include "video_core/renderer_opengl/gl_shader_util.h" -#include "video_core/renderer_opengl/gl_state.h" MICROPROFILE_DEFINE(OpenGL_ResourceCreation, "OpenGL", "Resource Creation", MP_RGB(128, 128, 192)); MICROPROFILE_DEFINE(OpenGL_ResourceDeletion, "OpenGL", "Resource Deletion", MP_RGB(128, 128, 192)); namespace OpenGL { +void OGLRenderbuffer::Create() { + if (handle != 0) + return; + + MICROPROFILE_SCOPE(OpenGL_ResourceCreation); + glCreateRenderbuffers(1, &handle); +} + +void OGLRenderbuffer::Release() { + if (handle == 0) + return; + + MICROPROFILE_SCOPE(OpenGL_ResourceDeletion); + glDeleteRenderbuffers(1, &handle); + handle = 0; +} + void OGLTexture::Create(GLenum target) { if (handle != 0) return; @@ -29,7 +45,6 @@ void OGLTexture::Release() { MICROPROFILE_SCOPE(OpenGL_ResourceDeletion); glDeleteTextures(1, &handle); - OpenGLState::GetCurState().UnbindTexture(handle).Apply(); handle = 0; } @@ -47,7 +62,6 @@ void OGLTextureView::Release() { MICROPROFILE_SCOPE(OpenGL_ResourceDeletion); glDeleteTextures(1, &handle); - OpenGLState::GetCurState().UnbindTexture(handle).Apply(); handle = 0; } @@ -65,7 +79,6 @@ void OGLSampler::Release() { MICROPROFILE_SCOPE(OpenGL_ResourceDeletion); glDeleteSamplers(1, &handle); - OpenGLState::GetCurState().ResetSampler(handle).Apply(); handle = 0; } @@ -109,7 +122,6 @@ void OGLProgram::Release() { MICROPROFILE_SCOPE(OpenGL_ResourceDeletion); glDeleteProgram(handle); - OpenGLState::GetCurState().ResetProgram(handle).Apply(); handle = 0; } @@ -127,7 +139,6 @@ void OGLPipeline::Release() { MICROPROFILE_SCOPE(OpenGL_ResourceDeletion); glDeleteProgramPipelines(1, &handle); - OpenGLState::GetCurState().ResetPipeline(handle).Apply(); handle = 0; } @@ -171,24 +182,6 @@ void OGLSync::Release() { handle = 0; } -void OGLVertexArray::Create() { - if (handle != 0) - return; - - MICROPROFILE_SCOPE(OpenGL_ResourceCreation); - glCreateVertexArrays(1, &handle); -} - -void OGLVertexArray::Release() { - if (handle == 0) - return; - - MICROPROFILE_SCOPE(OpenGL_ResourceDeletion); - glDeleteVertexArrays(1, &handle); - OpenGLState::GetCurState().ResetVertexArray(handle).Apply(); - handle = 0; -} - void OGLFramebuffer::Create() { if (handle != 0) return; @@ -203,7 +196,6 @@ void OGLFramebuffer::Release() { MICROPROFILE_SCOPE(OpenGL_ResourceDeletion); glDeleteFramebuffers(1, &handle); - OpenGLState::GetCurState().ResetFramebuffer(handle).Apply(); handle = 0; } diff --git a/src/video_core/renderer_opengl/gl_resource_manager.h b/src/video_core/renderer_opengl/gl_resource_manager.h index 514d1d165..de93f4212 100644 --- a/src/video_core/renderer_opengl/gl_resource_manager.h +++ b/src/video_core/renderer_opengl/gl_resource_manager.h @@ -11,6 +11,31 @@ namespace OpenGL { +class OGLRenderbuffer : private NonCopyable { +public: + OGLRenderbuffer() = default; + + OGLRenderbuffer(OGLRenderbuffer&& o) noexcept : handle(std::exchange(o.handle, 0)) {} + + ~OGLRenderbuffer() { + Release(); + } + + OGLRenderbuffer& operator=(OGLRenderbuffer&& o) noexcept { + Release(); + handle = std::exchange(o.handle, 0); + return *this; + } + + /// Creates a new internal OpenGL resource and stores the handle + void Create(); + + /// Deletes the internal OpenGL resource + void Release(); + + GLuint handle = 0; +}; + class OGLTexture : private NonCopyable { public: OGLTexture() = default; @@ -216,31 +241,6 @@ public: GLsync handle = 0; }; -class OGLVertexArray : private NonCopyable { -public: - OGLVertexArray() = default; - - OGLVertexArray(OGLVertexArray&& o) noexcept : handle(std::exchange(o.handle, 0)) {} - - ~OGLVertexArray() { - Release(); - } - - OGLVertexArray& operator=(OGLVertexArray&& o) noexcept { - Release(); - handle = std::exchange(o.handle, 0); - return *this; - } - - /// Creates a new internal OpenGL resource and stores the handle - void Create(); - - /// Deletes the internal OpenGL resource - void Release(); - - GLuint handle = 0; -}; - class OGLFramebuffer : private NonCopyable { public: OGLFramebuffer() = default; diff --git a/src/video_core/renderer_opengl/gl_sampler_cache.cpp b/src/video_core/renderer_opengl/gl_sampler_cache.cpp index 3ded5ecea..5c174879a 100644 --- a/src/video_core/renderer_opengl/gl_sampler_cache.cpp +++ b/src/video_core/renderer_opengl/gl_sampler_cache.cpp @@ -38,7 +38,7 @@ OGLSampler SamplerCacheOpenGL::CreateSampler(const Tegra::Texture::TSCEntry& tsc glSamplerParameterf(sampler_id, GL_TEXTURE_MAX_ANISOTROPY, tsc.GetMaxAnisotropy()); } else if (GLAD_GL_EXT_texture_filter_anisotropic) { glSamplerParameterf(sampler_id, GL_TEXTURE_MAX_ANISOTROPY_EXT, tsc.GetMaxAnisotropy()); - } else if (tsc.GetMaxAnisotropy() != 1) { + } else { LOG_WARNING(Render_OpenGL, "Anisotropy not supported by host GPU driver"); } diff --git a/src/video_core/renderer_opengl/gl_shader_cache.cpp b/src/video_core/renderer_opengl/gl_shader_cache.cpp index 489eb143c..e3d31c3eb 100644 --- a/src/video_core/renderer_opengl/gl_shader_cache.cpp +++ b/src/video_core/renderer_opengl/gl_shader_cache.cpp @@ -2,12 +2,16 @@ // Licensed under GPLv2 or any later version // Refer to the license.txt file included. +#include <atomic> +#include <functional> #include <mutex> #include <optional> #include <string> #include <thread> #include <unordered_set> + #include <boost/functional/hash.hpp> + #include "common/alignment.h" #include "common/assert.h" #include "common/logging/log.h" @@ -22,14 +26,16 @@ #include "video_core/renderer_opengl/gl_shader_cache.h" #include "video_core/renderer_opengl/gl_shader_decompiler.h" #include "video_core/renderer_opengl/gl_shader_disk_cache.h" +#include "video_core/renderer_opengl/gl_state_tracker.h" #include "video_core/renderer_opengl/utils.h" +#include "video_core/shader/registry.h" #include "video_core/shader/shader_ir.h" namespace OpenGL { using Tegra::Engines::ShaderType; -using VideoCommon::Shader::ConstBufferLocker; using VideoCommon::Shader::ProgramCode; +using VideoCommon::Shader::Registry; using VideoCommon::Shader::ShaderIR; namespace { @@ -55,7 +61,7 @@ constexpr bool IsSchedInstruction(std::size_t offset, std::size_t main_offset) { } /// Calculates the size of a program stream -std::size_t CalculateProgramSize(const GLShader::ProgramCode& program) { +std::size_t CalculateProgramSize(const ProgramCode& program) { constexpr std::size_t start_offset = 10; // This is the encoded version of BRA that jumps to itself. All Nvidia // shaders end with one. @@ -108,32 +114,9 @@ constexpr GLenum GetGLShaderType(ShaderType shader_type) { } } -/// Describes primitive behavior on geometry shaders -constexpr std::pair<const char*, u32> GetPrimitiveDescription(GLenum primitive_mode) { - switch (primitive_mode) { - case GL_POINTS: - return {"points", 1}; - case GL_LINES: - case GL_LINE_STRIP: - return {"lines", 2}; - case GL_LINES_ADJACENCY: - case GL_LINE_STRIP_ADJACENCY: - return {"lines_adjacency", 4}; - case GL_TRIANGLES: - case GL_TRIANGLE_STRIP: - case GL_TRIANGLE_FAN: - return {"triangles", 3}; - case GL_TRIANGLES_ADJACENCY: - case GL_TRIANGLE_STRIP_ADJACENCY: - return {"triangles_adjacency", 6}; - default: - return {"points", 1}; - } -} - /// Hashes one (or two) program streams u64 GetUniqueIdentifier(ShaderType shader_type, bool is_a, const ProgramCode& code, - const ProgramCode& code_b) { + const ProgramCode& code_b = {}) { u64 unique_identifier = boost::hash_value(code); if (is_a) { // VertexA programs include two programs @@ -142,24 +125,6 @@ u64 GetUniqueIdentifier(ShaderType shader_type, bool is_a, const ProgramCode& co return unique_identifier; } -/// Creates an unspecialized program from code streams -std::string GenerateGLSL(const Device& device, ShaderType shader_type, const ShaderIR& ir, - const std::optional<ShaderIR>& ir_b) { - switch (shader_type) { - case ShaderType::Vertex: - return GLShader::GenerateVertexShader(device, ir, ir_b ? &*ir_b : nullptr); - case ShaderType::Geometry: - return GLShader::GenerateGeometryShader(device, ir); - case ShaderType::Fragment: - return GLShader::GenerateFragmentShader(device, ir); - case ShaderType::Compute: - return GLShader::GenerateComputeShader(device, ir); - default: - UNIMPLEMENTED_MSG("Unimplemented shader_type={}", static_cast<u32>(shader_type)); - return {}; - } -} - constexpr const char* GetShaderTypeName(ShaderType shader_type) { switch (shader_type) { case ShaderType::Vertex: @@ -195,102 +160,38 @@ constexpr ShaderType GetShaderType(Maxwell::ShaderProgram program_type) { return {}; } -std::string GetShaderId(u64 unique_identifier, ShaderType shader_type) { +std::string MakeShaderID(u64 unique_identifier, ShaderType shader_type) { return fmt::format("{}{:016X}", GetShaderTypeName(shader_type), unique_identifier); } -Tegra::Engines::ConstBufferEngineInterface& GetConstBufferEngineInterface(Core::System& system, - ShaderType shader_type) { - if (shader_type == ShaderType::Compute) { - return system.GPU().KeplerCompute(); - } else { - return system.GPU().Maxwell3D(); - } -} - -std::unique_ptr<ConstBufferLocker> MakeLocker(Core::System& system, ShaderType shader_type) { - return std::make_unique<ConstBufferLocker>(shader_type, - GetConstBufferEngineInterface(system, shader_type)); -} - -void FillLocker(ConstBufferLocker& locker, const ShaderDiskCacheUsage& usage) { - locker.SetBoundBuffer(usage.bound_buffer); - for (const auto& key : usage.keys) { - const auto [buffer, offset] = key.first; - locker.InsertKey(buffer, offset, key.second); +std::shared_ptr<Registry> MakeRegistry(const ShaderDiskCacheEntry& entry) { + const VideoCore::GuestDriverProfile guest_profile{entry.texture_handler_size}; + const VideoCommon::Shader::SerializedRegistryInfo info{guest_profile, entry.bound_buffer, + entry.graphics_info, entry.compute_info}; + const auto registry = std::make_shared<Registry>(entry.type, info); + for (const auto& [address, value] : entry.keys) { + const auto [buffer, offset] = address; + registry->InsertKey(buffer, offset, value); } - for (const auto& [offset, sampler] : usage.bound_samplers) { - locker.InsertBoundSampler(offset, sampler); + for (const auto& [offset, sampler] : entry.bound_samplers) { + registry->InsertBoundSampler(offset, sampler); } - for (const auto& [key, sampler] : usage.bindless_samplers) { + for (const auto& [key, sampler] : entry.bindless_samplers) { const auto [buffer, offset] = key; - locker.InsertBindlessSampler(buffer, offset, sampler); + registry->InsertBindlessSampler(buffer, offset, sampler); } + return registry; } -CachedProgram BuildShader(const Device& device, u64 unique_identifier, ShaderType shader_type, - const ProgramCode& code, const ProgramCode& code_b, - ConstBufferLocker& locker, const ProgramVariant& variant, - bool hint_retrievable = false) { - LOG_INFO(Render_OpenGL, "called. {}", GetShaderId(unique_identifier, shader_type)); - - const bool is_compute = shader_type == ShaderType::Compute; - const u32 main_offset = is_compute ? KERNEL_MAIN_OFFSET : STAGE_MAIN_OFFSET; - const ShaderIR ir(code, main_offset, COMPILER_SETTINGS, locker); - std::optional<ShaderIR> ir_b; - if (!code_b.empty()) { - ir_b.emplace(code_b, main_offset, COMPILER_SETTINGS, locker); - } - - std::string source = fmt::format(R"(// {} -#version 430 core -#extension GL_ARB_separate_shader_objects : enable -)", - GetShaderId(unique_identifier, shader_type)); - if (device.HasShaderBallot()) { - source += "#extension GL_ARB_shader_ballot : require\n"; - } - if (device.HasVertexViewportLayer()) { - source += "#extension GL_ARB_shader_viewport_layer_array : require\n"; - } - if (device.HasImageLoadFormatted()) { - source += "#extension GL_EXT_shader_image_load_formatted : require\n"; - } - if (device.HasWarpIntrinsics()) { - source += "#extension GL_NV_gpu_shader5 : require\n" - "#extension GL_NV_shader_thread_group : require\n" - "#extension GL_NV_shader_thread_shuffle : require\n"; - } - // This pragma stops Nvidia's driver from over optimizing math (probably using fp16 operations) - // on places where we don't want to. - // Thanks to Ryujinx for finding this workaround. - source += "#pragma optionNV(fastmath off)\n"; - - if (shader_type == ShaderType::Geometry) { - const auto [glsl_topology, max_vertices] = GetPrimitiveDescription(variant.primitive_mode); - source += fmt::format("#define MAX_VERTEX_INPUT {}\n", max_vertices); - source += fmt::format("layout ({}) in;\n", glsl_topology); - } - if (shader_type == ShaderType::Compute) { - if (variant.local_memory_size > 0) { - source += fmt::format("#define LOCAL_MEMORY_SIZE {}\n", - Common::AlignUp(variant.local_memory_size, 4) / 4); - } - source += - fmt::format("layout (local_size_x = {}, local_size_y = {}, local_size_z = {}) in;\n", - variant.block_x, variant.block_y, variant.block_z); - - if (variant.shared_memory_size > 0) { - // shared_memory_size is described in number of words - source += fmt::format("shared uint smem[{}];\n", variant.shared_memory_size); - } - } - - source += '\n'; - source += GenerateGLSL(device, shader_type, ir, ir_b); +std::shared_ptr<OGLProgram> BuildShader(const Device& device, ShaderType shader_type, + u64 unique_identifier, const ShaderIR& ir, + const Registry& registry, bool hint_retrievable = false) { + const std::string shader_id = MakeShaderID(unique_identifier, shader_type); + LOG_INFO(Render_OpenGL, "{}", shader_id); + const std::string glsl = DecompileShader(device, ir, registry, shader_type, shader_id); OGLShader shader; - shader.Create(source.c_str(), GetGLShaderType(shader_type)); + shader.Create(glsl.c_str(), GetGLShaderType(shader_type)); auto program = std::make_shared<OGLProgram>(); program->Create(true, hint_retrievable, shader.handle); @@ -298,7 +199,7 @@ CachedProgram BuildShader(const Device& device, u64 unique_identifier, ShaderTyp } std::unordered_set<GLenum> GetSupportedFormats() { - GLint num_formats{}; + GLint num_formats; glGetIntegerv(GL_NUM_PROGRAM_BINARY_FORMATS, &num_formats); std::vector<GLint> formats(num_formats); @@ -313,115 +214,82 @@ std::unordered_set<GLenum> GetSupportedFormats() { } // Anonymous namespace -CachedShader::CachedShader(const ShaderParameters& params, ShaderType shader_type, - GLShader::ShaderEntries entries, ProgramCode code, ProgramCode code_b) - : RasterizerCacheObject{params.host_ptr}, system{params.system}, - disk_cache{params.disk_cache}, device{params.device}, cpu_addr{params.cpu_addr}, - unique_identifier{params.unique_identifier}, shader_type{shader_type}, - entries{std::move(entries)}, code{std::move(code)}, code_b{std::move(code_b)} { - if (!params.precompiled_variants) { - return; - } - for (const auto& pair : *params.precompiled_variants) { - auto locker = MakeLocker(system, shader_type); - const auto& usage = pair->first; - FillLocker(*locker, usage); - - std::unique_ptr<LockerVariant>* locker_variant = nullptr; - const auto it = - std::find_if(locker_variants.begin(), locker_variants.end(), [&](const auto& variant) { - return variant->locker->HasEqualKeys(*locker); - }); - if (it == locker_variants.end()) { - locker_variant = &locker_variants.emplace_back(); - *locker_variant = std::make_unique<LockerVariant>(); - locker_variant->get()->locker = std::move(locker); - } else { - locker_variant = &*it; - } - locker_variant->get()->programs.emplace(usage.variant, pair->second); - } +CachedShader::CachedShader(const u8* host_ptr, VAddr cpu_addr, std::size_t size_in_bytes, + std::shared_ptr<VideoCommon::Shader::Registry> registry, + ShaderEntries entries, std::shared_ptr<OGLProgram> program) + : RasterizerCacheObject{host_ptr}, registry{std::move(registry)}, entries{std::move(entries)}, + cpu_addr{cpu_addr}, size_in_bytes{size_in_bytes}, program{std::move(program)} {} + +CachedShader::~CachedShader() = default; + +GLuint CachedShader::GetHandle() const { + DEBUG_ASSERT(registry->IsConsistent()); + return program->handle; } Shader CachedShader::CreateStageFromMemory(const ShaderParameters& params, Maxwell::ShaderProgram program_type, ProgramCode code, ProgramCode code_b) { const auto shader_type = GetShaderType(program_type); - params.disk_cache.SaveRaw( - ShaderDiskCacheRaw(params.unique_identifier, shader_type, code, code_b)); + const std::size_t size_in_bytes = code.size() * sizeof(u64); - ConstBufferLocker locker(shader_type, params.system.GPU().Maxwell3D()); - const ShaderIR ir(code, STAGE_MAIN_OFFSET, COMPILER_SETTINGS, locker); + auto registry = std::make_shared<Registry>(shader_type, params.system.GPU().Maxwell3D()); + const ShaderIR ir(code, STAGE_MAIN_OFFSET, COMPILER_SETTINGS, *registry); // TODO(Rodrigo): Handle VertexA shaders // std::optional<ShaderIR> ir_b; // if (!code_b.empty()) { // ir_b.emplace(code_b, STAGE_MAIN_OFFSET); // } - return std::shared_ptr<CachedShader>(new CachedShader( - params, shader_type, GLShader::GetEntries(ir), std::move(code), std::move(code_b))); + auto program = BuildShader(params.device, shader_type, params.unique_identifier, ir, *registry); + + ShaderDiskCacheEntry entry; + entry.type = shader_type; + entry.code = std::move(code); + entry.code_b = std::move(code_b); + entry.unique_identifier = params.unique_identifier; + entry.bound_buffer = registry->GetBoundBuffer(); + entry.graphics_info = registry->GetGraphicsInfo(); + entry.keys = registry->GetKeys(); + entry.bound_samplers = registry->GetBoundSamplers(); + entry.bindless_samplers = registry->GetBindlessSamplers(); + params.disk_cache.SaveEntry(std::move(entry)); + + return std::shared_ptr<CachedShader>(new CachedShader(params.host_ptr, params.cpu_addr, + size_in_bytes, std::move(registry), + MakeEntries(ir), std::move(program))); } Shader CachedShader::CreateKernelFromMemory(const ShaderParameters& params, ProgramCode code) { - params.disk_cache.SaveRaw( - ShaderDiskCacheRaw(params.unique_identifier, ShaderType::Compute, code)); - - ConstBufferLocker locker(Tegra::Engines::ShaderType::Compute, - params.system.GPU().KeplerCompute()); - const ShaderIR ir(code, KERNEL_MAIN_OFFSET, COMPILER_SETTINGS, locker); - return std::shared_ptr<CachedShader>(new CachedShader( - params, ShaderType::Compute, GLShader::GetEntries(ir), std::move(code), {})); + const std::size_t size_in_bytes = code.size() * sizeof(u64); + + auto& engine = params.system.GPU().KeplerCompute(); + auto registry = std::make_shared<Registry>(ShaderType::Compute, engine); + const ShaderIR ir(code, KERNEL_MAIN_OFFSET, COMPILER_SETTINGS, *registry); + const u64 uid = params.unique_identifier; + auto program = BuildShader(params.device, ShaderType::Compute, uid, ir, *registry); + + ShaderDiskCacheEntry entry; + entry.type = ShaderType::Compute; + entry.code = std::move(code); + entry.unique_identifier = uid; + entry.bound_buffer = registry->GetBoundBuffer(); + entry.compute_info = registry->GetComputeInfo(); + entry.keys = registry->GetKeys(); + entry.bound_samplers = registry->GetBoundSamplers(); + entry.bindless_samplers = registry->GetBindlessSamplers(); + params.disk_cache.SaveEntry(std::move(entry)); + + return std::shared_ptr<CachedShader>(new CachedShader(params.host_ptr, params.cpu_addr, + size_in_bytes, std::move(registry), + MakeEntries(ir), std::move(program))); } Shader CachedShader::CreateFromCache(const ShaderParameters& params, - const UnspecializedShader& unspecialized) { - return std::shared_ptr<CachedShader>(new CachedShader(params, unspecialized.type, - unspecialized.entries, unspecialized.code, - unspecialized.code_b)); -} - -GLuint CachedShader::GetHandle(const ProgramVariant& variant) { - EnsureValidLockerVariant(); - - const auto [entry, is_cache_miss] = curr_locker_variant->programs.try_emplace(variant); - auto& program = entry->second; - if (!is_cache_miss) { - return program->handle; - } - - program = BuildShader(device, unique_identifier, shader_type, code, code_b, - *curr_locker_variant->locker, variant); - disk_cache.SaveUsage(GetUsage(variant, *curr_locker_variant->locker)); - - LabelGLObject(GL_PROGRAM, program->handle, cpu_addr); - return program->handle; -} - -bool CachedShader::EnsureValidLockerVariant() { - const auto previous_variant = curr_locker_variant; - if (curr_locker_variant && !curr_locker_variant->locker->IsConsistent()) { - curr_locker_variant = nullptr; - } - if (!curr_locker_variant) { - for (auto& variant : locker_variants) { - if (variant->locker->IsConsistent()) { - curr_locker_variant = variant.get(); - } - } - } - if (!curr_locker_variant) { - auto& new_variant = locker_variants.emplace_back(); - new_variant = std::make_unique<LockerVariant>(); - new_variant->locker = MakeLocker(system, shader_type); - curr_locker_variant = new_variant.get(); - } - return previous_variant == curr_locker_variant; -} - -ShaderDiskCacheUsage CachedShader::GetUsage(const ProgramVariant& variant, - const ConstBufferLocker& locker) const { - return ShaderDiskCacheUsage{unique_identifier, variant, - locker.GetBoundBuffer(), locker.GetKeys(), - locker.GetBoundSamplers(), locker.GetBindlessSamplers()}; + const PrecompiledShader& precompiled_shader, + std::size_t size_in_bytes) { + return std::shared_ptr<CachedShader>(new CachedShader( + params.host_ptr, params.cpu_addr, size_in_bytes, precompiled_shader.registry, + precompiled_shader.entries, precompiled_shader.program)); } ShaderCacheOpenGL::ShaderCacheOpenGL(RasterizerOpenGL& rasterizer, Core::System& system, @@ -431,16 +299,12 @@ ShaderCacheOpenGL::ShaderCacheOpenGL(RasterizerOpenGL& rasterizer, Core::System& void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading, const VideoCore::DiskResourceLoadCallback& callback) { - const auto transferable = disk_cache.LoadTransferable(); + const std::optional transferable = disk_cache.LoadTransferable(); if (!transferable) { return; } - const auto [raws, shader_usages] = *transferable; - if (!GenerateUnspecializedShaders(stop_loading, callback, raws) || stop_loading) { - return; - } - const auto dumps = disk_cache.LoadPrecompiled(); + const std::vector gl_cache = disk_cache.LoadPrecompiled(); const auto supported_formats = GetSupportedFormats(); // Track if precompiled cache was altered during loading to know if we have to @@ -449,77 +313,82 @@ void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading, // Inform the frontend about shader build initialization if (callback) { - callback(VideoCore::LoadCallbackStage::Build, 0, shader_usages.size()); + callback(VideoCore::LoadCallbackStage::Build, 0, transferable->size()); } std::mutex mutex; std::size_t built_shaders = 0; // It doesn't have be atomic since it's used behind a mutex - std::atomic_bool compilation_failed = false; + std::atomic_bool gl_cache_failed = false; - const auto Worker = [&](Core::Frontend::GraphicsContext* context, std::size_t begin, - std::size_t end, const std::vector<ShaderDiskCacheUsage>& shader_usages, - const ShaderDumpsMap& dumps) { + const auto find_precompiled = [&gl_cache](u64 id) { + return std::find_if(gl_cache.begin(), gl_cache.end(), + [id](const auto& entry) { return entry.unique_identifier == id; }); + }; + + const auto worker = [&](Core::Frontend::GraphicsContext* context, std::size_t begin, + std::size_t end) { context->MakeCurrent(); SCOPE_EXIT({ return context->DoneCurrent(); }); for (std::size_t i = begin; i < end; ++i) { - if (stop_loading || compilation_failed) { + if (stop_loading) { return; } - const auto& usage{shader_usages[i]}; - const auto& unspecialized{unspecialized_shaders.at(usage.unique_identifier)}; - const auto dump{dumps.find(usage)}; - - CachedProgram shader; - if (dump != dumps.end()) { - // If the shader is dumped, attempt to load it with - shader = GeneratePrecompiledProgram(dump->second, supported_formats); - if (!shader) { - compilation_failed = true; - return; + const auto& entry = (*transferable)[i]; + const u64 uid = entry.unique_identifier; + const auto it = find_precompiled(uid); + const auto precompiled_entry = it != gl_cache.end() ? &*it : nullptr; + + const bool is_compute = entry.type == ShaderType::Compute; + const u32 main_offset = is_compute ? KERNEL_MAIN_OFFSET : STAGE_MAIN_OFFSET; + auto registry = MakeRegistry(entry); + const ShaderIR ir(entry.code, main_offset, COMPILER_SETTINGS, *registry); + + std::shared_ptr<OGLProgram> program; + if (precompiled_entry) { + // If the shader is precompiled, attempt to load it with + program = GeneratePrecompiledProgram(entry, *precompiled_entry, supported_formats); + if (!program) { + gl_cache_failed = true; } } - if (!shader) { - auto locker{MakeLocker(system, unspecialized.type)}; - FillLocker(*locker, usage); - - shader = BuildShader(device, usage.unique_identifier, unspecialized.type, - unspecialized.code, unspecialized.code_b, *locker, - usage.variant, true); + if (!program) { + // Otherwise compile it from GLSL + program = BuildShader(device, entry.type, uid, ir, *registry, true); } + PrecompiledShader shader; + shader.program = std::move(program); + shader.registry = std::move(registry); + shader.entries = MakeEntries(ir); + std::scoped_lock lock{mutex}; if (callback) { callback(VideoCore::LoadCallbackStage::Build, ++built_shaders, - shader_usages.size()); + transferable->size()); } - - precompiled_programs.emplace(usage, std::move(shader)); - - // TODO(Rodrigo): Is there a better way to do this? - precompiled_variants[usage.unique_identifier].push_back( - precompiled_programs.find(usage)); + runtime_cache.emplace(entry.unique_identifier, std::move(shader)); } }; const auto num_workers{static_cast<std::size_t>(std::thread::hardware_concurrency() + 1ULL)}; - const std::size_t bucket_size{shader_usages.size() / num_workers}; + const std::size_t bucket_size{transferable->size() / num_workers}; std::vector<std::unique_ptr<Core::Frontend::GraphicsContext>> contexts(num_workers); std::vector<std::thread> threads(num_workers); for (std::size_t i = 0; i < num_workers; ++i) { const bool is_last_worker = i + 1 == num_workers; const std::size_t start{bucket_size * i}; - const std::size_t end{is_last_worker ? shader_usages.size() : start + bucket_size}; + const std::size_t end{is_last_worker ? transferable->size() : start + bucket_size}; // On some platforms the shared context has to be created from the GUI thread contexts[i] = emu_window.CreateSharedContext(); - threads[i] = std::thread(Worker, contexts[i].get(), start, end, shader_usages, dumps); + threads[i] = std::thread(worker, contexts[i].get(), start, end); } for (auto& thread : threads) { thread.join(); } - if (compilation_failed) { + if (gl_cache_failed) { // Invalidate the precompiled cache if a shader dumped shader was rejected disk_cache.InvalidatePrecompiled(); precompiled_cache_altered = true; @@ -532,11 +401,12 @@ void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading, // TODO(Rodrigo): Do state tracking for transferable shaders and do a dummy draw // before precompiling them - for (std::size_t i = 0; i < shader_usages.size(); ++i) { - const auto& usage{shader_usages[i]}; - if (dumps.find(usage) == dumps.end()) { - const auto& program{precompiled_programs.at(usage)}; - disk_cache.SaveDump(usage, program->handle); + for (std::size_t i = 0; i < transferable->size(); ++i) { + const u64 id = (*transferable)[i].unique_identifier; + const auto it = find_precompiled(id); + if (it == gl_cache.end()) { + const GLuint program = runtime_cache.at(id).program->handle; + disk_cache.SavePrecompiled(id, program); precompiled_cache_altered = true; } } @@ -546,84 +416,33 @@ void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading, } } -const PrecompiledVariants* ShaderCacheOpenGL::GetPrecompiledVariants(u64 unique_identifier) const { - const auto it = precompiled_variants.find(unique_identifier); - return it == precompiled_variants.end() ? nullptr : &it->second; -} - -CachedProgram ShaderCacheOpenGL::GeneratePrecompiledProgram( - const ShaderDiskCacheDump& dump, const std::unordered_set<GLenum>& supported_formats) { - if (supported_formats.find(dump.binary_format) == supported_formats.end()) { - LOG_INFO(Render_OpenGL, "Precompiled cache entry with unsupported format - removing"); +std::shared_ptr<OGLProgram> ShaderCacheOpenGL::GeneratePrecompiledProgram( + const ShaderDiskCacheEntry& entry, const ShaderDiskCachePrecompiled& precompiled_entry, + const std::unordered_set<GLenum>& supported_formats) { + if (supported_formats.find(precompiled_entry.binary_format) == supported_formats.end()) { + LOG_INFO(Render_OpenGL, "Precompiled cache entry with unsupported format, removing"); return {}; } - CachedProgram shader = std::make_shared<OGLProgram>(); - shader->handle = glCreateProgram(); - glProgramParameteri(shader->handle, GL_PROGRAM_SEPARABLE, GL_TRUE); - glProgramBinary(shader->handle, dump.binary_format, dump.binary.data(), - static_cast<GLsizei>(dump.binary.size())); - - GLint link_status{}; - glGetProgramiv(shader->handle, GL_LINK_STATUS, &link_status); + auto program = std::make_shared<OGLProgram>(); + program->handle = glCreateProgram(); + glProgramParameteri(program->handle, GL_PROGRAM_SEPARABLE, GL_TRUE); + glProgramBinary(program->handle, precompiled_entry.binary_format, + precompiled_entry.binary.data(), + static_cast<GLsizei>(precompiled_entry.binary.size())); + + GLint link_status; + glGetProgramiv(program->handle, GL_LINK_STATUS, &link_status); if (link_status == GL_FALSE) { - LOG_INFO(Render_OpenGL, "Precompiled cache rejected by the driver - removing"); + LOG_INFO(Render_OpenGL, "Precompiled cache rejected by the driver, removing"); return {}; } - return shader; -} - -bool ShaderCacheOpenGL::GenerateUnspecializedShaders( - const std::atomic_bool& stop_loading, const VideoCore::DiskResourceLoadCallback& callback, - const std::vector<ShaderDiskCacheRaw>& raws) { - if (callback) { - callback(VideoCore::LoadCallbackStage::Decompile, 0, raws.size()); - } - - for (std::size_t i = 0; i < raws.size(); ++i) { - if (stop_loading) { - return false; - } - const auto& raw{raws[i]}; - const u64 unique_identifier{raw.GetUniqueIdentifier()}; - const u64 calculated_hash{ - GetUniqueIdentifier(raw.GetType(), raw.HasProgramA(), raw.GetCode(), raw.GetCodeB())}; - if (unique_identifier != calculated_hash) { - LOG_ERROR(Render_OpenGL, - "Invalid hash in entry={:016x} (obtained hash={:016x}) - " - "removing shader cache", - raw.GetUniqueIdentifier(), calculated_hash); - disk_cache.InvalidateTransferable(); - return false; - } - - const u32 main_offset = - raw.GetType() == ShaderType::Compute ? KERNEL_MAIN_OFFSET : STAGE_MAIN_OFFSET; - ConstBufferLocker locker(raw.GetType()); - const ShaderIR ir(raw.GetCode(), main_offset, COMPILER_SETTINGS, locker); - // TODO(Rodrigo): Handle VertexA shaders - // std::optional<ShaderIR> ir_b; - // if (raw.HasProgramA()) { - // ir_b.emplace(raw.GetProgramCodeB(), main_offset); - // } - - UnspecializedShader unspecialized; - unspecialized.entries = GLShader::GetEntries(ir); - unspecialized.type = raw.GetType(); - unspecialized.code = raw.GetCode(); - unspecialized.code_b = raw.GetCodeB(); - unspecialized_shaders.emplace(raw.GetUniqueIdentifier(), unspecialized); - - if (callback) { - callback(VideoCore::LoadCallbackStage::Decompile, i, raws.size()); - } - } - return true; + return program; } Shader ShaderCacheOpenGL::GetStageProgram(Maxwell::ShaderProgram program) { - if (!system.GPU().Maxwell3D().dirty.shaders) { + if (!system.GPU().Maxwell3D().dirty.flags[Dirty::Shaders]) { return last_shaders[static_cast<std::size_t>(program)]; } @@ -647,17 +466,17 @@ Shader ShaderCacheOpenGL::GetStageProgram(Maxwell::ShaderProgram program) { const auto unique_identifier = GetUniqueIdentifier( GetShaderType(program), program == Maxwell::ShaderProgram::VertexA, code, code_b); - const auto precompiled_variants = GetPrecompiledVariants(unique_identifier); const auto cpu_addr{*memory_manager.GpuToCpuAddress(address)}; - const ShaderParameters params{system, disk_cache, precompiled_variants, device, + const ShaderParameters params{system, disk_cache, device, cpu_addr, host_ptr, unique_identifier}; - const auto found = unspecialized_shaders.find(unique_identifier); - if (found == unspecialized_shaders.end()) { + const auto found = runtime_cache.find(unique_identifier); + if (found == runtime_cache.end()) { shader = CachedShader::CreateStageFromMemory(params, program, std::move(code), std::move(code_b)); } else { - shader = CachedShader::CreateFromCache(params, found->second); + const std::size_t size_in_bytes = code.size() * sizeof(u64); + shader = CachedShader::CreateFromCache(params, found->second, size_in_bytes); } Register(shader); @@ -672,19 +491,19 @@ Shader ShaderCacheOpenGL::GetComputeKernel(GPUVAddr code_addr) { return kernel; } - // No kernel found - create a new one + // No kernel found, create a new one auto code{GetShaderCode(memory_manager, code_addr, host_ptr)}; - const auto unique_identifier{GetUniqueIdentifier(ShaderType::Compute, false, code, {})}; - const auto precompiled_variants = GetPrecompiledVariants(unique_identifier); + const auto unique_identifier{GetUniqueIdentifier(ShaderType::Compute, false, code)}; const auto cpu_addr{*memory_manager.GpuToCpuAddress(code_addr)}; - const ShaderParameters params{system, disk_cache, precompiled_variants, device, + const ShaderParameters params{system, disk_cache, device, cpu_addr, host_ptr, unique_identifier}; - const auto found = unspecialized_shaders.find(unique_identifier); - if (found == unspecialized_shaders.end()) { + const auto found = runtime_cache.find(unique_identifier); + if (found == runtime_cache.end()) { kernel = CachedShader::CreateKernelFromMemory(params, std::move(code)); } else { - kernel = CachedShader::CreateFromCache(params, found->second); + const std::size_t size_in_bytes = code.size() * sizeof(u64); + kernel = CachedShader::CreateFromCache(params, found->second, size_in_bytes); } Register(kernel); diff --git a/src/video_core/renderer_opengl/gl_shader_cache.h b/src/video_core/renderer_opengl/gl_shader_cache.h index 7b1470db3..4935019fc 100644 --- a/src/video_core/renderer_opengl/gl_shader_cache.h +++ b/src/video_core/renderer_opengl/gl_shader_cache.h @@ -22,7 +22,7 @@ #include "video_core/renderer_opengl/gl_resource_manager.h" #include "video_core/renderer_opengl/gl_shader_decompiler.h" #include "video_core/renderer_opengl/gl_shader_disk_cache.h" -#include "video_core/shader/const_buffer_locker.h" +#include "video_core/shader/registry.h" #include "video_core/shader/shader_ir.h" namespace Core { @@ -41,22 +41,17 @@ class RasterizerOpenGL; struct UnspecializedShader; using Shader = std::shared_ptr<CachedShader>; -using CachedProgram = std::shared_ptr<OGLProgram>; using Maxwell = Tegra::Engines::Maxwell3D::Regs; -using PrecompiledPrograms = std::unordered_map<ShaderDiskCacheUsage, CachedProgram>; -using PrecompiledVariants = std::vector<PrecompiledPrograms::iterator>; - -struct UnspecializedShader { - GLShader::ShaderEntries entries; - Tegra::Engines::ShaderType type; - ProgramCode code; - ProgramCode code_b; + +struct PrecompiledShader { + std::shared_ptr<OGLProgram> program; + std::shared_ptr<VideoCommon::Shader::Registry> registry; + ShaderEntries entries; }; struct ShaderParameters { Core::System& system; ShaderDiskCacheOpenGL& disk_cache; - const PrecompiledVariants* precompiled_variants; const Device& device; VAddr cpu_addr; u8* host_ptr; @@ -65,61 +60,45 @@ struct ShaderParameters { class CachedShader final : public RasterizerCacheObject { public: - static Shader CreateStageFromMemory(const ShaderParameters& params, - Maxwell::ShaderProgram program_type, - ProgramCode program_code, ProgramCode program_code_b); - static Shader CreateKernelFromMemory(const ShaderParameters& params, ProgramCode code); + ~CachedShader(); - static Shader CreateFromCache(const ShaderParameters& params, - const UnspecializedShader& unspecialized); + /// Gets the GL program handle for the shader + GLuint GetHandle() const; + /// Returns the guest CPU address of the shader VAddr GetCpuAddr() const override { return cpu_addr; } + /// Returns the size in bytes of the shader std::size_t GetSizeInBytes() const override { - return code.size() * sizeof(u64); + return size_in_bytes; } /// Gets the shader entries for the shader - const GLShader::ShaderEntries& GetShaderEntries() const { + const ShaderEntries& GetEntries() const { return entries; } - /// Gets the GL program handle for the shader - GLuint GetHandle(const ProgramVariant& variant); - -private: - struct LockerVariant { - std::unique_ptr<VideoCommon::Shader::ConstBufferLocker> locker; - std::unordered_map<ProgramVariant, CachedProgram> programs; - }; - - explicit CachedShader(const ShaderParameters& params, Tegra::Engines::ShaderType shader_type, - GLShader::ShaderEntries entries, ProgramCode program_code, - ProgramCode program_code_b); - - bool EnsureValidLockerVariant(); - - ShaderDiskCacheUsage GetUsage(const ProgramVariant& variant, - const VideoCommon::Shader::ConstBufferLocker& locker) const; - - Core::System& system; - ShaderDiskCacheOpenGL& disk_cache; - const Device& device; - - VAddr cpu_addr{}; - - u64 unique_identifier{}; - Tegra::Engines::ShaderType shader_type{}; - - GLShader::ShaderEntries entries; + static Shader CreateStageFromMemory(const ShaderParameters& params, + Maxwell::ShaderProgram program_type, + ProgramCode program_code, ProgramCode program_code_b); + static Shader CreateKernelFromMemory(const ShaderParameters& params, ProgramCode code); - ProgramCode code; - ProgramCode code_b; + static Shader CreateFromCache(const ShaderParameters& params, + const PrecompiledShader& precompiled_shader, + std::size_t size_in_bytes); - LockerVariant* curr_locker_variant = nullptr; - std::vector<std::unique_ptr<LockerVariant>> locker_variants; +private: + explicit CachedShader(const u8* host_ptr, VAddr cpu_addr, std::size_t size_in_bytes, + std::shared_ptr<VideoCommon::Shader::Registry> registry, + ShaderEntries entries, std::shared_ptr<OGLProgram> program); + + std::shared_ptr<VideoCommon::Shader::Registry> registry; + ShaderEntries entries; + VAddr cpu_addr = 0; + std::size_t size_in_bytes = 0; + std::shared_ptr<OGLProgram> program; }; class ShaderCacheOpenGL final : public RasterizerCache<Shader> { @@ -142,25 +121,15 @@ protected: void FlushObjectInner(const Shader& object) override {} private: - bool GenerateUnspecializedShaders(const std::atomic_bool& stop_loading, - const VideoCore::DiskResourceLoadCallback& callback, - const std::vector<ShaderDiskCacheRaw>& raws); - - CachedProgram GeneratePrecompiledProgram(const ShaderDiskCacheDump& dump, - const std::unordered_set<GLenum>& supported_formats); - - const PrecompiledVariants* GetPrecompiledVariants(u64 unique_identifier) const; + std::shared_ptr<OGLProgram> GeneratePrecompiledProgram( + const ShaderDiskCacheEntry& entry, const ShaderDiskCachePrecompiled& precompiled_entry, + const std::unordered_set<GLenum>& supported_formats); Core::System& system; Core::Frontend::EmuWindow& emu_window; const Device& device; - ShaderDiskCacheOpenGL disk_cache; - - PrecompiledPrograms precompiled_programs; - std::unordered_map<u64, PrecompiledVariants> precompiled_variants; - - std::unordered_map<u64, UnspecializedShader> unspecialized_shaders; + std::unordered_map<u64, PrecompiledShader> runtime_cache; std::array<Shader, Maxwell::MaxShaderProgram> last_shaders; }; diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp index 4735000b5..8aa4a7ac9 100644 --- a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp +++ b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp @@ -23,8 +23,9 @@ #include "video_core/shader/ast.h" #include "video_core/shader/node.h" #include "video_core/shader/shader_ir.h" +#include "video_core/shader/transform_feedback.h" -namespace OpenGL::GLShader { +namespace OpenGL { namespace { @@ -36,6 +37,8 @@ using Tegra::Shader::IpaInterpMode; using Tegra::Shader::IpaMode; using Tegra::Shader::IpaSampleMode; using Tegra::Shader::Register; +using VideoCommon::Shader::BuildTransformFeedback; +using VideoCommon::Shader::Registry; using namespace std::string_literals; using namespace VideoCommon::Shader; @@ -48,6 +51,11 @@ class ExprDecompiler; enum class Type { Void, Bool, Bool2, Float, Int, Uint, HalfFloat }; +constexpr std::array FLOAT_TYPES{"float", "vec2", "vec3", "vec4"}; + +constexpr std::string_view INPUT_ATTRIBUTE_NAME = "in_attr"; +constexpr std::string_view OUTPUT_ATTRIBUTE_NAME = "out_attr"; + struct TextureOffset {}; struct TextureDerivates {}; using TextureArgument = std::pair<Type, Node>; @@ -56,6 +64,25 @@ using TextureIR = std::variant<TextureOffset, TextureDerivates, TextureArgument> constexpr u32 MAX_CONSTBUFFER_ELEMENTS = static_cast<u32>(Maxwell::MaxConstBufferSize) / (4 * sizeof(float)); +constexpr std::string_view CommonDeclarations = R"(#define ftoi floatBitsToInt +#define ftou floatBitsToUint +#define itof intBitsToFloat +#define utof uintBitsToFloat + +bvec2 HalfFloatNanComparison(bvec2 comparison, vec2 pair1, vec2 pair2) {{ + bvec2 is_nan1 = isnan(pair1); + bvec2 is_nan2 = isnan(pair2); + return bvec2(comparison.x || is_nan1.x || is_nan2.x, comparison.y || is_nan1.y || is_nan2.y); +}} + +const float fswzadd_modifiers_a[] = float[4](-1.0f, 1.0f, -1.0f, 0.0f ); +const float fswzadd_modifiers_b[] = float[4](-1.0f, -1.0f, 1.0f, -1.0f ); + +layout (std140, binding = {}) uniform vs_config {{ + float y_direction; +}}; +)"; + class ShaderWriter final { public: void AddExpression(std::string_view text) { @@ -269,12 +296,41 @@ const char* GetImageTypeDeclaration(Tegra::Shader::ImageType image_type) { } } +/// Describes primitive behavior on geometry shaders +std::pair<const char*, u32> GetPrimitiveDescription(Maxwell::PrimitiveTopology topology) { + switch (topology) { + case Maxwell::PrimitiveTopology::Points: + return {"points", 1}; + case Maxwell::PrimitiveTopology::Lines: + case Maxwell::PrimitiveTopology::LineStrip: + return {"lines", 2}; + case Maxwell::PrimitiveTopology::LinesAdjacency: + case Maxwell::PrimitiveTopology::LineStripAdjacency: + return {"lines_adjacency", 4}; + case Maxwell::PrimitiveTopology::Triangles: + case Maxwell::PrimitiveTopology::TriangleStrip: + case Maxwell::PrimitiveTopology::TriangleFan: + return {"triangles", 3}; + case Maxwell::PrimitiveTopology::TrianglesAdjacency: + case Maxwell::PrimitiveTopology::TriangleStripAdjacency: + return {"triangles_adjacency", 6}; + default: + UNIMPLEMENTED_MSG("topology={}", static_cast<int>(topology)); + return {"points", 1}; + } +} + /// Generates code to use for a swizzle operation. -constexpr const char* GetSwizzle(u32 element) { +constexpr const char* GetSwizzle(std::size_t element) { constexpr std::array swizzle = {".x", ".y", ".z", ".w"}; return swizzle.at(element); } +constexpr const char* GetColorSwizzle(std::size_t element) { + constexpr std::array swizzle = {".r", ".g", ".b", ".a"}; + return swizzle.at(element); +} + /// Translate topology std::string GetTopologyName(Tegra::Shader::OutputTopology topology) { switch (topology) { @@ -310,10 +366,19 @@ constexpr bool IsGenericAttribute(Attribute::Index index) { return index >= Attribute::Index::Attribute_0 && index <= Attribute::Index::Attribute_31; } +constexpr bool IsLegacyTexCoord(Attribute::Index index) { + return static_cast<int>(index) >= static_cast<int>(Attribute::Index::TexCoord_0) && + static_cast<int>(index) <= static_cast<int>(Attribute::Index::TexCoord_7); +} + constexpr Attribute::Index ToGenericAttribute(u64 value) { return static_cast<Attribute::Index>(value + static_cast<u64>(Attribute::Index::Attribute_0)); } +constexpr int GetLegacyTexCoordIndex(Attribute::Index index) { + return static_cast<int>(index) - static_cast<int>(Attribute::Index::TexCoord_0); +} + u32 GetGenericAttributeIndex(Attribute::Index index) { ASSERT(IsGenericAttribute(index)); return static_cast<u32>(index) - static_cast<u32>(Attribute::Index::Attribute_0); @@ -337,15 +402,66 @@ std::string FlowStackTopName(MetaStackClass stack) { return fmt::format("{}_flow_stack_top", GetFlowStackPrefix(stack)); } -[[deprecated]] constexpr bool IsVertexShader(ShaderType stage) { - return stage == ShaderType::Vertex; -} +struct GenericVaryingDescription { + std::string name; + u8 first_element = 0; + bool is_scalar = false; +}; class GLSLDecompiler final { public: - explicit GLSLDecompiler(const Device& device, const ShaderIR& ir, ShaderType stage, - std::string suffix) - : device{device}, ir{ir}, stage{stage}, suffix{suffix}, header{ir.GetHeader()} {} + explicit GLSLDecompiler(const Device& device, const ShaderIR& ir, const Registry& registry, + ShaderType stage, std::string_view identifier, std::string_view suffix) + : device{device}, ir{ir}, registry{registry}, stage{stage}, + identifier{identifier}, suffix{suffix}, header{ir.GetHeader()} { + if (stage != ShaderType::Compute) { + transform_feedback = BuildTransformFeedback(registry.GetGraphicsInfo()); + } + } + + void Decompile() { + DeclareHeader(); + DeclareVertex(); + DeclareGeometry(); + DeclareFragment(); + DeclareCompute(); + DeclareInputAttributes(); + DeclareOutputAttributes(); + DeclareImages(); + DeclareSamplers(); + DeclareGlobalMemory(); + DeclareConstantBuffers(); + DeclareLocalMemory(); + DeclareRegisters(); + DeclarePredicates(); + DeclareInternalFlags(); + DeclareCustomVariables(); + DeclarePhysicalAttributeReader(); + + code.AddLine("void main() {{"); + ++code.scope; + + if (stage == ShaderType::Vertex) { + code.AddLine("gl_Position = vec4(0.0f, 0.0f, 0.0f, 1.0f);"); + } + + if (ir.IsDecompiled()) { + DecompileAST(); + } else { + DecompileBranchMode(); + } + + --code.scope; + code.AddLine("}}"); + } + + std::string GetResult() { + return code.GetResult(); + } + +private: + friend class ASTDecompiler; + friend class ExprDecompiler; void DecompileBranchMode() { // VM's program counter @@ -387,46 +503,40 @@ public: void DecompileAST(); - void Decompile() { - DeclareVertex(); - DeclareGeometry(); - DeclareRegisters(); - DeclareCustomVariables(); - DeclarePredicates(); - DeclareLocalMemory(); - DeclareInternalFlags(); - DeclareInputAttributes(); - DeclareOutputAttributes(); - DeclareConstantBuffers(); - DeclareGlobalMemory(); - DeclareSamplers(); - DeclareImages(); - DeclarePhysicalAttributeReader(); - - code.AddLine("void execute_{}() {{", suffix); - ++code.scope; - - if (ir.IsDecompiled()) { - DecompileAST(); - } else { - DecompileBranchMode(); + void DeclareHeader() { + if (!identifier.empty()) { + code.AddLine("// {}", identifier); + } + code.AddLine("#version 440 {}", ir.UsesLegacyVaryings() ? "compatibility" : "core"); + code.AddLine("#extension GL_ARB_separate_shader_objects : enable"); + if (device.HasShaderBallot()) { + code.AddLine("#extension GL_ARB_shader_ballot : require"); + } + if (device.HasVertexViewportLayer()) { + code.AddLine("#extension GL_ARB_shader_viewport_layer_array : require"); + } + if (device.HasImageLoadFormatted()) { + code.AddLine("#extension GL_EXT_shader_image_load_formatted : require"); } + if (device.HasWarpIntrinsics()) { + code.AddLine("#extension GL_NV_gpu_shader5 : require"); + code.AddLine("#extension GL_NV_shader_thread_group : require"); + code.AddLine("#extension GL_NV_shader_thread_shuffle : require"); + } + // This pragma stops Nvidia's driver from over optimizing math (probably using fp16 + // operations) on places where we don't want to. + // Thanks to Ryujinx for finding this workaround. + code.AddLine("#pragma optionNV(fastmath off)"); - --code.scope; - code.AddLine("}}"); - } + code.AddNewLine(); - std::string GetResult() { - return code.GetResult(); + code.AddLine(CommonDeclarations, EmulationUniformBlockBinding); } -private: - friend class ASTDecompiler; - friend class ExprDecompiler; - void DeclareVertex() { - if (!IsVertexShader(stage)) + if (stage != ShaderType::Vertex) { return; + } DeclareVertexRedeclarations(); } @@ -436,9 +546,15 @@ private: return; } + const auto& info = registry.GetGraphicsInfo(); + const auto input_topology = info.primitive_topology; + const auto [glsl_topology, max_vertices] = GetPrimitiveDescription(input_topology); + max_input_vertices = max_vertices; + code.AddLine("layout ({}) in;", glsl_topology); + const auto topology = GetTopologyName(header.common3.output_topology); - const auto max_vertices = header.common4.max_output_vertices.Value(); - code.AddLine("layout ({}, max_vertices = {}) out;", topology, max_vertices); + const auto max_output_vertices = header.common4.max_output_vertices.Value(); + code.AddLine("layout ({}, max_vertices = {}) out;", topology, max_output_vertices); code.AddNewLine(); code.AddLine("in gl_PerVertex {{"); @@ -450,11 +566,50 @@ private: DeclareVertexRedeclarations(); } + void DeclareFragment() { + if (stage != ShaderType::Fragment) { + return; + } + if (ir.UsesLegacyVaryings()) { + code.AddLine("in gl_PerFragment {{"); + ++code.scope; + code.AddLine("vec4 gl_TexCoord[8];"); + code.AddLine("vec4 gl_Color;"); + code.AddLine("vec4 gl_SecondaryColor;"); + --code.scope; + code.AddLine("}};"); + } + + for (u32 rt = 0; rt < Maxwell::NumRenderTargets; ++rt) { + code.AddLine("layout (location = {}) out vec4 frag_color{};", rt, rt); + } + } + + void DeclareCompute() { + if (stage != ShaderType::Compute) { + return; + } + const auto& info = registry.GetComputeInfo(); + if (const u32 size = info.shared_memory_size_in_words; size > 0) { + code.AddLine("shared uint smem[{}];", size); + code.AddNewLine(); + } + code.AddLine("layout (local_size_x = {}, local_size_y = {}, local_size_z = {}) in;", + info.workgroup_size[0], info.workgroup_size[1], info.workgroup_size[2]); + code.AddNewLine(); + } + void DeclareVertexRedeclarations() { code.AddLine("out gl_PerVertex {{"); ++code.scope; - code.AddLine("vec4 gl_Position;"); + auto pos_xfb = GetTransformFeedbackDecoration(Attribute::Index::Position); + if (!pos_xfb.empty()) { + pos_xfb = fmt::format("layout ({}) ", pos_xfb); + } + const char* pos_type = + FLOAT_TYPES.at(GetNumComponents(Attribute::Index::Position).value_or(4) - 1); + code.AddLine("{}{} gl_Position;", pos_xfb, pos_type); for (const auto attribute : ir.GetOutputAttributes()) { if (attribute == Attribute::Index::ClipDistances0123 || @@ -463,14 +618,14 @@ private: break; } } - if (!IsVertexShader(stage) || device.HasVertexViewportLayer()) { + if (stage != ShaderType::Vertex || device.HasVertexViewportLayer()) { if (ir.UsesLayer()) { code.AddLine("int gl_Layer;"); } if (ir.UsesViewportIndex()) { code.AddLine("int gl_ViewportIndex;"); } - } else if ((ir.UsesLayer() || ir.UsesViewportIndex()) && IsVertexShader(stage) && + } else if ((ir.UsesLayer() || ir.UsesViewportIndex()) && stage == ShaderType::Vertex && !device.HasVertexViewportLayer()) { LOG_ERROR( Render_OpenGL, @@ -481,12 +636,12 @@ private: code.AddLine("float gl_PointSize;"); } - if (ir.UsesInstanceId()) { - code.AddLine("int gl_InstanceID;"); - } - - if (ir.UsesVertexId()) { - code.AddLine("int gl_VertexID;"); + if (ir.UsesLegacyVaryings()) { + code.AddLine("vec4 gl_TexCoord[8];"); + code.AddLine("vec4 gl_FrontColor;"); + code.AddLine("vec4 gl_FrontSecondaryColor;"); + code.AddLine("vec4 gl_BackColor;"); + code.AddLine("vec4 gl_BackSecondaryColor;"); } --code.scope; @@ -525,18 +680,16 @@ private: } void DeclareLocalMemory() { + u64 local_memory_size = 0; if (stage == ShaderType::Compute) { - code.AddLine("#ifdef LOCAL_MEMORY_SIZE"); - code.AddLine("uint {}[LOCAL_MEMORY_SIZE];", GetLocalMemory()); - code.AddLine("#endif"); - return; + local_memory_size = registry.GetComputeInfo().local_memory_size_in_words * 4ULL; + } else { + local_memory_size = header.GetLocalMemorySize(); } - - const u64 local_memory_size = header.GetLocalMemorySize(); if (local_memory_size == 0) { return; } - const auto element_count = Common::AlignUp(local_memory_size, 4) / 4; + const u64 element_count = Common::AlignUp(local_memory_size, 4) / 4; code.AddLine("uint {}[{}];", GetLocalMemory(), element_count); code.AddNewLine(); } @@ -589,7 +742,7 @@ private: void DeclareInputAttribute(Attribute::Index index, bool skip_unused) { const u32 location{GetGenericAttributeIndex(index)}; - std::string name{GetInputAttribute(index)}; + std::string name{GetGenericInputAttribute(index)}; if (stage == ShaderType::Geometry) { name = "gs_" + name + "[]"; } @@ -626,9 +779,59 @@ private: } } + std::optional<std::size_t> GetNumComponents(Attribute::Index index, u8 element = 0) const { + const u8 location = static_cast<u8>(static_cast<u32>(index) * 4 + element); + const auto it = transform_feedback.find(location); + if (it == transform_feedback.end()) { + return {}; + } + return it->second.components; + } + + std::string GetTransformFeedbackDecoration(Attribute::Index index, u8 element = 0) const { + const u8 location = static_cast<u8>(static_cast<u32>(index) * 4 + element); + const auto it = transform_feedback.find(location); + if (it == transform_feedback.end()) { + return {}; + } + + const VaryingTFB& tfb = it->second; + return fmt::format("xfb_buffer = {}, xfb_offset = {}, xfb_stride = {}", tfb.buffer, + tfb.offset, tfb.stride); + } + void DeclareOutputAttribute(Attribute::Index index) { - const u32 location{GetGenericAttributeIndex(index)}; - code.AddLine("layout (location = {}) out vec4 {};", location, GetOutputAttribute(index)); + static constexpr std::string_view swizzle = "xyzw"; + u8 element = 0; + while (element < 4) { + auto xfb = GetTransformFeedbackDecoration(index, element); + if (!xfb.empty()) { + xfb = fmt::format(", {}", xfb); + } + const std::size_t remainder = 4 - element; + const std::size_t num_components = GetNumComponents(index, element).value_or(remainder); + const char* const type = FLOAT_TYPES.at(num_components - 1); + + const u32 location = GetGenericAttributeIndex(index); + + GenericVaryingDescription description; + description.first_element = static_cast<u8>(element); + description.is_scalar = num_components == 1; + description.name = AppendSuffix(location, OUTPUT_ATTRIBUTE_NAME); + if (element != 0 || num_components != 4) { + const std::string_view name_swizzle = swizzle.substr(element, num_components); + description.name = fmt::format("{}_{}", description.name, name_swizzle); + } + for (std::size_t i = 0; i < num_components; ++i) { + const u8 offset = static_cast<u8>(location * 4 + element + i); + varying_description.insert({offset, description}); + } + + code.AddLine("layout (location = {}, component = {}{}) out {} {};", location, element, + xfb, type, description.name); + + element = static_cast<u8>(static_cast<std::size_t>(element) + num_components); + } } void DeclareConstantBuffers() { @@ -925,7 +1128,8 @@ private: // TODO(Rodrigo): Guard geometry inputs against out of bound reads. Some games // set an 0x80000000 index for those and the shader fails to build. Find out why // this happens and what's its intent. - return fmt::format("gs_{}[{} % MAX_VERTEX_INPUT]", name, Visit(buffer).AsUint()); + return fmt::format("gs_{}[{} % {}]", name, Visit(buffer).AsUint(), + max_input_vertices.value()); } return std::string(name); }; @@ -943,6 +1147,10 @@ private: default: UNREACHABLE(); } + case Attribute::Index::FrontColor: + return {"gl_Color"s + GetSwizzle(element), Type::Float}; + case Attribute::Index::FrontSecondaryColor: + return {"gl_SecondaryColor"s + GetSwizzle(element), Type::Float}; case Attribute::Index::PointCoord: switch (element) { case 0: @@ -959,7 +1167,7 @@ private: // TODO(Subv): Find out what the values are for the first two elements when inside a // vertex shader, and what's the value of the fourth element when inside a Tess Eval // shader. - ASSERT(IsVertexShader(stage)); + ASSERT(stage == ShaderType::Vertex); switch (element) { case 2: // Config pack's first value is instance_id. @@ -980,7 +1188,13 @@ private: return {"0", Type::Int}; default: if (IsGenericAttribute(attribute)) { - return {GeometryPass(GetInputAttribute(attribute)) + GetSwizzle(element), + return {GeometryPass(GetGenericInputAttribute(attribute)) + GetSwizzle(element), + Type::Float}; + } + if (IsLegacyTexCoord(attribute)) { + UNIMPLEMENTED_IF(stage == ShaderType::Geometry); + return {fmt::format("gl_TexCoord[{}]{}", GetLegacyTexCoordIndex(attribute), + GetSwizzle(element)), Type::Float}; } break; @@ -1021,21 +1235,22 @@ private: } std::optional<Expression> GetOutputAttribute(const AbufNode* abuf) { + const u32 element = abuf->GetElement(); switch (const auto attribute = abuf->GetIndex()) { case Attribute::Index::Position: - return {{"gl_Position"s + GetSwizzle(abuf->GetElement()), Type::Float}}; + return {{"gl_Position"s + GetSwizzle(element), Type::Float}}; case Attribute::Index::LayerViewportPointSize: - switch (abuf->GetElement()) { + switch (element) { case 0: UNIMPLEMENTED(); return {}; case 1: - if (IsVertexShader(stage) && !device.HasVertexViewportLayer()) { + if (stage == ShaderType::Vertex && !device.HasVertexViewportLayer()) { return {}; } return {{"gl_Layer", Type::Int}}; case 2: - if (IsVertexShader(stage) && !device.HasVertexViewportLayer()) { + if (stage == ShaderType::Vertex && !device.HasVertexViewportLayer()) { return {}; } return {{"gl_ViewportIndex", Type::Int}}; @@ -1043,14 +1258,26 @@ private: return {{"gl_PointSize", Type::Float}}; } return {}; + case Attribute::Index::FrontColor: + return {{"gl_FrontColor"s + GetSwizzle(element), Type::Float}}; + case Attribute::Index::FrontSecondaryColor: + return {{"gl_FrontSecondaryColor"s + GetSwizzle(element), Type::Float}}; + case Attribute::Index::BackColor: + return {{"gl_BackColor"s + GetSwizzle(element), Type::Float}}; + case Attribute::Index::BackSecondaryColor: + return {{"gl_BackSecondaryColor"s + GetSwizzle(element), Type::Float}}; case Attribute::Index::ClipDistances0123: - return {{fmt::format("gl_ClipDistance[{}]", abuf->GetElement()), Type::Float}}; + return {{fmt::format("gl_ClipDistance[{}]", element), Type::Float}}; case Attribute::Index::ClipDistances4567: - return {{fmt::format("gl_ClipDistance[{}]", abuf->GetElement() + 4), Type::Float}}; + return {{fmt::format("gl_ClipDistance[{}]", element + 4), Type::Float}}; default: if (IsGenericAttribute(attribute)) { - return { - {GetOutputAttribute(attribute) + GetSwizzle(abuf->GetElement()), Type::Float}}; + return {{GetGenericOutputAttribute(attribute, element), Type::Float}}; + } + if (IsLegacyTexCoord(attribute)) { + return {{fmt::format("gl_TexCoord[{}]{}", GetLegacyTexCoordIndex(attribute), + GetSwizzle(element)), + Type::Float}}; } UNIMPLEMENTED_MSG("Unhandled output attribute: {}", static_cast<u32>(attribute)); return {}; @@ -1822,16 +2049,19 @@ private: expr += GetSampler(meta->sampler); expr += ", "; - expr += constructors.at(operation.GetOperandsCount() - 1); + expr += constructors.at(operation.GetOperandsCount() + (meta->array ? 1 : 0) - 1); expr += '('; for (std::size_t i = 0; i < count; ++i) { - expr += VisitOperand(operation, i).AsInt(); - const std::size_t next = i + 1; - if (next == count) - expr += ')'; - else if (next < count) + if (i > 0) { expr += ", "; + } + expr += VisitOperand(operation, i).AsInt(); + } + if (meta->array) { + expr += ", "; + expr += Visit(meta->array).AsInt(); } + expr += ')'; if (meta->lod && !meta->sampler.IsBuffer()) { expr += ", "; @@ -1945,7 +2175,7 @@ private: // TODO(Subv): Figure out how dual-source blending is configured in the Switch. for (u32 component = 0; component < 4; ++component) { if (header.ps.IsColorComponentOutputEnabled(render_target, component)) { - code.AddLine("FragColor{}[{}] = {};", render_target, component, + code.AddLine("frag_color{}{} = {};", render_target, GetColorSwizzle(component), SafeGetRegister(current_reg).AsFloat()); ++current_reg; } @@ -2261,27 +2491,34 @@ private: static_assert(operation_decompilers.size() == static_cast<std::size_t>(OperationCode::Amount)); std::string GetRegister(u32 index) const { - return GetDeclarationWithSuffix(index, "gpr"); + return AppendSuffix(index, "gpr"); } std::string GetCustomVariable(u32 index) const { - return GetDeclarationWithSuffix(index, "custom_var"); + return AppendSuffix(index, "custom_var"); } std::string GetPredicate(Tegra::Shader::Pred pred) const { - return GetDeclarationWithSuffix(static_cast<u32>(pred), "pred"); + return AppendSuffix(static_cast<u32>(pred), "pred"); } - std::string GetInputAttribute(Attribute::Index attribute) const { - return GetDeclarationWithSuffix(GetGenericAttributeIndex(attribute), "input_attr"); + std::string GetGenericInputAttribute(Attribute::Index attribute) const { + return AppendSuffix(GetGenericAttributeIndex(attribute), INPUT_ATTRIBUTE_NAME); } - std::string GetOutputAttribute(Attribute::Index attribute) const { - return GetDeclarationWithSuffix(GetGenericAttributeIndex(attribute), "output_attr"); + std::unordered_map<u8, GenericVaryingDescription> varying_description; + + std::string GetGenericOutputAttribute(Attribute::Index attribute, std::size_t element) const { + const u8 offset = static_cast<u8>(GetGenericAttributeIndex(attribute) * 4 + element); + const auto& description = varying_description.at(offset); + if (description.is_scalar) { + return description.name; + } + return fmt::format("{}[{}]", description.name, element - description.first_element); } std::string GetConstBuffer(u32 index) const { - return GetDeclarationWithSuffix(index, "cbuf"); + return AppendSuffix(index, "cbuf"); } std::string GetGlobalMemory(const GlobalMemoryBase& descriptor) const { @@ -2294,11 +2531,15 @@ private: } std::string GetConstBufferBlock(u32 index) const { - return GetDeclarationWithSuffix(index, "cbuf_block"); + return AppendSuffix(index, "cbuf_block"); } std::string GetLocalMemory() const { - return "lmem_" + suffix; + if (suffix.empty()) { + return "lmem"; + } else { + return "lmem_" + std::string{suffix}; + } } std::string GetInternalFlag(InternalFlag flag) const { @@ -2307,23 +2548,31 @@ private: const auto index = static_cast<u32>(flag); ASSERT(index < static_cast<u32>(InternalFlag::Amount)); - return fmt::format("{}_{}", InternalFlagNames[index], suffix); + if (suffix.empty()) { + return InternalFlagNames[index]; + } else { + return fmt::format("{}_{}", InternalFlagNames[index], suffix); + } } std::string GetSampler(const Sampler& sampler) const { - return GetDeclarationWithSuffix(static_cast<u32>(sampler.GetIndex()), "sampler"); + return AppendSuffix(static_cast<u32>(sampler.GetIndex()), "sampler"); } std::string GetImage(const Image& image) const { - return GetDeclarationWithSuffix(static_cast<u32>(image.GetIndex()), "image"); + return AppendSuffix(static_cast<u32>(image.GetIndex()), "image"); } - std::string GetDeclarationWithSuffix(u32 index, std::string_view name) const { - return fmt::format("{}_{}_{}", name, index, suffix); + std::string AppendSuffix(u32 index, std::string_view name) const { + if (suffix.empty()) { + return fmt::format("{}{}", name, index); + } else { + return fmt::format("{}{}_{}", name, index, suffix); + } } u32 GetNumPhysicalInputAttributes() const { - return IsVertexShader(stage) ? GetNumPhysicalAttributes() : GetNumPhysicalVaryings(); + return stage == ShaderType::Vertex ? GetNumPhysicalAttributes() : GetNumPhysicalVaryings(); } u32 GetNumPhysicalAttributes() const { @@ -2334,17 +2583,31 @@ private: return std::min<u32>(device.GetMaxVaryings(), Maxwell::NumVaryings); } + bool IsRenderTargetEnabled(u32 render_target) const { + for (u32 component = 0; component < 4; ++component) { + if (header.ps.IsColorComponentOutputEnabled(render_target, component)) { + return true; + } + } + return false; + } + const Device& device; const ShaderIR& ir; + const Registry& registry; const ShaderType stage; - const std::string suffix; + const std::string_view identifier; + const std::string_view suffix; const Header header; + std::unordered_map<u8, VaryingTFB> transform_feedback; ShaderWriter code; + + std::optional<u32> max_input_vertices; }; -std::string GetFlowVariable(u32 i) { - return fmt::format("flow_var_{}", i); +std::string GetFlowVariable(u32 index) { + return fmt::format("flow_var{}", index); } class ExprDecompiler { @@ -2531,7 +2794,7 @@ void GLSLDecompiler::DecompileAST() { } // Anonymous namespace -ShaderEntries GetEntries(const VideoCommon::Shader::ShaderIR& ir) { +ShaderEntries MakeEntries(const VideoCommon::Shader::ShaderIR& ir) { ShaderEntries entries; for (const auto& cbuf : ir.GetConstantBuffers()) { entries.const_buffers.emplace_back(cbuf.second.GetMaxOffset(), cbuf.second.IsIndirect(), @@ -2547,33 +2810,20 @@ ShaderEntries GetEntries(const VideoCommon::Shader::ShaderIR& ir) { for (const auto& image : ir.GetImages()) { entries.images.emplace_back(image); } - entries.clip_distances = ir.GetClipDistances(); + const auto clip_distances = ir.GetClipDistances(); + for (std::size_t i = 0; i < std::size(clip_distances); ++i) { + entries.clip_distances = (clip_distances[i] ? 1U : 0U) << i; + } entries.shader_length = ir.GetLength(); return entries; } -std::string GetCommonDeclarations() { - return R"(#define ftoi floatBitsToInt -#define ftou floatBitsToUint -#define itof intBitsToFloat -#define utof uintBitsToFloat - -bvec2 HalfFloatNanComparison(bvec2 comparison, vec2 pair1, vec2 pair2) { - bvec2 is_nan1 = isnan(pair1); - bvec2 is_nan2 = isnan(pair2); - return bvec2(comparison.x || is_nan1.x || is_nan2.x, comparison.y || is_nan1.y || is_nan2.y); -} - -const float fswzadd_modifiers_a[] = float[4](-1.0f, 1.0f, -1.0f, 0.0f ); -const float fswzadd_modifiers_b[] = float[4](-1.0f, -1.0f, 1.0f, -1.0f ); -)"; -} - -std::string Decompile(const Device& device, const ShaderIR& ir, ShaderType stage, - const std::string& suffix) { - GLSLDecompiler decompiler(device, ir, stage, suffix); +std::string DecompileShader(const Device& device, const ShaderIR& ir, const Registry& registry, + ShaderType stage, std::string_view identifier, + std::string_view suffix) { + GLSLDecompiler decompiler(device, ir, registry, stage, identifier, suffix); decompiler.Decompile(); return decompiler.GetResult(); } -} // namespace OpenGL::GLShader +} // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.h b/src/video_core/renderer_opengl/gl_shader_decompiler.h index 7876f48d6..e7dbd810c 100644 --- a/src/video_core/renderer_opengl/gl_shader_decompiler.h +++ b/src/video_core/renderer_opengl/gl_shader_decompiler.h @@ -6,22 +6,18 @@ #include <array> #include <string> +#include <string_view> #include <utility> #include <vector> #include "common/common_types.h" #include "video_core/engines/maxwell_3d.h" #include "video_core/engines/shader_type.h" +#include "video_core/shader/registry.h" #include "video_core/shader/shader_ir.h" -namespace VideoCommon::Shader { -class ShaderIR; -} - namespace OpenGL { -class Device; -} -namespace OpenGL::GLShader { +class Device; using Maxwell = Tegra::Engines::Maxwell3D::Regs; using SamplerEntry = VideoCommon::Shader::Sampler; @@ -74,15 +70,15 @@ struct ShaderEntries { std::vector<GlobalMemoryEntry> global_memory_entries; std::vector<SamplerEntry> samplers; std::vector<ImageEntry> images; - std::array<bool, Maxwell::NumClipDistances> clip_distances{}; + u32 clip_distances{}; std::size_t shader_length{}; }; -ShaderEntries GetEntries(const VideoCommon::Shader::ShaderIR& ir); - -std::string GetCommonDeclarations(); +ShaderEntries MakeEntries(const VideoCommon::Shader::ShaderIR& ir); -std::string Decompile(const Device& device, const VideoCommon::Shader::ShaderIR& ir, - Tegra::Engines::ShaderType stage, const std::string& suffix); +std::string DecompileShader(const Device& device, const VideoCommon::Shader::ShaderIR& ir, + const VideoCommon::Shader::Registry& registry, + Tegra::Engines::ShaderType stage, std::string_view identifier, + std::string_view suffix = {}); -} // namespace OpenGL::GLShader +} // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp b/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp index 1fc204f6f..9e95a122b 100644 --- a/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp +++ b/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp @@ -31,32 +31,24 @@ namespace { using ShaderCacheVersionHash = std::array<u8, 64>; -enum class TransferableEntryKind : u32 { - Raw, - Usage, -}; - struct ConstBufferKey { - u32 cbuf{}; - u32 offset{}; - u32 value{}; + u32 cbuf = 0; + u32 offset = 0; + u32 value = 0; }; struct BoundSamplerKey { - u32 offset{}; - Tegra::Engines::SamplerDescriptor sampler{}; + u32 offset = 0; + Tegra::Engines::SamplerDescriptor sampler; }; struct BindlessSamplerKey { - u32 cbuf{}; - u32 offset{}; - Tegra::Engines::SamplerDescriptor sampler{}; + u32 cbuf = 0; + u32 offset = 0; + Tegra::Engines::SamplerDescriptor sampler; }; -constexpr u32 NativeVersion = 12; - -// Making sure sizes doesn't change by accident -static_assert(sizeof(ProgramVariant) == 20); +constexpr u32 NativeVersion = 20; ShaderCacheVersionHash GetShaderCacheVersionHash() { ShaderCacheVersionHash hash{}; @@ -67,61 +59,124 @@ ShaderCacheVersionHash GetShaderCacheVersionHash() { } // Anonymous namespace -ShaderDiskCacheRaw::ShaderDiskCacheRaw(u64 unique_identifier, ShaderType type, ProgramCode code, - ProgramCode code_b) - : unique_identifier{unique_identifier}, type{type}, code{std::move(code)}, code_b{std::move( - code_b)} {} +ShaderDiskCacheEntry::ShaderDiskCacheEntry() = default; -ShaderDiskCacheRaw::ShaderDiskCacheRaw() = default; +ShaderDiskCacheEntry::~ShaderDiskCacheEntry() = default; -ShaderDiskCacheRaw::~ShaderDiskCacheRaw() = default; - -bool ShaderDiskCacheRaw::Load(FileUtil::IOFile& file) { - if (file.ReadBytes(&unique_identifier, sizeof(u64)) != sizeof(u64) || - file.ReadBytes(&type, sizeof(u32)) != sizeof(u32)) { +bool ShaderDiskCacheEntry::Load(FileUtil::IOFile& file) { + if (file.ReadBytes(&type, sizeof(u32)) != sizeof(u32)) { return false; } - u32 code_size{}; - u32 code_size_b{}; + u32 code_size; + u32 code_size_b; if (file.ReadBytes(&code_size, sizeof(u32)) != sizeof(u32) || file.ReadBytes(&code_size_b, sizeof(u32)) != sizeof(u32)) { return false; } - code.resize(code_size); code_b.resize(code_size_b); - if (file.ReadArray(code.data(), code_size) != code_size) + if (file.ReadArray(code.data(), code_size) != code_size) { return false; - + } if (HasProgramA() && file.ReadArray(code_b.data(), code_size_b) != code_size_b) { return false; } + + u8 is_texture_handler_size_known; + u32 texture_handler_size_value; + u32 num_keys; + u32 num_bound_samplers; + u32 num_bindless_samplers; + if (file.ReadArray(&unique_identifier, 1) != 1 || file.ReadArray(&bound_buffer, 1) != 1 || + file.ReadArray(&is_texture_handler_size_known, 1) != 1 || + file.ReadArray(&texture_handler_size_value, 1) != 1 || + file.ReadArray(&graphics_info, 1) != 1 || file.ReadArray(&compute_info, 1) != 1 || + file.ReadArray(&num_keys, 1) != 1 || file.ReadArray(&num_bound_samplers, 1) != 1 || + file.ReadArray(&num_bindless_samplers, 1) != 1) { + return false; + } + if (is_texture_handler_size_known) { + texture_handler_size = texture_handler_size_value; + } + + std::vector<ConstBufferKey> flat_keys(num_keys); + std::vector<BoundSamplerKey> flat_bound_samplers(num_bound_samplers); + std::vector<BindlessSamplerKey> flat_bindless_samplers(num_bindless_samplers); + if (file.ReadArray(flat_keys.data(), flat_keys.size()) != flat_keys.size() || + file.ReadArray(flat_bound_samplers.data(), flat_bound_samplers.size()) != + flat_bound_samplers.size() || + file.ReadArray(flat_bindless_samplers.data(), flat_bindless_samplers.size()) != + flat_bindless_samplers.size()) { + return false; + } + for (const auto& key : flat_keys) { + keys.insert({{key.cbuf, key.offset}, key.value}); + } + for (const auto& key : flat_bound_samplers) { + bound_samplers.emplace(key.offset, key.sampler); + } + for (const auto& key : flat_bindless_samplers) { + bindless_samplers.insert({{key.cbuf, key.offset}, key.sampler}); + } + return true; } -bool ShaderDiskCacheRaw::Save(FileUtil::IOFile& file) const { - if (file.WriteObject(unique_identifier) != 1 || file.WriteObject(static_cast<u32>(type)) != 1 || +bool ShaderDiskCacheEntry::Save(FileUtil::IOFile& file) const { + if (file.WriteObject(static_cast<u32>(type)) != 1 || file.WriteObject(static_cast<u32>(code.size())) != 1 || file.WriteObject(static_cast<u32>(code_b.size())) != 1) { return false; } - - if (file.WriteArray(code.data(), code.size()) != code.size()) + if (file.WriteArray(code.data(), code.size()) != code.size()) { return false; - + } if (HasProgramA() && file.WriteArray(code_b.data(), code_b.size()) != code_b.size()) { return false; } - return true; + + if (file.WriteObject(unique_identifier) != 1 || file.WriteObject(bound_buffer) != 1 || + file.WriteObject(static_cast<u8>(texture_handler_size.has_value())) != 1 || + file.WriteObject(texture_handler_size.value_or(0)) != 1 || + file.WriteObject(graphics_info) != 1 || file.WriteObject(compute_info) != 1 || + file.WriteObject(static_cast<u32>(keys.size())) != 1 || + file.WriteObject(static_cast<u32>(bound_samplers.size())) != 1 || + file.WriteObject(static_cast<u32>(bindless_samplers.size())) != 1) { + return false; + } + + std::vector<ConstBufferKey> flat_keys; + flat_keys.reserve(keys.size()); + for (const auto& [address, value] : keys) { + flat_keys.push_back(ConstBufferKey{address.first, address.second, value}); + } + + std::vector<BoundSamplerKey> flat_bound_samplers; + flat_bound_samplers.reserve(bound_samplers.size()); + for (const auto& [address, sampler] : bound_samplers) { + flat_bound_samplers.push_back(BoundSamplerKey{address, sampler}); + } + + std::vector<BindlessSamplerKey> flat_bindless_samplers; + flat_bindless_samplers.reserve(bindless_samplers.size()); + for (const auto& [address, sampler] : bindless_samplers) { + flat_bindless_samplers.push_back( + BindlessSamplerKey{address.first, address.second, sampler}); + } + + return file.WriteArray(flat_keys.data(), flat_keys.size()) == flat_keys.size() && + file.WriteArray(flat_bound_samplers.data(), flat_bound_samplers.size()) == + flat_bound_samplers.size() && + file.WriteArray(flat_bindless_samplers.data(), flat_bindless_samplers.size()) == + flat_bindless_samplers.size(); } ShaderDiskCacheOpenGL::ShaderDiskCacheOpenGL(Core::System& system) : system{system} {} ShaderDiskCacheOpenGL::~ShaderDiskCacheOpenGL() = default; -std::optional<std::pair<std::vector<ShaderDiskCacheRaw>, std::vector<ShaderDiskCacheUsage>>> -ShaderDiskCacheOpenGL::LoadTransferable() { +std::optional<std::vector<ShaderDiskCacheEntry>> ShaderDiskCacheOpenGL::LoadTransferable() { // Skip games without title id const bool has_title_id = system.CurrentProcess()->GetTitleID() != 0; if (!Settings::values.use_disk_shader_cache || !has_title_id) { @@ -130,17 +185,14 @@ ShaderDiskCacheOpenGL::LoadTransferable() { FileUtil::IOFile file(GetTransferablePath(), "rb"); if (!file.IsOpen()) { - LOG_INFO(Render_OpenGL, "No transferable shader cache found for game with title id={}", - GetTitleID()); + LOG_INFO(Render_OpenGL, "No transferable shader cache found"); is_usable = true; return {}; } u32 version{}; if (file.ReadBytes(&version, sizeof(version)) != sizeof(version)) { - LOG_ERROR(Render_OpenGL, - "Failed to get transferable cache version for title id={}, skipping", - GetTitleID()); + LOG_ERROR(Render_OpenGL, "Failed to get transferable cache version, skipping it"); return {}; } @@ -158,105 +210,42 @@ ShaderDiskCacheOpenGL::LoadTransferable() { } // Version is valid, load the shaders - constexpr const char error_loading[] = "Failed to load transferable raw entry, skipping"; - std::vector<ShaderDiskCacheRaw> raws; - std::vector<ShaderDiskCacheUsage> usages; + std::vector<ShaderDiskCacheEntry> entries; while (file.Tell() < file.GetSize()) { - TransferableEntryKind kind{}; - if (file.ReadBytes(&kind, sizeof(u32)) != sizeof(u32)) { - LOG_ERROR(Render_OpenGL, "Failed to read transferable file, skipping"); - return {}; - } - - switch (kind) { - case TransferableEntryKind::Raw: { - ShaderDiskCacheRaw entry; - if (!entry.Load(file)) { - LOG_ERROR(Render_OpenGL, error_loading); - return {}; - } - transferable.insert({entry.GetUniqueIdentifier(), {}}); - raws.push_back(std::move(entry)); - break; - } - case TransferableEntryKind::Usage: { - ShaderDiskCacheUsage usage; - - u32 num_keys{}; - u32 num_bound_samplers{}; - u32 num_bindless_samplers{}; - if (file.ReadArray(&usage.unique_identifier, 1) != 1 || - file.ReadArray(&usage.variant, 1) != 1 || - file.ReadArray(&usage.bound_buffer, 1) != 1 || file.ReadArray(&num_keys, 1) != 1 || - file.ReadArray(&num_bound_samplers, 1) != 1 || - file.ReadArray(&num_bindless_samplers, 1) != 1) { - LOG_ERROR(Render_OpenGL, error_loading); - return {}; - } - - std::vector<ConstBufferKey> keys(num_keys); - std::vector<BoundSamplerKey> bound_samplers(num_bound_samplers); - std::vector<BindlessSamplerKey> bindless_samplers(num_bindless_samplers); - if (file.ReadArray(keys.data(), keys.size()) != keys.size() || - file.ReadArray(bound_samplers.data(), bound_samplers.size()) != - bound_samplers.size() || - file.ReadArray(bindless_samplers.data(), bindless_samplers.size()) != - bindless_samplers.size()) { - LOG_ERROR(Render_OpenGL, error_loading); - return {}; - } - for (const auto& key : keys) { - usage.keys.insert({{key.cbuf, key.offset}, key.value}); - } - for (const auto& key : bound_samplers) { - usage.bound_samplers.emplace(key.offset, key.sampler); - } - for (const auto& key : bindless_samplers) { - usage.bindless_samplers.insert({{key.cbuf, key.offset}, key.sampler}); - } - - usages.push_back(std::move(usage)); - break; - } - default: - LOG_ERROR(Render_OpenGL, "Unknown transferable shader cache entry kind={}, skipping", - static_cast<u32>(kind)); + ShaderDiskCacheEntry& entry = entries.emplace_back(); + if (!entry.Load(file)) { + LOG_ERROR(Render_OpenGL, "Failed to load transferable raw entry, skipping"); return {}; } } is_usable = true; - return {{std::move(raws), std::move(usages)}}; + return {std::move(entries)}; } -std::unordered_map<ShaderDiskCacheUsage, ShaderDiskCacheDump> -ShaderDiskCacheOpenGL::LoadPrecompiled() { +std::vector<ShaderDiskCachePrecompiled> ShaderDiskCacheOpenGL::LoadPrecompiled() { if (!is_usable) { return {}; } - std::string path = GetPrecompiledPath(); - FileUtil::IOFile file(path, "rb"); + FileUtil::IOFile file(GetPrecompiledPath(), "rb"); if (!file.IsOpen()) { - LOG_INFO(Render_OpenGL, "No precompiled shader cache found for game with title id={}", - GetTitleID()); + LOG_INFO(Render_OpenGL, "No precompiled shader cache found"); return {}; } - const auto result = LoadPrecompiledFile(file); - if (!result) { - LOG_INFO(Render_OpenGL, - "Failed to load precompiled cache for game with title id={}, removing", - GetTitleID()); - file.Close(); - InvalidatePrecompiled(); - return {}; + if (const auto result = LoadPrecompiledFile(file)) { + return *result; } - return *result; + + LOG_INFO(Render_OpenGL, "Failed to load precompiled cache"); + file.Close(); + InvalidatePrecompiled(); + return {}; } -std::optional<std::unordered_map<ShaderDiskCacheUsage, ShaderDiskCacheDump>> -ShaderDiskCacheOpenGL::LoadPrecompiledFile(FileUtil::IOFile& file) { +std::optional<std::vector<ShaderDiskCachePrecompiled>> ShaderDiskCacheOpenGL::LoadPrecompiledFile( + FileUtil::IOFile& file) { // Read compressed file from disk and decompress to virtual precompiled cache file std::vector<u8> compressed(file.GetSize()); file.ReadBytes(compressed.data(), compressed.size()); @@ -275,58 +264,22 @@ ShaderDiskCacheOpenGL::LoadPrecompiledFile(FileUtil::IOFile& file) { return {}; } - ShaderDumpsMap dumps; + std::vector<ShaderDiskCachePrecompiled> entries; while (precompiled_cache_virtual_file_offset < precompiled_cache_virtual_file.GetSize()) { - u32 num_keys{}; - u32 num_bound_samplers{}; - u32 num_bindless_samplers{}; - ShaderDiskCacheUsage usage; - if (!LoadObjectFromPrecompiled(usage.unique_identifier) || - !LoadObjectFromPrecompiled(usage.variant) || - !LoadObjectFromPrecompiled(usage.bound_buffer) || - !LoadObjectFromPrecompiled(num_keys) || - !LoadObjectFromPrecompiled(num_bound_samplers) || - !LoadObjectFromPrecompiled(num_bindless_samplers)) { - return {}; - } - std::vector<ConstBufferKey> keys(num_keys); - std::vector<BoundSamplerKey> bound_samplers(num_bound_samplers); - std::vector<BindlessSamplerKey> bindless_samplers(num_bindless_samplers); - if (!LoadArrayFromPrecompiled(keys.data(), keys.size()) || - !LoadArrayFromPrecompiled(bound_samplers.data(), bound_samplers.size()) != - bound_samplers.size() || - !LoadArrayFromPrecompiled(bindless_samplers.data(), bindless_samplers.size()) != - bindless_samplers.size()) { - return {}; - } - for (const auto& key : keys) { - usage.keys.insert({{key.cbuf, key.offset}, key.value}); - } - for (const auto& key : bound_samplers) { - usage.bound_samplers.emplace(key.offset, key.sampler); - } - for (const auto& key : bindless_samplers) { - usage.bindless_samplers.insert({{key.cbuf, key.offset}, key.sampler}); - } - - ShaderDiskCacheDump dump; - if (!LoadObjectFromPrecompiled(dump.binary_format)) { - return {}; - } - - u32 binary_length{}; - if (!LoadObjectFromPrecompiled(binary_length)) { + u32 binary_size; + auto& entry = entries.emplace_back(); + if (!LoadObjectFromPrecompiled(entry.unique_identifier) || + !LoadObjectFromPrecompiled(entry.binary_format) || + !LoadObjectFromPrecompiled(binary_size)) { return {}; } - dump.binary.resize(binary_length); - if (!LoadArrayFromPrecompiled(dump.binary.data(), dump.binary.size())) { + entry.binary.resize(binary_size); + if (!LoadArrayFromPrecompiled(entry.binary.data(), entry.binary.size())) { return {}; } - - dumps.emplace(std::move(usage), dump); } - return dumps; + return entries; } void ShaderDiskCacheOpenGL::InvalidateTransferable() { @@ -346,13 +299,13 @@ void ShaderDiskCacheOpenGL::InvalidatePrecompiled() { } } -void ShaderDiskCacheOpenGL::SaveRaw(const ShaderDiskCacheRaw& entry) { +void ShaderDiskCacheOpenGL::SaveEntry(const ShaderDiskCacheEntry& entry) { if (!is_usable) { return; } - const u64 id = entry.GetUniqueIdentifier(); - if (transferable.find(id) != transferable.end()) { + const u64 id = entry.unique_identifier; + if (stored_transferable.find(id) != stored_transferable.end()) { // The shader already exists return; } @@ -361,71 +314,17 @@ void ShaderDiskCacheOpenGL::SaveRaw(const ShaderDiskCacheRaw& entry) { if (!file.IsOpen()) { return; } - if (file.WriteObject(TransferableEntryKind::Raw) != 1 || !entry.Save(file)) { + if (!entry.Save(file)) { LOG_ERROR(Render_OpenGL, "Failed to save raw transferable cache entry, removing"); file.Close(); InvalidateTransferable(); return; } - transferable.insert({id, {}}); -} -void ShaderDiskCacheOpenGL::SaveUsage(const ShaderDiskCacheUsage& usage) { - if (!is_usable) { - return; - } - - const auto it = transferable.find(usage.unique_identifier); - ASSERT_MSG(it != transferable.end(), "Saving shader usage without storing raw previously"); - - auto& usages{it->second}; - if (usages.find(usage) != usages.end()) { - // Skip this variant since the shader is already stored. - return; - } - usages.insert(usage); - - FileUtil::IOFile file = AppendTransferableFile(); - if (!file.IsOpen()) - return; - const auto Close = [&] { - LOG_ERROR(Render_OpenGL, "Failed to save usage transferable cache entry, removing"); - file.Close(); - InvalidateTransferable(); - }; - - if (file.WriteObject(TransferableEntryKind::Usage) != 1 || - file.WriteObject(usage.unique_identifier) != 1 || file.WriteObject(usage.variant) != 1 || - file.WriteObject(usage.bound_buffer) != 1 || - file.WriteObject(static_cast<u32>(usage.keys.size())) != 1 || - file.WriteObject(static_cast<u32>(usage.bound_samplers.size())) != 1 || - file.WriteObject(static_cast<u32>(usage.bindless_samplers.size())) != 1) { - Close(); - return; - } - for (const auto& [pair, value] : usage.keys) { - const auto [cbuf, offset] = pair; - if (file.WriteObject(ConstBufferKey{cbuf, offset, value}) != 1) { - Close(); - return; - } - } - for (const auto& [offset, sampler] : usage.bound_samplers) { - if (file.WriteObject(BoundSamplerKey{offset, sampler}) != 1) { - Close(); - return; - } - } - for (const auto& [pair, sampler] : usage.bindless_samplers) { - const auto [cbuf, offset] = pair; - if (file.WriteObject(BindlessSamplerKey{cbuf, offset, sampler}) != 1) { - Close(); - return; - } - } + stored_transferable.insert(id); } -void ShaderDiskCacheOpenGL::SaveDump(const ShaderDiskCacheUsage& usage, GLuint program) { +void ShaderDiskCacheOpenGL::SavePrecompiled(u64 unique_identifier, GLuint program) { if (!is_usable) { return; } @@ -437,51 +336,19 @@ void ShaderDiskCacheOpenGL::SaveDump(const ShaderDiskCacheUsage& usage, GLuint p SavePrecompiledHeaderToVirtualPrecompiledCache(); } - GLint binary_length{}; + GLint binary_length; glGetProgramiv(program, GL_PROGRAM_BINARY_LENGTH, &binary_length); - GLenum binary_format{}; + GLenum binary_format; std::vector<u8> binary(binary_length); glGetProgramBinary(program, binary_length, nullptr, &binary_format, binary.data()); - const auto Close = [&] { + if (!SaveObjectToPrecompiled(unique_identifier) || !SaveObjectToPrecompiled(binary_format) || + !SaveObjectToPrecompiled(static_cast<u32>(binary.size())) || + !SaveArrayToPrecompiled(binary.data(), binary.size())) { LOG_ERROR(Render_OpenGL, "Failed to save binary program file in shader={:016X}, removing", - usage.unique_identifier); + unique_identifier); InvalidatePrecompiled(); - }; - - if (!SaveObjectToPrecompiled(usage.unique_identifier) || - !SaveObjectToPrecompiled(usage.variant) || !SaveObjectToPrecompiled(usage.bound_buffer) || - !SaveObjectToPrecompiled(static_cast<u32>(usage.keys.size())) || - !SaveObjectToPrecompiled(static_cast<u32>(usage.bound_samplers.size())) || - !SaveObjectToPrecompiled(static_cast<u32>(usage.bindless_samplers.size()))) { - Close(); - return; - } - for (const auto& [pair, value] : usage.keys) { - const auto [cbuf, offset] = pair; - if (SaveObjectToPrecompiled(ConstBufferKey{cbuf, offset, value}) != 1) { - Close(); - return; - } - } - for (const auto& [offset, sampler] : usage.bound_samplers) { - if (SaveObjectToPrecompiled(BoundSamplerKey{offset, sampler}) != 1) { - Close(); - return; - } - } - for (const auto& [pair, sampler] : usage.bindless_samplers) { - const auto [cbuf, offset] = pair; - if (SaveObjectToPrecompiled(BindlessSamplerKey{cbuf, offset, sampler}) != 1) { - Close(); - return; - } - } - if (!SaveObjectToPrecompiled(static_cast<u32>(binary_format)) || - !SaveObjectToPrecompiled(static_cast<u32>(binary_length)) || - !SaveArrayToPrecompiled(binary.data(), binary.size())) { - Close(); } } @@ -534,7 +401,6 @@ void ShaderDiskCacheOpenGL::SaveVirtualPrecompiledFile() { if (file.WriteBytes(compressed.data(), compressed.size()) != compressed.size()) { LOG_ERROR(Render_OpenGL, "Failed to write precompiled cache version in path={}", precompiled_path); - return; } } diff --git a/src/video_core/renderer_opengl/gl_shader_disk_cache.h b/src/video_core/renderer_opengl/gl_shader_disk_cache.h index ef2371f6d..d5be52e40 100644 --- a/src/video_core/renderer_opengl/gl_shader_disk_cache.h +++ b/src/video_core/renderer_opengl/gl_shader_disk_cache.h @@ -19,8 +19,7 @@ #include "common/common_types.h" #include "core/file_sys/vfs_vector.h" #include "video_core/engines/shader_type.h" -#include "video_core/renderer_opengl/gl_shader_gen.h" -#include "video_core/shader/const_buffer_locker.h" +#include "video_core/shader/registry.h" namespace Core { class System; @@ -32,139 +31,39 @@ class IOFile; namespace OpenGL { -struct ShaderDiskCacheUsage; -struct ShaderDiskCacheDump; - using ProgramCode = std::vector<u64>; -using ShaderDumpsMap = std::unordered_map<ShaderDiskCacheUsage, ShaderDiskCacheDump>; - -/// Describes the different variants a program can be compiled with. -struct ProgramVariant final { - ProgramVariant() = default; - - /// Graphics constructor. - explicit constexpr ProgramVariant(GLenum primitive_mode) noexcept - : primitive_mode{primitive_mode} {} - - /// Compute constructor. - explicit constexpr ProgramVariant(u32 block_x, u32 block_y, u32 block_z, u32 shared_memory_size, - u32 local_memory_size) noexcept - : block_x{block_x}, block_y{static_cast<u16>(block_y)}, block_z{static_cast<u16>(block_z)}, - shared_memory_size{shared_memory_size}, local_memory_size{local_memory_size} {} - - // Graphics specific parameters. - GLenum primitive_mode{}; - - // Compute specific parameters. - u32 block_x{}; - u16 block_y{}; - u16 block_z{}; - u32 shared_memory_size{}; - u32 local_memory_size{}; - - bool operator==(const ProgramVariant& rhs) const noexcept { - return std::tie(primitive_mode, block_x, block_y, block_z, shared_memory_size, - local_memory_size) == std::tie(rhs.primitive_mode, rhs.block_x, rhs.block_y, - rhs.block_z, rhs.shared_memory_size, - rhs.local_memory_size); - } - - bool operator!=(const ProgramVariant& rhs) const noexcept { - return !operator==(rhs); - } -}; -static_assert(std::is_trivially_copyable_v<ProgramVariant>); - -/// Describes how a shader is used. -struct ShaderDiskCacheUsage { - u64 unique_identifier{}; - ProgramVariant variant; - u32 bound_buffer{}; - VideoCommon::Shader::KeyMap keys; - VideoCommon::Shader::BoundSamplerMap bound_samplers; - VideoCommon::Shader::BindlessSamplerMap bindless_samplers; - - bool operator==(const ShaderDiskCacheUsage& rhs) const { - return std::tie(unique_identifier, variant, keys, bound_samplers, bindless_samplers) == - std::tie(rhs.unique_identifier, rhs.variant, rhs.keys, rhs.bound_samplers, - rhs.bindless_samplers); - } - - bool operator!=(const ShaderDiskCacheUsage& rhs) const { - return !operator==(rhs); - } -}; - -} // namespace OpenGL - -namespace std { - -template <> -struct hash<OpenGL::ProgramVariant> { - std::size_t operator()(const OpenGL::ProgramVariant& variant) const noexcept { - return (static_cast<std::size_t>(variant.primitive_mode) << 6) ^ - static_cast<std::size_t>(variant.block_x) ^ - (static_cast<std::size_t>(variant.block_y) << 32) ^ - (static_cast<std::size_t>(variant.block_z) << 48) ^ - (static_cast<std::size_t>(variant.shared_memory_size) << 16) ^ - (static_cast<std::size_t>(variant.local_memory_size) << 36); - } -}; - -template <> -struct hash<OpenGL::ShaderDiskCacheUsage> { - std::size_t operator()(const OpenGL::ShaderDiskCacheUsage& usage) const noexcept { - return static_cast<std::size_t>(usage.unique_identifier) ^ - std::hash<OpenGL::ProgramVariant>{}(usage.variant); - } -}; - -} // namespace std - -namespace OpenGL { -/// Describes a shader how it's used by the guest GPU -class ShaderDiskCacheRaw { -public: - explicit ShaderDiskCacheRaw(u64 unique_identifier, Tegra::Engines::ShaderType type, - ProgramCode code, ProgramCode code_b = {}); - ShaderDiskCacheRaw(); - ~ShaderDiskCacheRaw(); +/// Describes a shader and how it's used by the guest GPU +struct ShaderDiskCacheEntry { + ShaderDiskCacheEntry(); + ~ShaderDiskCacheEntry(); bool Load(FileUtil::IOFile& file); bool Save(FileUtil::IOFile& file) const; - u64 GetUniqueIdentifier() const { - return unique_identifier; - } - bool HasProgramA() const { return !code.empty() && !code_b.empty(); } - Tegra::Engines::ShaderType GetType() const { - return type; - } - - const ProgramCode& GetCode() const { - return code; - } - - const ProgramCode& GetCodeB() const { - return code_b; - } - -private: - u64 unique_identifier{}; Tegra::Engines::ShaderType type{}; ProgramCode code; ProgramCode code_b; + + u64 unique_identifier = 0; + std::optional<u32> texture_handler_size; + u32 bound_buffer = 0; + VideoCommon::Shader::GraphicsInfo graphics_info; + VideoCommon::Shader::ComputeInfo compute_info; + VideoCommon::Shader::KeyMap keys; + VideoCommon::Shader::BoundSamplerMap bound_samplers; + VideoCommon::Shader::BindlessSamplerMap bindless_samplers; }; /// Contains an OpenGL dumped binary program -struct ShaderDiskCacheDump { - GLenum binary_format{}; +struct ShaderDiskCachePrecompiled { + u64 unique_identifier = 0; + GLenum binary_format = 0; std::vector<u8> binary; }; @@ -174,11 +73,10 @@ public: ~ShaderDiskCacheOpenGL(); /// Loads transferable cache. If file has a old version or on failure, it deletes the file. - std::optional<std::pair<std::vector<ShaderDiskCacheRaw>, std::vector<ShaderDiskCacheUsage>>> - LoadTransferable(); + std::optional<std::vector<ShaderDiskCacheEntry>> LoadTransferable(); /// Loads current game's precompiled cache. Invalidates on failure. - std::unordered_map<ShaderDiskCacheUsage, ShaderDiskCacheDump> LoadPrecompiled(); + std::vector<ShaderDiskCachePrecompiled> LoadPrecompiled(); /// Removes the transferable (and precompiled) cache file. void InvalidateTransferable(); @@ -187,21 +85,18 @@ public: void InvalidatePrecompiled(); /// Saves a raw dump to the transferable file. Checks for collisions. - void SaveRaw(const ShaderDiskCacheRaw& entry); - - /// Saves shader usage to the transferable file. Does not check for collisions. - void SaveUsage(const ShaderDiskCacheUsage& usage); + void SaveEntry(const ShaderDiskCacheEntry& entry); /// Saves a dump entry to the precompiled file. Does not check for collisions. - void SaveDump(const ShaderDiskCacheUsage& usage, GLuint program); + void SavePrecompiled(u64 unique_identifier, GLuint program); /// Serializes virtual precompiled shader cache file to real file void SaveVirtualPrecompiledFile(); private: /// Loads the transferable cache. Returns empty on failure. - std::optional<std::unordered_map<ShaderDiskCacheUsage, ShaderDiskCacheDump>> - LoadPrecompiledFile(FileUtil::IOFile& file); + std::optional<std::vector<ShaderDiskCachePrecompiled>> LoadPrecompiledFile( + FileUtil::IOFile& file); /// Opens current game's transferable file and write it's header if it doesn't exist FileUtil::IOFile AppendTransferableFile() const; @@ -270,7 +165,7 @@ private: std::size_t precompiled_cache_virtual_file_offset = 0; // Stored transferable shaders - std::unordered_map<u64, std::unordered_set<ShaderDiskCacheUsage>> transferable; + std::unordered_set<u64> stored_transferable; // The cache has been loaded at boot bool is_usable{}; diff --git a/src/video_core/renderer_opengl/gl_shader_gen.cpp b/src/video_core/renderer_opengl/gl_shader_gen.cpp deleted file mode 100644 index 34946fb47..000000000 --- a/src/video_core/renderer_opengl/gl_shader_gen.cpp +++ /dev/null @@ -1,109 +0,0 @@ -// Copyright 2018 yuzu Emulator Project -// Licensed under GPLv2 or any later version -// Refer to the license.txt file included. - -#include <string> - -#include <fmt/format.h> - -#include "video_core/engines/maxwell_3d.h" -#include "video_core/engines/shader_type.h" -#include "video_core/renderer_opengl/gl_device.h" -#include "video_core/renderer_opengl/gl_shader_decompiler.h" -#include "video_core/renderer_opengl/gl_shader_gen.h" -#include "video_core/shader/shader_ir.h" - -namespace OpenGL::GLShader { - -using Tegra::Engines::Maxwell3D; -using Tegra::Engines::ShaderType; -using VideoCommon::Shader::CompileDepth; -using VideoCommon::Shader::CompilerSettings; -using VideoCommon::Shader::ProgramCode; -using VideoCommon::Shader::ShaderIR; - -std::string GenerateVertexShader(const Device& device, const ShaderIR& ir, const ShaderIR* ir_b) { - std::string out = GetCommonDeclarations(); - out += fmt::format(R"( -layout (std140, binding = {}) uniform vs_config {{ - float y_direction; -}}; - -)", - EmulationUniformBlockBinding); - out += Decompile(device, ir, ShaderType::Vertex, "vertex"); - if (ir_b) { - out += Decompile(device, *ir_b, ShaderType::Vertex, "vertex_b"); - } - - out += R"( -void main() { - gl_Position = vec4(0.0f, 0.0f, 0.0f, 1.0f); - execute_vertex(); -)"; - if (ir_b) { - out += " execute_vertex_b();"; - } - out += "}\n"; - return out; -} - -std::string GenerateGeometryShader(const Device& device, const ShaderIR& ir) { - std::string out = GetCommonDeclarations(); - out += fmt::format(R"( -layout (std140, binding = {}) uniform gs_config {{ - float y_direction; -}}; - -)", - EmulationUniformBlockBinding); - out += Decompile(device, ir, ShaderType::Geometry, "geometry"); - - out += R"( -void main() { - execute_geometry(); -} -)"; - return out; -} - -std::string GenerateFragmentShader(const Device& device, const ShaderIR& ir) { - std::string out = GetCommonDeclarations(); - out += fmt::format(R"( -layout (location = 0) out vec4 FragColor0; -layout (location = 1) out vec4 FragColor1; -layout (location = 2) out vec4 FragColor2; -layout (location = 3) out vec4 FragColor3; -layout (location = 4) out vec4 FragColor4; -layout (location = 5) out vec4 FragColor5; -layout (location = 6) out vec4 FragColor6; -layout (location = 7) out vec4 FragColor7; - -layout (std140, binding = {}) uniform fs_config {{ - float y_direction; -}}; - -)", - EmulationUniformBlockBinding); - out += Decompile(device, ir, ShaderType::Fragment, "fragment"); - - out += R"( -void main() { - execute_fragment(); -} -)"; - return out; -} - -std::string GenerateComputeShader(const Device& device, const ShaderIR& ir) { - std::string out = GetCommonDeclarations(); - out += Decompile(device, ir, ShaderType::Compute, "compute"); - out += R"( -void main() { - execute_compute(); -} -)"; - return out; -} - -} // namespace OpenGL::GLShader diff --git a/src/video_core/renderer_opengl/gl_shader_gen.h b/src/video_core/renderer_opengl/gl_shader_gen.h deleted file mode 100644 index cba2be9f9..000000000 --- a/src/video_core/renderer_opengl/gl_shader_gen.h +++ /dev/null @@ -1,34 +0,0 @@ -// Copyright 2018 yuzu Emulator Project -// Licensed under GPLv2 or any later version -// Refer to the license.txt file included. - -#pragma once - -#include <vector> - -#include "common/common_types.h" -#include "video_core/renderer_opengl/gl_shader_decompiler.h" -#include "video_core/shader/shader_ir.h" - -namespace OpenGL { -class Device; -} - -namespace OpenGL::GLShader { - -using VideoCommon::Shader::ProgramCode; -using VideoCommon::Shader::ShaderIR; - -/// Generates the GLSL vertex shader program source code for the given VS program -std::string GenerateVertexShader(const Device& device, const ShaderIR& ir, const ShaderIR* ir_b); - -/// Generates the GLSL geometry shader program source code for the given GS program -std::string GenerateGeometryShader(const Device& device, const ShaderIR& ir); - -/// Generates the GLSL fragment shader program source code for the given FS program -std::string GenerateFragmentShader(const Device& device, const ShaderIR& ir); - -/// Generates the GLSL compute shader program source code for the given CS program -std::string GenerateComputeShader(const Device& device, const ShaderIR& ir); - -} // namespace OpenGL::GLShader diff --git a/src/video_core/renderer_opengl/gl_shader_manager.cpp b/src/video_core/renderer_opengl/gl_shader_manager.cpp index 75d3fac04..9c7b0adbd 100644 --- a/src/video_core/renderer_opengl/gl_shader_manager.cpp +++ b/src/video_core/renderer_opengl/gl_shader_manager.cpp @@ -2,45 +2,52 @@ // Licensed under GPLv2 or any later version // Refer to the license.txt file included. +#include <glad/glad.h> + #include "common/common_types.h" #include "video_core/engines/maxwell_3d.h" #include "video_core/renderer_opengl/gl_shader_manager.h" namespace OpenGL::GLShader { -using Tegra::Engines::Maxwell3D; - -ProgramManager::ProgramManager() { - pipeline.Create(); -} +ProgramManager::ProgramManager() = default; ProgramManager::~ProgramManager() = default; -void ProgramManager::ApplyTo(OpenGLState& state) { - UpdatePipeline(); - state.draw.shader_program = 0; - state.draw.program_pipeline = pipeline.handle; +void ProgramManager::Create() { + graphics_pipeline.Create(); + glBindProgramPipeline(graphics_pipeline.handle); } -void ProgramManager::UpdatePipeline() { +void ProgramManager::BindGraphicsPipeline() { + if (!is_graphics_bound) { + is_graphics_bound = true; + glUseProgram(0); + } + // Avoid updating the pipeline when values have no changed if (old_state == current_state) { return; } // Workaround for AMD bug - constexpr GLenum all_used_stages{GL_VERTEX_SHADER_BIT | GL_GEOMETRY_SHADER_BIT | - GL_FRAGMENT_SHADER_BIT}; - glUseProgramStages(pipeline.handle, all_used_stages, 0); - - glUseProgramStages(pipeline.handle, GL_VERTEX_SHADER_BIT, current_state.vertex_shader); - glUseProgramStages(pipeline.handle, GL_GEOMETRY_SHADER_BIT, current_state.geometry_shader); - glUseProgramStages(pipeline.handle, GL_FRAGMENT_SHADER_BIT, current_state.fragment_shader); + static constexpr GLenum all_used_stages{GL_VERTEX_SHADER_BIT | GL_GEOMETRY_SHADER_BIT | + GL_FRAGMENT_SHADER_BIT}; + const GLuint handle = graphics_pipeline.handle; + glUseProgramStages(handle, all_used_stages, 0); + glUseProgramStages(handle, GL_VERTEX_SHADER_BIT, current_state.vertex_shader); + glUseProgramStages(handle, GL_GEOMETRY_SHADER_BIT, current_state.geometry_shader); + glUseProgramStages(handle, GL_FRAGMENT_SHADER_BIT, current_state.fragment_shader); old_state = current_state; } -void MaxwellUniformData::SetFromRegs(const Maxwell3D& maxwell) { +void ProgramManager::BindComputeShader(GLuint program) { + is_graphics_bound = false; + glUseProgram(program); +} + +void MaxwellUniformData::SetFromRegs(const Tegra::Engines::Maxwell3D& maxwell) { const auto& regs = maxwell.regs; // Y_NEGATE controls what value S2R returns for the Y_DIRECTION system value. diff --git a/src/video_core/renderer_opengl/gl_shader_manager.h b/src/video_core/renderer_opengl/gl_shader_manager.h index 478c165ce..d2e47f2a9 100644 --- a/src/video_core/renderer_opengl/gl_shader_manager.h +++ b/src/video_core/renderer_opengl/gl_shader_manager.h @@ -9,7 +9,6 @@ #include <glad/glad.h> #include "video_core/renderer_opengl/gl_resource_manager.h" -#include "video_core/renderer_opengl/gl_state.h" #include "video_core/renderer_opengl/maxwell_to_gl.h" namespace OpenGL::GLShader { @@ -32,49 +31,47 @@ public: explicit ProgramManager(); ~ProgramManager(); - void ApplyTo(OpenGLState& state); + void Create(); - void UseProgrammableVertexShader(GLuint program) { + /// Updates the graphics pipeline and binds it. + void BindGraphicsPipeline(); + + /// Binds a compute shader. + void BindComputeShader(GLuint program); + + void UseVertexShader(GLuint program) { current_state.vertex_shader = program; } - void UseProgrammableGeometryShader(GLuint program) { + void UseGeometryShader(GLuint program) { current_state.geometry_shader = program; } - void UseProgrammableFragmentShader(GLuint program) { + void UseFragmentShader(GLuint program) { current_state.fragment_shader = program; } - void UseTrivialGeometryShader() { - current_state.geometry_shader = 0; - } - - void UseTrivialFragmentShader() { - current_state.fragment_shader = 0; - } - private: struct PipelineState { - bool operator==(const PipelineState& rhs) const { + bool operator==(const PipelineState& rhs) const noexcept { return vertex_shader == rhs.vertex_shader && fragment_shader == rhs.fragment_shader && geometry_shader == rhs.geometry_shader; } - bool operator!=(const PipelineState& rhs) const { + bool operator!=(const PipelineState& rhs) const noexcept { return !operator==(rhs); } - GLuint vertex_shader{}; - GLuint fragment_shader{}; - GLuint geometry_shader{}; + GLuint vertex_shader = 0; + GLuint fragment_shader = 0; + GLuint geometry_shader = 0; }; - void UpdatePipeline(); - - OGLPipeline pipeline; + OGLPipeline graphics_pipeline; + OGLPipeline compute_pipeline; PipelineState current_state; PipelineState old_state; + bool is_graphics_bound = true; }; } // namespace OpenGL::GLShader diff --git a/src/video_core/renderer_opengl/gl_state.cpp b/src/video_core/renderer_opengl/gl_state.cpp deleted file mode 100644 index ab1f7983c..000000000 --- a/src/video_core/renderer_opengl/gl_state.cpp +++ /dev/null @@ -1,554 +0,0 @@ -// Copyright 2015 Citra Emulator Project -// Licensed under GPLv2 or any later version -// Refer to the license.txt file included. - -#include <algorithm> -#include <iterator> -#include <glad/glad.h> -#include "common/assert.h" -#include "common/logging/log.h" -#include "common/microprofile.h" -#include "video_core/renderer_opengl/gl_state.h" - -MICROPROFILE_DEFINE(OpenGL_State, "OpenGL", "State Change", MP_RGB(192, 128, 128)); - -namespace OpenGL { - -using Maxwell = Tegra::Engines::Maxwell3D::Regs; - -OpenGLState OpenGLState::cur_state; - -namespace { - -template <typename T> -bool UpdateValue(T& current_value, const T new_value) { - const bool changed = current_value != new_value; - current_value = new_value; - return changed; -} - -template <typename T1, typename T2> -bool UpdateTie(T1 current_value, const T2 new_value) { - const bool changed = current_value != new_value; - current_value = new_value; - return changed; -} - -template <typename T> -std::optional<std::pair<GLuint, GLsizei>> UpdateArray(T& current_values, const T& new_values) { - std::optional<std::size_t> first; - std::size_t last; - for (std::size_t i = 0; i < std::size(current_values); ++i) { - if (!UpdateValue(current_values[i], new_values[i])) { - continue; - } - if (!first) { - first = i; - } - last = i; - } - if (!first) { - return std::nullopt; - } - return std::make_pair(static_cast<GLuint>(*first), static_cast<GLsizei>(last - *first + 1)); -} - -void Enable(GLenum cap, bool enable) { - if (enable) { - glEnable(cap); - } else { - glDisable(cap); - } -} - -void Enable(GLenum cap, GLuint index, bool enable) { - if (enable) { - glEnablei(cap, index); - } else { - glDisablei(cap, index); - } -} - -void Enable(GLenum cap, bool& current_value, bool new_value) { - if (UpdateValue(current_value, new_value)) { - Enable(cap, new_value); - } -} - -void Enable(GLenum cap, GLuint index, bool& current_value, bool new_value) { - if (UpdateValue(current_value, new_value)) { - Enable(cap, index, new_value); - } -} - -} // Anonymous namespace - -OpenGLState::OpenGLState() = default; - -void OpenGLState::SetDefaultViewports() { - viewports.fill(Viewport{}); - - depth_clamp.far_plane = false; - depth_clamp.near_plane = false; -} - -void OpenGLState::ApplyFramebufferState() { - if (UpdateValue(cur_state.draw.read_framebuffer, draw.read_framebuffer)) { - glBindFramebuffer(GL_READ_FRAMEBUFFER, draw.read_framebuffer); - } - if (UpdateValue(cur_state.draw.draw_framebuffer, draw.draw_framebuffer)) { - glBindFramebuffer(GL_DRAW_FRAMEBUFFER, draw.draw_framebuffer); - } -} - -void OpenGLState::ApplyVertexArrayState() { - if (UpdateValue(cur_state.draw.vertex_array, draw.vertex_array)) { - glBindVertexArray(draw.vertex_array); - } -} - -void OpenGLState::ApplyShaderProgram() { - if (UpdateValue(cur_state.draw.shader_program, draw.shader_program)) { - glUseProgram(draw.shader_program); - } -} - -void OpenGLState::ApplyProgramPipeline() { - if (UpdateValue(cur_state.draw.program_pipeline, draw.program_pipeline)) { - glBindProgramPipeline(draw.program_pipeline); - } -} - -void OpenGLState::ApplyClipDistances() { - for (std::size_t i = 0; i < clip_distance.size(); ++i) { - Enable(GL_CLIP_DISTANCE0 + static_cast<GLenum>(i), cur_state.clip_distance[i], - clip_distance[i]); - } -} - -void OpenGLState::ApplyPointSize() { - Enable(GL_PROGRAM_POINT_SIZE, cur_state.point.program_control, point.program_control); - Enable(GL_POINT_SPRITE, cur_state.point.sprite, point.sprite); - if (UpdateValue(cur_state.point.size, point.size)) { - glPointSize(point.size); - } -} - -void OpenGLState::ApplyFragmentColorClamp() { - if (UpdateValue(cur_state.fragment_color_clamp.enabled, fragment_color_clamp.enabled)) { - glClampColor(GL_CLAMP_FRAGMENT_COLOR_ARB, - fragment_color_clamp.enabled ? GL_TRUE : GL_FALSE); - } -} - -void OpenGLState::ApplyMultisample() { - Enable(GL_SAMPLE_ALPHA_TO_COVERAGE, cur_state.multisample_control.alpha_to_coverage, - multisample_control.alpha_to_coverage); - Enable(GL_SAMPLE_ALPHA_TO_ONE, cur_state.multisample_control.alpha_to_one, - multisample_control.alpha_to_one); -} - -void OpenGLState::ApplyDepthClamp() { - if (depth_clamp.far_plane == cur_state.depth_clamp.far_plane && - depth_clamp.near_plane == cur_state.depth_clamp.near_plane) { - return; - } - cur_state.depth_clamp = depth_clamp; - - UNIMPLEMENTED_IF_MSG(depth_clamp.far_plane != depth_clamp.near_plane, - "Unimplemented Depth Clamp Separation!"); - - Enable(GL_DEPTH_CLAMP, depth_clamp.far_plane || depth_clamp.near_plane); -} - -void OpenGLState::ApplySRgb() { - if (cur_state.framebuffer_srgb.enabled == framebuffer_srgb.enabled) - return; - cur_state.framebuffer_srgb.enabled = framebuffer_srgb.enabled; - if (framebuffer_srgb.enabled) { - glEnable(GL_FRAMEBUFFER_SRGB); - } else { - glDisable(GL_FRAMEBUFFER_SRGB); - } -} - -void OpenGLState::ApplyCulling() { - Enable(GL_CULL_FACE, cur_state.cull.enabled, cull.enabled); - - if (UpdateValue(cur_state.cull.mode, cull.mode)) { - glCullFace(cull.mode); - } - - if (UpdateValue(cur_state.cull.front_face, cull.front_face)) { - glFrontFace(cull.front_face); - } -} - -void OpenGLState::ApplyRasterizerDiscard() { - Enable(GL_RASTERIZER_DISCARD, cur_state.rasterizer_discard, rasterizer_discard); -} - -void OpenGLState::ApplyColorMask() { - if (!dirty.color_mask) { - return; - } - dirty.color_mask = false; - - for (std::size_t i = 0; i < Maxwell::NumRenderTargets; ++i) { - const auto& updated = color_mask[i]; - auto& current = cur_state.color_mask[i]; - if (updated.red_enabled != current.red_enabled || - updated.green_enabled != current.green_enabled || - updated.blue_enabled != current.blue_enabled || - updated.alpha_enabled != current.alpha_enabled) { - current = updated; - glColorMaski(static_cast<GLuint>(i), updated.red_enabled, updated.green_enabled, - updated.blue_enabled, updated.alpha_enabled); - } - } -} - -void OpenGLState::ApplyDepth() { - Enable(GL_DEPTH_TEST, cur_state.depth.test_enabled, depth.test_enabled); - - if (cur_state.depth.test_func != depth.test_func) { - cur_state.depth.test_func = depth.test_func; - glDepthFunc(depth.test_func); - } - - if (cur_state.depth.write_mask != depth.write_mask) { - cur_state.depth.write_mask = depth.write_mask; - glDepthMask(depth.write_mask); - } -} - -void OpenGLState::ApplyPrimitiveRestart() { - Enable(GL_PRIMITIVE_RESTART, cur_state.primitive_restart.enabled, primitive_restart.enabled); - - if (cur_state.primitive_restart.index != primitive_restart.index) { - cur_state.primitive_restart.index = primitive_restart.index; - glPrimitiveRestartIndex(primitive_restart.index); - } -} - -void OpenGLState::ApplyStencilTest() { - if (!dirty.stencil_state) { - return; - } - dirty.stencil_state = false; - - Enable(GL_STENCIL_TEST, cur_state.stencil.test_enabled, stencil.test_enabled); - - const auto ConfigStencil = [](GLenum face, const auto& config, auto& current) { - if (current.test_func != config.test_func || current.test_ref != config.test_ref || - current.test_mask != config.test_mask) { - current.test_func = config.test_func; - current.test_ref = config.test_ref; - current.test_mask = config.test_mask; - glStencilFuncSeparate(face, config.test_func, config.test_ref, config.test_mask); - } - if (current.action_depth_fail != config.action_depth_fail || - current.action_depth_pass != config.action_depth_pass || - current.action_stencil_fail != config.action_stencil_fail) { - current.action_depth_fail = config.action_depth_fail; - current.action_depth_pass = config.action_depth_pass; - current.action_stencil_fail = config.action_stencil_fail; - glStencilOpSeparate(face, config.action_stencil_fail, config.action_depth_fail, - config.action_depth_pass); - } - if (current.write_mask != config.write_mask) { - current.write_mask = config.write_mask; - glStencilMaskSeparate(face, config.write_mask); - } - }; - ConfigStencil(GL_FRONT, stencil.front, cur_state.stencil.front); - ConfigStencil(GL_BACK, stencil.back, cur_state.stencil.back); -} - -void OpenGLState::ApplyViewport() { - for (GLuint i = 0; i < static_cast<GLuint>(Maxwell::NumViewports); ++i) { - const auto& updated = viewports[i]; - auto& current = cur_state.viewports[i]; - - if (current.x != updated.x || current.y != updated.y || current.width != updated.width || - current.height != updated.height) { - current.x = updated.x; - current.y = updated.y; - current.width = updated.width; - current.height = updated.height; - glViewportIndexedf(i, static_cast<GLfloat>(updated.x), static_cast<GLfloat>(updated.y), - static_cast<GLfloat>(updated.width), - static_cast<GLfloat>(updated.height)); - } - if (current.depth_range_near != updated.depth_range_near || - current.depth_range_far != updated.depth_range_far) { - current.depth_range_near = updated.depth_range_near; - current.depth_range_far = updated.depth_range_far; - glDepthRangeIndexed(i, updated.depth_range_near, updated.depth_range_far); - } - - Enable(GL_SCISSOR_TEST, i, current.scissor.enabled, updated.scissor.enabled); - - if (current.scissor.x != updated.scissor.x || current.scissor.y != updated.scissor.y || - current.scissor.width != updated.scissor.width || - current.scissor.height != updated.scissor.height) { - current.scissor.x = updated.scissor.x; - current.scissor.y = updated.scissor.y; - current.scissor.width = updated.scissor.width; - current.scissor.height = updated.scissor.height; - glScissorIndexed(i, updated.scissor.x, updated.scissor.y, updated.scissor.width, - updated.scissor.height); - } - } -} - -void OpenGLState::ApplyGlobalBlending() { - const Blend& updated = blend[0]; - Blend& current = cur_state.blend[0]; - - Enable(GL_BLEND, current.enabled, updated.enabled); - - if (current.src_rgb_func != updated.src_rgb_func || - current.dst_rgb_func != updated.dst_rgb_func || current.src_a_func != updated.src_a_func || - current.dst_a_func != updated.dst_a_func) { - current.src_rgb_func = updated.src_rgb_func; - current.dst_rgb_func = updated.dst_rgb_func; - current.src_a_func = updated.src_a_func; - current.dst_a_func = updated.dst_a_func; - glBlendFuncSeparate(updated.src_rgb_func, updated.dst_rgb_func, updated.src_a_func, - updated.dst_a_func); - } - - if (current.rgb_equation != updated.rgb_equation || current.a_equation != updated.a_equation) { - current.rgb_equation = updated.rgb_equation; - current.a_equation = updated.a_equation; - glBlendEquationSeparate(updated.rgb_equation, updated.a_equation); - } -} - -void OpenGLState::ApplyTargetBlending(std::size_t target, bool force) { - const Blend& updated = blend[target]; - Blend& current = cur_state.blend[target]; - - if (current.enabled != updated.enabled || force) { - current.enabled = updated.enabled; - Enable(GL_BLEND, static_cast<GLuint>(target), updated.enabled); - } - - if (UpdateTie(std::tie(current.src_rgb_func, current.dst_rgb_func, current.src_a_func, - current.dst_a_func), - std::tie(updated.src_rgb_func, updated.dst_rgb_func, updated.src_a_func, - updated.dst_a_func))) { - glBlendFuncSeparatei(static_cast<GLuint>(target), updated.src_rgb_func, - updated.dst_rgb_func, updated.src_a_func, updated.dst_a_func); - } - - if (UpdateTie(std::tie(current.rgb_equation, current.a_equation), - std::tie(updated.rgb_equation, updated.a_equation))) { - glBlendEquationSeparatei(static_cast<GLuint>(target), updated.rgb_equation, - updated.a_equation); - } -} - -void OpenGLState::ApplyBlending() { - if (!dirty.blend_state) { - return; - } - dirty.blend_state = false; - - if (independant_blend.enabled) { - const bool force = independant_blend.enabled != cur_state.independant_blend.enabled; - for (std::size_t target = 0; target < Maxwell::NumRenderTargets; ++target) { - ApplyTargetBlending(target, force); - } - } else { - ApplyGlobalBlending(); - } - cur_state.independant_blend.enabled = independant_blend.enabled; - - if (UpdateTie( - std::tie(cur_state.blend_color.red, cur_state.blend_color.green, - cur_state.blend_color.blue, cur_state.blend_color.alpha), - std::tie(blend_color.red, blend_color.green, blend_color.blue, blend_color.alpha))) { - glBlendColor(blend_color.red, blend_color.green, blend_color.blue, blend_color.alpha); - } -} - -void OpenGLState::ApplyLogicOp() { - Enable(GL_COLOR_LOGIC_OP, cur_state.logic_op.enabled, logic_op.enabled); - - if (UpdateValue(cur_state.logic_op.operation, logic_op.operation)) { - glLogicOp(logic_op.operation); - } -} - -void OpenGLState::ApplyPolygonOffset() { - if (!dirty.polygon_offset) { - return; - } - dirty.polygon_offset = false; - - Enable(GL_POLYGON_OFFSET_FILL, cur_state.polygon_offset.fill_enable, - polygon_offset.fill_enable); - Enable(GL_POLYGON_OFFSET_LINE, cur_state.polygon_offset.line_enable, - polygon_offset.line_enable); - Enable(GL_POLYGON_OFFSET_POINT, cur_state.polygon_offset.point_enable, - polygon_offset.point_enable); - - if (UpdateTie(std::tie(cur_state.polygon_offset.factor, cur_state.polygon_offset.units, - cur_state.polygon_offset.clamp), - std::tie(polygon_offset.factor, polygon_offset.units, polygon_offset.clamp))) { - if (GLAD_GL_EXT_polygon_offset_clamp && polygon_offset.clamp != 0) { - glPolygonOffsetClamp(polygon_offset.factor, polygon_offset.units, polygon_offset.clamp); - } else { - UNIMPLEMENTED_IF_MSG(polygon_offset.clamp != 0, - "Unimplemented Depth polygon offset clamp."); - glPolygonOffset(polygon_offset.factor, polygon_offset.units); - } - } -} - -void OpenGLState::ApplyAlphaTest() { - Enable(GL_ALPHA_TEST, cur_state.alpha_test.enabled, alpha_test.enabled); - if (UpdateTie(std::tie(cur_state.alpha_test.func, cur_state.alpha_test.ref), - std::tie(alpha_test.func, alpha_test.ref))) { - glAlphaFunc(alpha_test.func, alpha_test.ref); - } -} - -void OpenGLState::ApplyClipControl() { - if (UpdateTie(std::tie(cur_state.clip_control.origin, cur_state.clip_control.depth_mode), - std::tie(clip_control.origin, clip_control.depth_mode))) { - glClipControl(clip_control.origin, clip_control.depth_mode); - } -} - -void OpenGLState::ApplyTextures() { - const std::size_t size = std::size(textures); - for (std::size_t i = 0; i < size; ++i) { - if (UpdateValue(cur_state.textures[i], textures[i])) { - // BindTextureUnit doesn't support binding null textures, skip those binds. - // TODO(Rodrigo): Stop using null textures - if (textures[i] != 0) { - glBindTextureUnit(static_cast<GLuint>(i), textures[i]); - } - } - } -} - -void OpenGLState::ApplySamplers() { - const std::size_t size = std::size(samplers); - for (std::size_t i = 0; i < size; ++i) { - if (UpdateValue(cur_state.samplers[i], samplers[i])) { - glBindSampler(static_cast<GLuint>(i), samplers[i]); - } - } -} - -void OpenGLState::ApplyImages() { - if (const auto update = UpdateArray(cur_state.images, images)) { - glBindImageTextures(update->first, update->second, images.data() + update->first); - } -} - -void OpenGLState::Apply() { - MICROPROFILE_SCOPE(OpenGL_State); - ApplyFramebufferState(); - ApplyVertexArrayState(); - ApplyShaderProgram(); - ApplyProgramPipeline(); - ApplyClipDistances(); - ApplyPointSize(); - ApplyFragmentColorClamp(); - ApplyMultisample(); - ApplyRasterizerDiscard(); - ApplyColorMask(); - ApplyDepthClamp(); - ApplyViewport(); - ApplyStencilTest(); - ApplySRgb(); - ApplyCulling(); - ApplyDepth(); - ApplyPrimitiveRestart(); - ApplyBlending(); - ApplyLogicOp(); - ApplyTextures(); - ApplySamplers(); - ApplyImages(); - ApplyPolygonOffset(); - ApplyAlphaTest(); - ApplyClipControl(); -} - -void OpenGLState::EmulateViewportWithScissor() { - auto& current = viewports[0]; - if (current.scissor.enabled) { - const GLint left = std::max(current.x, current.scissor.x); - const GLint right = - std::max(current.x + current.width, current.scissor.x + current.scissor.width); - const GLint bottom = std::max(current.y, current.scissor.y); - const GLint top = - std::max(current.y + current.height, current.scissor.y + current.scissor.height); - current.scissor.x = std::max(left, 0); - current.scissor.y = std::max(bottom, 0); - current.scissor.width = std::max(right - left, 0); - current.scissor.height = std::max(top - bottom, 0); - } else { - current.scissor.enabled = true; - current.scissor.x = current.x; - current.scissor.y = current.y; - current.scissor.width = current.width; - current.scissor.height = current.height; - } -} - -OpenGLState& OpenGLState::UnbindTexture(GLuint handle) { - for (auto& texture : textures) { - if (texture == handle) { - texture = 0; - } - } - return *this; -} - -OpenGLState& OpenGLState::ResetSampler(GLuint handle) { - for (auto& sampler : samplers) { - if (sampler == handle) { - sampler = 0; - } - } - return *this; -} - -OpenGLState& OpenGLState::ResetProgram(GLuint handle) { - if (draw.shader_program == handle) { - draw.shader_program = 0; - } - return *this; -} - -OpenGLState& OpenGLState::ResetPipeline(GLuint handle) { - if (draw.program_pipeline == handle) { - draw.program_pipeline = 0; - } - return *this; -} - -OpenGLState& OpenGLState::ResetVertexArray(GLuint handle) { - if (draw.vertex_array == handle) { - draw.vertex_array = 0; - } - return *this; -} - -OpenGLState& OpenGLState::ResetFramebuffer(GLuint handle) { - if (draw.read_framebuffer == handle) { - draw.read_framebuffer = 0; - } - if (draw.draw_framebuffer == handle) { - draw.draw_framebuffer = 0; - } - return *this; -} - -} // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_state.h b/src/video_core/renderer_opengl/gl_state.h deleted file mode 100644 index 4953eeda2..000000000 --- a/src/video_core/renderer_opengl/gl_state.h +++ /dev/null @@ -1,247 +0,0 @@ -// Copyright 2015 Citra Emulator Project -// Licensed under GPLv2 or any later version -// Refer to the license.txt file included. - -#pragma once - -#include <array> -#include <type_traits> -#include <glad/glad.h> -#include "video_core/engines/maxwell_3d.h" - -namespace OpenGL { - -class OpenGLState { -public: - struct { - bool enabled = false; // GL_FRAMEBUFFER_SRGB - } framebuffer_srgb; - - struct { - bool alpha_to_coverage = false; // GL_ALPHA_TO_COVERAGE - bool alpha_to_one = false; // GL_ALPHA_TO_ONE - } multisample_control; - - struct { - bool enabled = false; // GL_CLAMP_FRAGMENT_COLOR_ARB - } fragment_color_clamp; - - struct { - bool far_plane = false; - bool near_plane = false; - } depth_clamp; // GL_DEPTH_CLAMP - - struct { - bool enabled = false; // GL_CULL_FACE - GLenum mode = GL_BACK; // GL_CULL_FACE_MODE - GLenum front_face = GL_CCW; // GL_FRONT_FACE - } cull; - - struct { - bool test_enabled = false; // GL_DEPTH_TEST - GLboolean write_mask = GL_TRUE; // GL_DEPTH_WRITEMASK - GLenum test_func = GL_LESS; // GL_DEPTH_FUNC - } depth; - - struct { - bool enabled = false; - GLuint index = 0; - } primitive_restart; // GL_PRIMITIVE_RESTART - - bool rasterizer_discard = false; // GL_RASTERIZER_DISCARD - - struct ColorMask { - GLboolean red_enabled = GL_TRUE; - GLboolean green_enabled = GL_TRUE; - GLboolean blue_enabled = GL_TRUE; - GLboolean alpha_enabled = GL_TRUE; - }; - std::array<ColorMask, Tegra::Engines::Maxwell3D::Regs::NumRenderTargets> - color_mask; // GL_COLOR_WRITEMASK - - struct { - bool test_enabled = false; // GL_STENCIL_TEST - struct { - GLenum test_func = GL_ALWAYS; // GL_STENCIL_FUNC - GLint test_ref = 0; // GL_STENCIL_REF - GLuint test_mask = 0xFFFFFFFF; // GL_STENCIL_VALUE_MASK - GLuint write_mask = 0xFFFFFFFF; // GL_STENCIL_WRITEMASK - GLenum action_stencil_fail = GL_KEEP; // GL_STENCIL_FAIL - GLenum action_depth_fail = GL_KEEP; // GL_STENCIL_PASS_DEPTH_FAIL - GLenum action_depth_pass = GL_KEEP; // GL_STENCIL_PASS_DEPTH_PASS - } front, back; - } stencil; - - struct Blend { - bool enabled = false; // GL_BLEND - GLenum rgb_equation = GL_FUNC_ADD; // GL_BLEND_EQUATION_RGB - GLenum a_equation = GL_FUNC_ADD; // GL_BLEND_EQUATION_ALPHA - GLenum src_rgb_func = GL_ONE; // GL_BLEND_SRC_RGB - GLenum dst_rgb_func = GL_ZERO; // GL_BLEND_DST_RGB - GLenum src_a_func = GL_ONE; // GL_BLEND_SRC_ALPHA - GLenum dst_a_func = GL_ZERO; // GL_BLEND_DST_ALPHA - }; - std::array<Blend, Tegra::Engines::Maxwell3D::Regs::NumRenderTargets> blend; - - struct { - bool enabled = false; - } independant_blend; - - struct { - GLclampf red = 0.0f; - GLclampf green = 0.0f; - GLclampf blue = 0.0f; - GLclampf alpha = 0.0f; - } blend_color; // GL_BLEND_COLOR - - struct { - bool enabled = false; // GL_LOGIC_OP_MODE - GLenum operation = GL_COPY; - } logic_op; - - static constexpr std::size_t NumSamplers = 32 * 5; - static constexpr std::size_t NumImages = 8 * 5; - std::array<GLuint, NumSamplers> textures = {}; - std::array<GLuint, NumSamplers> samplers = {}; - std::array<GLuint, NumImages> images = {}; - - struct { - GLuint read_framebuffer = 0; // GL_READ_FRAMEBUFFER_BINDING - GLuint draw_framebuffer = 0; // GL_DRAW_FRAMEBUFFER_BINDING - GLuint vertex_array = 0; // GL_VERTEX_ARRAY_BINDING - GLuint shader_program = 0; // GL_CURRENT_PROGRAM - GLuint program_pipeline = 0; // GL_PROGRAM_PIPELINE_BINDING - } draw; - - struct Viewport { - GLint x = 0; - GLint y = 0; - GLint width = 0; - GLint height = 0; - GLfloat depth_range_near = 0.0f; // GL_DEPTH_RANGE - GLfloat depth_range_far = 1.0f; // GL_DEPTH_RANGE - struct { - bool enabled = false; // GL_SCISSOR_TEST - GLint x = 0; - GLint y = 0; - GLsizei width = 0; - GLsizei height = 0; - } scissor; - }; - std::array<Viewport, Tegra::Engines::Maxwell3D::Regs::NumViewports> viewports; - - struct { - bool program_control = false; // GL_PROGRAM_POINT_SIZE - bool sprite = false; // GL_POINT_SPRITE - GLfloat size = 1.0f; // GL_POINT_SIZE - } point; - - struct { - bool point_enable = false; - bool line_enable = false; - bool fill_enable = false; - GLfloat units = 0.0f; - GLfloat factor = 0.0f; - GLfloat clamp = 0.0f; - } polygon_offset; - - struct { - bool enabled = false; // GL_ALPHA_TEST - GLenum func = GL_ALWAYS; // GL_ALPHA_TEST_FUNC - GLfloat ref = 0.0f; // GL_ALPHA_TEST_REF - } alpha_test; - - std::array<bool, 8> clip_distance = {}; // GL_CLIP_DISTANCE - - struct { - GLenum origin = GL_LOWER_LEFT; - GLenum depth_mode = GL_NEGATIVE_ONE_TO_ONE; - } clip_control; - - OpenGLState(); - - /// Get the currently active OpenGL state - static OpenGLState GetCurState() { - return cur_state; - } - - void SetDefaultViewports(); - /// Apply this state as the current OpenGL state - void Apply(); - - void ApplyFramebufferState(); - void ApplyVertexArrayState(); - void ApplyShaderProgram(); - void ApplyProgramPipeline(); - void ApplyClipDistances(); - void ApplyPointSize(); - void ApplyFragmentColorClamp(); - void ApplyMultisample(); - void ApplySRgb(); - void ApplyCulling(); - void ApplyRasterizerDiscard(); - void ApplyColorMask(); - void ApplyDepth(); - void ApplyPrimitiveRestart(); - void ApplyStencilTest(); - void ApplyViewport(); - void ApplyTargetBlending(std::size_t target, bool force); - void ApplyGlobalBlending(); - void ApplyBlending(); - void ApplyLogicOp(); - void ApplyTextures(); - void ApplySamplers(); - void ApplyImages(); - void ApplyDepthClamp(); - void ApplyPolygonOffset(); - void ApplyAlphaTest(); - void ApplyClipControl(); - - /// Resets any references to the given resource - OpenGLState& UnbindTexture(GLuint handle); - OpenGLState& ResetSampler(GLuint handle); - OpenGLState& ResetProgram(GLuint handle); - OpenGLState& ResetPipeline(GLuint handle); - OpenGLState& ResetVertexArray(GLuint handle); - OpenGLState& ResetFramebuffer(GLuint handle); - - /// Viewport does not affects glClearBuffer so emulate viewport using scissor test - void EmulateViewportWithScissor(); - - void MarkDirtyBlendState() { - dirty.blend_state = true; - } - - void MarkDirtyStencilState() { - dirty.stencil_state = true; - } - - void MarkDirtyPolygonOffset() { - dirty.polygon_offset = true; - } - - void MarkDirtyColorMask() { - dirty.color_mask = true; - } - - void AllDirty() { - dirty.blend_state = true; - dirty.stencil_state = true; - dirty.polygon_offset = true; - dirty.color_mask = true; - } - -private: - static OpenGLState cur_state; - - struct { - bool blend_state; - bool stencil_state; - bool viewport_state; - bool polygon_offset; - bool color_mask; - } dirty{}; -}; -static_assert(std::is_trivially_copyable_v<OpenGLState>); - -} // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_state_tracker.cpp b/src/video_core/renderer_opengl/gl_state_tracker.cpp new file mode 100644 index 000000000..255ac3147 --- /dev/null +++ b/src/video_core/renderer_opengl/gl_state_tracker.cpp @@ -0,0 +1,247 @@ +// Copyright 2019 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include <algorithm> +#include <array> +#include <cstddef> + +#include "common/common_types.h" +#include "core/core.h" +#include "video_core/engines/maxwell_3d.h" +#include "video_core/gpu.h" +#include "video_core/renderer_opengl/gl_state_tracker.h" + +#define OFF(field_name) MAXWELL3D_REG_INDEX(field_name) +#define NUM(field_name) (sizeof(Maxwell3D::Regs::field_name) / sizeof(u32)) + +namespace OpenGL { + +namespace { + +using namespace Dirty; +using namespace VideoCommon::Dirty; +using Tegra::Engines::Maxwell3D; +using Regs = Maxwell3D::Regs; +using Tables = Maxwell3D::DirtyState::Tables; +using Table = Maxwell3D::DirtyState::Table; + +void SetupDirtyColorMasks(Tables& tables) { + tables[0][OFF(color_mask_common)] = ColorMaskCommon; + for (std::size_t rt = 0; rt < Regs::NumRenderTargets; ++rt) { + const std::size_t offset = OFF(color_mask) + rt * NUM(color_mask[0]); + FillBlock(tables[0], offset, NUM(color_mask[0]), ColorMask0 + rt); + } + + FillBlock(tables[1], OFF(color_mask), NUM(color_mask), ColorMasks); +} + +void SetupDirtyVertexArrays(Tables& tables) { + static constexpr std::size_t num_array = 3; + static constexpr std::size_t instance_base_offset = 3; + for (std::size_t i = 0; i < Regs::NumVertexArrays; ++i) { + const std::size_t array_offset = OFF(vertex_array) + i * NUM(vertex_array[0]); + const std::size_t limit_offset = OFF(vertex_array_limit) + i * NUM(vertex_array_limit[0]); + + FillBlock(tables, array_offset, num_array, VertexBuffer0 + i, VertexBuffers); + FillBlock(tables, limit_offset, NUM(vertex_array_limit), VertexBuffer0 + i, VertexBuffers); + + const std::size_t instance_array_offset = array_offset + instance_base_offset; + tables[0][instance_array_offset] = static_cast<u8>(VertexInstance0 + i); + tables[1][instance_array_offset] = VertexInstances; + + const std::size_t instance_offset = OFF(instanced_arrays) + i; + tables[0][instance_offset] = static_cast<u8>(VertexInstance0 + i); + tables[1][instance_offset] = VertexInstances; + } +} + +void SetupDirtyVertexFormat(Tables& tables) { + for (std::size_t i = 0; i < Regs::NumVertexAttributes; ++i) { + const std::size_t offset = OFF(vertex_attrib_format) + i * NUM(vertex_attrib_format[0]); + FillBlock(tables[0], offset, NUM(vertex_attrib_format[0]), VertexFormat0 + i); + } + + FillBlock(tables[1], OFF(vertex_attrib_format), Regs::NumVertexAttributes, VertexFormats); +} + +void SetupDirtyViewports(Tables& tables) { + for (std::size_t i = 0; i < Regs::NumViewports; ++i) { + const std::size_t transf_offset = OFF(viewport_transform) + i * NUM(viewport_transform[0]); + const std::size_t viewport_offset = OFF(viewports) + i * NUM(viewports[0]); + + FillBlock(tables[0], transf_offset, NUM(viewport_transform[0]), Viewport0 + i); + FillBlock(tables[0], viewport_offset, NUM(viewports[0]), Viewport0 + i); + } + + FillBlock(tables[1], OFF(viewport_transform), NUM(viewport_transform), Viewports); + FillBlock(tables[1], OFF(viewports), NUM(viewports), Viewports); + + tables[0][OFF(viewport_transform_enabled)] = ViewportTransform; + tables[1][OFF(viewport_transform_enabled)] = Viewports; +} + +void SetupDirtyScissors(Tables& tables) { + for (std::size_t i = 0; i < Regs::NumViewports; ++i) { + const std::size_t offset = OFF(scissor_test) + i * NUM(scissor_test[0]); + FillBlock(tables[0], offset, NUM(scissor_test[0]), Scissor0 + i); + } + FillBlock(tables[1], OFF(scissor_test), NUM(scissor_test), Scissors); +} + +void SetupDirtyShaders(Tables& tables) { + FillBlock(tables[0], OFF(shader_config[0]), NUM(shader_config[0]) * Regs::MaxShaderProgram, + Shaders); +} + +void SetupDirtyPolygonModes(Tables& tables) { + tables[0][OFF(polygon_mode_front)] = PolygonModeFront; + tables[0][OFF(polygon_mode_back)] = PolygonModeBack; + + tables[1][OFF(polygon_mode_front)] = PolygonModes; + tables[1][OFF(polygon_mode_back)] = PolygonModes; + tables[0][OFF(fill_rectangle)] = PolygonModes; +} + +void SetupDirtyDepthTest(Tables& tables) { + auto& table = tables[0]; + table[OFF(depth_test_enable)] = DepthTest; + table[OFF(depth_write_enabled)] = DepthMask; + table[OFF(depth_test_func)] = DepthTest; +} + +void SetupDirtyStencilTest(Tables& tables) { + static constexpr std::array offsets = { + OFF(stencil_enable), OFF(stencil_front_func_func), OFF(stencil_front_func_ref), + OFF(stencil_front_func_mask), OFF(stencil_front_op_fail), OFF(stencil_front_op_zfail), + OFF(stencil_front_op_zpass), OFF(stencil_front_mask), OFF(stencil_two_side_enable), + OFF(stencil_back_func_func), OFF(stencil_back_func_ref), OFF(stencil_back_func_mask), + OFF(stencil_back_op_fail), OFF(stencil_back_op_zfail), OFF(stencil_back_op_zpass), + OFF(stencil_back_mask)}; + for (const auto offset : offsets) { + tables[0][offset] = StencilTest; + } +} + +void SetupDirtyAlphaTest(Tables& tables) { + auto& table = tables[0]; + table[OFF(alpha_test_ref)] = AlphaTest; + table[OFF(alpha_test_func)] = AlphaTest; + table[OFF(alpha_test_enabled)] = AlphaTest; +} + +void SetupDirtyBlend(Tables& tables) { + FillBlock(tables[0], OFF(blend_color), NUM(blend_color), BlendColor); + + tables[0][OFF(independent_blend_enable)] = BlendIndependentEnabled; + + for (std::size_t i = 0; i < Regs::NumRenderTargets; ++i) { + const std::size_t offset = OFF(independent_blend) + i * NUM(independent_blend[0]); + FillBlock(tables[0], offset, NUM(independent_blend[0]), BlendState0 + i); + + tables[0][OFF(blend.enable) + i] = static_cast<u8>(BlendState0 + i); + } + FillBlock(tables[1], OFF(independent_blend), NUM(independent_blend), BlendStates); + FillBlock(tables[1], OFF(blend), NUM(blend), BlendStates); +} + +void SetupDirtyPrimitiveRestart(Tables& tables) { + FillBlock(tables[0], OFF(primitive_restart), NUM(primitive_restart), PrimitiveRestart); +} + +void SetupDirtyPolygonOffset(Tables& tables) { + auto& table = tables[0]; + table[OFF(polygon_offset_fill_enable)] = PolygonOffset; + table[OFF(polygon_offset_line_enable)] = PolygonOffset; + table[OFF(polygon_offset_point_enable)] = PolygonOffset; + table[OFF(polygon_offset_factor)] = PolygonOffset; + table[OFF(polygon_offset_units)] = PolygonOffset; + table[OFF(polygon_offset_clamp)] = PolygonOffset; +} + +void SetupDirtyMultisampleControl(Tables& tables) { + FillBlock(tables[0], OFF(multisample_control), NUM(multisample_control), MultisampleControl); +} + +void SetupDirtyRasterizeEnable(Tables& tables) { + tables[0][OFF(rasterize_enable)] = RasterizeEnable; +} + +void SetupDirtyFramebufferSRGB(Tables& tables) { + tables[0][OFF(framebuffer_srgb)] = FramebufferSRGB; +} + +void SetupDirtyLogicOp(Tables& tables) { + FillBlock(tables[0], OFF(logic_op), NUM(logic_op), LogicOp); +} + +void SetupDirtyFragmentClampColor(Tables& tables) { + tables[0][OFF(frag_color_clamp)] = FragmentClampColor; +} + +void SetupDirtyPointSize(Tables& tables) { + tables[0][OFF(vp_point_size)] = PointSize; + tables[0][OFF(point_size)] = PointSize; + tables[0][OFF(point_sprite_enable)] = PointSize; +} + +void SetupDirtyClipControl(Tables& tables) { + auto& table = tables[0]; + table[OFF(screen_y_control)] = ClipControl; + table[OFF(depth_mode)] = ClipControl; +} + +void SetupDirtyDepthClampEnabled(Tables& tables) { + tables[0][OFF(view_volume_clip_control)] = DepthClampEnabled; +} + +void SetupDirtyMisc(Tables& tables) { + auto& table = tables[0]; + + table[OFF(clip_distance_enabled)] = ClipDistances; + + table[OFF(front_face)] = FrontFace; + + table[OFF(cull_test_enabled)] = CullTest; + table[OFF(cull_face)] = CullTest; +} + +} // Anonymous namespace + +StateTracker::StateTracker(Core::System& system) : system{system} {} + +void StateTracker::Initialize() { + auto& dirty = system.GPU().Maxwell3D().dirty; + auto& tables = dirty.tables; + SetupDirtyRenderTargets(tables); + SetupDirtyColorMasks(tables); + SetupDirtyViewports(tables); + SetupDirtyScissors(tables); + SetupDirtyVertexArrays(tables); + SetupDirtyVertexFormat(tables); + SetupDirtyShaders(tables); + SetupDirtyPolygonModes(tables); + SetupDirtyDepthTest(tables); + SetupDirtyStencilTest(tables); + SetupDirtyAlphaTest(tables); + SetupDirtyBlend(tables); + SetupDirtyPrimitiveRestart(tables); + SetupDirtyPolygonOffset(tables); + SetupDirtyMultisampleControl(tables); + SetupDirtyRasterizeEnable(tables); + SetupDirtyFramebufferSRGB(tables); + SetupDirtyLogicOp(tables); + SetupDirtyFragmentClampColor(tables); + SetupDirtyPointSize(tables); + SetupDirtyClipControl(tables); + SetupDirtyDepthClampEnabled(tables); + SetupDirtyMisc(tables); + + auto& store = dirty.on_write_stores; + store[VertexBuffers] = true; + for (std::size_t i = 0; i < Regs::NumVertexArrays; ++i) { + store[VertexBuffer0 + i] = true; + } +} + +} // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_state_tracker.h b/src/video_core/renderer_opengl/gl_state_tracker.h new file mode 100644 index 000000000..b882d75c3 --- /dev/null +++ b/src/video_core/renderer_opengl/gl_state_tracker.h @@ -0,0 +1,215 @@ +// Copyright 2019 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include <limits> + +#include <glad/glad.h> + +#include "common/common_types.h" +#include "core/core.h" +#include "video_core/dirty_flags.h" +#include "video_core/engines/maxwell_3d.h" + +namespace Core { +class System; +} + +namespace OpenGL { + +namespace Dirty { + +enum : u8 { + First = VideoCommon::Dirty::LastCommonEntry, + + VertexFormats, + VertexFormat0, + VertexFormat31 = VertexFormat0 + 31, + + VertexBuffers, + VertexBuffer0, + VertexBuffer31 = VertexBuffer0 + 31, + + VertexInstances, + VertexInstance0, + VertexInstance31 = VertexInstance0 + 31, + + ViewportTransform, + Viewports, + Viewport0, + Viewport15 = Viewport0 + 15, + + Scissors, + Scissor0, + Scissor15 = Scissor0 + 15, + + ColorMaskCommon, + ColorMasks, + ColorMask0, + ColorMask7 = ColorMask0 + 7, + + BlendColor, + BlendIndependentEnabled, + BlendStates, + BlendState0, + BlendState7 = BlendState0 + 7, + + Shaders, + ClipDistances, + + PolygonModes, + PolygonModeFront, + PolygonModeBack, + + ColorMask, + FrontFace, + CullTest, + DepthMask, + DepthTest, + StencilTest, + AlphaTest, + PrimitiveRestart, + PolygonOffset, + MultisampleControl, + RasterizeEnable, + FramebufferSRGB, + LogicOp, + FragmentClampColor, + PointSize, + ClipControl, + DepthClampEnabled, + + Last +}; +static_assert(Last <= std::numeric_limits<u8>::max()); + +} // namespace Dirty + +class StateTracker { +public: + explicit StateTracker(Core::System& system); + + void Initialize(); + + void BindIndexBuffer(GLuint new_index_buffer) { + if (index_buffer == new_index_buffer) { + return; + } + index_buffer = new_index_buffer; + glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, new_index_buffer); + } + + void NotifyScreenDrawVertexArray() { + auto& flags = system.GPU().Maxwell3D().dirty.flags; + flags[OpenGL::Dirty::VertexFormats] = true; + flags[OpenGL::Dirty::VertexFormat0 + 0] = true; + flags[OpenGL::Dirty::VertexFormat0 + 1] = true; + + flags[OpenGL::Dirty::VertexBuffers] = true; + flags[OpenGL::Dirty::VertexBuffer0] = true; + + flags[OpenGL::Dirty::VertexInstances] = true; + flags[OpenGL::Dirty::VertexInstance0 + 0] = true; + flags[OpenGL::Dirty::VertexInstance0 + 1] = true; + } + + void NotifyPolygonModes() { + auto& flags = system.GPU().Maxwell3D().dirty.flags; + flags[OpenGL::Dirty::PolygonModes] = true; + flags[OpenGL::Dirty::PolygonModeFront] = true; + flags[OpenGL::Dirty::PolygonModeBack] = true; + } + + void NotifyViewport0() { + auto& flags = system.GPU().Maxwell3D().dirty.flags; + flags[OpenGL::Dirty::Viewports] = true; + flags[OpenGL::Dirty::Viewport0] = true; + } + + void NotifyScissor0() { + auto& flags = system.GPU().Maxwell3D().dirty.flags; + flags[OpenGL::Dirty::Scissors] = true; + flags[OpenGL::Dirty::Scissor0] = true; + } + + void NotifyColorMask0() { + auto& flags = system.GPU().Maxwell3D().dirty.flags; + flags[OpenGL::Dirty::ColorMasks] = true; + flags[OpenGL::Dirty::ColorMask0] = true; + } + + void NotifyBlend0() { + auto& flags = system.GPU().Maxwell3D().dirty.flags; + flags[OpenGL::Dirty::BlendStates] = true; + flags[OpenGL::Dirty::BlendState0] = true; + } + + void NotifyFramebuffer() { + auto& flags = system.GPU().Maxwell3D().dirty.flags; + flags[VideoCommon::Dirty::RenderTargets] = true; + } + + void NotifyFrontFace() { + auto& flags = system.GPU().Maxwell3D().dirty.flags; + flags[OpenGL::Dirty::FrontFace] = true; + } + + void NotifyCullTest() { + auto& flags = system.GPU().Maxwell3D().dirty.flags; + flags[OpenGL::Dirty::CullTest] = true; + } + + void NotifyDepthMask() { + auto& flags = system.GPU().Maxwell3D().dirty.flags; + flags[OpenGL::Dirty::DepthMask] = true; + } + + void NotifyDepthTest() { + auto& flags = system.GPU().Maxwell3D().dirty.flags; + flags[OpenGL::Dirty::DepthTest] = true; + } + + void NotifyStencilTest() { + auto& flags = system.GPU().Maxwell3D().dirty.flags; + flags[OpenGL::Dirty::StencilTest] = true; + } + + void NotifyPolygonOffset() { + auto& flags = system.GPU().Maxwell3D().dirty.flags; + flags[OpenGL::Dirty::PolygonOffset] = true; + } + + void NotifyRasterizeEnable() { + auto& flags = system.GPU().Maxwell3D().dirty.flags; + flags[OpenGL::Dirty::RasterizeEnable] = true; + } + + void NotifyFramebufferSRGB() { + auto& flags = system.GPU().Maxwell3D().dirty.flags; + flags[OpenGL::Dirty::FramebufferSRGB] = true; + } + + void NotifyLogicOp() { + auto& flags = system.GPU().Maxwell3D().dirty.flags; + flags[OpenGL::Dirty::LogicOp] = true; + } + + void NotifyClipControl() { + auto& flags = system.GPU().Maxwell3D().dirty.flags; + flags[OpenGL::Dirty::ClipControl] = true; + } + + void NotifyAlphaTest() { + auto& flags = system.GPU().Maxwell3D().dirty.flags; + flags[OpenGL::Dirty::AlphaTest] = true; + } + +private: + Core::System& system; + + GLuint index_buffer = 0; +}; + +} // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_stream_buffer.cpp b/src/video_core/renderer_opengl/gl_stream_buffer.cpp index 35ba334e4..6ec328c53 100644 --- a/src/video_core/renderer_opengl/gl_stream_buffer.cpp +++ b/src/video_core/renderer_opengl/gl_stream_buffer.cpp @@ -7,7 +7,6 @@ #include "common/alignment.h" #include "common/assert.h" #include "common/microprofile.h" -#include "video_core/renderer_opengl/gl_state.h" #include "video_core/renderer_opengl/gl_stream_buffer.h" MICROPROFILE_DEFINE(OpenGL_StreamBuffer, "OpenGL", "Stream Buffer Orphaning", diff --git a/src/video_core/renderer_opengl/gl_texture_cache.cpp b/src/video_core/renderer_opengl/gl_texture_cache.cpp index 5c1ae1418..f424e3000 100644 --- a/src/video_core/renderer_opengl/gl_texture_cache.cpp +++ b/src/video_core/renderer_opengl/gl_texture_cache.cpp @@ -10,7 +10,7 @@ #include "core/core.h" #include "video_core/morton.h" #include "video_core/renderer_opengl/gl_resource_manager.h" -#include "video_core/renderer_opengl/gl_state.h" +#include "video_core/renderer_opengl/gl_state_tracker.h" #include "video_core/renderer_opengl/gl_texture_cache.h" #include "video_core/renderer_opengl/utils.h" #include "video_core/texture_cache/surface_base.h" @@ -53,6 +53,7 @@ constexpr std::array<FormatTuple, VideoCore::Surface::MaxPixelFormat> tex_format {GL_R8UI, GL_RED_INTEGER, GL_UNSIGNED_BYTE, false}, // R8UI {GL_RGBA16F, GL_RGBA, GL_HALF_FLOAT, false}, // RGBA16F {GL_RGBA16, GL_RGBA, GL_UNSIGNED_SHORT, false}, // RGBA16U + {GL_RGBA16_SNORM, GL_RGBA, GL_SHORT, false}, // RGBA16S {GL_RGBA16UI, GL_RGBA_INTEGER, GL_UNSIGNED_SHORT, false}, // RGBA16UI {GL_R11F_G11F_B10F, GL_RGB, GL_UNSIGNED_INT_10F_11F_11F_REV, false}, // R11FG11FB10F {GL_RGBA32UI, GL_RGBA_INTEGER, GL_UNSIGNED_INT, false}, // RGBA32UI @@ -87,6 +88,7 @@ constexpr std::array<FormatTuple, VideoCore::Surface::MaxPixelFormat> tex_format {GL_RG32UI, GL_RG_INTEGER, GL_UNSIGNED_INT, false}, // RG32UI {GL_RGB16F, GL_RGBA, GL_HALF_FLOAT, false}, // RGBX16F {GL_R32UI, GL_RED_INTEGER, GL_UNSIGNED_INT, false}, // R32UI + {GL_R32I, GL_RED_INTEGER, GL_INT, false}, // R32I {GL_RGBA8, GL_RGBA, GL_UNSIGNED_BYTE, false}, // ASTC_2D_8X8 {GL_RGBA8, GL_RGBA, GL_UNSIGNED_BYTE, false}, // ASTC_2D_8X5 {GL_RGBA8, GL_RGBA, GL_UNSIGNED_BYTE, false}, // ASTC_2D_5X4 @@ -396,6 +398,7 @@ CachedSurfaceView::CachedSurfaceView(CachedSurface& surface, const ViewParams& p const bool is_proxy) : VideoCommon::ViewBase(params), surface{surface}, is_proxy{is_proxy} { target = GetTextureTarget(params.target); + format = GetFormatTuple(surface.GetSurfaceParams().pixel_format).internal_format; if (!is_proxy) { texture_view = CreateTextureView(); } @@ -405,24 +408,36 @@ CachedSurfaceView::CachedSurfaceView(CachedSurface& surface, const ViewParams& p CachedSurfaceView::~CachedSurfaceView() = default; void CachedSurfaceView::Attach(GLenum attachment, GLenum target) const { - ASSERT(params.num_layers == 1 && params.num_levels == 1); + ASSERT(params.num_levels == 1); - const auto& owner_params = surface.GetSurfaceParams(); + const GLuint texture = surface.GetTexture(); + if (params.num_layers > 1) { + // Layered framebuffer attachments + UNIMPLEMENTED_IF(params.base_layer != 0); - switch (owner_params.target) { + switch (params.target) { + case SurfaceTarget::Texture2DArray: + glFramebufferTexture(target, attachment, texture, params.base_level); + break; + default: + UNIMPLEMENTED(); + } + return; + } + + const GLenum view_target = surface.GetTarget(); + switch (surface.GetSurfaceParams().target) { case SurfaceTarget::Texture1D: - glFramebufferTexture1D(target, attachment, surface.GetTarget(), surface.GetTexture(), - params.base_level); + glFramebufferTexture1D(target, attachment, view_target, texture, params.base_level); break; case SurfaceTarget::Texture2D: - glFramebufferTexture2D(target, attachment, surface.GetTarget(), surface.GetTexture(), - params.base_level); + glFramebufferTexture2D(target, attachment, view_target, texture, params.base_level); break; case SurfaceTarget::Texture1DArray: case SurfaceTarget::Texture2DArray: case SurfaceTarget::TextureCubemap: case SurfaceTarget::TextureCubeArray: - glFramebufferTextureLayer(target, attachment, surface.GetTexture(), params.base_level, + glFramebufferTextureLayer(target, attachment, texture, params.base_level, params.base_layer); break; default: @@ -454,25 +469,20 @@ void CachedSurfaceView::ApplySwizzle(SwizzleSource x_source, SwizzleSource y_sou } OGLTextureView CachedSurfaceView::CreateTextureView() const { - const auto& owner_params = surface.GetSurfaceParams(); OGLTextureView texture_view; texture_view.Create(); - const GLuint handle{texture_view.handle}; - const FormatTuple& tuple{GetFormatTuple(owner_params.pixel_format)}; - - glTextureView(handle, target, surface.texture.handle, tuple.internal_format, params.base_level, + glTextureView(texture_view.handle, target, surface.texture.handle, format, params.base_level, params.num_levels, params.base_layer, params.num_layers); - - ApplyTextureDefaults(owner_params, handle); + ApplyTextureDefaults(surface.GetSurfaceParams(), texture_view.handle); return texture_view; } TextureCacheOpenGL::TextureCacheOpenGL(Core::System& system, VideoCore::RasterizerInterface& rasterizer, - const Device& device) - : TextureCacheBase{system, rasterizer} { + const Device& device, StateTracker& state_tracker) + : TextureCacheBase{system, rasterizer}, state_tracker{state_tracker} { src_framebuffer.Create(); dst_framebuffer.Create(); } @@ -506,25 +516,26 @@ void TextureCacheOpenGL::ImageBlit(View& src_view, View& dst_view, const Tegra::Engines::Fermi2D::Config& copy_config) { const auto& src_params{src_view->GetSurfaceParams()}; const auto& dst_params{dst_view->GetSurfaceParams()}; + UNIMPLEMENTED_IF(src_params.target == SurfaceTarget::Texture3D); + UNIMPLEMENTED_IF(dst_params.target == SurfaceTarget::Texture3D); - OpenGLState prev_state{OpenGLState::GetCurState()}; - SCOPE_EXIT({ - prev_state.AllDirty(); - prev_state.Apply(); - }); - - OpenGLState state; - state.draw.read_framebuffer = src_framebuffer.handle; - state.draw.draw_framebuffer = dst_framebuffer.handle; - state.framebuffer_srgb.enabled = dst_params.srgb_conversion; - state.AllDirty(); - state.Apply(); + state_tracker.NotifyScissor0(); + state_tracker.NotifyFramebuffer(); + state_tracker.NotifyRasterizeEnable(); + state_tracker.NotifyFramebufferSRGB(); - u32 buffers{}; + if (dst_params.srgb_conversion) { + glEnable(GL_FRAMEBUFFER_SRGB); + } else { + glDisable(GL_FRAMEBUFFER_SRGB); + } + glDisable(GL_RASTERIZER_DISCARD); + glDisablei(GL_SCISSOR_TEST, 0); - UNIMPLEMENTED_IF(src_params.target == SurfaceTarget::Texture3D); - UNIMPLEMENTED_IF(dst_params.target == SurfaceTarget::Texture3D); + glBindFramebuffer(GL_READ_FRAMEBUFFER, src_framebuffer.handle); + glBindFramebuffer(GL_DRAW_FRAMEBUFFER, dst_framebuffer.handle); + GLenum buffers = 0; if (src_params.type == SurfaceType::ColorTexture) { src_view->Attach(GL_COLOR_ATTACHMENT0, GL_READ_FRAMEBUFFER); glFramebufferTexture2D(GL_READ_FRAMEBUFFER, GL_DEPTH_STENCIL_ATTACHMENT, GL_TEXTURE_2D, 0, diff --git a/src/video_core/renderer_opengl/gl_texture_cache.h b/src/video_core/renderer_opengl/gl_texture_cache.h index 8e13ab38b..6658c6ffd 100644 --- a/src/video_core/renderer_opengl/gl_texture_cache.h +++ b/src/video_core/renderer_opengl/gl_texture_cache.h @@ -27,6 +27,7 @@ using VideoCommon::ViewParams; class CachedSurfaceView; class CachedSurface; class TextureCacheOpenGL; +class StateTracker; using Surface = std::shared_ptr<CachedSurface>; using View = std::shared_ptr<CachedSurfaceView>; @@ -96,6 +97,10 @@ public: return texture_view.handle; } + GLenum GetFormat() const { + return format; + } + const SurfaceParams& GetSurfaceParams() const { return surface.GetSurfaceParams(); } @@ -113,6 +118,7 @@ private: CachedSurface& surface; GLenum target{}; + GLenum format{}; OGLTextureView texture_view; u32 swizzle{}; @@ -122,7 +128,7 @@ private: class TextureCacheOpenGL final : public TextureCacheBase { public: explicit TextureCacheOpenGL(Core::System& system, VideoCore::RasterizerInterface& rasterizer, - const Device& device); + const Device& device, StateTracker& state_tracker); ~TextureCacheOpenGL(); protected: @@ -139,6 +145,8 @@ protected: private: GLuint FetchPBO(std::size_t buffer_size); + StateTracker& state_tracker; + OGLFramebuffer src_framebuffer; OGLFramebuffer dst_framebuffer; std::unordered_map<u32, OGLBuffer> copy_pbo_cache; diff --git a/src/video_core/renderer_opengl/maxwell_to_gl.h b/src/video_core/renderer_opengl/maxwell_to_gl.h index 7ed505628..89f0e04ef 100644 --- a/src/video_core/renderer_opengl/maxwell_to_gl.h +++ b/src/video_core/renderer_opengl/maxwell_to_gl.h @@ -92,8 +92,32 @@ inline GLenum VertexType(Maxwell::VertexAttribute attrib) { } case Maxwell::VertexAttribute::Type::UnsignedScaled: switch (attrib.size) { + case Maxwell::VertexAttribute::Size::Size_8: case Maxwell::VertexAttribute::Size::Size_8_8: + case Maxwell::VertexAttribute::Size::Size_8_8_8: + case Maxwell::VertexAttribute::Size::Size_8_8_8_8: return GL_UNSIGNED_BYTE; + case Maxwell::VertexAttribute::Size::Size_16: + case Maxwell::VertexAttribute::Size::Size_16_16: + case Maxwell::VertexAttribute::Size::Size_16_16_16: + case Maxwell::VertexAttribute::Size::Size_16_16_16_16: + return GL_UNSIGNED_SHORT; + default: + LOG_ERROR(Render_OpenGL, "Unimplemented vertex size={}", attrib.SizeString()); + return {}; + } + case Maxwell::VertexAttribute::Type::SignedScaled: + switch (attrib.size) { + case Maxwell::VertexAttribute::Size::Size_8: + case Maxwell::VertexAttribute::Size::Size_8_8: + case Maxwell::VertexAttribute::Size::Size_8_8_8: + case Maxwell::VertexAttribute::Size::Size_8_8_8_8: + return GL_BYTE; + case Maxwell::VertexAttribute::Size::Size_16: + case Maxwell::VertexAttribute::Size::Size_16_16: + case Maxwell::VertexAttribute::Size::Size_16_16_16: + case Maxwell::VertexAttribute::Size::Size_16_16_16_16: + return GL_SHORT; default: LOG_ERROR(Render_OpenGL, "Unimplemented vertex size={}", attrib.SizeString()); return {}; @@ -401,24 +425,24 @@ inline GLenum StencilOp(Maxwell::StencilOp stencil) { return GL_KEEP; } -inline GLenum FrontFace(Maxwell::Cull::FrontFace front_face) { +inline GLenum FrontFace(Maxwell::FrontFace front_face) { switch (front_face) { - case Maxwell::Cull::FrontFace::ClockWise: + case Maxwell::FrontFace::ClockWise: return GL_CW; - case Maxwell::Cull::FrontFace::CounterClockWise: + case Maxwell::FrontFace::CounterClockWise: return GL_CCW; } LOG_ERROR(Render_OpenGL, "Unimplemented front face cull={}", static_cast<u32>(front_face)); return GL_CCW; } -inline GLenum CullFace(Maxwell::Cull::CullFace cull_face) { +inline GLenum CullFace(Maxwell::CullFace cull_face) { switch (cull_face) { - case Maxwell::Cull::CullFace::Front: + case Maxwell::CullFace::Front: return GL_FRONT; - case Maxwell::Cull::CullFace::Back: + case Maxwell::CullFace::Back: return GL_BACK; - case Maxwell::Cull::CullFace::FrontAndBack: + case Maxwell::CullFace::FrontAndBack: return GL_FRONT_AND_BACK; } LOG_ERROR(Render_OpenGL, "Unimplemented cull face={}", static_cast<u32>(cull_face)); @@ -464,5 +488,18 @@ inline GLenum LogicOp(Maxwell::LogicOperation operation) { return GL_COPY; } +inline GLenum PolygonMode(Maxwell::PolygonMode polygon_mode) { + switch (polygon_mode) { + case Maxwell::PolygonMode::Point: + return GL_POINT; + case Maxwell::PolygonMode::Line: + return GL_LINE; + case Maxwell::PolygonMode::Fill: + return GL_FILL; + } + UNREACHABLE_MSG("Invalid polygon mode={}", static_cast<int>(polygon_mode)); + return GL_FILL; +} + } // namespace MaxwellToGL } // namespace OpenGL diff --git a/src/video_core/renderer_opengl/renderer_opengl.cpp b/src/video_core/renderer_opengl/renderer_opengl.cpp index bba16afaf..fca5e3ec0 100644 --- a/src/video_core/renderer_opengl/renderer_opengl.cpp +++ b/src/video_core/renderer_opengl/renderer_opengl.cpp @@ -5,30 +5,54 @@ #include <algorithm> #include <cstddef> #include <cstdlib> +#include <cstring> #include <memory> + #include <glad/glad.h> + #include "common/assert.h" #include "common/logging/log.h" +#include "common/microprofile.h" #include "common/telemetry.h" #include "core/core.h" #include "core/core_timing.h" #include "core/frontend/emu_window.h" -#include "core/frontend/scope_acquire_window_context.h" #include "core/memory.h" #include "core/perf_stats.h" #include "core/settings.h" #include "core/telemetry_session.h" #include "video_core/morton.h" #include "video_core/renderer_opengl/gl_rasterizer.h" +#include "video_core/renderer_opengl/gl_shader_manager.h" #include "video_core/renderer_opengl/renderer_opengl.h" namespace OpenGL { namespace { -constexpr char vertex_shader[] = R"( +// If the size of this is too small, it ends up creating a soft cap on FPS as the renderer will have +// to wait on available presentation frames. +constexpr std::size_t SWAP_CHAIN_SIZE = 3; + +struct Frame { + u32 width{}; /// Width of the frame (to detect resize) + u32 height{}; /// Height of the frame + bool color_reloaded{}; /// Texture attachment was recreated (ie: resized) + OpenGL::OGLRenderbuffer color{}; /// Buffer shared between the render/present FBO + OpenGL::OGLFramebuffer render{}; /// FBO created on the render thread + OpenGL::OGLFramebuffer present{}; /// FBO created on the present thread + GLsync render_fence{}; /// Fence created on the render thread + GLsync present_fence{}; /// Fence created on the presentation thread + bool is_srgb{}; /// Framebuffer is sRGB or RGB +}; + +constexpr char VERTEX_SHADER[] = R"( #version 430 core +out gl_PerVertex { + vec4 gl_Position; +}; + layout (location = 0) in vec2 vert_position; layout (location = 1) in vec2 vert_tex_coord; layout (location = 0) out vec2 frag_tex_coord; @@ -49,7 +73,7 @@ void main() { } )"; -constexpr char fragment_shader[] = R"( +constexpr char FRAGMENT_SHADER[] = R"( #version 430 core layout (location = 0) in vec2 frag_tex_coord; @@ -58,7 +82,7 @@ layout (location = 0) out vec4 color; layout (binding = 0) uniform sampler2D color_texture; void main() { - color = texture(color_texture, frag_tex_coord); + color = vec4(texture(color_texture, frag_tex_coord).rgb, 1.0f); } )"; @@ -67,13 +91,31 @@ constexpr GLint TexCoordLocation = 1; constexpr GLint ModelViewMatrixLocation = 0; struct ScreenRectVertex { - constexpr ScreenRectVertex(GLfloat x, GLfloat y, GLfloat u, GLfloat v) - : position{{x, y}}, tex_coord{{u, v}} {} + constexpr ScreenRectVertex(u32 x, u32 y, GLfloat u, GLfloat v) + : position{{static_cast<GLfloat>(x), static_cast<GLfloat>(y)}}, tex_coord{{u, v}} {} std::array<GLfloat, 2> position; std::array<GLfloat, 2> tex_coord; }; +/// Returns true if any debug tool is attached +bool HasDebugTool() { + const bool nsight = std::getenv("NVTX_INJECTION64_PATH") || std::getenv("NSIGHT_LAUNCHED"); + if (nsight) { + return true; + } + + GLint num_extensions; + glGetIntegerv(GL_NUM_EXTENSIONS, &num_extensions); + for (GLuint index = 0; index < static_cast<GLuint>(num_extensions); ++index) { + const auto name = reinterpret_cast<const char*>(glGetStringi(GL_EXTENSIONS, index)); + if (!std::strcmp(name, "GL_EXT_debug_tool")) { + return true; + } + } + return false; +} + /** * Defines a 1:1 pixel ortographic projection matrix with (0,0) on the top-left * corner and (width, height) on the lower-bottom. @@ -157,22 +199,229 @@ void APIENTRY DebugHandler(GLenum source, GLenum type, GLuint id, GLenum severit } // Anonymous namespace +/** + * For smooth Vsync rendering, we want to always present the latest frame that the core generates, + * but also make sure that rendering happens at the pace that the frontend dictates. This is a + * helper class that the renderer uses to sync frames between the render thread and the presentation + * thread + */ +class FrameMailbox { +public: + std::mutex swap_chain_lock; + std::condition_variable present_cv; + std::array<Frame, SWAP_CHAIN_SIZE> swap_chain{}; + std::queue<Frame*> free_queue; + std::deque<Frame*> present_queue; + Frame* previous_frame{}; + + FrameMailbox() : has_debug_tool{HasDebugTool()} { + for (auto& frame : swap_chain) { + free_queue.push(&frame); + } + } + + ~FrameMailbox() { + // lock the mutex and clear out the present and free_queues and notify any people who are + // blocked to prevent deadlock on shutdown + std::scoped_lock lock{swap_chain_lock}; + std::queue<Frame*>().swap(free_queue); + present_queue.clear(); + present_cv.notify_all(); + } + + void ReloadPresentFrame(Frame* frame, u32 height, u32 width) { + frame->present.Release(); + frame->present.Create(); + GLint previous_draw_fbo{}; + glGetIntegerv(GL_DRAW_FRAMEBUFFER_BINDING, &previous_draw_fbo); + glBindFramebuffer(GL_FRAMEBUFFER, frame->present.handle); + glFramebufferRenderbuffer(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_RENDERBUFFER, + frame->color.handle); + if (glCheckFramebufferStatus(GL_FRAMEBUFFER) != GL_FRAMEBUFFER_COMPLETE) { + LOG_CRITICAL(Render_OpenGL, "Failed to recreate present FBO!"); + } + glBindFramebuffer(GL_DRAW_FRAMEBUFFER, previous_draw_fbo); + frame->color_reloaded = false; + } + + void ReloadRenderFrame(Frame* frame, u32 width, u32 height) { + // Recreate the color texture attachment + frame->color.Release(); + frame->color.Create(); + const GLenum internal_format = frame->is_srgb ? GL_SRGB8 : GL_RGB8; + glNamedRenderbufferStorage(frame->color.handle, internal_format, width, height); + + // Recreate the FBO for the render target + frame->render.Release(); + frame->render.Create(); + glBindFramebuffer(GL_FRAMEBUFFER, frame->render.handle); + glFramebufferRenderbuffer(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_RENDERBUFFER, + frame->color.handle); + if (glCheckFramebufferStatus(GL_FRAMEBUFFER) != GL_FRAMEBUFFER_COMPLETE) { + LOG_CRITICAL(Render_OpenGL, "Failed to recreate render FBO!"); + } + + frame->width = width; + frame->height = height; + frame->color_reloaded = true; + } + + Frame* GetRenderFrame() { + std::unique_lock lock{swap_chain_lock}; + + // If theres no free frames, we will reuse the oldest render frame + if (free_queue.empty()) { + auto frame = present_queue.back(); + present_queue.pop_back(); + return frame; + } + + Frame* frame = free_queue.front(); + free_queue.pop(); + return frame; + } + + void ReleaseRenderFrame(Frame* frame) { + std::unique_lock lock{swap_chain_lock}; + present_queue.push_front(frame); + present_cv.notify_one(); + + DebugNotifyNextFrame(); + } + + Frame* TryGetPresentFrame(int timeout_ms) { + DebugWaitForNextFrame(); + + std::unique_lock lock{swap_chain_lock}; + // wait for new entries in the present_queue + present_cv.wait_for(lock, std::chrono::milliseconds(timeout_ms), + [&] { return !present_queue.empty(); }); + if (present_queue.empty()) { + // timed out waiting for a frame to draw so return the previous frame + return previous_frame; + } + + // free the previous frame and add it back to the free queue + if (previous_frame) { + free_queue.push(previous_frame); + } + + // the newest entries are pushed to the front of the queue + Frame* frame = present_queue.front(); + present_queue.pop_front(); + // remove all old entries from the present queue and move them back to the free_queue + for (auto f : present_queue) { + free_queue.push(f); + } + present_queue.clear(); + previous_frame = frame; + return frame; + } + +private: + std::mutex debug_synch_mutex; + std::condition_variable debug_synch_condition; + std::atomic_int frame_for_debug{}; + const bool has_debug_tool; // When true, using a GPU debugger, so keep frames in lock-step + + /// Signal that a new frame is available (called from GPU thread) + void DebugNotifyNextFrame() { + if (!has_debug_tool) { + return; + } + frame_for_debug++; + std::lock_guard lock{debug_synch_mutex}; + debug_synch_condition.notify_one(); + } + + /// Wait for a new frame to be available (called from presentation thread) + void DebugWaitForNextFrame() { + if (!has_debug_tool) { + return; + } + const int last_frame = frame_for_debug; + std::unique_lock lock{debug_synch_mutex}; + debug_synch_condition.wait(lock, + [this, last_frame] { return frame_for_debug > last_frame; }); + } +}; + RendererOpenGL::RendererOpenGL(Core::Frontend::EmuWindow& emu_window, Core::System& system) - : VideoCore::RendererBase{emu_window}, emu_window{emu_window}, system{system} {} + : VideoCore::RendererBase{emu_window}, emu_window{emu_window}, system{system}, + frame_mailbox{std::make_unique<FrameMailbox>()} {} RendererOpenGL::~RendererOpenGL() = default; +MICROPROFILE_DEFINE(OpenGL_RenderFrame, "OpenGL", "Render Frame", MP_RGB(128, 128, 64)); +MICROPROFILE_DEFINE(OpenGL_WaitPresent, "OpenGL", "Wait For Present", MP_RGB(128, 128, 128)); + void RendererOpenGL::SwapBuffers(const Tegra::FramebufferConfig* framebuffer) { - // Maintain the rasterizer's state as a priority - OpenGLState prev_state = OpenGLState::GetCurState(); - state.AllDirty(); - state.Apply(); + render_window.PollEvents(); + + if (!framebuffer) { + return; + } + + PrepareRendertarget(framebuffer); + RenderScreenshot(); + + Frame* frame; + { + MICROPROFILE_SCOPE(OpenGL_WaitPresent); + + frame = frame_mailbox->GetRenderFrame(); + + // Clean up sync objects before drawing + + // INTEL driver workaround. We can't delete the previous render sync object until we are + // sure that the presentation is done + if (frame->present_fence) { + glClientWaitSync(frame->present_fence, 0, GL_TIMEOUT_IGNORED); + } + + // delete the draw fence if the frame wasn't presented + if (frame->render_fence) { + glDeleteSync(frame->render_fence); + frame->render_fence = 0; + } + + // wait for the presentation to be done + if (frame->present_fence) { + glWaitSync(frame->present_fence, 0, GL_TIMEOUT_IGNORED); + glDeleteSync(frame->present_fence); + frame->present_fence = 0; + } + } + { + MICROPROFILE_SCOPE(OpenGL_RenderFrame); + const auto& layout = render_window.GetFramebufferLayout(); + + // Recreate the frame if the size of the window has changed + if (layout.width != frame->width || layout.height != frame->height || + screen_info.display_srgb != frame->is_srgb) { + LOG_DEBUG(Render_OpenGL, "Reloading render frame"); + frame->is_srgb = screen_info.display_srgb; + frame_mailbox->ReloadRenderFrame(frame, layout.width, layout.height); + } + glBindFramebuffer(GL_DRAW_FRAMEBUFFER, frame->render.handle); + DrawScreen(layout); + // Create a fence for the frontend to wait on and swap this frame to OffTex + frame->render_fence = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0); + glFlush(); + frame_mailbox->ReleaseRenderFrame(frame); + m_current_frame++; + rasterizer->TickFrame(); + } +} + +void RendererOpenGL::PrepareRendertarget(const Tegra::FramebufferConfig* framebuffer) { if (framebuffer) { // If framebuffer is provided, reload it from memory to a texture if (screen_info.texture.width != static_cast<GLsizei>(framebuffer->width) || screen_info.texture.height != static_cast<GLsizei>(framebuffer->height) || - screen_info.texture.pixel_format != framebuffer->pixel_format) { + screen_info.texture.pixel_format != framebuffer->pixel_format || + gl_framebuffer_data.empty()) { // Reallocate texture if the framebuffer size has changed. // This is expected to not happen very often and hence should not be a // performance problem. @@ -181,22 +430,7 @@ void RendererOpenGL::SwapBuffers(const Tegra::FramebufferConfig* framebuffer) { // Load the framebuffer from memory, draw it to the screen, and swap buffers LoadFBToScreenInfo(*framebuffer); - - if (renderer_settings.screenshot_requested) - CaptureScreenshot(); - - DrawScreen(render_window.GetFramebufferLayout()); - - rasterizer->TickFrame(); - - render_window.SwapBuffers(); } - - render_window.PollEvents(); - - // Restore the rasterizer state - prev_state.AllDirty(); - prev_state.Apply(); } void RendererOpenGL::LoadFBToScreenInfo(const Tegra::FramebufferConfig& framebuffer) { @@ -249,31 +483,24 @@ void RendererOpenGL::InitOpenGLObjects() { glClearColor(Settings::values.bg_red, Settings::values.bg_green, Settings::values.bg_blue, 0.0f); - // Link shaders and get variable locations - shader.CreateFromSource(vertex_shader, nullptr, fragment_shader); - state.draw.shader_program = shader.handle; - state.AllDirty(); - state.Apply(); + // Create shader programs + OGLShader vertex_shader; + vertex_shader.Create(VERTEX_SHADER, GL_VERTEX_SHADER); + + OGLShader fragment_shader; + fragment_shader.Create(FRAGMENT_SHADER, GL_FRAGMENT_SHADER); + + vertex_program.Create(true, false, vertex_shader.handle); + fragment_program.Create(true, false, fragment_shader.handle); + + // Create program pipeline + program_manager.Create(); // Generate VBO handle for drawing vertex_buffer.Create(); - // Generate VAO - vertex_array.Create(); - state.draw.vertex_array = vertex_array.handle; - // Attach vertex data to VAO glNamedBufferData(vertex_buffer.handle, sizeof(ScreenRectVertex) * 4, nullptr, GL_STREAM_DRAW); - glVertexArrayAttribFormat(vertex_array.handle, PositionLocation, 2, GL_FLOAT, GL_FALSE, - offsetof(ScreenRectVertex, position)); - glVertexArrayAttribFormat(vertex_array.handle, TexCoordLocation, 2, GL_FLOAT, GL_FALSE, - offsetof(ScreenRectVertex, tex_coord)); - glVertexArrayAttribBinding(vertex_array.handle, PositionLocation, 0); - glVertexArrayAttribBinding(vertex_array.handle, TexCoordLocation, 0); - glEnableVertexArrayAttrib(vertex_array.handle, PositionLocation); - glEnableVertexArrayAttrib(vertex_array.handle, TexCoordLocation); - glVertexArrayVertexBuffer(vertex_array.handle, 0, vertex_buffer.handle, 0, - sizeof(ScreenRectVertex)); // Allocate textures for the screen screen_info.texture.resource.Create(GL_TEXTURE_2D); @@ -306,7 +533,8 @@ void RendererOpenGL::CreateRasterizer() { if (rasterizer) { return; } - rasterizer = std::make_unique<RasterizerOpenGL>(system, emu_window, screen_info); + rasterizer = std::make_unique<RasterizerOpenGL>(system, emu_window, screen_info, + program_manager, state_tracker); } void RendererOpenGL::ConfigureFramebufferTexture(TextureInfo& texture, @@ -345,8 +573,19 @@ void RendererOpenGL::ConfigureFramebufferTexture(TextureInfo& texture, glTextureStorage2D(texture.resource.handle, 1, internal_format, texture.width, texture.height); } -void RendererOpenGL::DrawScreenTriangles(const ScreenInfo& screen_info, float x, float y, float w, - float h) { +void RendererOpenGL::DrawScreen(const Layout::FramebufferLayout& layout) { + if (renderer_settings.set_background_color) { + // Update background color before drawing + glClearColor(Settings::values.bg_red, Settings::values.bg_green, Settings::values.bg_blue, + 0.0f); + } + + // Set projection matrix + const std::array ortho_matrix = + MakeOrthographicMatrix(static_cast<float>(layout.width), static_cast<float>(layout.height)); + glProgramUniformMatrix3x2fv(vertex_program.handle, ModelViewMatrixLocation, 1, GL_FALSE, + std::data(ortho_matrix)); + const auto& texcoords = screen_info.display_texcoords; auto left = texcoords.left; auto right = texcoords.right; @@ -378,60 +617,129 @@ void RendererOpenGL::DrawScreenTriangles(const ScreenInfo& screen_info, float x, static_cast<f32>(screen_info.texture.height); } + const auto& screen = layout.screen; const std::array vertices = { - ScreenRectVertex(x, y, texcoords.top * scale_u, left * scale_v), - ScreenRectVertex(x + w, y, texcoords.bottom * scale_u, left * scale_v), - ScreenRectVertex(x, y + h, texcoords.top * scale_u, right * scale_v), - ScreenRectVertex(x + w, y + h, texcoords.bottom * scale_u, right * scale_v), + ScreenRectVertex(screen.left, screen.top, texcoords.top * scale_u, left * scale_v), + ScreenRectVertex(screen.right, screen.top, texcoords.bottom * scale_u, left * scale_v), + ScreenRectVertex(screen.left, screen.bottom, texcoords.top * scale_u, right * scale_v), + ScreenRectVertex(screen.right, screen.bottom, texcoords.bottom * scale_u, right * scale_v), }; - - state.textures[0] = screen_info.display_texture; - state.framebuffer_srgb.enabled = screen_info.display_srgb; - state.AllDirty(); - state.Apply(); glNamedBufferSubData(vertex_buffer.handle, 0, sizeof(vertices), std::data(vertices)); - glDrawArrays(GL_TRIANGLE_STRIP, 0, 4); - // Restore default state - state.framebuffer_srgb.enabled = false; - state.textures[0] = 0; - state.AllDirty(); - state.Apply(); -} -void RendererOpenGL::DrawScreen(const Layout::FramebufferLayout& layout) { - if (renderer_settings.set_background_color) { - // Update background color before drawing - glClearColor(Settings::values.bg_red, Settings::values.bg_green, Settings::values.bg_blue, - 0.0f); + // TODO: Signal state tracker about these changes + state_tracker.NotifyScreenDrawVertexArray(); + state_tracker.NotifyPolygonModes(); + state_tracker.NotifyViewport0(); + state_tracker.NotifyScissor0(); + state_tracker.NotifyColorMask0(); + state_tracker.NotifyBlend0(); + state_tracker.NotifyFramebuffer(); + state_tracker.NotifyFrontFace(); + state_tracker.NotifyCullTest(); + state_tracker.NotifyDepthTest(); + state_tracker.NotifyStencilTest(); + state_tracker.NotifyPolygonOffset(); + state_tracker.NotifyRasterizeEnable(); + state_tracker.NotifyFramebufferSRGB(); + state_tracker.NotifyLogicOp(); + state_tracker.NotifyClipControl(); + state_tracker.NotifyAlphaTest(); + + program_manager.UseVertexShader(vertex_program.handle); + program_manager.UseGeometryShader(0); + program_manager.UseFragmentShader(fragment_program.handle); + program_manager.BindGraphicsPipeline(); + + glEnable(GL_CULL_FACE); + if (screen_info.display_srgb) { + glEnable(GL_FRAMEBUFFER_SRGB); + } else { + glDisable(GL_FRAMEBUFFER_SRGB); } + glDisable(GL_COLOR_LOGIC_OP); + glDisable(GL_DEPTH_TEST); + glDisable(GL_STENCIL_TEST); + glDisable(GL_POLYGON_OFFSET_FILL); + glDisable(GL_RASTERIZER_DISCARD); + glDisable(GL_ALPHA_TEST); + glDisablei(GL_BLEND, 0); + glDisablei(GL_SCISSOR_TEST, 0); + glPolygonMode(GL_FRONT_AND_BACK, GL_FILL); + glCullFace(GL_BACK); + glFrontFace(GL_CW); + glColorMaski(0, GL_TRUE, GL_TRUE, GL_TRUE, GL_TRUE); + glClipControl(GL_LOWER_LEFT, GL_ZERO_TO_ONE); + glViewportIndexedf(0, 0.0f, 0.0f, static_cast<GLfloat>(layout.width), + static_cast<GLfloat>(layout.height)); + glDepthRangeIndexed(0, 0.0, 0.0); + + glEnableVertexAttribArray(PositionLocation); + glEnableVertexAttribArray(TexCoordLocation); + glVertexAttribDivisor(PositionLocation, 0); + glVertexAttribDivisor(TexCoordLocation, 0); + glVertexAttribFormat(PositionLocation, 2, GL_FLOAT, GL_FALSE, + offsetof(ScreenRectVertex, position)); + glVertexAttribFormat(TexCoordLocation, 2, GL_FLOAT, GL_FALSE, + offsetof(ScreenRectVertex, tex_coord)); + glVertexAttribBinding(PositionLocation, 0); + glVertexAttribBinding(TexCoordLocation, 0); + glBindVertexBuffer(0, vertex_buffer.handle, 0, sizeof(ScreenRectVertex)); + + glBindTextureUnit(0, screen_info.display_texture); + glBindSampler(0, 0); - const auto& screen = layout.screen; - - glViewport(0, 0, layout.width, layout.height); glClear(GL_COLOR_BUFFER_BIT); + glDrawArrays(GL_TRIANGLE_STRIP, 0, 4); +} - // Set projection matrix - const std::array ortho_matrix = - MakeOrthographicMatrix(static_cast<float>(layout.width), static_cast<float>(layout.height)); - glUniformMatrix3x2fv(ModelViewMatrixLocation, 1, GL_FALSE, ortho_matrix.data()); +void RendererOpenGL::TryPresent(int timeout_ms) { + const auto& layout = render_window.GetFramebufferLayout(); + auto frame = frame_mailbox->TryGetPresentFrame(timeout_ms); + if (!frame) { + LOG_DEBUG(Render_OpenGL, "TryGetPresentFrame returned no frame to present"); + return; + } - DrawScreenTriangles(screen_info, static_cast<float>(screen.left), - static_cast<float>(screen.top), static_cast<float>(screen.GetWidth()), - static_cast<float>(screen.GetHeight())); + // Clearing before a full overwrite of a fbo can signal to drivers that they can avoid a + // readback since we won't be doing any blending + glClear(GL_COLOR_BUFFER_BIT); - m_current_frame++; + // Recreate the presentation FBO if the color attachment was changed + if (frame->color_reloaded) { + LOG_DEBUG(Render_OpenGL, "Reloading present frame"); + frame_mailbox->ReloadPresentFrame(frame, layout.width, layout.height); + } + glWaitSync(frame->render_fence, 0, GL_TIMEOUT_IGNORED); + // INTEL workaround. + // Normally we could just delete the draw fence here, but due to driver bugs, we can just delete + // it on the emulation thread without too much penalty + // glDeleteSync(frame.render_sync); + // frame.render_sync = 0; + + glBindFramebuffer(GL_READ_FRAMEBUFFER, frame->present.handle); + glBlitFramebuffer(0, 0, frame->width, frame->height, 0, 0, layout.width, layout.height, + GL_COLOR_BUFFER_BIT, GL_LINEAR); + + // Insert fence for the main thread to block on + frame->present_fence = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0); + glFlush(); + + glBindFramebuffer(GL_READ_FRAMEBUFFER, 0); } -void RendererOpenGL::UpdateFramerate() {} +void RendererOpenGL::RenderScreenshot() { + if (!renderer_settings.screenshot_requested) { + return; + } + + GLint old_read_fb; + GLint old_draw_fb; + glGetIntegerv(GL_READ_FRAMEBUFFER_BINDING, &old_read_fb); + glGetIntegerv(GL_DRAW_FRAMEBUFFER_BINDING, &old_draw_fb); -void RendererOpenGL::CaptureScreenshot() { // Draw the current frame to the screenshot framebuffer screenshot_framebuffer.Create(); - GLuint old_read_fb = state.draw.read_framebuffer; - GLuint old_draw_fb = state.draw.draw_framebuffer; - state.draw.read_framebuffer = state.draw.draw_framebuffer = screenshot_framebuffer.handle; - state.AllDirty(); - state.Apply(); + glBindFramebuffer(GL_FRAMEBUFFER, screenshot_framebuffer.handle); Layout::FramebufferLayout layout{renderer_settings.screenshot_framebuffer_layout}; @@ -448,19 +756,16 @@ void RendererOpenGL::CaptureScreenshot() { renderer_settings.screenshot_bits); screenshot_framebuffer.Release(); - state.draw.read_framebuffer = old_read_fb; - state.draw.draw_framebuffer = old_draw_fb; - state.AllDirty(); - state.Apply(); glDeleteRenderbuffers(1, &renderbuffer); + glBindFramebuffer(GL_READ_FRAMEBUFFER, old_read_fb); + glBindFramebuffer(GL_DRAW_FRAMEBUFFER, old_draw_fb); + renderer_settings.screenshot_complete_callback(); renderer_settings.screenshot_requested = false; } bool RendererOpenGL::Init() { - Core::Frontend::ScopeAcquireWindowContext acquire_context{render_window}; - if (GLAD_GL_KHR_debug) { glEnable(GL_DEBUG_OUTPUT); glDebugMessageCallback(DebugHandler, nullptr); diff --git a/src/video_core/renderer_opengl/renderer_opengl.h b/src/video_core/renderer_opengl/renderer_opengl.h index b56328a7f..33073ce5b 100644 --- a/src/video_core/renderer_opengl/renderer_opengl.h +++ b/src/video_core/renderer_opengl/renderer_opengl.h @@ -10,7 +10,8 @@ #include "common/math_util.h" #include "video_core/renderer_base.h" #include "video_core/renderer_opengl/gl_resource_manager.h" -#include "video_core/renderer_opengl/gl_state.h" +#include "video_core/renderer_opengl/gl_shader_manager.h" +#include "video_core/renderer_opengl/gl_state_tracker.h" namespace Core { class System; @@ -44,19 +45,23 @@ struct ScreenInfo { TextureInfo texture; }; +struct PresentationTexture { + u32 width = 0; + u32 height = 0; + OGLTexture texture; +}; + +class FrameMailbox; + class RendererOpenGL final : public VideoCore::RendererBase { public: explicit RendererOpenGL(Core::Frontend::EmuWindow& emu_window, Core::System& system); ~RendererOpenGL() override; - /// Swap buffers (render frame) - void SwapBuffers(const Tegra::FramebufferConfig* framebuffer) override; - - /// Initialize the renderer bool Init() override; - - /// Shutdown the renderer void ShutDown() override; + void SwapBuffers(const Tegra::FramebufferConfig* framebuffer) override; + void TryPresent(int timeout_ms) override; private: /// Initializes the OpenGL state and creates persistent objects. @@ -72,12 +77,7 @@ private: /// Draws the emulated screens to the emulator window. void DrawScreen(const Layout::FramebufferLayout& layout); - void DrawScreenTriangles(const ScreenInfo& screen_info, float x, float y, float w, float h); - - /// Updates the framerate. - void UpdateFramerate(); - - void CaptureScreenshot(); + void RenderScreenshot(); /// Loads framebuffer from emulated memory into the active OpenGL texture. void LoadFBToScreenInfo(const Tegra::FramebufferConfig& framebuffer); @@ -87,26 +87,34 @@ private: void LoadColorToActiveGLTexture(u8 color_r, u8 color_g, u8 color_b, u8 color_a, const TextureInfo& texture); + void PrepareRendertarget(const Tegra::FramebufferConfig* framebuffer); + Core::Frontend::EmuWindow& emu_window; Core::System& system; - OpenGLState state; + StateTracker state_tracker{system}; // OpenGL object IDs - OGLVertexArray vertex_array; OGLBuffer vertex_buffer; - OGLProgram shader; + OGLProgram vertex_program; + OGLProgram fragment_program; OGLFramebuffer screenshot_framebuffer; /// Display information for Switch screen ScreenInfo screen_info; + /// Global dummy shader pipeline + GLShader::ProgramManager program_manager; + /// OpenGL framebuffer data std::vector<u8> gl_framebuffer_data; /// Used for transforming the framebuffer orientation Tegra::FramebufferConfig::TransformFlags framebuffer_transform_flags; Common::Rectangle<int> framebuffer_crop_rect; + + /// Frame presentation mailbox + std::unique_ptr<FrameMailbox> frame_mailbox; }; } // namespace OpenGL diff --git a/src/video_core/renderer_opengl/utils.cpp b/src/video_core/renderer_opengl/utils.cpp index ac99e6385..b751086fa 100644 --- a/src/video_core/renderer_opengl/utils.cpp +++ b/src/video_core/renderer_opengl/utils.cpp @@ -9,6 +9,7 @@ #include <glad/glad.h> #include "common/common_types.h" +#include "video_core/renderer_opengl/gl_state_tracker.h" #include "video_core/renderer_opengl/utils.h" namespace OpenGL { @@ -20,12 +21,12 @@ struct VertexArrayPushBuffer::Entry { GLsizei stride{}; }; -VertexArrayPushBuffer::VertexArrayPushBuffer() = default; +VertexArrayPushBuffer::VertexArrayPushBuffer(StateTracker& state_tracker) + : state_tracker{state_tracker} {} VertexArrayPushBuffer::~VertexArrayPushBuffer() = default; -void VertexArrayPushBuffer::Setup(GLuint vao_) { - vao = vao_; +void VertexArrayPushBuffer::Setup() { index_buffer = nullptr; vertex_buffers.clear(); } @@ -41,13 +42,11 @@ void VertexArrayPushBuffer::SetVertexBuffer(GLuint binding_index, const GLuint* void VertexArrayPushBuffer::Bind() { if (index_buffer) { - glVertexArrayElementBuffer(vao, *index_buffer); + state_tracker.BindIndexBuffer(*index_buffer); } - // TODO(Rodrigo): Find a way to ARB_multi_bind this for (const auto& entry : vertex_buffers) { - glVertexArrayVertexBuffer(vao, entry.binding_index, *entry.buffer, entry.offset, - entry.stride); + glBindVertexBuffer(entry.binding_index, *entry.buffer, entry.offset, entry.stride); } } diff --git a/src/video_core/renderer_opengl/utils.h b/src/video_core/renderer_opengl/utils.h index 3ad7c02d4..47ee3177b 100644 --- a/src/video_core/renderer_opengl/utils.h +++ b/src/video_core/renderer_opengl/utils.h @@ -11,12 +11,14 @@ namespace OpenGL { +class StateTracker; + class VertexArrayPushBuffer final { public: - explicit VertexArrayPushBuffer(); + explicit VertexArrayPushBuffer(StateTracker& state_tracker); ~VertexArrayPushBuffer(); - void Setup(GLuint vao_); + void Setup(); void SetIndexBuffer(const GLuint* buffer); @@ -28,7 +30,8 @@ public: private: struct Entry; - GLuint vao{}; + StateTracker& state_tracker; + const GLuint* index_buffer{}; std::vector<Entry> vertex_buffers; }; diff --git a/src/video_core/renderer_vulkan/fixed_pipeline_state.cpp b/src/video_core/renderer_vulkan/fixed_pipeline_state.cpp index 4e3ff231e..2bb376555 100644 --- a/src/video_core/renderer_vulkan/fixed_pipeline_state.cpp +++ b/src/video_core/renderer_vulkan/fixed_pipeline_state.cpp @@ -112,19 +112,18 @@ constexpr FixedPipelineState::Rasterizer GetRasterizerState(const Maxwell& regs) const auto& clip = regs.view_volume_clip_control; const bool depth_clamp_enabled = clip.depth_clamp_near == 1 || clip.depth_clamp_far == 1; - Maxwell::Cull::FrontFace front_face = regs.cull.front_face; + Maxwell::FrontFace front_face = regs.front_face; if (regs.screen_y_control.triangle_rast_flip != 0 && regs.viewport_transform[0].scale_y > 0.0f) { - if (front_face == Maxwell::Cull::FrontFace::CounterClockWise) - front_face = Maxwell::Cull::FrontFace::ClockWise; - else if (front_face == Maxwell::Cull::FrontFace::ClockWise) - front_face = Maxwell::Cull::FrontFace::CounterClockWise; + if (front_face == Maxwell::FrontFace::CounterClockWise) + front_face = Maxwell::FrontFace::ClockWise; + else if (front_face == Maxwell::FrontFace::ClockWise) + front_face = Maxwell::FrontFace::CounterClockWise; } const bool gl_ndc = regs.depth_mode == Maxwell::DepthMode::MinusOneToOne; - return FixedPipelineState::Rasterizer(regs.cull.enabled, depth_bias_enabled, - depth_clamp_enabled, gl_ndc, regs.cull.cull_face, - front_face); + return FixedPipelineState::Rasterizer(regs.cull_test_enabled, depth_bias_enabled, + depth_clamp_enabled, gl_ndc, regs.cull_face, front_face); } } // Anonymous namespace diff --git a/src/video_core/renderer_vulkan/fixed_pipeline_state.h b/src/video_core/renderer_vulkan/fixed_pipeline_state.h index 87056ef37..4c8ba7f90 100644 --- a/src/video_core/renderer_vulkan/fixed_pipeline_state.h +++ b/src/video_core/renderer_vulkan/fixed_pipeline_state.h @@ -171,8 +171,8 @@ struct FixedPipelineState { struct Rasterizer { constexpr Rasterizer(bool cull_enable, bool depth_bias_enable, bool depth_clamp_enable, - bool ndc_minus_one_to_one, Maxwell::Cull::CullFace cull_face, - Maxwell::Cull::FrontFace front_face) + bool ndc_minus_one_to_one, Maxwell::CullFace cull_face, + Maxwell::FrontFace front_face) : cull_enable{cull_enable}, depth_bias_enable{depth_bias_enable}, depth_clamp_enable{depth_clamp_enable}, ndc_minus_one_to_one{ndc_minus_one_to_one}, cull_face{cull_face}, front_face{front_face} {} @@ -182,8 +182,8 @@ struct FixedPipelineState { bool depth_bias_enable; bool depth_clamp_enable; bool ndc_minus_one_to_one; - Maxwell::Cull::CullFace cull_face; - Maxwell::Cull::FrontFace front_face; + Maxwell::CullFace cull_face; + Maxwell::FrontFace front_face; std::size_t Hash() const noexcept; diff --git a/src/video_core/renderer_vulkan/maxwell_to_vk.cpp b/src/video_core/renderer_vulkan/maxwell_to_vk.cpp index 5403c3ab7..f93447610 100644 --- a/src/video_core/renderer_vulkan/maxwell_to_vk.cpp +++ b/src/video_core/renderer_vulkan/maxwell_to_vk.cpp @@ -120,11 +120,12 @@ struct FormatTuple { {vk::Format::eA8B8G8R8UintPack32, Attachable | Storage}, // ABGR8UI {vk::Format::eB5G6R5UnormPack16, {}}, // B5G6R5U {vk::Format::eA2B10G10R10UnormPack32, Attachable | Storage}, // A2B10G10R10U - {vk::Format::eA1R5G5B5UnormPack16, Attachable | Storage}, // A1B5G5R5U (flipped with swizzle) + {vk::Format::eA1R5G5B5UnormPack16, Attachable}, // A1B5G5R5U (flipped with swizzle) {vk::Format::eR8Unorm, Attachable | Storage}, // R8U {vk::Format::eR8Uint, Attachable | Storage}, // R8UI {vk::Format::eR16G16B16A16Sfloat, Attachable | Storage}, // RGBA16F {vk::Format::eR16G16B16A16Unorm, Attachable | Storage}, // RGBA16U + {vk::Format::eR16G16B16A16Snorm, Attachable | Storage}, // RGBA16S {vk::Format::eR16G16B16A16Uint, Attachable | Storage}, // RGBA16UI {vk::Format::eB10G11R11UfloatPack32, Attachable | Storage}, // R11FG11FB10F {vk::Format::eR32G32B32A32Uint, Attachable | Storage}, // RGBA32UI @@ -159,6 +160,7 @@ struct FormatTuple { {vk::Format::eR32G32Uint, Attachable | Storage}, // RG32UI {vk::Format::eUndefined, {}}, // RGBX16F {vk::Format::eR32Uint, Attachable | Storage}, // R32UI + {vk::Format::eR32Sint, Attachable | Storage}, // R32I {vk::Format::eAstc8x8UnormBlock, {}}, // ASTC_2D_8X8 {vk::Format::eUndefined, {}}, // ASTC_2D_8X5 {vk::Format::eUndefined, {}}, // ASTC_2D_5X4 @@ -255,6 +257,8 @@ vk::ShaderStageFlagBits ShaderStage(Tegra::Engines::ShaderType stage) { return vk::ShaderStageFlagBits::eGeometry; case Tegra::Engines::ShaderType::Fragment: return vk::ShaderStageFlagBits::eFragment; + case Tegra::Engines::ShaderType::Compute: + return vk::ShaderStageFlagBits::eCompute; } UNIMPLEMENTED_MSG("Unimplemented shader stage={}", static_cast<u32>(stage)); return {}; @@ -330,6 +334,8 @@ vk::Format VertexFormat(Maxwell::VertexAttribute::Type type, Maxwell::VertexAttr return vk::Format::eR16G16B16Unorm; case Maxwell::VertexAttribute::Size::Size_16_16_16_16: return vk::Format::eR16G16B16A16Unorm; + case Maxwell::VertexAttribute::Size::Size_10_10_10_2: + return vk::Format::eA2B10G10R10UnormPack32; default: break; } @@ -363,6 +369,10 @@ vk::Format VertexFormat(Maxwell::VertexAttribute::Type type, Maxwell::VertexAttr return vk::Format::eR8G8B8A8Uint; case Maxwell::VertexAttribute::Size::Size_32: return vk::Format::eR32Uint; + case Maxwell::VertexAttribute::Size::Size_32_32: + return vk::Format::eR32G32Uint; + case Maxwell::VertexAttribute::Size::Size_32_32_32: + return vk::Format::eR32G32B32Uint; case Maxwell::VertexAttribute::Size::Size_32_32_32_32: return vk::Format::eR32G32B32A32Uint; default: @@ -370,8 +380,22 @@ vk::Format VertexFormat(Maxwell::VertexAttribute::Type type, Maxwell::VertexAttr } case Maxwell::VertexAttribute::Type::UnsignedScaled: switch (size) { + case Maxwell::VertexAttribute::Size::Size_8: + return vk::Format::eR8Uscaled; case Maxwell::VertexAttribute::Size::Size_8_8: return vk::Format::eR8G8Uscaled; + case Maxwell::VertexAttribute::Size::Size_8_8_8: + return vk::Format::eR8G8B8Uscaled; + case Maxwell::VertexAttribute::Size::Size_8_8_8_8: + return vk::Format::eR8G8B8A8Uscaled; + case Maxwell::VertexAttribute::Size::Size_16: + return vk::Format::eR16Uscaled; + case Maxwell::VertexAttribute::Size::Size_16_16: + return vk::Format::eR16G16Uscaled; + case Maxwell::VertexAttribute::Size::Size_16_16_16: + return vk::Format::eR16G16B16Uscaled; + case Maxwell::VertexAttribute::Size::Size_16_16_16_16: + return vk::Format::eR16G16B16A16Uscaled; default: break; } @@ -571,24 +595,24 @@ vk::BlendFactor BlendFactor(Maxwell::Blend::Factor factor) { return {}; } -vk::FrontFace FrontFace(Maxwell::Cull::FrontFace front_face) { +vk::FrontFace FrontFace(Maxwell::FrontFace front_face) { switch (front_face) { - case Maxwell::Cull::FrontFace::ClockWise: + case Maxwell::FrontFace::ClockWise: return vk::FrontFace::eClockwise; - case Maxwell::Cull::FrontFace::CounterClockWise: + case Maxwell::FrontFace::CounterClockWise: return vk::FrontFace::eCounterClockwise; } UNIMPLEMENTED_MSG("Unimplemented front face={}", static_cast<u32>(front_face)); return {}; } -vk::CullModeFlags CullFace(Maxwell::Cull::CullFace cull_face) { +vk::CullModeFlags CullFace(Maxwell::CullFace cull_face) { switch (cull_face) { - case Maxwell::Cull::CullFace::Front: + case Maxwell::CullFace::Front: return vk::CullModeFlagBits::eFront; - case Maxwell::Cull::CullFace::Back: + case Maxwell::CullFace::Back: return vk::CullModeFlagBits::eBack; - case Maxwell::Cull::CullFace::FrontAndBack: + case Maxwell::CullFace::FrontAndBack: return vk::CullModeFlagBits::eFrontAndBack; } UNIMPLEMENTED_MSG("Unimplemented cull face={}", static_cast<u32>(cull_face)); diff --git a/src/video_core/renderer_vulkan/maxwell_to_vk.h b/src/video_core/renderer_vulkan/maxwell_to_vk.h index 7e9678b7b..24f6ab544 100644 --- a/src/video_core/renderer_vulkan/maxwell_to_vk.h +++ b/src/video_core/renderer_vulkan/maxwell_to_vk.h @@ -54,9 +54,9 @@ vk::BlendOp BlendEquation(Maxwell::Blend::Equation equation); vk::BlendFactor BlendFactor(Maxwell::Blend::Factor factor); -vk::FrontFace FrontFace(Maxwell::Cull::FrontFace front_face); +vk::FrontFace FrontFace(Maxwell::FrontFace front_face); -vk::CullModeFlags CullFace(Maxwell::Cull::CullFace cull_face); +vk::CullModeFlags CullFace(Maxwell::CullFace cull_face); vk::ComponentSwizzle SwizzleSource(Tegra::Texture::SwizzleSource swizzle); diff --git a/src/video_core/renderer_vulkan/renderer_vulkan.cpp b/src/video_core/renderer_vulkan/renderer_vulkan.cpp index d5032b432..42bb01418 100644 --- a/src/video_core/renderer_vulkan/renderer_vulkan.cpp +++ b/src/video_core/renderer_vulkan/renderer_vulkan.cpp @@ -27,6 +27,7 @@ #include "video_core/renderer_vulkan/vk_rasterizer.h" #include "video_core/renderer_vulkan/vk_resource_manager.h" #include "video_core/renderer_vulkan/vk_scheduler.h" +#include "video_core/renderer_vulkan/vk_state_tracker.h" #include "video_core/renderer_vulkan/vk_swapchain.h" namespace Vulkan { @@ -106,8 +107,14 @@ RendererVulkan::~RendererVulkan() { } void RendererVulkan::SwapBuffers(const Tegra::FramebufferConfig* framebuffer) { + render_window.PollEvents(); + + if (!framebuffer) { + return; + } + const auto& layout = render_window.GetFramebufferLayout(); - if (framebuffer && layout.width > 0 && layout.height > 0 && render_window.IsShown()) { + if (layout.width > 0 && layout.height > 0 && render_window.IsShown()) { const VAddr framebuffer_addr = framebuffer->address + framebuffer->offset; const bool use_accelerated = rasterizer->AccelerateDisplay(*framebuffer, framebuffer_addr, framebuffer->stride); @@ -128,13 +135,16 @@ void RendererVulkan::SwapBuffers(const Tegra::FramebufferConfig* framebuffer) { blit_screen->Recreate(); } - render_window.SwapBuffers(); rasterizer->TickFrame(); } render_window.PollEvents(); } +void RendererVulkan::TryPresent(int /*timeout_ms*/) { + // TODO (bunnei): ImplementMe +} + bool RendererVulkan::Init() { PFN_vkGetInstanceProcAddr vkGetInstanceProcAddr{}; render_window.RetrieveVulkanHandlers(&vkGetInstanceProcAddr, &instance, &surface); @@ -168,10 +178,13 @@ bool RendererVulkan::Init() { swapchain = std::make_unique<VKSwapchain>(surface, *device); swapchain->Create(framebuffer.width, framebuffer.height, false); - scheduler = std::make_unique<VKScheduler>(*device, *resource_manager); + state_tracker = std::make_unique<StateTracker>(system); + + scheduler = std::make_unique<VKScheduler>(*device, *resource_manager, *state_tracker); rasterizer = std::make_unique<RasterizerVulkan>(system, render_window, screen_info, *device, - *resource_manager, *memory_manager, *scheduler); + *resource_manager, *memory_manager, + *state_tracker, *scheduler); blit_screen = std::make_unique<VKBlitScreen>(system, render_window, *rasterizer, *device, *resource_manager, *memory_manager, *swapchain, @@ -262,4 +275,4 @@ void RendererVulkan::Report() const { telemetry_session.AddField(field, "GPU_Vulkan_Extensions", extensions); } -} // namespace Vulkan
\ No newline at end of file +} // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/renderer_vulkan.h b/src/video_core/renderer_vulkan/renderer_vulkan.h index a472c5dc9..3da08d2e4 100644 --- a/src/video_core/renderer_vulkan/renderer_vulkan.h +++ b/src/video_core/renderer_vulkan/renderer_vulkan.h @@ -4,8 +4,10 @@ #pragma once +#include <memory> #include <optional> #include <vector> + #include "video_core/renderer_base.h" #include "video_core/renderer_vulkan/declarations.h" @@ -15,6 +17,7 @@ class System; namespace Vulkan { +class StateTracker; class VKBlitScreen; class VKDevice; class VKFence; @@ -36,14 +39,10 @@ public: explicit RendererVulkan(Core::Frontend::EmuWindow& window, Core::System& system); ~RendererVulkan() override; - /// Swap buffers (render frame) - void SwapBuffers(const Tegra::FramebufferConfig* framebuffer) override; - - /// Initialize the renderer bool Init() override; - - /// Shutdown the renderer void ShutDown() override; + void SwapBuffers(const Tegra::FramebufferConfig* framebuffer) override; + void TryPresent(int timeout_ms) override; private: std::optional<vk::DebugUtilsMessengerEXT> CreateDebugCallback( @@ -65,6 +64,7 @@ private: std::unique_ptr<VKSwapchain> swapchain; std::unique_ptr<VKMemoryManager> memory_manager; std::unique_ptr<VKResourceManager> resource_manager; + std::unique_ptr<StateTracker> state_tracker; std::unique_ptr<VKScheduler> scheduler; std::unique_ptr<VKBlitScreen> blit_screen; }; diff --git a/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp b/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp index 9d5b8de7a..60f57d83e 100644 --- a/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp +++ b/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp @@ -73,7 +73,7 @@ UniqueDescriptorUpdateTemplate VKComputePipeline::CreateDescriptorUpdateTemplate std::vector<vk::DescriptorUpdateTemplateEntry> template_entries; u32 binding = 0; u32 offset = 0; - FillDescriptorUpdateTemplateEntries(device, entries, binding, offset, template_entries); + FillDescriptorUpdateTemplateEntries(entries, binding, offset, template_entries); if (template_entries.empty()) { // If the shader doesn't use descriptor sets, skip template creation. return UniqueDescriptorUpdateTemplate{}; diff --git a/src/video_core/renderer_vulkan/vk_device.cpp b/src/video_core/renderer_vulkan/vk_device.cpp index d1da4f9d3..28d2fbc4f 100644 --- a/src/video_core/renderer_vulkan/vk_device.cpp +++ b/src/video_core/renderer_vulkan/vk_device.cpp @@ -107,8 +107,7 @@ bool VKDevice::Create(const vk::DispatchLoaderDynamic& dldi, vk::Instance instan features.occlusionQueryPrecise = true; features.fragmentStoresAndAtomics = true; features.shaderImageGatherExtended = true; - features.shaderStorageImageReadWithoutFormat = - is_shader_storage_img_read_without_format_supported; + features.shaderStorageImageReadWithoutFormat = is_formatless_image_load_supported; features.shaderStorageImageWriteWithoutFormat = true; features.textureCompressionASTC_LDR = is_optimal_astc_supported; @@ -148,6 +147,15 @@ bool VKDevice::Create(const vk::DispatchLoaderDynamic& dldi, vk::Instance instan LOG_INFO(Render_Vulkan, "Device doesn't support uint8 indexes"); } + vk::PhysicalDeviceTransformFeedbackFeaturesEXT transform_feedback; + if (ext_transform_feedback) { + transform_feedback.transformFeedback = true; + transform_feedback.geometryStreams = true; + SetNext(next, transform_feedback); + } else { + LOG_INFO(Render_Vulkan, "Device doesn't support transform feedbacks"); + } + if (!ext_depth_range_unrestricted) { LOG_INFO(Render_Vulkan, "Device doesn't support depth range unrestricted"); } @@ -385,7 +393,7 @@ std::vector<const char*> VKDevice::LoadExtensions(const vk::DispatchLoaderDynami } }; - extensions.reserve(14); + extensions.reserve(15); extensions.push_back(VK_KHR_SWAPCHAIN_EXTENSION_NAME); extensions.push_back(VK_KHR_16BIT_STORAGE_EXTENSION_NAME); extensions.push_back(VK_KHR_8BIT_STORAGE_EXTENSION_NAME); @@ -397,18 +405,22 @@ std::vector<const char*> VKDevice::LoadExtensions(const vk::DispatchLoaderDynami [[maybe_unused]] const bool nsight = std::getenv("NVTX_INJECTION64_PATH") || std::getenv("NSIGHT_LAUNCHED"); - bool khr_shader_float16_int8{}; - bool ext_subgroup_size_control{}; + bool has_khr_shader_float16_int8{}; + bool has_ext_subgroup_size_control{}; + bool has_ext_transform_feedback{}; for (const auto& extension : physical.enumerateDeviceExtensionProperties(nullptr, dldi)) { Test(extension, khr_uniform_buffer_standard_layout, VK_KHR_UNIFORM_BUFFER_STANDARD_LAYOUT_EXTENSION_NAME, true); - Test(extension, khr_shader_float16_int8, VK_KHR_SHADER_FLOAT16_INT8_EXTENSION_NAME, false); + Test(extension, has_khr_shader_float16_int8, VK_KHR_SHADER_FLOAT16_INT8_EXTENSION_NAME, + false); Test(extension, ext_depth_range_unrestricted, VK_EXT_DEPTH_RANGE_UNRESTRICTED_EXTENSION_NAME, true); Test(extension, ext_index_type_uint8, VK_EXT_INDEX_TYPE_UINT8_EXTENSION_NAME, true); Test(extension, ext_shader_viewport_index_layer, VK_EXT_SHADER_VIEWPORT_INDEX_LAYER_EXTENSION_NAME, true); - Test(extension, ext_subgroup_size_control, VK_EXT_SUBGROUP_SIZE_CONTROL_EXTENSION_NAME, + Test(extension, has_ext_subgroup_size_control, VK_EXT_SUBGROUP_SIZE_CONTROL_EXTENSION_NAME, + false); + Test(extension, has_ext_transform_feedback, VK_EXT_TRANSFORM_FEEDBACK_EXTENSION_NAME, false); if (Settings::values.renderer_debug) { Test(extension, nv_device_diagnostic_checkpoints, @@ -416,13 +428,13 @@ std::vector<const char*> VKDevice::LoadExtensions(const vk::DispatchLoaderDynami } } - if (khr_shader_float16_int8) { + if (has_khr_shader_float16_int8) { is_float16_supported = GetFeatures<vk::PhysicalDeviceFloat16Int8FeaturesKHR>(physical, dldi).shaderFloat16; extensions.push_back(VK_KHR_SHADER_FLOAT16_INT8_EXTENSION_NAME); } - if (ext_subgroup_size_control) { + if (has_ext_subgroup_size_control) { const auto features = GetFeatures<vk::PhysicalDeviceSubgroupSizeControlFeaturesEXT>(physical, dldi); const auto properties = @@ -439,6 +451,20 @@ std::vector<const char*> VKDevice::LoadExtensions(const vk::DispatchLoaderDynami is_warp_potentially_bigger = true; } + if (has_ext_transform_feedback) { + const auto features = + GetFeatures<vk::PhysicalDeviceTransformFeedbackFeaturesEXT>(physical, dldi); + const auto properties = + GetProperties<vk::PhysicalDeviceTransformFeedbackPropertiesEXT>(physical, dldi); + + if (features.transformFeedback && features.geometryStreams && + properties.maxTransformFeedbackStreams >= 4 && properties.maxTransformFeedbackBuffers && + properties.transformFeedbackQueries && properties.transformFeedbackDraw) { + extensions.push_back(VK_EXT_TRANSFORM_FEEDBACK_EXTENSION_NAME); + ext_transform_feedback = true; + } + } + return extensions; } @@ -467,8 +493,7 @@ void VKDevice::SetupFamilies(const vk::DispatchLoaderDynamic& dldi, vk::SurfaceK void VKDevice::SetupFeatures(const vk::DispatchLoaderDynamic& dldi) { const auto supported_features{physical.getFeatures(dldi)}; - is_shader_storage_img_read_without_format_supported = - supported_features.shaderStorageImageReadWithoutFormat; + is_formatless_image_load_supported = supported_features.shaderStorageImageReadWithoutFormat; is_optimal_astc_supported = IsOptimalAstcSupported(supported_features, dldi); } @@ -510,6 +535,7 @@ std::unordered_map<vk::Format, vk::FormatProperties> VKDevice::GetFormatProperti vk::Format::eR32G32Sfloat, vk::Format::eR32G32Uint, vk::Format::eR16G16B16A16Uint, + vk::Format::eR16G16B16A16Snorm, vk::Format::eR16G16B16A16Unorm, vk::Format::eR16G16Unorm, vk::Format::eR16G16Snorm, @@ -523,6 +549,7 @@ std::unordered_map<vk::Format, vk::FormatProperties> VKDevice::GetFormatProperti vk::Format::eB10G11R11UfloatPack32, vk::Format::eR32Sfloat, vk::Format::eR32Uint, + vk::Format::eR32Sint, vk::Format::eR16Sfloat, vk::Format::eR16G16B16A16Sfloat, vk::Format::eB8G8R8A8Unorm, diff --git a/src/video_core/renderer_vulkan/vk_device.h b/src/video_core/renderer_vulkan/vk_device.h index 2c27ad730..6e656517f 100644 --- a/src/video_core/renderer_vulkan/vk_device.h +++ b/src/video_core/renderer_vulkan/vk_device.h @@ -122,11 +122,6 @@ public: return properties.limits.maxPushConstantsSize; } - /// Returns true if Shader storage Image Read Without Format supported. - bool IsShaderStorageImageReadWithoutFormatSupported() const { - return is_shader_storage_img_read_without_format_supported; - } - /// Returns true if ASTC is natively supported. bool IsOptimalAstcSupported() const { return is_optimal_astc_supported; @@ -147,6 +142,11 @@ public: return (guest_warp_stages & stage) != vk::ShaderStageFlags{}; } + /// Returns true if formatless image load is supported. + bool IsFormatlessImageLoadSupported() const { + return is_formatless_image_load_supported; + } + /// Returns true if the device supports VK_EXT_scalar_block_layout. bool IsKhrUniformBufferStandardLayoutSupported() const { return khr_uniform_buffer_standard_layout; @@ -167,6 +167,11 @@ public: return ext_shader_viewport_index_layer; } + /// Returns true if the device supports VK_EXT_transform_feedback. + bool IsExtTransformFeedbackSupported() const { + return ext_transform_feedback; + } + /// Returns true if the device supports VK_NV_device_diagnostic_checkpoints. bool IsNvDeviceDiagnosticCheckpoints() const { return nv_device_diagnostic_checkpoints; @@ -214,26 +219,26 @@ private: static std::unordered_map<vk::Format, vk::FormatProperties> GetFormatProperties( const vk::DispatchLoaderDynamic& dldi, vk::PhysicalDevice physical); - const vk::PhysicalDevice physical; ///< Physical device. - vk::DispatchLoaderDynamic dld; ///< Device function pointers. - vk::PhysicalDeviceProperties properties; ///< Device properties. - UniqueDevice logical; ///< Logical device. - vk::Queue graphics_queue; ///< Main graphics queue. - vk::Queue present_queue; ///< Main present queue. - u32 graphics_family{}; ///< Main graphics queue family index. - u32 present_family{}; ///< Main present queue family index. - vk::DriverIdKHR driver_id{}; ///< Driver ID. - vk::ShaderStageFlags guest_warp_stages{}; ///< Stages where the guest warp size can be forced. - bool is_optimal_astc_supported{}; ///< Support for native ASTC. - bool is_float16_supported{}; ///< Support for float16 arithmetics. - bool is_warp_potentially_bigger{}; ///< Host warp size can be bigger than guest. + const vk::PhysicalDevice physical; ///< Physical device. + vk::DispatchLoaderDynamic dld; ///< Device function pointers. + vk::PhysicalDeviceProperties properties; ///< Device properties. + UniqueDevice logical; ///< Logical device. + vk::Queue graphics_queue; ///< Main graphics queue. + vk::Queue present_queue; ///< Main present queue. + u32 graphics_family{}; ///< Main graphics queue family index. + u32 present_family{}; ///< Main present queue family index. + vk::DriverIdKHR driver_id{}; ///< Driver ID. + vk::ShaderStageFlags guest_warp_stages{}; ///< Stages where the guest warp size can be forced.ed + bool is_optimal_astc_supported{}; ///< Support for native ASTC. + bool is_float16_supported{}; ///< Support for float16 arithmetics. + bool is_warp_potentially_bigger{}; ///< Host warp size can be bigger than guest. + bool is_formatless_image_load_supported{}; ///< Support for shader image read without format. bool khr_uniform_buffer_standard_layout{}; ///< Support for std430 on UBOs. bool ext_index_type_uint8{}; ///< Support for VK_EXT_index_type_uint8. bool ext_depth_range_unrestricted{}; ///< Support for VK_EXT_depth_range_unrestricted. bool ext_shader_viewport_index_layer{}; ///< Support for VK_EXT_shader_viewport_index_layer. + bool ext_transform_feedback{}; ///< Support for VK_EXT_transform_feedback. bool nv_device_diagnostic_checkpoints{}; ///< Support for VK_NV_device_diagnostic_checkpoints. - bool is_shader_storage_img_read_without_format_supported{}; ///< Support for shader storage - ///< image read without format // Telemetry parameters std::string vendor_name; ///< Device's driver name. diff --git a/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp b/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp index b155dfb49..6a02403c1 100644 --- a/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp +++ b/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp @@ -97,8 +97,7 @@ UniqueDescriptorUpdateTemplate VKGraphicsPipeline::CreateDescriptorUpdateTemplat u32 offset = 0; for (const auto& stage : program) { if (stage) { - FillDescriptorUpdateTemplateEntries(device, stage->entries, binding, offset, - template_entries); + FillDescriptorUpdateTemplateEntries(stage->entries, binding, offset, template_entries); } } if (template_entries.empty()) { diff --git a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp index 7ddf7d3ee..557b9d662 100644 --- a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp @@ -36,6 +36,13 @@ using Tegra::Engines::ShaderType; namespace { +// C++20's using enum +constexpr auto eUniformBuffer = vk::DescriptorType::eUniformBuffer; +constexpr auto eStorageBuffer = vk::DescriptorType::eStorageBuffer; +constexpr auto eUniformTexelBuffer = vk::DescriptorType::eUniformTexelBuffer; +constexpr auto eCombinedImageSampler = vk::DescriptorType::eCombinedImageSampler; +constexpr auto eStorageImage = vk::DescriptorType::eStorageImage; + constexpr VideoCommon::Shader::CompilerSettings compiler_settings{ VideoCommon::Shader::CompileDepth::FullDecompile}; @@ -119,23 +126,32 @@ ShaderType GetShaderType(Maxwell::ShaderProgram program) { } } +template <vk::DescriptorType descriptor_type, class Container> +void AddBindings(std::vector<vk::DescriptorSetLayoutBinding>& bindings, u32& binding, + vk::ShaderStageFlags stage_flags, const Container& container) { + const u32 num_entries = static_cast<u32>(std::size(container)); + for (std::size_t i = 0; i < num_entries; ++i) { + u32 count = 1; + if constexpr (descriptor_type == eCombinedImageSampler) { + // Combined image samplers can be arrayed. + count = container[i].Size(); + } + bindings.emplace_back(binding++, descriptor_type, count, stage_flags, nullptr); + } +} + u32 FillDescriptorLayout(const ShaderEntries& entries, std::vector<vk::DescriptorSetLayoutBinding>& bindings, Maxwell::ShaderProgram program_type, u32 base_binding) { const ShaderType stage = GetStageFromProgram(program_type); - const vk::ShaderStageFlags stage_flags = MaxwellToVK::ShaderStage(stage); + const vk::ShaderStageFlags flags = MaxwellToVK::ShaderStage(stage); u32 binding = base_binding; - const auto AddBindings = [&](vk::DescriptorType descriptor_type, std::size_t num_entries) { - for (std::size_t i = 0; i < num_entries; ++i) { - bindings.emplace_back(binding++, descriptor_type, 1, stage_flags, nullptr); - } - }; - AddBindings(vk::DescriptorType::eUniformBuffer, entries.const_buffers.size()); - AddBindings(vk::DescriptorType::eStorageBuffer, entries.global_buffers.size()); - AddBindings(vk::DescriptorType::eUniformTexelBuffer, entries.texel_buffers.size()); - AddBindings(vk::DescriptorType::eCombinedImageSampler, entries.samplers.size()); - AddBindings(vk::DescriptorType::eStorageImage, entries.images.size()); + AddBindings<eUniformBuffer>(bindings, binding, flags, entries.const_buffers); + AddBindings<eStorageBuffer>(bindings, binding, flags, entries.global_buffers); + AddBindings<eUniformTexelBuffer>(bindings, binding, flags, entries.texel_buffers); + AddBindings<eCombinedImageSampler>(bindings, binding, flags, entries.samplers); + AddBindings<eStorageImage>(bindings, binding, flags, entries.images); return binding; } @@ -145,8 +161,8 @@ CachedShader::CachedShader(Core::System& system, Tegra::Engines::ShaderType stag GPUVAddr gpu_addr, VAddr cpu_addr, u8* host_ptr, ProgramCode program_code, u32 main_offset) : RasterizerCacheObject{host_ptr}, gpu_addr{gpu_addr}, cpu_addr{cpu_addr}, - program_code{std::move(program_code)}, locker{stage, GetEngine(system, stage)}, - shader_ir{this->program_code, main_offset, compiler_settings, locker}, + program_code{std::move(program_code)}, registry{stage, GetEngine(system, stage)}, + shader_ir{this->program_code, main_offset, compiler_settings, registry}, entries{GenerateShaderEntries(shader_ir)} {} CachedShader::~CachedShader() = default; @@ -163,24 +179,19 @@ Tegra::Engines::ConstBufferEngineInterface& CachedShader::GetEngine( VKPipelineCache::VKPipelineCache(Core::System& system, RasterizerVulkan& rasterizer, const VKDevice& device, VKScheduler& scheduler, VKDescriptorPool& descriptor_pool, - VKUpdateDescriptorQueue& update_descriptor_queue) + VKUpdateDescriptorQueue& update_descriptor_queue, + VKRenderPassCache& renderpass_cache) : RasterizerCache{rasterizer}, system{system}, device{device}, scheduler{scheduler}, descriptor_pool{descriptor_pool}, update_descriptor_queue{update_descriptor_queue}, - renderpass_cache(device) {} + renderpass_cache{renderpass_cache} {} VKPipelineCache::~VKPipelineCache() = default; std::array<Shader, Maxwell::MaxShaderProgram> VKPipelineCache::GetShaders() { const auto& gpu = system.GPU().Maxwell3D(); - auto& dirty = system.GPU().Maxwell3D().dirty.shaders; - if (!dirty) { - return last_shaders; - } - dirty = false; std::array<Shader, Maxwell::MaxShaderProgram> shaders; for (std::size_t index = 0; index < Maxwell::MaxShaderProgram; ++index) { - const auto& shader_config = gpu.regs.shader_config[index]; const auto program{static_cast<Maxwell::ShaderProgram>(index)}; // Skip stages that are not enabled @@ -262,9 +273,9 @@ VKComputePipeline& VKPipelineCache::GetComputePipeline(const ComputePipelineCach specialization.workgroup_size = key.workgroup_size; specialization.shared_memory_size = key.shared_memory_size; - const SPIRVShader spirv_shader{ - Decompile(device, shader->GetIR(), ShaderType::Compute, specialization), - shader->GetEntries()}; + const SPIRVShader spirv_shader{Decompile(device, shader->GetIR(), ShaderType::Compute, + shader->GetRegistry(), specialization), + shader->GetEntries()}; entry = std::make_unique<VKComputePipeline>(device, scheduler, descriptor_pool, update_descriptor_queue, spirv_shader); return *entry; @@ -313,8 +324,7 @@ VKPipelineCache::DecompileShaders(const GraphicsPipelineCacheKey& key) { const auto& gpu = system.GPU().Maxwell3D(); Specialization specialization; - specialization.primitive_topology = fixed_state.input_assembly.topology; - if (specialization.primitive_topology == Maxwell::PrimitiveTopology::Points) { + if (fixed_state.input_assembly.topology == Maxwell::PrimitiveTopology::Points) { ASSERT(fixed_state.input_assembly.point_size != 0.0f); specialization.point_size = fixed_state.input_assembly.point_size; } @@ -322,9 +332,6 @@ VKPipelineCache::DecompileShaders(const GraphicsPipelineCacheKey& key) { specialization.attribute_types[i] = fixed_state.vertex_input.attributes[i].type; } specialization.ndc_minus_one_to_one = fixed_state.rasterizer.ndc_minus_one_to_one; - specialization.tessellation.primitive = fixed_state.tessellation.primitive; - specialization.tessellation.spacing = fixed_state.tessellation.spacing; - specialization.tessellation.clockwise = fixed_state.tessellation.clockwise; SPIRVProgram program; std::vector<vk::DescriptorSetLayoutBinding> bindings; @@ -345,8 +352,9 @@ VKPipelineCache::DecompileShaders(const GraphicsPipelineCacheKey& key) { const std::size_t stage = index == 0 ? 0 : index - 1; // Stage indices are 0 - 5 const auto program_type = GetShaderType(program_enum); const auto& entries = shader->GetEntries(); - program[stage] = {Decompile(device, shader->GetIR(), program_type, specialization), - entries}; + program[stage] = { + Decompile(device, shader->GetIR(), program_type, shader->GetRegistry(), specialization), + entries}; if (program_enum == Maxwell::ShaderProgram::VertexA) { // VertexB was combined with VertexA, so we skip the VertexB iteration @@ -361,32 +369,45 @@ VKPipelineCache::DecompileShaders(const GraphicsPipelineCacheKey& key) { return {std::move(program), std::move(bindings)}; } -void FillDescriptorUpdateTemplateEntries( - const VKDevice& device, const ShaderEntries& entries, u32& binding, u32& offset, - std::vector<vk::DescriptorUpdateTemplateEntry>& template_entries) { - static constexpr auto entry_size = static_cast<u32>(sizeof(DescriptorUpdateEntry)); - const auto AddEntry = [&](vk::DescriptorType descriptor_type, std::size_t count_) { - const u32 count = static_cast<u32>(count_); - if (descriptor_type == vk::DescriptorType::eUniformTexelBuffer && - device.GetDriverID() == vk::DriverIdKHR::eNvidiaProprietary) { - // Nvidia has a bug where updating multiple uniform texels at once causes the driver to - // crash. - for (u32 i = 0; i < count; ++i) { - template_entries.emplace_back(binding + i, 0, 1, descriptor_type, - offset + i * entry_size, entry_size); - } - } else if (count != 0) { - template_entries.emplace_back(binding, 0, count, descriptor_type, offset, entry_size); +template <vk::DescriptorType descriptor_type, class Container> +void AddEntry(std::vector<vk::DescriptorUpdateTemplateEntry>& template_entries, u32& binding, + u32& offset, const Container& container) { + static constexpr u32 entry_size = static_cast<u32>(sizeof(DescriptorUpdateEntry)); + const u32 count = static_cast<u32>(std::size(container)); + + if constexpr (descriptor_type == eCombinedImageSampler) { + for (u32 i = 0; i < count; ++i) { + const u32 num_samplers = container[i].Size(); + template_entries.emplace_back(binding, 0, num_samplers, descriptor_type, offset, + entry_size); + ++binding; + offset += num_samplers * entry_size; } - offset += count * entry_size; - binding += count; - }; + return; + } - AddEntry(vk::DescriptorType::eUniformBuffer, entries.const_buffers.size()); - AddEntry(vk::DescriptorType::eStorageBuffer, entries.global_buffers.size()); - AddEntry(vk::DescriptorType::eUniformTexelBuffer, entries.texel_buffers.size()); - AddEntry(vk::DescriptorType::eCombinedImageSampler, entries.samplers.size()); - AddEntry(vk::DescriptorType::eStorageImage, entries.images.size()); + if constexpr (descriptor_type == eUniformTexelBuffer) { + // Nvidia has a bug where updating multiple uniform texels at once causes the driver to + // crash. + for (u32 i = 0; i < count; ++i) { + template_entries.emplace_back(binding + i, 0, 1, descriptor_type, + offset + i * entry_size, entry_size); + } + } else if (count > 0) { + template_entries.emplace_back(binding, 0, count, descriptor_type, offset, entry_size); + } + offset += count * entry_size; + binding += count; +} + +void FillDescriptorUpdateTemplateEntries( + const ShaderEntries& entries, u32& binding, u32& offset, + std::vector<vk::DescriptorUpdateTemplateEntry>& template_entries) { + AddEntry<eUniformBuffer>(template_entries, offset, binding, entries.const_buffers); + AddEntry<eStorageBuffer>(template_entries, offset, binding, entries.global_buffers); + AddEntry<eUniformTexelBuffer>(template_entries, offset, binding, entries.texel_buffers); + AddEntry<eCombinedImageSampler>(template_entries, offset, binding, entries.samplers); + AddEntry<eStorageImage>(template_entries, offset, binding, entries.images); } } // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_pipeline_cache.h b/src/video_core/renderer_vulkan/vk_pipeline_cache.h index 8678fc9c3..c4c112290 100644 --- a/src/video_core/renderer_vulkan/vk_pipeline_cache.h +++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.h @@ -25,7 +25,7 @@ #include "video_core/renderer_vulkan/vk_renderpass_cache.h" #include "video_core/renderer_vulkan/vk_resource_manager.h" #include "video_core/renderer_vulkan/vk_shader_decompiler.h" -#include "video_core/shader/const_buffer_locker.h" +#include "video_core/shader/registry.h" #include "video_core/shader/shader_ir.h" #include "video_core/surface.h" @@ -132,6 +132,10 @@ public: return shader_ir; } + const VideoCommon::Shader::Registry& GetRegistry() const { + return registry; + } + const VideoCommon::Shader::ShaderIR& GetIR() const { return shader_ir; } @@ -147,7 +151,7 @@ private: GPUVAddr gpu_addr{}; VAddr cpu_addr{}; ProgramCode program_code; - VideoCommon::Shader::ConstBufferLocker locker; + VideoCommon::Shader::Registry registry; VideoCommon::Shader::ShaderIR shader_ir; ShaderEntries entries; }; @@ -157,7 +161,8 @@ public: explicit VKPipelineCache(Core::System& system, RasterizerVulkan& rasterizer, const VKDevice& device, VKScheduler& scheduler, VKDescriptorPool& descriptor_pool, - VKUpdateDescriptorQueue& update_descriptor_queue); + VKUpdateDescriptorQueue& update_descriptor_queue, + VKRenderPassCache& renderpass_cache); ~VKPipelineCache(); std::array<Shader, Maxwell::MaxShaderProgram> GetShaders(); @@ -180,8 +185,7 @@ private: VKScheduler& scheduler; VKDescriptorPool& descriptor_pool; VKUpdateDescriptorQueue& update_descriptor_queue; - - VKRenderPassCache renderpass_cache; + VKRenderPassCache& renderpass_cache; std::array<Shader, Maxwell::MaxShaderProgram> last_shaders; @@ -194,7 +198,7 @@ private: }; void FillDescriptorUpdateTemplateEntries( - const VKDevice& device, const ShaderEntries& entries, u32& binding, u32& offset, + const ShaderEntries& entries, u32& binding, u32& offset, std::vector<vk::DescriptorUpdateTemplateEntry>& template_entries); } // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp index 31c078f6a..58c69b786 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp +++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp @@ -36,6 +36,7 @@ #include "video_core/renderer_vulkan/vk_sampler_cache.h" #include "video_core/renderer_vulkan/vk_scheduler.h" #include "video_core/renderer_vulkan/vk_staging_buffer_pool.h" +#include "video_core/renderer_vulkan/vk_state_tracker.h" #include "video_core/renderer_vulkan/vk_texture_cache.h" #include "video_core/renderer_vulkan/vk_update_descriptor.h" @@ -105,17 +106,20 @@ void TransitionImages(const std::vector<ImageView>& views, vk::PipelineStageFlag template <typename Engine, typename Entry> Tegra::Texture::FullTextureInfo GetTextureInfo(const Engine& engine, const Entry& entry, - std::size_t stage) { + std::size_t stage, std::size_t index = 0) { const auto stage_type = static_cast<Tegra::Engines::ShaderType>(stage); if (entry.IsBindless()) { const Tegra::Texture::TextureHandle tex_handle = engine.AccessConstBuffer32(stage_type, entry.GetBuffer(), entry.GetOffset()); return engine.GetTextureInfo(tex_handle); } + const auto& gpu_profile = engine.AccessGuestDriverProfile(); + const u32 entry_offset = static_cast<u32>(index * gpu_profile.GetTextureHandlerSize()); + const u32 offset = entry.GetOffset() + entry_offset; if constexpr (std::is_same_v<Engine, Tegra::Engines::Maxwell3D>) { - return engine.GetStageTexture(stage_type, entry.GetOffset()); + return engine.GetStageTexture(stage_type, offset); } else { - return engine.GetTexture(entry.GetOffset()); + return engine.GetTexture(offset); } } @@ -277,17 +281,19 @@ void RasterizerVulkan::DrawParameters::Draw(vk::CommandBuffer cmdbuf, RasterizerVulkan::RasterizerVulkan(Core::System& system, Core::Frontend::EmuWindow& renderer, VKScreenInfo& screen_info, const VKDevice& device, VKResourceManager& resource_manager, - VKMemoryManager& memory_manager, VKScheduler& scheduler) + VKMemoryManager& memory_manager, StateTracker& state_tracker, + VKScheduler& scheduler) : RasterizerAccelerated{system.Memory()}, system{system}, render_window{renderer}, screen_info{screen_info}, device{device}, resource_manager{resource_manager}, - memory_manager{memory_manager}, scheduler{scheduler}, + memory_manager{memory_manager}, state_tracker{state_tracker}, scheduler{scheduler}, staging_pool(device, memory_manager, scheduler), descriptor_pool(device), - update_descriptor_queue(device, scheduler), + update_descriptor_queue(device, scheduler), renderpass_cache(device), quad_array_pass(device, scheduler, descriptor_pool, staging_pool, update_descriptor_queue), uint8_pass(device, scheduler, descriptor_pool, staging_pool, update_descriptor_queue), texture_cache(system, *this, device, resource_manager, memory_manager, scheduler, staging_pool), - pipeline_cache(system, *this, device, scheduler, descriptor_pool, update_descriptor_queue), + pipeline_cache(system, *this, device, scheduler, descriptor_pool, update_descriptor_queue, + renderpass_cache), buffer_cache(*this, system, device, memory_manager, scheduler, staging_pool), sampler_cache(device), query_cache(system, *this, device, scheduler) { scheduler.SetQueryCache(query_cache); @@ -342,6 +348,8 @@ void RasterizerVulkan::Draw(bool is_indexed, bool is_instanced) { [&pipeline](auto cmdbuf, auto& dld) { cmdbuf.setCheckpointNV(&pipeline, dld); }); } + BeginTransformFeedback(); + const auto pipeline_layout = pipeline.GetLayout(); const auto descriptor_set = pipeline.CommitDescriptorSet(); scheduler.Record([pipeline_layout, descriptor_set, draw_params](auto cmdbuf, auto& dld) { @@ -351,18 +359,23 @@ void RasterizerVulkan::Draw(bool is_indexed, bool is_instanced) { } draw_params.Draw(cmdbuf, dld); }); + + EndTransformFeedback(); } void RasterizerVulkan::Clear() { MICROPROFILE_SCOPE(Vulkan_Clearing); - query_cache.UpdateCounters(); - const auto& gpu = system.GPU().Maxwell3D(); if (!system.GPU().Maxwell3D().ShouldExecute()) { return; } + sampled_views.clear(); + image_views.clear(); + + query_cache.UpdateCounters(); + const auto& regs = gpu.regs; const bool use_color = regs.clear_buffers.R || regs.clear_buffers.G || regs.clear_buffers.B || regs.clear_buffers.A; @@ -371,52 +384,54 @@ void RasterizerVulkan::Clear() { if (!use_color && !use_depth && !use_stencil) { return; } - // Clearing images requires to be out of a renderpass - scheduler.RequestOutsideRenderPassOperationContext(); - // TODO(Rodrigo): Implement clears rendering a quad or using beginning a renderpass. + [[maybe_unused]] const auto texceptions = UpdateAttachments(); + DEBUG_ASSERT(texceptions.none()); + SetupImageTransitions(0, color_attachments, zeta_attachment); - if (use_color) { - View color_view; - { - MICROPROFILE_SCOPE(Vulkan_RenderTargets); - color_view = texture_cache.GetColorBufferSurface(regs.clear_buffers.RT.Value(), false); - } + const vk::RenderPass renderpass = renderpass_cache.GetRenderPass(GetRenderPassParams(0)); + const auto [framebuffer, render_area] = ConfigureFramebuffers(renderpass); + scheduler.RequestRenderpass({renderpass, framebuffer, {{0, 0}, render_area}, 0, nullptr}); - color_view->Transition(vk::ImageLayout::eTransferDstOptimal, - vk::PipelineStageFlagBits::eTransfer, - vk::AccessFlagBits::eTransferWrite); + const auto& scissor = regs.scissor_test[0]; + const vk::Offset2D scissor_offset(scissor.min_x, scissor.min_y); + vk::Extent2D scissor_extent{scissor.max_x - scissor.min_x, scissor.max_y - scissor.min_y}; + scissor_extent.width = std::min(scissor_extent.width, render_area.width); + scissor_extent.height = std::min(scissor_extent.height, render_area.height); + const u32 layer = regs.clear_buffers.layer; + const vk::ClearRect clear_rect({scissor_offset, scissor_extent}, layer, 1); + + if (use_color) { const std::array clear_color = {regs.clear_color[0], regs.clear_color[1], regs.clear_color[2], regs.clear_color[3]}; - const vk::ClearColorValue clear(clear_color); - scheduler.Record([image = color_view->GetImage(), - subresource = color_view->GetImageSubresourceRange(), - clear](auto cmdbuf, auto& dld) { - cmdbuf.clearColorImage(image, vk::ImageLayout::eTransferDstOptimal, clear, subresource, - dld); + const vk::ClearValue clear_value{clear_color}; + const u32 color_attachment = regs.clear_buffers.RT; + scheduler.Record([color_attachment, clear_value, clear_rect](auto cmdbuf, auto& dld) { + const vk::ClearAttachment attachment(vk::ImageAspectFlagBits::eColor, color_attachment, + clear_value); + cmdbuf.clearAttachments(1, &attachment, 1, &clear_rect, dld); }); } - if (use_depth || use_stencil) { - View zeta_surface; - { - MICROPROFILE_SCOPE(Vulkan_RenderTargets); - zeta_surface = texture_cache.GetDepthBufferSurface(false); - } - zeta_surface->Transition(vk::ImageLayout::eTransferDstOptimal, - vk::PipelineStageFlagBits::eTransfer, - vk::AccessFlagBits::eTransferWrite); - - const vk::ClearDepthStencilValue clear(regs.clear_depth, - static_cast<u32>(regs.clear_stencil)); - scheduler.Record([image = zeta_surface->GetImage(), - subresource = zeta_surface->GetImageSubresourceRange(), - clear](auto cmdbuf, auto& dld) { - cmdbuf.clearDepthStencilImage(image, vk::ImageLayout::eTransferDstOptimal, clear, - subresource, dld); - }); + if (!use_depth && !use_stencil) { + return; + } + vk::ImageAspectFlags aspect_flags; + if (use_depth) { + aspect_flags |= vk::ImageAspectFlagBits::eDepth; + } + if (use_stencil) { + aspect_flags |= vk::ImageAspectFlagBits::eStencil; } + + scheduler.Record([clear_depth = regs.clear_depth, clear_stencil = regs.clear_stencil, + clear_rect, aspect_flags](auto cmdbuf, auto& dld) { + const vk::ClearDepthStencilValue clear_zeta(clear_depth, clear_stencil); + const vk::ClearValue clear_value{clear_zeta}; + const vk::ClearAttachment attachment(aspect_flags, 0, clear_value); + cmdbuf.clearAttachments(1, &attachment, 1, &clear_rect, dld); + }); } void RasterizerVulkan::DispatchCompute(GPUVAddr code_addr) { @@ -533,8 +548,6 @@ bool RasterizerVulkan::AccelerateDisplay(const Tegra::FramebufferConfig& config, // Verify that the cached surface is the same size and format as the requested framebuffer const auto& params{surface->GetSurfaceParams()}; - const auto& pixel_format{ - VideoCore::Surface::PixelFormatFromGPUPixelFormat(config.pixel_format)}; ASSERT_MSG(params.width == config.width, "Framebuffer width is different"); ASSERT_MSG(params.height == config.height, "Framebuffer height is different"); @@ -545,6 +558,10 @@ bool RasterizerVulkan::AccelerateDisplay(const Tegra::FramebufferConfig& config, return true; } +void RasterizerVulkan::SetupDirtyFlags() { + state_tracker.Initialize(); +} + void RasterizerVulkan::FlushWork() { static constexpr u32 DRAWS_TO_DISPATCH = 4096; @@ -568,9 +585,9 @@ void RasterizerVulkan::FlushWork() { RasterizerVulkan::Texceptions RasterizerVulkan::UpdateAttachments() { MICROPROFILE_SCOPE(Vulkan_RenderTargets); - auto& dirty = system.GPU().Maxwell3D().dirty; - const bool update_rendertargets = dirty.render_settings; - dirty.render_settings = false; + auto& dirty = system.GPU().Maxwell3D().dirty.flags; + const bool update_rendertargets = dirty[VideoCommon::Dirty::RenderTargets]; + dirty[VideoCommon::Dirty::RenderTargets] = false; texture_cache.GuardRenderTargets(true); @@ -611,33 +628,34 @@ bool RasterizerVulkan::WalkAttachmentOverlaps(const CachedSurfaceView& attachmen std::tuple<vk::Framebuffer, vk::Extent2D> RasterizerVulkan::ConfigureFramebuffers( vk::RenderPass renderpass) { FramebufferCacheKey key{renderpass, std::numeric_limits<u32>::max(), - std::numeric_limits<u32>::max()}; + std::numeric_limits<u32>::max(), std::numeric_limits<u32>::max()}; - const auto MarkAsModifiedAndPush = [&](const View& view) { - if (view == nullptr) { + const auto try_push = [&](const View& view) { + if (!view) { return false; } key.views.push_back(view->GetHandle()); key.width = std::min(key.width, view->GetWidth()); key.height = std::min(key.height, view->GetHeight()); + key.layers = std::min(key.layers, view->GetNumLayers()); return true; }; for (std::size_t index = 0; index < std::size(color_attachments); ++index) { - if (MarkAsModifiedAndPush(color_attachments[index])) { + if (try_push(color_attachments[index])) { texture_cache.MarkColorBufferInUse(index); } } - if (MarkAsModifiedAndPush(zeta_attachment)) { + if (try_push(zeta_attachment)) { texture_cache.MarkDepthBufferInUse(); } const auto [fbentry, is_cache_miss] = framebuffer_cache.try_emplace(key); auto& framebuffer = fbentry->second; if (is_cache_miss) { - const vk::FramebufferCreateInfo framebuffer_ci({}, key.renderpass, - static_cast<u32>(key.views.size()), - key.views.data(), key.width, key.height, 1); + const vk::FramebufferCreateInfo framebuffer_ci( + {}, key.renderpass, static_cast<u32>(key.views.size()), key.views.data(), key.width, + key.height, key.layers); const auto dev = device.GetLogical(); const auto& dld = device.GetDispatchLoader(); framebuffer = dev.createFramebufferUnique(framebuffer_ci, nullptr, dld); @@ -719,13 +737,51 @@ void RasterizerVulkan::SetupImageTransitions( } void RasterizerVulkan::UpdateDynamicStates() { - auto& gpu = system.GPU().Maxwell3D(); - UpdateViewportsState(gpu); - UpdateScissorsState(gpu); - UpdateDepthBias(gpu); - UpdateBlendConstants(gpu); - UpdateDepthBounds(gpu); - UpdateStencilFaces(gpu); + auto& regs = system.GPU().Maxwell3D().regs; + UpdateViewportsState(regs); + UpdateScissorsState(regs); + UpdateDepthBias(regs); + UpdateBlendConstants(regs); + UpdateDepthBounds(regs); + UpdateStencilFaces(regs); +} + +void RasterizerVulkan::BeginTransformFeedback() { + const auto& regs = system.GPU().Maxwell3D().regs; + if (regs.tfb_enabled == 0) { + return; + } + + UNIMPLEMENTED_IF(regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::TesselationControl) || + regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::TesselationEval) || + regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::Geometry)); + + UNIMPLEMENTED_IF(regs.tfb_bindings[1].buffer_enable); + UNIMPLEMENTED_IF(regs.tfb_bindings[2].buffer_enable); + UNIMPLEMENTED_IF(regs.tfb_bindings[3].buffer_enable); + + const auto& binding = regs.tfb_bindings[0]; + UNIMPLEMENTED_IF(binding.buffer_enable == 0); + UNIMPLEMENTED_IF(binding.buffer_offset != 0); + + const GPUVAddr gpu_addr = binding.Address(); + const std::size_t size = binding.buffer_size; + const auto [buffer, offset] = buffer_cache.UploadMemory(gpu_addr, size, 4, true); + + scheduler.Record([buffer = *buffer, offset = offset, size](auto cmdbuf, auto& dld) { + cmdbuf.bindTransformFeedbackBuffersEXT(0, {buffer}, {offset}, {size}, dld); + cmdbuf.beginTransformFeedbackEXT(0, {}, {}, dld); + }); +} + +void RasterizerVulkan::EndTransformFeedback() { + const auto& regs = system.GPU().Maxwell3D().regs; + if (regs.tfb_enabled == 0) { + return; + } + + scheduler.Record( + [](auto cmdbuf, auto& dld) { cmdbuf.endTransformFeedbackEXT(0, {}, {}, dld); }); } void RasterizerVulkan::SetupVertexArrays(FixedPipelineState::VertexInput& vertex_input, @@ -835,14 +891,16 @@ void RasterizerVulkan::SetupGraphicsTextures(const ShaderEntries& entries, std:: MICROPROFILE_SCOPE(Vulkan_Textures); const auto& gpu = system.GPU().Maxwell3D(); for (const auto& entry : entries.samplers) { - const auto texture = GetTextureInfo(gpu, entry, stage); - SetupTexture(texture, entry); + for (std::size_t i = 0; i < entry.Size(); ++i) { + const auto texture = GetTextureInfo(gpu, entry, stage, i); + SetupTexture(texture, entry); + } } } void RasterizerVulkan::SetupGraphicsImages(const ShaderEntries& entries, std::size_t stage) { MICROPROFILE_SCOPE(Vulkan_Images); - const auto& gpu = system.GPU().KeplerCompute(); + const auto& gpu = system.GPU().Maxwell3D(); for (const auto& entry : entries.images) { const auto tic = GetTextureInfo(gpu, entry, stage).tic; SetupImage(tic, entry); @@ -885,8 +943,10 @@ void RasterizerVulkan::SetupComputeTextures(const ShaderEntries& entries) { MICROPROFILE_SCOPE(Vulkan_Textures); const auto& gpu = system.GPU().KeplerCompute(); for (const auto& entry : entries.samplers) { - const auto texture = GetTextureInfo(gpu, entry, ComputeShaderIndex); - SetupTexture(texture, entry); + for (std::size_t i = 0; i < entry.Size(); ++i) { + const auto texture = GetTextureInfo(gpu, entry, ComputeShaderIndex, i); + SetupTexture(texture, entry); + } } } @@ -901,6 +961,13 @@ void RasterizerVulkan::SetupComputeImages(const ShaderEntries& entries) { void RasterizerVulkan::SetupConstBuffer(const ConstBufferEntry& entry, const Tegra::Engines::ConstBufferInfo& buffer) { + if (!buffer.enabled) { + // Set values to zero to unbind buffers + update_descriptor_queue.AddBuffer(buffer_cache.GetEmptyBuffer(sizeof(float)), 0, + sizeof(float)); + return; + } + // Align the size to avoid bad std140 interactions const std::size_t size = Common::AlignUp(CalculateConstBufferSize(entry, buffer), 4 * sizeof(float)); @@ -971,12 +1038,10 @@ void RasterizerVulkan::SetupImage(const Tegra::Texture::TICEntry& tic, const Ima image_views.push_back(ImageView{std::move(view), image_layout}); } -void RasterizerVulkan::UpdateViewportsState(Tegra::Engines::Maxwell3D& gpu) { - if (!gpu.dirty.viewport_transform && scheduler.TouchViewports()) { +void RasterizerVulkan::UpdateViewportsState(Tegra::Engines::Maxwell3D::Regs& regs) { + if (!state_tracker.TouchViewports()) { return; } - gpu.dirty.viewport_transform = false; - const auto& regs = gpu.regs; const std::array viewports{ GetViewportState(device, regs, 0), GetViewportState(device, regs, 1), GetViewportState(device, regs, 2), GetViewportState(device, regs, 3), @@ -991,12 +1056,10 @@ void RasterizerVulkan::UpdateViewportsState(Tegra::Engines::Maxwell3D& gpu) { }); } -void RasterizerVulkan::UpdateScissorsState(Tegra::Engines::Maxwell3D& gpu) { - if (!gpu.dirty.scissor_test && scheduler.TouchScissors()) { +void RasterizerVulkan::UpdateScissorsState(Tegra::Engines::Maxwell3D::Regs& regs) { + if (!state_tracker.TouchScissors()) { return; } - gpu.dirty.scissor_test = false; - const auto& regs = gpu.regs; const std::array scissors = { GetScissorState(regs, 0), GetScissorState(regs, 1), GetScissorState(regs, 2), GetScissorState(regs, 3), GetScissorState(regs, 4), GetScissorState(regs, 5), @@ -1009,46 +1072,39 @@ void RasterizerVulkan::UpdateScissorsState(Tegra::Engines::Maxwell3D& gpu) { }); } -void RasterizerVulkan::UpdateDepthBias(Tegra::Engines::Maxwell3D& gpu) { - if (!gpu.dirty.polygon_offset && scheduler.TouchDepthBias()) { +void RasterizerVulkan::UpdateDepthBias(Tegra::Engines::Maxwell3D::Regs& regs) { + if (!state_tracker.TouchDepthBias()) { return; } - gpu.dirty.polygon_offset = false; - const auto& regs = gpu.regs; scheduler.Record([constant = regs.polygon_offset_units, clamp = regs.polygon_offset_clamp, factor = regs.polygon_offset_factor](auto cmdbuf, auto& dld) { cmdbuf.setDepthBias(constant, clamp, factor / 2.0f, dld); }); } -void RasterizerVulkan::UpdateBlendConstants(Tegra::Engines::Maxwell3D& gpu) { - if (!gpu.dirty.blend_state && scheduler.TouchBlendConstants()) { +void RasterizerVulkan::UpdateBlendConstants(Tegra::Engines::Maxwell3D::Regs& regs) { + if (!state_tracker.TouchBlendConstants()) { return; } - gpu.dirty.blend_state = false; - const std::array blend_color = {gpu.regs.blend_color.r, gpu.regs.blend_color.g, - gpu.regs.blend_color.b, gpu.regs.blend_color.a}; + const std::array blend_color = {regs.blend_color.r, regs.blend_color.g, regs.blend_color.b, + regs.blend_color.a}; scheduler.Record([blend_color](auto cmdbuf, auto& dld) { cmdbuf.setBlendConstants(blend_color.data(), dld); }); } -void RasterizerVulkan::UpdateDepthBounds(Tegra::Engines::Maxwell3D& gpu) { - if (!gpu.dirty.depth_bounds_values && scheduler.TouchDepthBounds()) { +void RasterizerVulkan::UpdateDepthBounds(Tegra::Engines::Maxwell3D::Regs& regs) { + if (!state_tracker.TouchDepthBounds()) { return; } - gpu.dirty.depth_bounds_values = false; - const auto& regs = gpu.regs; scheduler.Record([min = regs.depth_bounds[0], max = regs.depth_bounds[1]]( auto cmdbuf, auto& dld) { cmdbuf.setDepthBounds(min, max, dld); }); } -void RasterizerVulkan::UpdateStencilFaces(Tegra::Engines::Maxwell3D& gpu) { - if (!gpu.dirty.stencil_test && scheduler.TouchStencilValues()) { +void RasterizerVulkan::UpdateStencilFaces(Tegra::Engines::Maxwell3D::Regs& regs) { + if (!state_tracker.TouchStencilProperties()) { return; } - gpu.dirty.stencil_test = false; - const auto& regs = gpu.regs; if (regs.stencil_two_side_enable) { // Separate values per face scheduler.Record( @@ -1099,7 +1155,7 @@ std::size_t RasterizerVulkan::CalculateVertexArraysSize() const { // This implementation assumes that all attributes are used in the shader. const GPUVAddr start{regs.vertex_array[index].StartAddress()}; const GPUVAddr end{regs.vertex_array_limit[index].LimitAddress()}; - DEBUG_ASSERT(end > start); + DEBUG_ASSERT(end >= start); size += (end - start + 1) * regs.vertex_array[index].enable; } diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.h b/src/video_core/renderer_vulkan/vk_rasterizer.h index 138903d60..3185868e9 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.h +++ b/src/video_core/renderer_vulkan/vk_rasterizer.h @@ -56,6 +56,7 @@ struct FramebufferCacheKey { vk::RenderPass renderpass{}; u32 width = 0; u32 height = 0; + u32 layers = 0; ImageViewsPack views; std::size_t Hash() const noexcept { @@ -66,12 +67,17 @@ struct FramebufferCacheKey { } boost::hash_combine(hash, width); boost::hash_combine(hash, height); + boost::hash_combine(hash, layers); return hash; } bool operator==(const FramebufferCacheKey& rhs) const noexcept { - return std::tie(renderpass, views, width, height) == - std::tie(rhs.renderpass, rhs.views, rhs.width, rhs.height); + return std::tie(renderpass, views, width, height, layers) == + std::tie(rhs.renderpass, rhs.views, rhs.width, rhs.height, rhs.layers); + } + + bool operator!=(const FramebufferCacheKey& rhs) const noexcept { + return !operator==(rhs); } }; @@ -90,6 +96,7 @@ struct hash<Vulkan::FramebufferCacheKey> { namespace Vulkan { +class StateTracker; class BufferBindings; struct ImageView { @@ -102,7 +109,7 @@ public: explicit RasterizerVulkan(Core::System& system, Core::Frontend::EmuWindow& render_window, VKScreenInfo& screen_info, const VKDevice& device, VKResourceManager& resource_manager, VKMemoryManager& memory_manager, - VKScheduler& scheduler); + StateTracker& state_tracker, VKScheduler& scheduler); ~RasterizerVulkan() override; void Draw(bool is_indexed, bool is_instanced) override; @@ -121,6 +128,7 @@ public: const Tegra::Engines::Fermi2D::Config& copy_config) override; bool AccelerateDisplay(const Tegra::FramebufferConfig& config, VAddr framebuffer_addr, u32 pixel_stride) override; + void SetupDirtyFlags() override; /// Maximum supported size that a constbuffer can have in bytes. static constexpr std::size_t MaxConstbufferSize = 0x10000; @@ -161,6 +169,10 @@ private: void UpdateDynamicStates(); + void BeginTransformFeedback(); + + void EndTransformFeedback(); + bool WalkAttachmentOverlaps(const CachedSurfaceView& attachment); void SetupVertexArrays(FixedPipelineState::VertexInput& vertex_input, @@ -209,12 +221,12 @@ private: void SetupImage(const Tegra::Texture::TICEntry& tic, const ImageEntry& entry); - void UpdateViewportsState(Tegra::Engines::Maxwell3D& gpu); - void UpdateScissorsState(Tegra::Engines::Maxwell3D& gpu); - void UpdateDepthBias(Tegra::Engines::Maxwell3D& gpu); - void UpdateBlendConstants(Tegra::Engines::Maxwell3D& gpu); - void UpdateDepthBounds(Tegra::Engines::Maxwell3D& gpu); - void UpdateStencilFaces(Tegra::Engines::Maxwell3D& gpu); + void UpdateViewportsState(Tegra::Engines::Maxwell3D::Regs& regs); + void UpdateScissorsState(Tegra::Engines::Maxwell3D::Regs& regs); + void UpdateDepthBias(Tegra::Engines::Maxwell3D::Regs& regs); + void UpdateBlendConstants(Tegra::Engines::Maxwell3D::Regs& regs); + void UpdateDepthBounds(Tegra::Engines::Maxwell3D::Regs& regs); + void UpdateStencilFaces(Tegra::Engines::Maxwell3D::Regs& regs); std::size_t CalculateGraphicsStreamBufferSize(bool is_indexed) const; @@ -235,11 +247,13 @@ private: const VKDevice& device; VKResourceManager& resource_manager; VKMemoryManager& memory_manager; + StateTracker& state_tracker; VKScheduler& scheduler; VKStagingBufferPool staging_pool; VKDescriptorPool descriptor_pool; VKUpdateDescriptorQueue update_descriptor_queue; + VKRenderPassCache renderpass_cache; QuadArrayPass quad_array_pass; Uint8Pass uint8_pass; diff --git a/src/video_core/renderer_vulkan/vk_scheduler.cpp b/src/video_core/renderer_vulkan/vk_scheduler.cpp index 92bd6c344..b61d4fe63 100644 --- a/src/video_core/renderer_vulkan/vk_scheduler.cpp +++ b/src/video_core/renderer_vulkan/vk_scheduler.cpp @@ -2,6 +2,12 @@ // Licensed under GPLv2 or any later version // Refer to the license.txt file included. +#include <memory> +#include <mutex> +#include <optional> +#include <thread> +#include <utility> + #include "common/assert.h" #include "common/microprofile.h" #include "video_core/renderer_vulkan/declarations.h" @@ -9,6 +15,7 @@ #include "video_core/renderer_vulkan/vk_query_cache.h" #include "video_core/renderer_vulkan/vk_resource_manager.h" #include "video_core/renderer_vulkan/vk_scheduler.h" +#include "video_core/renderer_vulkan/vk_state_tracker.h" namespace Vulkan { @@ -29,9 +36,10 @@ void VKScheduler::CommandChunk::ExecuteAll(vk::CommandBuffer cmdbuf, last = nullptr; } -VKScheduler::VKScheduler(const VKDevice& device, VKResourceManager& resource_manager) - : device{device}, resource_manager{resource_manager}, next_fence{ - &resource_manager.CommitFence()} { +VKScheduler::VKScheduler(const VKDevice& device, VKResourceManager& resource_manager, + StateTracker& state_tracker) + : device{device}, resource_manager{resource_manager}, state_tracker{state_tracker}, + next_fence{&resource_manager.CommitFence()} { AcquireNewChunk(); AllocateNewContext(); worker_thread = std::thread(&VKScheduler::WorkerThread, this); @@ -157,12 +165,7 @@ void VKScheduler::AllocateNewContext() { void VKScheduler::InvalidateState() { state.graphics_pipeline = nullptr; - state.viewports = false; - state.scissors = false; - state.depth_bias = false; - state.blend_constants = false; - state.depth_bounds = false; - state.stencil_values = false; + state_tracker.InvalidateCommandBufferState(); } void VKScheduler::EndPendingOperations() { diff --git a/src/video_core/renderer_vulkan/vk_scheduler.h b/src/video_core/renderer_vulkan/vk_scheduler.h index 62fd7858b..c7cc291c3 100644 --- a/src/video_core/renderer_vulkan/vk_scheduler.h +++ b/src/video_core/renderer_vulkan/vk_scheduler.h @@ -17,6 +17,7 @@ namespace Vulkan { +class StateTracker; class VKDevice; class VKFence; class VKQueryCache; @@ -43,7 +44,8 @@ private: /// OpenGL-like operations on Vulkan command buffers. class VKScheduler { public: - explicit VKScheduler(const VKDevice& device, VKResourceManager& resource_manager); + explicit VKScheduler(const VKDevice& device, VKResourceManager& resource_manager, + StateTracker& state_tracker); ~VKScheduler(); /// Sends the current execution context to the GPU. @@ -74,36 +76,6 @@ public: query_cache = &query_cache_; } - /// Returns true when viewports have been set in the current command buffer. - bool TouchViewports() { - return std::exchange(state.viewports, true); - } - - /// Returns true when scissors have been set in the current command buffer. - bool TouchScissors() { - return std::exchange(state.scissors, true); - } - - /// Returns true when depth bias have been set in the current command buffer. - bool TouchDepthBias() { - return std::exchange(state.depth_bias, true); - } - - /// Returns true when blend constants have been set in the current command buffer. - bool TouchBlendConstants() { - return std::exchange(state.blend_constants, true); - } - - /// Returns true when depth bounds have been set in the current command buffer. - bool TouchDepthBounds() { - return std::exchange(state.depth_bounds, true); - } - - /// Returns true when stencil values have been set in the current command buffer. - bool TouchStencilValues() { - return std::exchange(state.stencil_values, true); - } - /// Send work to a separate thread. template <typename T> void Record(T&& command) { @@ -217,6 +189,8 @@ private: const VKDevice& device; VKResourceManager& resource_manager; + StateTracker& state_tracker; + VKQueryCache* query_cache = nullptr; vk::CommandBuffer current_cmdbuf; @@ -226,12 +200,6 @@ private: struct State { std::optional<vk::RenderPassBeginInfo> renderpass; vk::Pipeline graphics_pipeline; - bool viewports = false; - bool scissors = false; - bool depth_bias = false; - bool blend_constants = false; - bool depth_bounds = false; - bool stencil_values = false; } state; std::unique_ptr<CommandChunk> chunk; diff --git a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp index 2da622d15..51ecb5567 100644 --- a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp +++ b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp @@ -5,7 +5,9 @@ #include <functional> #include <limits> #include <map> +#include <optional> #include <type_traits> +#include <unordered_map> #include <utility> #include <fmt/format.h> @@ -24,6 +26,7 @@ #include "video_core/renderer_vulkan/vk_shader_decompiler.h" #include "video_core/shader/node.h" #include "video_core/shader/shader_ir.h" +#include "video_core/shader/transform_feedback.h" namespace Vulkan { @@ -69,8 +72,9 @@ struct TexelBuffer { struct SampledImage { Id image_type{}; - Id sampled_image_type{}; - Id sampler{}; + Id sampler_type{}; + Id sampler_pointer_type{}; + Id variable{}; }; struct StorageImage { @@ -92,6 +96,12 @@ struct VertexIndices { std::optional<u32> clip_distances; }; +struct GenericVaryingDescription { + Id id = nullptr; + u32 first_element = 0; + bool is_scalar = false; +}; + spv::Dim GetSamplerDim(const Sampler& sampler) { ASSERT(!sampler.IsBuffer()); switch (sampler.GetType()) { @@ -265,9 +275,13 @@ bool IsPrecise(Operation operand) { class SPIRVDecompiler final : public Sirit::Module { public: explicit SPIRVDecompiler(const VKDevice& device, const ShaderIR& ir, ShaderType stage, - const Specialization& specialization) + const Registry& registry, const Specialization& specialization) : Module(0x00010300), device{device}, ir{ir}, stage{stage}, header{ir.GetHeader()}, - specialization{specialization} { + registry{registry}, specialization{specialization} { + if (stage != ShaderType::Compute) { + transform_feedback = BuildTransformFeedback(registry.GetGraphicsInfo()); + } + AddCapability(spv::Capability::Shader); AddCapability(spv::Capability::UniformAndStorageBuffer16BitAccess); AddCapability(spv::Capability::ImageQuery); @@ -285,6 +299,15 @@ public: AddExtension("SPV_KHR_variable_pointers"); AddExtension("SPV_KHR_shader_draw_parameters"); + if (!transform_feedback.empty()) { + if (device.IsExtTransformFeedbackSupported()) { + AddCapability(spv::Capability::TransformFeedback); + } else { + LOG_ERROR(Render_Vulkan, "Shader requires transform feedbacks but these are not " + "supported on this device"); + } + } + if (ir.UsesLayer() || ir.UsesViewportIndex()) { if (ir.UsesViewportIndex()) { AddCapability(spv::Capability::MultiViewport); @@ -295,7 +318,7 @@ public: } } - if (device.IsShaderStorageImageReadWithoutFormatSupported()) { + if (device.IsFormatlessImageLoadSupported()) { AddCapability(spv::Capability::StorageImageReadWithoutFormat); } @@ -317,25 +340,29 @@ public: AddExecutionMode(main, spv::ExecutionMode::OutputVertices, header.common2.threads_per_input_primitive); break; - case ShaderType::TesselationEval: + case ShaderType::TesselationEval: { + const auto& info = registry.GetGraphicsInfo(); AddCapability(spv::Capability::Tessellation); AddEntryPoint(spv::ExecutionModel::TessellationEvaluation, main, "main", interfaces); - AddExecutionMode(main, GetExecutionMode(specialization.tessellation.primitive)); - AddExecutionMode(main, GetExecutionMode(specialization.tessellation.spacing)); - AddExecutionMode(main, specialization.tessellation.clockwise + AddExecutionMode(main, GetExecutionMode(info.tessellation_primitive)); + AddExecutionMode(main, GetExecutionMode(info.tessellation_spacing)); + AddExecutionMode(main, info.tessellation_clockwise ? spv::ExecutionMode::VertexOrderCw : spv::ExecutionMode::VertexOrderCcw); break; - case ShaderType::Geometry: + } + case ShaderType::Geometry: { + const auto& info = registry.GetGraphicsInfo(); AddCapability(spv::Capability::Geometry); AddEntryPoint(spv::ExecutionModel::Geometry, main, "main", interfaces); - AddExecutionMode(main, GetExecutionMode(specialization.primitive_topology)); + AddExecutionMode(main, GetExecutionMode(info.primitive_topology)); AddExecutionMode(main, GetExecutionMode(header.common3.output_topology)); AddExecutionMode(main, spv::ExecutionMode::OutputVertices, header.common4.max_output_vertices); // TODO(Rodrigo): Where can we get this info from? AddExecutionMode(main, spv::ExecutionMode::Invocations, 1U); break; + } case ShaderType::Fragment: AddEntryPoint(spv::ExecutionModel::Fragment, main, "main", interfaces); AddExecutionMode(main, spv::ExecutionMode::OriginUpperLeft); @@ -544,7 +571,8 @@ private: if (stage != ShaderType::Geometry) { return; } - const u32 num_input = GetNumPrimitiveTopologyVertices(specialization.primitive_topology); + const auto& info = registry.GetGraphicsInfo(); + const u32 num_input = GetNumPrimitiveTopologyVertices(info.primitive_topology); DeclareInputVertexArray(num_input); DeclareOutputVertex(); } @@ -741,12 +769,34 @@ private: } void DeclareOutputAttributes() { + if (stage == ShaderType::Compute || stage == ShaderType::Fragment) { + return; + } + + UNIMPLEMENTED_IF(registry.GetGraphicsInfo().tfb_enabled && stage != ShaderType::Vertex); for (const auto index : ir.GetOutputAttributes()) { if (!IsGenericAttribute(index)) { continue; } - const u32 location = GetGenericAttributeLocation(index); - Id type = t_float4; + DeclareOutputAttribute(index); + } + } + + void DeclareOutputAttribute(Attribute::Index index) { + static constexpr std::string_view swizzle = "xyzw"; + + const u32 location = GetGenericAttributeLocation(index); + u8 element = 0; + while (element < 4) { + const std::size_t remainder = 4 - element; + + std::size_t num_components = remainder; + const std::optional tfb = GetTransformFeedbackInfo(index, element); + if (tfb) { + num_components = tfb->components; + } + + Id type = GetTypeVectorDefinitionLut(Type::Float).at(num_components - 1); Id varying_default = v_varying_default; if (IsOutputAttributeArray()) { const u32 num = GetNumOutputVertices(); @@ -759,15 +809,47 @@ private: } type = TypePointer(spv::StorageClass::Output, type); + std::string name = fmt::format("out_attr{}", location); + if (num_components < 4 || element > 0) { + name = fmt::format("{}_{}", name, swizzle.substr(element, num_components)); + } + const Id id = OpVariable(type, spv::StorageClass::Output, varying_default); - Name(AddGlobalVariable(id), fmt::format("out_attr{}", location)); - output_attributes.emplace(index, id); + Name(AddGlobalVariable(id), name); + + GenericVaryingDescription description; + description.id = id; + description.first_element = element; + description.is_scalar = num_components == 1; + for (u32 i = 0; i < num_components; ++i) { + const u8 offset = static_cast<u8>(static_cast<u32>(index) * 4 + element + i); + output_attributes.emplace(offset, description); + } interfaces.push_back(id); Decorate(id, spv::Decoration::Location, location); + if (element > 0) { + Decorate(id, spv::Decoration::Component, static_cast<u32>(element)); + } + if (tfb && device.IsExtTransformFeedbackSupported()) { + Decorate(id, spv::Decoration::XfbBuffer, static_cast<u32>(tfb->buffer)); + Decorate(id, spv::Decoration::XfbStride, static_cast<u32>(tfb->stride)); + Decorate(id, spv::Decoration::Offset, static_cast<u32>(tfb->offset)); + } + + element = static_cast<u8>(static_cast<std::size_t>(element) + num_components); } } + std::optional<VaryingTFB> GetTransformFeedbackInfo(Attribute::Index index, u8 element = 0) { + const u8 location = static_cast<u8>(static_cast<u32>(index) * 4 + element); + const auto it = transform_feedback.find(location); + if (it == transform_feedback.end()) { + return {}; + } + return it->second; + } + u32 DeclareConstantBuffers(u32 binding) { for (const auto& [index, size] : ir.GetConstantBuffers()) { const Id type = device.IsKhrUniformBufferStandardLayoutSupported() ? t_cbuf_scalar_ubo @@ -833,16 +915,20 @@ private: constexpr int sampled = 1; constexpr auto format = spv::ImageFormat::Unknown; const Id image_type = TypeImage(t_float, dim, depth, arrayed, ms, sampled, format); - const Id sampled_image_type = TypeSampledImage(image_type); - const Id pointer_type = - TypePointer(spv::StorageClass::UniformConstant, sampled_image_type); + const Id sampler_type = TypeSampledImage(image_type); + const Id sampler_pointer_type = + TypePointer(spv::StorageClass::UniformConstant, sampler_type); + const Id type = sampler.IsIndexed() + ? TypeArray(sampler_type, Constant(t_uint, sampler.Size())) + : sampler_type; + const Id pointer_type = TypePointer(spv::StorageClass::UniformConstant, type); const Id id = OpVariable(pointer_type, spv::StorageClass::UniformConstant); AddGlobalVariable(Name(id, fmt::format("sampler_{}", sampler.GetIndex()))); Decorate(id, spv::Decoration::Binding, binding++); Decorate(id, spv::Decoration::DescriptorSet, DESCRIPTOR_SET); - sampled_images.emplace(sampler.GetIndex(), - SampledImage{image_type, sampled_image_type, id}); + sampled_images.emplace(sampler.GetIndex(), SampledImage{image_type, sampler_type, + sampler_pointer_type, id}); } return binding; } @@ -893,7 +979,7 @@ private: u32 GetNumInputVertices() const { switch (stage) { case ShaderType::Geometry: - return GetNumPrimitiveTopologyVertices(specialization.primitive_topology); + return GetNumPrimitiveTopologyVertices(registry.GetGraphicsInfo().primitive_topology); case ShaderType::TesselationControl: case ShaderType::TesselationEval: return NumInputPatches; @@ -1341,8 +1427,14 @@ private: } default: if (IsGenericAttribute(attribute)) { - const Id composite = output_attributes.at(attribute); - return {ArrayPass(t_out_float, composite, {element}), Type::Float}; + const u8 offset = static_cast<u8>(static_cast<u8>(attribute) * 4 + element); + const GenericVaryingDescription description = output_attributes.at(offset); + const Id composite = description.id; + std::vector<u32> indices; + if (!description.is_scalar) { + indices.push_back(element - description.first_element); + } + return {ArrayPass(t_out_float, composite, indices), Type::Float}; } UNIMPLEMENTED_MSG("Unhandled output attribute: {}", static_cast<u32>(attribute)); @@ -1525,7 +1617,12 @@ private: ASSERT(!meta.sampler.IsBuffer()); const auto& entry = sampled_images.at(meta.sampler.GetIndex()); - return OpLoad(entry.sampled_image_type, entry.sampler); + Id sampler = entry.variable; + if (meta.sampler.IsIndexed()) { + const Id index = AsInt(Visit(meta.index)); + sampler = OpAccessChain(entry.sampler_pointer_type, sampler, index); + } + return OpLoad(entry.sampler_type, sampler); } Id GetTextureImage(Operation operation) { @@ -1783,7 +1880,7 @@ private: } Expression ImageLoad(Operation operation) { - if (!device.IsShaderStorageImageReadWithoutFormatSupported()) { + if (!device.IsFormatlessImageLoadSupported()) { return {v_float_zero, Type::Float}; } @@ -2211,16 +2308,14 @@ private: switch (specialization.attribute_types.at(location)) { case Maxwell::VertexAttribute::Type::SignedNorm: case Maxwell::VertexAttribute::Type::UnsignedNorm: + case Maxwell::VertexAttribute::Type::UnsignedScaled: + case Maxwell::VertexAttribute::Type::SignedScaled: case Maxwell::VertexAttribute::Type::Float: return {Type::Float, t_in_float, t_in_float4}; case Maxwell::VertexAttribute::Type::SignedInt: return {Type::Int, t_in_int, t_in_int4}; case Maxwell::VertexAttribute::Type::UnsignedInt: return {Type::Uint, t_in_uint, t_in_uint4}; - case Maxwell::VertexAttribute::Type::UnsignedScaled: - case Maxwell::VertexAttribute::Type::SignedScaled: - UNIMPLEMENTED(); - return {Type::Float, t_in_float, t_in_float4}; default: UNREACHABLE(); return {Type::Float, t_in_float, t_in_float4}; @@ -2250,11 +2345,11 @@ private: std::array<Id, 4> GetTypeVectorDefinitionLut(Type type) const { switch (type) { case Type::Float: - return {nullptr, t_float2, t_float3, t_float4}; + return {t_float, t_float2, t_float3, t_float4}; case Type::Int: - return {nullptr, t_int2, t_int3, t_int4}; + return {t_int, t_int2, t_int3, t_int4}; case Type::Uint: - return {nullptr, t_uint2, t_uint3, t_uint4}; + return {t_uint, t_uint2, t_uint3, t_uint4}; default: UNIMPLEMENTED(); return {}; @@ -2487,7 +2582,9 @@ private: const ShaderIR& ir; const ShaderType stage; const Tegra::Shader::Header header; + const Registry& registry; const Specialization& specialization; + std::unordered_map<u8, VaryingTFB> transform_feedback; const Id t_void = Name(TypeVoid(), "void"); @@ -2576,7 +2673,7 @@ private: Id shared_memory{}; std::array<Id, INTERNAL_FLAGS_COUNT> internal_flags{}; std::map<Attribute::Index, Id> input_attributes; - std::map<Attribute::Index, Id> output_attributes; + std::unordered_map<u8, GenericVaryingDescription> output_attributes; std::map<u32, Id> constant_buffers; std::map<GlobalMemoryBase, Id> global_buffers; std::map<u32, TexelBuffer> texel_buffers; @@ -2862,8 +2959,9 @@ ShaderEntries GenerateShaderEntries(const VideoCommon::Shader::ShaderIR& ir) { } std::vector<u32> Decompile(const VKDevice& device, const VideoCommon::Shader::ShaderIR& ir, - ShaderType stage, const Specialization& specialization) { - return SPIRVDecompiler(device, ir, stage, specialization).Assemble(); + ShaderType stage, const VideoCommon::Shader::Registry& registry, + const Specialization& specialization) { + return SPIRVDecompiler(device, ir, stage, registry, specialization).Assemble(); } } // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_shader_decompiler.h b/src/video_core/renderer_vulkan/vk_shader_decompiler.h index f5dc14d9e..ffea4709e 100644 --- a/src/video_core/renderer_vulkan/vk_shader_decompiler.h +++ b/src/video_core/renderer_vulkan/vk_shader_decompiler.h @@ -15,6 +15,7 @@ #include "common/common_types.h" #include "video_core/engines/maxwell_3d.h" #include "video_core/engines/shader_type.h" +#include "video_core/shader/registry.h" #include "video_core/shader/shader_ir.h" namespace Vulkan { @@ -91,17 +92,9 @@ struct Specialization final { u32 shared_memory_size{}; // Graphics specific - Maxwell::PrimitiveTopology primitive_topology{}; std::optional<float> point_size{}; std::array<Maxwell::VertexAttribute::Type, Maxwell::NumVertexAttributes> attribute_types{}; bool ndc_minus_one_to_one{}; - - // Tessellation specific - struct { - Maxwell::TessellationPrimitive primitive{}; - Maxwell::TessellationSpacing spacing{}; - bool clockwise{}; - } tessellation; }; // Old gcc versions don't consider this trivially copyable. // static_assert(std::is_trivially_copyable_v<Specialization>); @@ -114,6 +107,8 @@ struct SPIRVShader { ShaderEntries GenerateShaderEntries(const VideoCommon::Shader::ShaderIR& ir); std::vector<u32> Decompile(const VKDevice& device, const VideoCommon::Shader::ShaderIR& ir, - Tegra::Engines::ShaderType stage, const Specialization& specialization); + Tegra::Engines::ShaderType stage, + const VideoCommon::Shader::Registry& registry, + const Specialization& specialization); } // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_staging_buffer_pool.cpp b/src/video_core/renderer_vulkan/vk_staging_buffer_pool.cpp index 171d78afc..374959f82 100644 --- a/src/video_core/renderer_vulkan/vk_staging_buffer_pool.cpp +++ b/src/video_core/renderer_vulkan/vk_staging_buffer_pool.cpp @@ -73,7 +73,8 @@ VKBuffer* VKStagingBufferPool::TryGetReservedBuffer(std::size_t size, bool host_ VKBuffer& VKStagingBufferPool::CreateStagingBuffer(std::size_t size, bool host_visible) { const auto usage = vk::BufferUsageFlagBits::eTransferSrc | vk::BufferUsageFlagBits::eTransferDst | - vk::BufferUsageFlagBits::eStorageBuffer | vk::BufferUsageFlagBits::eIndexBuffer; + vk::BufferUsageFlagBits::eUniformBuffer | vk::BufferUsageFlagBits::eStorageBuffer | + vk::BufferUsageFlagBits::eIndexBuffer; const u32 log2 = Common::Log2Ceil64(size); const vk::BufferCreateInfo buffer_ci({}, 1ULL << log2, usage, vk::SharingMode::eExclusive, 0, nullptr); @@ -99,7 +100,6 @@ void VKStagingBufferPool::ReleaseCache(bool host_visible) { } u64 VKStagingBufferPool::ReleaseLevel(StagingBuffersCache& cache, std::size_t log2) { - static constexpr u64 epochs_to_destroy = 180; static constexpr std::size_t deletions_per_tick = 16; auto& staging = cache[log2]; @@ -107,6 +107,7 @@ u64 VKStagingBufferPool::ReleaseLevel(StagingBuffersCache& cache, std::size_t lo const std::size_t old_size = entries.size(); const auto is_deleteable = [this](const auto& entry) { + static constexpr u64 epochs_to_destroy = 180; return entry.last_epoch + epochs_to_destroy < epoch && !entry.watch.IsUsed(); }; const std::size_t begin_offset = staging.delete_index; diff --git a/src/video_core/renderer_vulkan/vk_state_tracker.cpp b/src/video_core/renderer_vulkan/vk_state_tracker.cpp new file mode 100644 index 000000000..94a89e388 --- /dev/null +++ b/src/video_core/renderer_vulkan/vk_state_tracker.cpp @@ -0,0 +1,99 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include <algorithm> +#include <cstddef> +#include <iterator> + +#include "common/common_types.h" +#include "core/core.h" +#include "video_core/dirty_flags.h" +#include "video_core/engines/maxwell_3d.h" +#include "video_core/gpu.h" +#include "video_core/renderer_vulkan/vk_state_tracker.h" + +#define OFF(field_name) MAXWELL3D_REG_INDEX(field_name) +#define NUM(field_name) (sizeof(Maxwell3D::Regs::field_name) / sizeof(u32)) + +namespace Vulkan { + +namespace { + +using namespace Dirty; +using namespace VideoCommon::Dirty; +using Tegra::Engines::Maxwell3D; +using Regs = Maxwell3D::Regs; +using Tables = Maxwell3D::DirtyState::Tables; +using Table = Maxwell3D::DirtyState::Table; +using Flags = Maxwell3D::DirtyState::Flags; + +Flags MakeInvalidationFlags() { + Flags flags{}; + flags[Viewports] = true; + flags[Scissors] = true; + flags[DepthBias] = true; + flags[BlendConstants] = true; + flags[DepthBounds] = true; + flags[StencilProperties] = true; + return flags; +} + +void SetupDirtyViewports(Tables& tables) { + FillBlock(tables[0], OFF(viewport_transform), NUM(viewport_transform), Viewports); + FillBlock(tables[0], OFF(viewports), NUM(viewports), Viewports); + tables[0][OFF(viewport_transform_enabled)] = Viewports; +} + +void SetupDirtyScissors(Tables& tables) { + FillBlock(tables[0], OFF(scissor_test), NUM(scissor_test), Scissors); +} + +void SetupDirtyDepthBias(Tables& tables) { + auto& table = tables[0]; + table[OFF(polygon_offset_units)] = DepthBias; + table[OFF(polygon_offset_clamp)] = DepthBias; + table[OFF(polygon_offset_factor)] = DepthBias; +} + +void SetupDirtyBlendConstants(Tables& tables) { + FillBlock(tables[0], OFF(blend_color), NUM(blend_color), BlendConstants); +} + +void SetupDirtyDepthBounds(Tables& tables) { + FillBlock(tables[0], OFF(depth_bounds), NUM(depth_bounds), DepthBounds); +} + +void SetupDirtyStencilProperties(Tables& tables) { + auto& table = tables[0]; + table[OFF(stencil_two_side_enable)] = StencilProperties; + table[OFF(stencil_front_func_ref)] = StencilProperties; + table[OFF(stencil_front_mask)] = StencilProperties; + table[OFF(stencil_front_func_mask)] = StencilProperties; + table[OFF(stencil_back_func_ref)] = StencilProperties; + table[OFF(stencil_back_mask)] = StencilProperties; + table[OFF(stencil_back_func_mask)] = StencilProperties; +} + +} // Anonymous namespace + +StateTracker::StateTracker(Core::System& system) + : system{system}, invalidation_flags{MakeInvalidationFlags()} {} + +void StateTracker::Initialize() { + auto& dirty = system.GPU().Maxwell3D().dirty; + auto& tables = dirty.tables; + SetupDirtyRenderTargets(tables); + SetupDirtyViewports(tables); + SetupDirtyScissors(tables); + SetupDirtyDepthBias(tables); + SetupDirtyBlendConstants(tables); + SetupDirtyDepthBounds(tables); + SetupDirtyStencilProperties(tables); +} + +void StateTracker::InvalidateCommandBufferState() { + system.GPU().Maxwell3D().dirty.flags |= invalidation_flags; +} + +} // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_state_tracker.h b/src/video_core/renderer_vulkan/vk_state_tracker.h new file mode 100644 index 000000000..03bc415b2 --- /dev/null +++ b/src/video_core/renderer_vulkan/vk_state_tracker.h @@ -0,0 +1,79 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include <cstddef> +#include <limits> + +#include "common/common_types.h" +#include "core/core.h" +#include "video_core/dirty_flags.h" +#include "video_core/engines/maxwell_3d.h" + +namespace Vulkan { + +namespace Dirty { + +enum : u8 { + First = VideoCommon::Dirty::LastCommonEntry, + + Viewports, + Scissors, + DepthBias, + BlendConstants, + DepthBounds, + StencilProperties, + + Last +}; +static_assert(Last <= std::numeric_limits<u8>::max()); + +} // namespace Dirty + +class StateTracker { +public: + explicit StateTracker(Core::System& system); + + void Initialize(); + + void InvalidateCommandBufferState(); + + bool TouchViewports() { + return Exchange(Dirty::Viewports, false); + } + + bool TouchScissors() { + return Exchange(Dirty::Scissors, false); + } + + bool TouchDepthBias() { + return Exchange(Dirty::DepthBias, false); + } + + bool TouchBlendConstants() { + return Exchange(Dirty::BlendConstants, false); + } + + bool TouchDepthBounds() { + return Exchange(Dirty::DepthBounds, false); + } + + bool TouchStencilProperties() { + return Exchange(Dirty::StencilProperties, false); + } + +private: + bool Exchange(std::size_t id, bool new_value) const noexcept { + auto& flags = system.GPU().Maxwell3D().dirty.flags; + const bool is_dirty = flags[id]; + flags[id] = new_value; + return is_dirty; + } + + Core::System& system; + Tegra::Engines::Maxwell3D::DirtyState::Flags invalidation_flags; +}; + +} // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_swapchain.cpp b/src/video_core/renderer_vulkan/vk_swapchain.cpp index f47b691a8..9e73fa9cd 100644 --- a/src/video_core/renderer_vulkan/vk_swapchain.cpp +++ b/src/video_core/renderer_vulkan/vk_swapchain.cpp @@ -141,11 +141,6 @@ void VKSwapchain::CreateSwapchain(const vk::SurfaceCapabilitiesKHR& capabilities const vk::SurfaceFormatKHR surface_format{ChooseSwapSurfaceFormat(formats, srgb)}; const vk::PresentModeKHR present_mode{ChooseSwapPresentMode(present_modes)}; - extent = ChooseSwapExtent(capabilities, width, height); - - current_width = extent.width; - current_height = extent.height; - current_srgb = srgb; u32 requested_image_count{capabilities.minImageCount + 1}; if (capabilities.maxImageCount > 0 && requested_image_count > capabilities.maxImageCount) { @@ -153,10 +148,9 @@ void VKSwapchain::CreateSwapchain(const vk::SurfaceCapabilitiesKHR& capabilities } vk::SwapchainCreateInfoKHR swapchain_ci( - {}, surface, requested_image_count, surface_format.format, surface_format.colorSpace, - extent, 1, vk::ImageUsageFlagBits::eColorAttachment, {}, {}, {}, - capabilities.currentTransform, vk::CompositeAlphaFlagBitsKHR::eOpaque, present_mode, false, - {}); + {}, surface, requested_image_count, surface_format.format, surface_format.colorSpace, {}, 1, + vk::ImageUsageFlagBits::eColorAttachment, {}, {}, {}, capabilities.currentTransform, + vk::CompositeAlphaFlagBitsKHR::eOpaque, present_mode, false, {}); const u32 graphics_family{device.GetGraphicsFamily()}; const u32 present_family{device.GetPresentFamily()}; @@ -169,9 +163,18 @@ void VKSwapchain::CreateSwapchain(const vk::SurfaceCapabilitiesKHR& capabilities swapchain_ci.imageSharingMode = vk::SharingMode::eExclusive; } + // Request the size again to reduce the possibility of a TOCTOU race condition. + const auto updated_capabilities = physical_device.getSurfaceCapabilitiesKHR(surface, dld); + swapchain_ci.imageExtent = ChooseSwapExtent(updated_capabilities, width, height); + // Don't add code within this and the swapchain creation. const auto dev{device.GetLogical()}; swapchain = dev.createSwapchainKHRUnique(swapchain_ci, nullptr, dld); + extent = swapchain_ci.imageExtent; + current_width = extent.width; + current_height = extent.height; + current_srgb = srgb; + images = dev.getSwapchainImagesKHR(*swapchain, dld); image_count = static_cast<u32>(images.size()); image_format = surface_format.format; diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.cpp b/src/video_core/renderer_vulkan/vk_texture_cache.cpp index 51b0d38a6..26175921b 100644 --- a/src/video_core/renderer_vulkan/vk_texture_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_texture_cache.cpp @@ -22,6 +22,7 @@ #include "video_core/renderer_vulkan/vk_device.h" #include "video_core/renderer_vulkan/vk_memory_manager.h" #include "video_core/renderer_vulkan/vk_rasterizer.h" +#include "video_core/renderer_vulkan/vk_scheduler.h" #include "video_core/renderer_vulkan/vk_staging_buffer_pool.h" #include "video_core/renderer_vulkan/vk_texture_cache.h" #include "video_core/surface.h" @@ -51,6 +52,9 @@ vk::ImageType SurfaceTargetToImage(SurfaceTarget target) { return vk::ImageType::e2D; case SurfaceTarget::Texture3D: return vk::ImageType::e3D; + case SurfaceTarget::TextureBuffer: + UNREACHABLE(); + return {}; } UNREACHABLE_MSG("Unknown texture target={}", static_cast<u32>(target)); return {}; @@ -272,7 +276,6 @@ void CachedSurface::UploadImage(const std::vector<u8>& staging_buffer) { for (u32 level = 0; level < params.num_levels; ++level) { vk::BufferImageCopy copy = GetBufferImageCopy(level); - const auto& dld = device.GetDispatchLoader(); if (image->GetAspectMask() == (vk::ImageAspectFlagBits::eDepth | vk::ImageAspectFlagBits::eStencil)) { vk::BufferImageCopy depth = copy; @@ -421,7 +424,6 @@ void VKTextureCache::ImageCopy(Surface& src_surface, Surface& dst_surface, dst_base_layer, num_layers, copy_params.dest_level, 1, vk::PipelineStageFlagBits::eTransfer, vk::AccessFlagBits::eTransferWrite, vk::ImageLayout::eTransferDstOptimal); - const auto& dld{device.GetDispatchLoader()}; const vk::ImageSubresourceLayers src_subresource( src_surface->GetAspectMask(), copy_params.source_level, copy_params.source_z, num_layers); const vk::ImageSubresourceLayers dst_subresource( @@ -457,7 +459,6 @@ void VKTextureCache::ImageBlit(View& src_view, View& dst_view, dst_view->GetImageSubresourceLayers(), {dst_top_left, dst_bot_right}); const bool is_linear = copy_config.filter == Tegra::Engines::Fermi2D::Filter::Linear; - const auto& dld{device.GetDispatchLoader()}; scheduler.Record([src_image = src_view->GetImage(), dst_image = dst_view->GetImage(), blit, is_linear](auto cmdbuf, auto& dld) { cmdbuf.blitImage(src_image, vk::ImageLayout::eTransferSrcOptimal, dst_image, diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.h b/src/video_core/renderer_vulkan/vk_texture_cache.h index d3edbe80c..22e3d34de 100644 --- a/src/video_core/renderer_vulkan/vk_texture_cache.h +++ b/src/video_core/renderer_vulkan/vk_texture_cache.h @@ -151,6 +151,10 @@ public: return params.GetMipHeight(base_level); } + u32 GetNumLayers() const { + return num_layers; + } + bool IsBufferView() const { return buffer_view; } diff --git a/src/video_core/shader/const_buffer_locker.cpp b/src/video_core/shader/const_buffer_locker.cpp deleted file mode 100644 index 0638be8cb..000000000 --- a/src/video_core/shader/const_buffer_locker.cpp +++ /dev/null @@ -1,126 +0,0 @@ -// Copyright 2019 yuzu Emulator Project -// Licensed under GPLv2 or any later version -// Refer to the license.txt file included. - -#include <algorithm> -#include <tuple> - -#include "common/common_types.h" -#include "video_core/engines/maxwell_3d.h" -#include "video_core/engines/shader_type.h" -#include "video_core/shader/const_buffer_locker.h" - -namespace VideoCommon::Shader { - -using Tegra::Engines::SamplerDescriptor; - -ConstBufferLocker::ConstBufferLocker(Tegra::Engines::ShaderType shader_stage) - : stage{shader_stage} {} - -ConstBufferLocker::ConstBufferLocker(Tegra::Engines::ShaderType shader_stage, - Tegra::Engines::ConstBufferEngineInterface& engine) - : stage{shader_stage}, engine{&engine} {} - -ConstBufferLocker::~ConstBufferLocker() = default; - -std::optional<u32> ConstBufferLocker::ObtainKey(u32 buffer, u32 offset) { - const std::pair<u32, u32> key = {buffer, offset}; - const auto iter = keys.find(key); - if (iter != keys.end()) { - return iter->second; - } - if (!engine) { - return std::nullopt; - } - const u32 value = engine->AccessConstBuffer32(stage, buffer, offset); - keys.emplace(key, value); - return value; -} - -std::optional<SamplerDescriptor> ConstBufferLocker::ObtainBoundSampler(u32 offset) { - const u32 key = offset; - const auto iter = bound_samplers.find(key); - if (iter != bound_samplers.end()) { - return iter->second; - } - if (!engine) { - return std::nullopt; - } - const SamplerDescriptor value = engine->AccessBoundSampler(stage, offset); - bound_samplers.emplace(key, value); - return value; -} - -std::optional<Tegra::Engines::SamplerDescriptor> ConstBufferLocker::ObtainBindlessSampler( - u32 buffer, u32 offset) { - const std::pair key = {buffer, offset}; - const auto iter = bindless_samplers.find(key); - if (iter != bindless_samplers.end()) { - return iter->second; - } - if (!engine) { - return std::nullopt; - } - const SamplerDescriptor value = engine->AccessBindlessSampler(stage, buffer, offset); - bindless_samplers.emplace(key, value); - return value; -} - -std::optional<u32> ConstBufferLocker::ObtainBoundBuffer() { - if (bound_buffer_saved) { - return bound_buffer; - } - if (!engine) { - return std::nullopt; - } - bound_buffer_saved = true; - bound_buffer = engine->GetBoundBuffer(); - return bound_buffer; -} - -void ConstBufferLocker::InsertKey(u32 buffer, u32 offset, u32 value) { - keys.insert_or_assign({buffer, offset}, value); -} - -void ConstBufferLocker::InsertBoundSampler(u32 offset, SamplerDescriptor sampler) { - bound_samplers.insert_or_assign(offset, sampler); -} - -void ConstBufferLocker::InsertBindlessSampler(u32 buffer, u32 offset, SamplerDescriptor sampler) { - bindless_samplers.insert_or_assign({buffer, offset}, sampler); -} - -void ConstBufferLocker::SetBoundBuffer(u32 buffer) { - bound_buffer_saved = true; - bound_buffer = buffer; -} - -bool ConstBufferLocker::IsConsistent() const { - if (!engine) { - return false; - } - return std::all_of(keys.begin(), keys.end(), - [this](const auto& pair) { - const auto [cbuf, offset] = pair.first; - const auto value = pair.second; - return value == engine->AccessConstBuffer32(stage, cbuf, offset); - }) && - std::all_of(bound_samplers.begin(), bound_samplers.end(), - [this](const auto& sampler) { - const auto [key, value] = sampler; - return value == engine->AccessBoundSampler(stage, key); - }) && - std::all_of(bindless_samplers.begin(), bindless_samplers.end(), - [this](const auto& sampler) { - const auto [cbuf, offset] = sampler.first; - const auto value = sampler.second; - return value == engine->AccessBindlessSampler(stage, cbuf, offset); - }); -} - -bool ConstBufferLocker::HasEqualKeys(const ConstBufferLocker& rhs) const { - return std::tie(keys, bound_samplers, bindless_samplers) == - std::tie(rhs.keys, rhs.bound_samplers, rhs.bindless_samplers); -} - -} // namespace VideoCommon::Shader diff --git a/src/video_core/shader/const_buffer_locker.h b/src/video_core/shader/const_buffer_locker.h deleted file mode 100644 index d3ea11087..000000000 --- a/src/video_core/shader/const_buffer_locker.h +++ /dev/null @@ -1,103 +0,0 @@ -// Copyright 2019 yuzu Emulator Project -// Licensed under GPLv2 or any later version -// Refer to the license.txt file included. - -#pragma once - -#include <optional> -#include <unordered_map> -#include "common/common_types.h" -#include "common/hash.h" -#include "video_core/engines/const_buffer_engine_interface.h" -#include "video_core/engines/shader_type.h" -#include "video_core/guest_driver.h" - -namespace VideoCommon::Shader { - -using KeyMap = std::unordered_map<std::pair<u32, u32>, u32, Common::PairHash>; -using BoundSamplerMap = std::unordered_map<u32, Tegra::Engines::SamplerDescriptor>; -using BindlessSamplerMap = - std::unordered_map<std::pair<u32, u32>, Tegra::Engines::SamplerDescriptor, Common::PairHash>; - -/** - * The ConstBufferLocker is a class use to interface the 3D and compute engines with the shader - * compiler. with it, the shader can obtain required data from GPU state and store it for disk - * shader compilation. - */ -class ConstBufferLocker { -public: - explicit ConstBufferLocker(Tegra::Engines::ShaderType shader_stage); - - explicit ConstBufferLocker(Tegra::Engines::ShaderType shader_stage, - Tegra::Engines::ConstBufferEngineInterface& engine); - - ~ConstBufferLocker(); - - /// Retrieves a key from the locker, if it's registered, it will give the registered value, if - /// not it will obtain it from maxwell3d and register it. - std::optional<u32> ObtainKey(u32 buffer, u32 offset); - - std::optional<Tegra::Engines::SamplerDescriptor> ObtainBoundSampler(u32 offset); - - std::optional<Tegra::Engines::SamplerDescriptor> ObtainBindlessSampler(u32 buffer, u32 offset); - - std::optional<u32> ObtainBoundBuffer(); - - /// Inserts a key. - void InsertKey(u32 buffer, u32 offset, u32 value); - - /// Inserts a bound sampler key. - void InsertBoundSampler(u32 offset, Tegra::Engines::SamplerDescriptor sampler); - - /// Inserts a bindless sampler key. - void InsertBindlessSampler(u32 buffer, u32 offset, Tegra::Engines::SamplerDescriptor sampler); - - /// Set the bound buffer for this locker. - void SetBoundBuffer(u32 buffer); - - /// Checks keys and samplers against engine's current const buffers. Returns true if they are - /// the same value, false otherwise; - bool IsConsistent() const; - - /// Returns true if the keys are equal to the other ones in the locker. - bool HasEqualKeys(const ConstBufferLocker& rhs) const; - - /// Gives an getter to the const buffer keys in the database. - const KeyMap& GetKeys() const { - return keys; - } - - /// Gets samplers database. - const BoundSamplerMap& GetBoundSamplers() const { - return bound_samplers; - } - - /// Gets bindless samplers database. - const BindlessSamplerMap& GetBindlessSamplers() const { - return bindless_samplers; - } - - /// Gets bound buffer used on this shader - u32 GetBoundBuffer() const { - return bound_buffer; - } - - /// Obtains access to the guest driver's profile. - VideoCore::GuestDriverProfile* AccessGuestDriverProfile() const { - if (engine) { - return &engine->AccessGuestDriverProfile(); - } - return nullptr; - } - -private: - const Tegra::Engines::ShaderType stage; - Tegra::Engines::ConstBufferEngineInterface* engine = nullptr; - KeyMap keys; - BoundSamplerMap bound_samplers; - BindlessSamplerMap bindless_samplers; - bool bound_buffer_saved{}; - u32 bound_buffer{}; -}; - -} // namespace VideoCommon::Shader diff --git a/src/video_core/shader/control_flow.cpp b/src/video_core/shader/control_flow.cpp index 0229733b6..2e2711350 100644 --- a/src/video_core/shader/control_flow.cpp +++ b/src/video_core/shader/control_flow.cpp @@ -13,6 +13,7 @@ #include "common/common_types.h" #include "video_core/shader/ast.h" #include "video_core/shader/control_flow.h" +#include "video_core/shader/registry.h" #include "video_core/shader/shader_ir.h" namespace VideoCommon::Shader { @@ -64,11 +65,11 @@ struct BlockInfo { }; struct CFGRebuildState { - explicit CFGRebuildState(const ProgramCode& program_code, u32 start, ConstBufferLocker& locker) - : program_code{program_code}, locker{locker}, start{start} {} + explicit CFGRebuildState(const ProgramCode& program_code, u32 start, Registry& registry) + : program_code{program_code}, registry{registry}, start{start} {} const ProgramCode& program_code; - ConstBufferLocker& locker; + Registry& registry; u32 start{}; std::vector<BlockInfo> block_info; std::list<u32> inspect_queries; @@ -438,7 +439,7 @@ std::pair<ParseResult, ParseInfo> ParseCode(CFGRebuildState& state, u32 address) const s32 pc_target = offset + result.relative_position; std::vector<CaseBranch> branches; for (u32 i = 0; i < result.entries; i++) { - auto key = state.locker.ObtainKey(result.buffer, result.offset + i * 4); + auto key = state.registry.ObtainKey(result.buffer, result.offset + i * 4); if (!key) { return {ParseResult::AbnormalFlow, parse_info}; } @@ -656,14 +657,14 @@ void DecompileShader(CFGRebuildState& state) { std::unique_ptr<ShaderCharacteristics> ScanFlow(const ProgramCode& program_code, u32 start_address, const CompilerSettings& settings, - ConstBufferLocker& locker) { + Registry& registry) { auto result_out = std::make_unique<ShaderCharacteristics>(); if (settings.depth == CompileDepth::BruteForce) { result_out->settings.depth = CompileDepth::BruteForce; return result_out; } - CFGRebuildState state{program_code, start_address, locker}; + CFGRebuildState state{program_code, start_address, registry}; // Inspect Code and generate blocks state.labels.clear(); state.labels.emplace(start_address); diff --git a/src/video_core/shader/control_flow.h b/src/video_core/shader/control_flow.h index 5304998b9..62a3510d8 100644 --- a/src/video_core/shader/control_flow.h +++ b/src/video_core/shader/control_flow.h @@ -12,6 +12,7 @@ #include "video_core/engines/shader_bytecode.h" #include "video_core/shader/ast.h" #include "video_core/shader/compiler_settings.h" +#include "video_core/shader/registry.h" #include "video_core/shader/shader_ir.h" namespace VideoCommon::Shader { @@ -111,6 +112,6 @@ struct ShaderCharacteristics { std::unique_ptr<ShaderCharacteristics> ScanFlow(const ProgramCode& program_code, u32 start_address, const CompilerSettings& settings, - ConstBufferLocker& locker); + Registry& registry); } // namespace VideoCommon::Shader diff --git a/src/video_core/shader/decode.cpp b/src/video_core/shader/decode.cpp index 6b697ed5d..87ac9ac6c 100644 --- a/src/video_core/shader/decode.cpp +++ b/src/video_core/shader/decode.cpp @@ -34,13 +34,9 @@ constexpr bool IsSchedInstruction(u32 offset, u32 main_offset) { return (absolute_offset % SchedPeriod) == 0; } -void DeduceTextureHandlerSize(VideoCore::GuestDriverProfile* gpu_driver, +void DeduceTextureHandlerSize(VideoCore::GuestDriverProfile& gpu_driver, const std::list<Sampler>& used_samplers) { - if (gpu_driver == nullptr) { - LOG_CRITICAL(HW_GPU, "GPU driver profile has not been created yet"); - return; - } - if (gpu_driver->TextureHandlerSizeKnown() || used_samplers.size() <= 1) { + if (gpu_driver.IsTextureHandlerSizeKnown() || used_samplers.size() <= 1) { return; } u32 count{}; @@ -53,17 +49,13 @@ void DeduceTextureHandlerSize(VideoCore::GuestDriverProfile* gpu_driver, bound_offsets.emplace_back(sampler.GetOffset()); } if (count > 1) { - gpu_driver->DeduceTextureHandlerSize(std::move(bound_offsets)); + gpu_driver.DeduceTextureHandlerSize(std::move(bound_offsets)); } } std::optional<u32> TryDeduceSamplerSize(const Sampler& sampler_to_deduce, - VideoCore::GuestDriverProfile* gpu_driver, + VideoCore::GuestDriverProfile& gpu_driver, const std::list<Sampler>& used_samplers) { - if (gpu_driver == nullptr) { - LOG_CRITICAL(HW_GPU, "GPU Driver profile has not been created yet"); - return std::nullopt; - } const u32 base_offset = sampler_to_deduce.GetOffset(); u32 max_offset{std::numeric_limits<u32>::max()}; for (const auto& sampler : used_samplers) { @@ -77,7 +69,7 @@ std::optional<u32> TryDeduceSamplerSize(const Sampler& sampler_to_deduce, if (max_offset == std::numeric_limits<u32>::max()) { return std::nullopt; } - return ((max_offset - base_offset) * 4) / gpu_driver->GetTextureHandlerSize(); + return ((max_offset - base_offset) * 4) / gpu_driver.GetTextureHandlerSize(); } } // Anonymous namespace @@ -149,7 +141,7 @@ void ShaderIR::Decode() { std::memcpy(&header, program_code.data(), sizeof(Tegra::Shader::Header)); decompiled = false; - auto info = ScanFlow(program_code, main_offset, settings, locker); + auto info = ScanFlow(program_code, main_offset, settings, registry); auto& shader_info = *info; coverage_begin = shader_info.start; coverage_end = shader_info.end; @@ -364,7 +356,7 @@ u32 ShaderIR::DecodeInstr(NodeBlock& bb, u32 pc) { void ShaderIR::PostDecode() { // Deduce texture handler size if needed - auto gpu_driver = locker.AccessGuestDriverProfile(); + auto gpu_driver = registry.AccessGuestDriverProfile(); DeduceTextureHandlerSize(gpu_driver, used_samplers); // Deduce Indexed Samplers if (!uses_indexed_samplers) { diff --git a/src/video_core/shader/decode/arithmetic.cpp b/src/video_core/shader/decode/arithmetic.cpp index 90240c765..478394682 100644 --- a/src/video_core/shader/decode/arithmetic.cpp +++ b/src/video_core/shader/decode/arithmetic.cpp @@ -53,29 +53,24 @@ u32 ShaderIR::DecodeArithmetic(NodeBlock& bb, u32 pc) { op_b = GetOperandAbsNegFloat(op_b, false, instr.fmul.negate_b); - // TODO(Rodrigo): Should precise be used when there's a postfactor? - Node value = Operation(OperationCode::FMul, PRECISE, op_a, op_b); + static constexpr std::array FmulPostFactor = { + 1.000f, // None + 0.500f, // Divide 2 + 0.250f, // Divide 4 + 0.125f, // Divide 8 + 8.000f, // Mul 8 + 4.000f, // Mul 4 + 2.000f, // Mul 2 + }; if (instr.fmul.postfactor != 0) { - auto postfactor = static_cast<s32>(instr.fmul.postfactor); - - // Postfactor encoded as 3-bit 1's complement in instruction, interpreted with below - // logic. - if (postfactor >= 4) { - postfactor = 7 - postfactor; - } else { - postfactor = 0 - postfactor; - } - - if (postfactor > 0) { - value = Operation(OperationCode::FMul, NO_PRECISE, value, - Immediate(static_cast<f32>(1 << postfactor))); - } else { - value = Operation(OperationCode::FDiv, NO_PRECISE, value, - Immediate(static_cast<f32>(1 << -postfactor))); - } + op_a = Operation(OperationCode::FMul, NO_PRECISE, op_a, + Immediate(FmulPostFactor[instr.fmul.postfactor])); } + // TODO(Rodrigo): Should precise be used when there's a postfactor? + Node value = Operation(OperationCode::FMul, PRECISE, op_a, op_b); + value = GetSaturatedFloat(value, instr.alu.saturate_d); SetInternalFlagsFromFloat(bb, value, instr.generates_cc); diff --git a/src/video_core/shader/decode/arithmetic_integer.cpp b/src/video_core/shader/decode/arithmetic_integer.cpp index 21366869d..2fe787d6f 100644 --- a/src/video_core/shader/decode/arithmetic_integer.cpp +++ b/src/video_core/shader/decode/arithmetic_integer.cpp @@ -293,44 +293,66 @@ u32 ShaderIR::DecodeArithmeticInteger(NodeBlock& bb, u32 pc) { void ShaderIR::WriteLop3Instruction(NodeBlock& bb, Register dest, Node op_a, Node op_b, Node op_c, Node imm_lut, bool sets_cc) { - constexpr u32 lop_iterations = 32; - const Node one = Immediate(1); - const Node two = Immediate(2); - - Node value; - for (u32 i = 0; i < lop_iterations; ++i) { - const Node shift_amount = Immediate(i); - - const Node a = Operation(OperationCode::ILogicalShiftRight, NO_PRECISE, op_c, shift_amount); - const Node pack_0 = Operation(OperationCode::IBitwiseAnd, NO_PRECISE, a, one); - - const Node b = Operation(OperationCode::ILogicalShiftRight, NO_PRECISE, op_b, shift_amount); - const Node c = Operation(OperationCode::IBitwiseAnd, NO_PRECISE, b, one); - const Node pack_1 = Operation(OperationCode::ILogicalShiftLeft, NO_PRECISE, c, one); - - const Node d = Operation(OperationCode::ILogicalShiftRight, NO_PRECISE, op_a, shift_amount); - const Node e = Operation(OperationCode::IBitwiseAnd, NO_PRECISE, d, one); - const Node pack_2 = Operation(OperationCode::ILogicalShiftLeft, NO_PRECISE, e, two); - - const Node pack_01 = Operation(OperationCode::IBitwiseAnd, NO_PRECISE, pack_0, pack_1); - const Node pack_012 = Operation(OperationCode::IBitwiseAnd, NO_PRECISE, pack_01, pack_2); - - const Node shifted_bit = - Operation(OperationCode::ILogicalShiftRight, NO_PRECISE, imm_lut, pack_012); - const Node bit = Operation(OperationCode::IBitwiseAnd, NO_PRECISE, shifted_bit, one); - - const Node right = - Operation(OperationCode::ILogicalShiftLeft, NO_PRECISE, bit, shift_amount); - - if (i > 0) { - value = Operation(OperationCode::IBitwiseOr, NO_PRECISE, value, right); - } else { - value = right; + const Node lop3_fast = [&](const Node na, const Node nb, const Node nc, const Node ttbl) { + Node value = Immediate(0); + const ImmediateNode imm = std::get<ImmediateNode>(*ttbl); + if (imm.GetValue() & 0x01) { + const Node a = Operation(OperationCode::IBitwiseNot, na); + const Node b = Operation(OperationCode::IBitwiseNot, nb); + const Node c = Operation(OperationCode::IBitwiseNot, nc); + Node r = Operation(OperationCode::IBitwiseAnd, NO_PRECISE, a, b); + r = Operation(OperationCode::IBitwiseAnd, NO_PRECISE, r, c); + value = Operation(OperationCode::IBitwiseOr, value, r); } - } + if (imm.GetValue() & 0x02) { + const Node a = Operation(OperationCode::IBitwiseNot, na); + const Node b = Operation(OperationCode::IBitwiseNot, nb); + Node r = Operation(OperationCode::IBitwiseAnd, NO_PRECISE, a, b); + r = Operation(OperationCode::IBitwiseAnd, NO_PRECISE, r, nc); + value = Operation(OperationCode::IBitwiseOr, value, r); + } + if (imm.GetValue() & 0x04) { + const Node a = Operation(OperationCode::IBitwiseNot, na); + const Node c = Operation(OperationCode::IBitwiseNot, nc); + Node r = Operation(OperationCode::IBitwiseAnd, NO_PRECISE, a, nb); + r = Operation(OperationCode::IBitwiseAnd, NO_PRECISE, r, c); + value = Operation(OperationCode::IBitwiseOr, value, r); + } + if (imm.GetValue() & 0x08) { + const Node a = Operation(OperationCode::IBitwiseNot, na); + Node r = Operation(OperationCode::IBitwiseAnd, NO_PRECISE, a, nb); + r = Operation(OperationCode::IBitwiseAnd, NO_PRECISE, r, nc); + value = Operation(OperationCode::IBitwiseOr, value, r); + } + if (imm.GetValue() & 0x10) { + const Node b = Operation(OperationCode::IBitwiseNot, nb); + const Node c = Operation(OperationCode::IBitwiseNot, nc); + Node r = Operation(OperationCode::IBitwiseAnd, NO_PRECISE, na, b); + r = Operation(OperationCode::IBitwiseAnd, NO_PRECISE, r, c); + value = Operation(OperationCode::IBitwiseOr, value, r); + } + if (imm.GetValue() & 0x20) { + const Node b = Operation(OperationCode::IBitwiseNot, nb); + Node r = Operation(OperationCode::IBitwiseAnd, NO_PRECISE, na, b); + r = Operation(OperationCode::IBitwiseAnd, NO_PRECISE, r, nc); + value = Operation(OperationCode::IBitwiseOr, value, r); + } + if (imm.GetValue() & 0x40) { + const Node c = Operation(OperationCode::IBitwiseNot, nc); + Node r = Operation(OperationCode::IBitwiseAnd, NO_PRECISE, na, nb); + r = Operation(OperationCode::IBitwiseAnd, NO_PRECISE, r, c); + value = Operation(OperationCode::IBitwiseOr, value, r); + } + if (imm.GetValue() & 0x80) { + Node r = Operation(OperationCode::IBitwiseAnd, NO_PRECISE, na, nb); + r = Operation(OperationCode::IBitwiseAnd, NO_PRECISE, r, nc); + value = Operation(OperationCode::IBitwiseOr, value, r); + } + return value; + }(op_a, op_b, op_c, imm_lut); - SetInternalFlagsFromInteger(bb, value, sets_cc); - SetRegister(bb, dest, value); + SetInternalFlagsFromInteger(bb, lop3_fast, sets_cc); + SetRegister(bb, dest, lop3_fast); } } // namespace VideoCommon::Shader diff --git a/src/video_core/shader/decode/bfe.cpp b/src/video_core/shader/decode/bfe.cpp index e02bcd097..8e3b46e8e 100644 --- a/src/video_core/shader/decode/bfe.cpp +++ b/src/video_core/shader/decode/bfe.cpp @@ -17,33 +17,60 @@ u32 ShaderIR::DecodeBfe(NodeBlock& bb, u32 pc) { const Instruction instr = {program_code[pc]}; const auto opcode = OpCode::Decode(instr); - UNIMPLEMENTED_IF(instr.bfe.negate_b); - Node op_a = GetRegister(instr.gpr8); - op_a = GetOperandAbsNegInteger(op_a, false, instr.bfe.negate_a, false); - - switch (opcode->get().GetId()) { - case OpCode::Id::BFE_IMM: { - UNIMPLEMENTED_IF_MSG(instr.generates_cc, - "Condition codes generation in BFE is not implemented"); + Node op_b = [&] { + switch (opcode->get().GetId()) { + case OpCode::Id::BFE_R: + return GetRegister(instr.gpr20); + case OpCode::Id::BFE_C: + return GetConstBuffer(instr.cbuf34.index, instr.cbuf34.GetOffset()); + case OpCode::Id::BFE_IMM: + return Immediate(instr.alu.GetSignedImm20_20()); + default: + UNREACHABLE(); + return Immediate(0); + } + }(); - const Node inner_shift_imm = Immediate(static_cast<u32>(instr.bfe.GetLeftShiftValue())); - const Node outer_shift_imm = - Immediate(static_cast<u32>(instr.bfe.GetLeftShiftValue() + instr.bfe.shift_position)); + UNIMPLEMENTED_IF_MSG(instr.bfe.rd_cc, "Condition codes in BFE is not implemented"); - const Node inner_shift = - Operation(OperationCode::ILogicalShiftLeft, NO_PRECISE, op_a, inner_shift_imm); - const Node outer_shift = - Operation(OperationCode::ILogicalShiftRight, NO_PRECISE, inner_shift, outer_shift_imm); + const bool is_signed = instr.bfe.is_signed; - SetInternalFlagsFromInteger(bb, outer_shift, instr.generates_cc); - SetRegister(bb, instr.gpr0, outer_shift); - break; - } - default: - UNIMPLEMENTED_MSG("Unhandled BFE instruction: {}", opcode->get().GetName()); + // using reverse parallel method in + // https://graphics.stanford.edu/~seander/bithacks.html#ReverseParallel + // note for later if possible to implement faster method. + if (instr.bfe.brev) { + const auto swap = [&](u32 s, u32 mask) { + Node v1 = + SignedOperation(OperationCode::ILogicalShiftRight, is_signed, op_a, Immediate(s)); + if (mask != 0) { + v1 = SignedOperation(OperationCode::IBitwiseAnd, is_signed, std::move(v1), + Immediate(mask)); + } + Node v2 = op_a; + if (mask != 0) { + v2 = SignedOperation(OperationCode::IBitwiseAnd, is_signed, std::move(v2), + Immediate(mask)); + } + v2 = SignedOperation(OperationCode::ILogicalShiftLeft, is_signed, std::move(v2), + Immediate(s)); + return SignedOperation(OperationCode::IBitwiseOr, is_signed, std::move(v1), + std::move(v2)); + }; + op_a = swap(1, 0x55555555U); + op_a = swap(2, 0x33333333U); + op_a = swap(4, 0x0F0F0F0FU); + op_a = swap(8, 0x00FF00FFU); + op_a = swap(16, 0); } + const auto offset = SignedOperation(OperationCode::IBitfieldExtract, is_signed, op_b, + Immediate(0), Immediate(8)); + const auto bits = SignedOperation(OperationCode::IBitfieldExtract, is_signed, op_b, + Immediate(8), Immediate(8)); + auto result = SignedOperation(OperationCode::IBitfieldExtract, is_signed, op_a, offset, bits); + SetRegister(bb, instr.gpr0, std::move(result)); + return pc; } diff --git a/src/video_core/shader/decode/texture.cpp b/src/video_core/shader/decode/texture.cpp index bee7d8cad..48350e042 100644 --- a/src/video_core/shader/decode/texture.cpp +++ b/src/video_core/shader/decode/texture.cpp @@ -12,6 +12,7 @@ #include "common/logging/log.h" #include "video_core/engines/shader_bytecode.h" #include "video_core/shader/node_helper.h" +#include "video_core/shader/registry.h" #include "video_core/shader/shader_ir.h" namespace VideoCommon::Shader { @@ -359,8 +360,8 @@ ShaderIR::SamplerInfo ShaderIR::GetSamplerInfo(std::optional<SamplerInfo> sample if (sampler_info) { return *sampler_info; } - const auto sampler = - buffer ? locker.ObtainBindlessSampler(*buffer, offset) : locker.ObtainBoundSampler(offset); + const auto sampler = buffer ? registry.ObtainBindlessSampler(*buffer, offset) + : registry.ObtainBoundSampler(offset); if (!sampler) { LOG_WARNING(HW_GPU, "Unknown sampler info"); return SamplerInfo{TextureType::Texture2D, false, false, false}; diff --git a/src/video_core/shader/decode/xmad.cpp b/src/video_core/shader/decode/xmad.cpp index 206961909..6191ffba1 100644 --- a/src/video_core/shader/decode/xmad.cpp +++ b/src/video_core/shader/decode/xmad.cpp @@ -12,6 +12,7 @@ namespace VideoCommon::Shader { using Tegra::Shader::Instruction; using Tegra::Shader::OpCode; +using Tegra::Shader::PredCondition; u32 ShaderIR::DecodeXmad(NodeBlock& bb, u32 pc) { const Instruction instr = {program_code[pc]}; @@ -30,7 +31,7 @@ u32 ShaderIR::DecodeXmad(NodeBlock& bb, u32 pc) { const bool is_signed_b = instr.xmad.sign_b == 1; const bool is_signed_c = is_signed_a; - auto [is_merge, is_psl, is_high_b, mode, op_b, + auto [is_merge, is_psl, is_high_b, mode, op_b_binding, op_c] = [&]() -> std::tuple<bool, bool, bool, Tegra::Shader::XmadMode, Node, Node> { switch (opcode->get().GetId()) { case OpCode::Id::XMAD_CR: @@ -63,15 +64,19 @@ u32 ShaderIR::DecodeXmad(NodeBlock& bb, u32 pc) { } }(); - op_a = BitfieldExtract(op_a, instr.xmad.high_a ? 16 : 0, 16); + op_a = SignedOperation(OperationCode::IBitfieldExtract, is_signed_a, std::move(op_a), + instr.xmad.high_a ? Immediate(16) : Immediate(0), Immediate(16)); - const Node original_b = op_b; - op_b = BitfieldExtract(op_b, is_high_b ? 16 : 0, 16); + const Node original_b = op_b_binding; + const Node op_b = + SignedOperation(OperationCode::IBitfieldExtract, is_signed_b, std::move(op_b_binding), + is_high_b ? Immediate(16) : Immediate(0), Immediate(16)); - // TODO(Rodrigo): Use an appropiate sign for this operation - Node product = Operation(OperationCode::IMul, NO_PRECISE, op_a, op_b); + // we already check sign_a and sign_b is difference or not before so just use one in here. + Node product = SignedOperation(OperationCode::IMul, is_signed_a, op_a, op_b); if (is_psl) { - product = Operation(OperationCode::ILogicalShiftLeft, NO_PRECISE, product, Immediate(16)); + product = + SignedOperation(OperationCode::ILogicalShiftLeft, is_signed_a, product, Immediate(16)); } SetTemporary(bb, 0, product); product = GetTemporary(0); @@ -88,12 +93,40 @@ u32 ShaderIR::DecodeXmad(NodeBlock& bb, u32 pc) { return BitfieldExtract(original_c, 16, 16); case Tegra::Shader::XmadMode::CBcc: { const Node shifted_b = SignedOperation(OperationCode::ILogicalShiftLeft, is_signed_b, - NO_PRECISE, original_b, Immediate(16)); - return SignedOperation(OperationCode::IAdd, is_signed_c, NO_PRECISE, original_c, - shifted_b); + original_b, Immediate(16)); + return SignedOperation(OperationCode::IAdd, is_signed_c, original_c, shifted_b); + } + case Tegra::Shader::XmadMode::CSfu: { + const Node comp_a = GetPredicateComparisonInteger(PredCondition::Equal, is_signed_a, + op_a, Immediate(0)); + const Node comp_b = GetPredicateComparisonInteger(PredCondition::Equal, is_signed_b, + op_b, Immediate(0)); + const Node comp = Operation(OperationCode::LogicalOr, comp_a, comp_b); + + const Node comp_minus_a = GetPredicateComparisonInteger( + PredCondition::NotEqual, is_signed_a, + SignedOperation(OperationCode::IBitwiseAnd, is_signed_a, op_a, + Immediate(0x80000000)), + Immediate(0)); + const Node comp_minus_b = GetPredicateComparisonInteger( + PredCondition::NotEqual, is_signed_b, + SignedOperation(OperationCode::IBitwiseAnd, is_signed_b, op_b, + Immediate(0x80000000)), + Immediate(0)); + + Node new_c = Operation( + OperationCode::Select, comp_minus_a, + SignedOperation(OperationCode::IAdd, is_signed_c, original_c, Immediate(-65536)), + original_c); + new_c = Operation( + OperationCode::Select, comp_minus_b, + SignedOperation(OperationCode::IAdd, is_signed_c, new_c, Immediate(-65536)), + std::move(new_c)); + + return Operation(OperationCode::Select, comp, original_c, std::move(new_c)); } default: - UNIMPLEMENTED_MSG("Unhandled XMAD mode: {}", static_cast<u32>(instr.xmad.mode.Value())); + UNREACHABLE(); return Immediate(0); } }(); @@ -102,18 +135,19 @@ u32 ShaderIR::DecodeXmad(NodeBlock& bb, u32 pc) { op_c = GetTemporary(1); // TODO(Rodrigo): Use an appropiate sign for this operation - Node sum = Operation(OperationCode::IAdd, product, op_c); + Node sum = SignedOperation(OperationCode::IAdd, is_signed_a, product, std::move(op_c)); SetTemporary(bb, 2, sum); sum = GetTemporary(2); if (is_merge) { - const Node a = BitfieldExtract(sum, 0, 16); - const Node b = - Operation(OperationCode::ILogicalShiftLeft, NO_PRECISE, original_b, Immediate(16)); - sum = Operation(OperationCode::IBitwiseOr, NO_PRECISE, a, b); + const Node a = SignedOperation(OperationCode::IBitfieldExtract, is_signed_a, std::move(sum), + Immediate(0), Immediate(16)); + const Node b = SignedOperation(OperationCode::ILogicalShiftLeft, is_signed_b, original_b, + Immediate(16)); + sum = SignedOperation(OperationCode::IBitwiseOr, is_signed_a, a, b); } SetInternalFlagsFromInteger(bb, sum, instr.generates_cc); - SetRegister(bb, instr.gpr0, sum); + SetRegister(bb, instr.gpr0, std::move(sum)); return pc; } diff --git a/src/video_core/shader/node.h b/src/video_core/shader/node.h index a0a7b9111..a1828546e 100644 --- a/src/video_core/shader/node.h +++ b/src/video_core/shader/node.h @@ -299,7 +299,7 @@ private: u32 index{}; ///< Emulated index given for the this sampler. u32 offset{}; ///< Offset in the const buffer from where the sampler is being read. u32 buffer{}; ///< Buffer where the bindless sampler is being read (unused on bound samplers). - u32 size{}; ///< Size of the sampler if indexed. + u32 size{1}; ///< Size of the sampler. Tegra::Shader::TextureType type{}; ///< The type used to sample this texture (Texture2D, etc) bool is_array{}; ///< Whether the texture is being sampled as an array texture or not. diff --git a/src/video_core/shader/node_helper.cpp b/src/video_core/shader/node_helper.cpp index b3dcd291c..76c56abb5 100644 --- a/src/video_core/shader/node_helper.cpp +++ b/src/video_core/shader/node_helper.cpp @@ -68,6 +68,8 @@ OperationCode SignedToUnsignedCode(OperationCode operation_code, bool is_signed) return OperationCode::UBitwiseXor; case OperationCode::IBitwiseNot: return OperationCode::UBitwiseNot; + case OperationCode::IBitfieldExtract: + return OperationCode::UBitfieldExtract; case OperationCode::IBitfieldInsert: return OperationCode::UBitfieldInsert; case OperationCode::IBitCount: diff --git a/src/video_core/shader/registry.cpp b/src/video_core/shader/registry.cpp new file mode 100644 index 000000000..af70b3f35 --- /dev/null +++ b/src/video_core/shader/registry.cpp @@ -0,0 +1,161 @@ +// Copyright 2019 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include <algorithm> +#include <tuple> + +#include "common/assert.h" +#include "common/common_types.h" +#include "video_core/engines/kepler_compute.h" +#include "video_core/engines/maxwell_3d.h" +#include "video_core/engines/shader_type.h" +#include "video_core/shader/registry.h" + +namespace VideoCommon::Shader { + +using Tegra::Engines::ConstBufferEngineInterface; +using Tegra::Engines::SamplerDescriptor; +using Tegra::Engines::ShaderType; + +namespace { + +GraphicsInfo MakeGraphicsInfo(ShaderType shader_stage, ConstBufferEngineInterface& engine) { + if (shader_stage == ShaderType::Compute) { + return {}; + } + auto& graphics = static_cast<Tegra::Engines::Maxwell3D&>(engine); + + GraphicsInfo info; + info.tfb_layouts = graphics.regs.tfb_layouts; + info.tfb_varying_locs = graphics.regs.tfb_varying_locs; + info.primitive_topology = graphics.regs.draw.topology; + info.tessellation_primitive = graphics.regs.tess_mode.prim; + info.tessellation_spacing = graphics.regs.tess_mode.spacing; + info.tfb_enabled = graphics.regs.tfb_enabled; + info.tessellation_clockwise = graphics.regs.tess_mode.cw; + return info; +} + +ComputeInfo MakeComputeInfo(ShaderType shader_stage, ConstBufferEngineInterface& engine) { + if (shader_stage != ShaderType::Compute) { + return {}; + } + auto& compute = static_cast<Tegra::Engines::KeplerCompute&>(engine); + const auto& launch = compute.launch_description; + + ComputeInfo info; + info.workgroup_size = {launch.block_dim_x, launch.block_dim_y, launch.block_dim_z}; + info.local_memory_size_in_words = launch.local_pos_alloc; + info.shared_memory_size_in_words = launch.shared_alloc; + return info; +} + +} // Anonymous namespace + +Registry::Registry(Tegra::Engines::ShaderType shader_stage, const SerializedRegistryInfo& info) + : stage{shader_stage}, stored_guest_driver_profile{info.guest_driver_profile}, + bound_buffer{info.bound_buffer}, graphics_info{info.graphics}, compute_info{info.compute} {} + +Registry::Registry(Tegra::Engines::ShaderType shader_stage, + Tegra::Engines::ConstBufferEngineInterface& engine) + : stage{shader_stage}, engine{&engine}, bound_buffer{engine.GetBoundBuffer()}, + graphics_info{MakeGraphicsInfo(shader_stage, engine)}, compute_info{MakeComputeInfo( + shader_stage, engine)} {} + +Registry::~Registry() = default; + +std::optional<u32> Registry::ObtainKey(u32 buffer, u32 offset) { + const std::pair<u32, u32> key = {buffer, offset}; + const auto iter = keys.find(key); + if (iter != keys.end()) { + return iter->second; + } + if (!engine) { + return std::nullopt; + } + const u32 value = engine->AccessConstBuffer32(stage, buffer, offset); + keys.emplace(key, value); + return value; +} + +std::optional<SamplerDescriptor> Registry::ObtainBoundSampler(u32 offset) { + const u32 key = offset; + const auto iter = bound_samplers.find(key); + if (iter != bound_samplers.end()) { + return iter->second; + } + if (!engine) { + return std::nullopt; + } + const SamplerDescriptor value = engine->AccessBoundSampler(stage, offset); + bound_samplers.emplace(key, value); + return value; +} + +std::optional<Tegra::Engines::SamplerDescriptor> Registry::ObtainBindlessSampler(u32 buffer, + u32 offset) { + const std::pair key = {buffer, offset}; + const auto iter = bindless_samplers.find(key); + if (iter != bindless_samplers.end()) { + return iter->second; + } + if (!engine) { + return std::nullopt; + } + const SamplerDescriptor value = engine->AccessBindlessSampler(stage, buffer, offset); + bindless_samplers.emplace(key, value); + return value; +} + +void Registry::InsertKey(u32 buffer, u32 offset, u32 value) { + keys.insert_or_assign({buffer, offset}, value); +} + +void Registry::InsertBoundSampler(u32 offset, SamplerDescriptor sampler) { + bound_samplers.insert_or_assign(offset, sampler); +} + +void Registry::InsertBindlessSampler(u32 buffer, u32 offset, SamplerDescriptor sampler) { + bindless_samplers.insert_or_assign({buffer, offset}, sampler); +} + +bool Registry::IsConsistent() const { + if (!engine) { + return true; + } + return std::all_of(keys.begin(), keys.end(), + [this](const auto& pair) { + const auto [cbuf, offset] = pair.first; + const auto value = pair.second; + return value == engine->AccessConstBuffer32(stage, cbuf, offset); + }) && + std::all_of(bound_samplers.begin(), bound_samplers.end(), + [this](const auto& sampler) { + const auto [key, value] = sampler; + return value == engine->AccessBoundSampler(stage, key); + }) && + std::all_of(bindless_samplers.begin(), bindless_samplers.end(), + [this](const auto& sampler) { + const auto [cbuf, offset] = sampler.first; + const auto value = sampler.second; + return value == engine->AccessBindlessSampler(stage, cbuf, offset); + }); +} + +bool Registry::HasEqualKeys(const Registry& rhs) const { + return std::tie(keys, bound_samplers, bindless_samplers) == + std::tie(rhs.keys, rhs.bound_samplers, rhs.bindless_samplers); +} + +const GraphicsInfo& Registry::GetGraphicsInfo() const { + ASSERT(stage != Tegra::Engines::ShaderType::Compute); + return graphics_info; +} + +const ComputeInfo& Registry::GetComputeInfo() const { + ASSERT(stage == Tegra::Engines::ShaderType::Compute); + return compute_info; +} + +} // namespace VideoCommon::Shader diff --git a/src/video_core/shader/registry.h b/src/video_core/shader/registry.h new file mode 100644 index 000000000..0c80d35fd --- /dev/null +++ b/src/video_core/shader/registry.h @@ -0,0 +1,137 @@ +// Copyright 2019 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include <array> +#include <optional> +#include <type_traits> +#include <unordered_map> +#include <utility> + +#include "common/common_types.h" +#include "common/hash.h" +#include "video_core/engines/const_buffer_engine_interface.h" +#include "video_core/engines/maxwell_3d.h" +#include "video_core/engines/shader_type.h" +#include "video_core/guest_driver.h" + +namespace VideoCommon::Shader { + +using KeyMap = std::unordered_map<std::pair<u32, u32>, u32, Common::PairHash>; +using BoundSamplerMap = std::unordered_map<u32, Tegra::Engines::SamplerDescriptor>; +using BindlessSamplerMap = + std::unordered_map<std::pair<u32, u32>, Tegra::Engines::SamplerDescriptor, Common::PairHash>; + +struct GraphicsInfo { + using Maxwell = Tegra::Engines::Maxwell3D::Regs; + + std::array<Maxwell::TransformFeedbackLayout, Maxwell::NumTransformFeedbackBuffers> + tfb_layouts{}; + std::array<std::array<u8, 128>, Maxwell::NumTransformFeedbackBuffers> tfb_varying_locs{}; + Maxwell::PrimitiveTopology primitive_topology{}; + Maxwell::TessellationPrimitive tessellation_primitive{}; + Maxwell::TessellationSpacing tessellation_spacing{}; + bool tfb_enabled = false; + bool tessellation_clockwise = false; +}; +static_assert(std::is_trivially_copyable_v<GraphicsInfo> && + std::is_standard_layout_v<GraphicsInfo>); + +struct ComputeInfo { + std::array<u32, 3> workgroup_size{}; + u32 shared_memory_size_in_words = 0; + u32 local_memory_size_in_words = 0; +}; +static_assert(std::is_trivially_copyable_v<ComputeInfo> && std::is_standard_layout_v<ComputeInfo>); + +struct SerializedRegistryInfo { + VideoCore::GuestDriverProfile guest_driver_profile; + u32 bound_buffer = 0; + GraphicsInfo graphics; + ComputeInfo compute; +}; + +/** + * The Registry is a class use to interface the 3D and compute engines with the shader compiler. + * With it, the shader can obtain required data from GPU state and store it for disk shader + * compilation. + */ +class Registry { +public: + explicit Registry(Tegra::Engines::ShaderType shader_stage, const SerializedRegistryInfo& info); + + explicit Registry(Tegra::Engines::ShaderType shader_stage, + Tegra::Engines::ConstBufferEngineInterface& engine); + + ~Registry(); + + /// Retrieves a key from the registry, if it's registered, it will give the registered value, if + /// not it will obtain it from maxwell3d and register it. + std::optional<u32> ObtainKey(u32 buffer, u32 offset); + + std::optional<Tegra::Engines::SamplerDescriptor> ObtainBoundSampler(u32 offset); + + std::optional<Tegra::Engines::SamplerDescriptor> ObtainBindlessSampler(u32 buffer, u32 offset); + + /// Inserts a key. + void InsertKey(u32 buffer, u32 offset, u32 value); + + /// Inserts a bound sampler key. + void InsertBoundSampler(u32 offset, Tegra::Engines::SamplerDescriptor sampler); + + /// Inserts a bindless sampler key. + void InsertBindlessSampler(u32 buffer, u32 offset, Tegra::Engines::SamplerDescriptor sampler); + + /// Checks keys and samplers against engine's current const buffers. + /// Returns true if they are the same value, false otherwise. + bool IsConsistent() const; + + /// Returns true if the keys are equal to the other ones in the registry. + bool HasEqualKeys(const Registry& rhs) const; + + /// Returns graphics information from this shader + const GraphicsInfo& GetGraphicsInfo() const; + + /// Returns compute information from this shader + const ComputeInfo& GetComputeInfo() const; + + /// Gives an getter to the const buffer keys in the database. + const KeyMap& GetKeys() const { + return keys; + } + + /// Gets samplers database. + const BoundSamplerMap& GetBoundSamplers() const { + return bound_samplers; + } + + /// Gets bindless samplers database. + const BindlessSamplerMap& GetBindlessSamplers() const { + return bindless_samplers; + } + + /// Gets bound buffer used on this shader + u32 GetBoundBuffer() const { + return bound_buffer; + } + + /// Obtains access to the guest driver's profile. + VideoCore::GuestDriverProfile& AccessGuestDriverProfile() { + return engine ? engine->AccessGuestDriverProfile() : stored_guest_driver_profile; + } + +private: + const Tegra::Engines::ShaderType stage; + VideoCore::GuestDriverProfile stored_guest_driver_profile; + Tegra::Engines::ConstBufferEngineInterface* engine = nullptr; + KeyMap keys; + BoundSamplerMap bound_samplers; + BindlessSamplerMap bindless_samplers; + u32 bound_buffer; + GraphicsInfo graphics_info; + ComputeInfo compute_info; +}; + +} // namespace VideoCommon::Shader diff --git a/src/video_core/shader/shader_ir.cpp b/src/video_core/shader/shader_ir.cpp index 3a5d280a9..baf7188d2 100644 --- a/src/video_core/shader/shader_ir.cpp +++ b/src/video_core/shader/shader_ir.cpp @@ -11,6 +11,7 @@ #include "common/logging/log.h" #include "video_core/engines/shader_bytecode.h" #include "video_core/shader/node_helper.h" +#include "video_core/shader/registry.h" #include "video_core/shader/shader_ir.h" namespace VideoCommon::Shader { @@ -24,8 +25,8 @@ using Tegra::Shader::PredOperation; using Tegra::Shader::Register; ShaderIR::ShaderIR(const ProgramCode& program_code, u32 main_offset, CompilerSettings settings, - ConstBufferLocker& locker) - : program_code{program_code}, main_offset{main_offset}, settings{settings}, locker{locker} { + Registry& registry) + : program_code{program_code}, main_offset{main_offset}, settings{settings}, registry{registry} { Decode(); PostDecode(); } @@ -95,6 +96,7 @@ Node ShaderIR::GetPredicate(bool immediate) { } Node ShaderIR::GetInputAttribute(Attribute::Index index, u64 element, Node buffer) { + MarkAttributeUsage(index, element); used_input_attributes.emplace(index); return MakeNode<AbufNode>(index, static_cast<u32>(element), std::move(buffer)); } @@ -105,42 +107,8 @@ Node ShaderIR::GetPhysicalInputAttribute(Tegra::Shader::Register physical_addres } Node ShaderIR::GetOutputAttribute(Attribute::Index index, u64 element, Node buffer) { - if (index == Attribute::Index::LayerViewportPointSize) { - switch (element) { - case 0: - UNIMPLEMENTED(); - break; - case 1: - uses_layer = true; - break; - case 2: - uses_viewport_index = true; - break; - case 3: - uses_point_size = true; - break; - } - } - if (index == Attribute::Index::TessCoordInstanceIDVertexID) { - switch (element) { - case 2: - uses_instance_id = true; - break; - case 3: - uses_vertex_id = true; - break; - default: - break; - } - } - if (index == Attribute::Index::ClipDistances0123 || - index == Attribute::Index::ClipDistances4567) { - const auto clip_index = - static_cast<u32>((index == Attribute::Index::ClipDistances4567 ? 1 : 0) + element); - used_clip_distances.at(clip_index) = true; - } + MarkAttributeUsage(index, element); used_output_attributes.insert(index); - return MakeNode<AbufNode>(index, static_cast<u32>(element), std::move(buffer)); } @@ -451,6 +419,54 @@ Node ShaderIR::BitfieldInsert(Node base, Node insert, u32 offset, u32 bits) { Immediate(bits)); } +void ShaderIR::MarkAttributeUsage(Attribute::Index index, u64 element) { + switch (index) { + case Attribute::Index::LayerViewportPointSize: + switch (element) { + case 0: + UNIMPLEMENTED(); + break; + case 1: + uses_layer = true; + break; + case 2: + uses_viewport_index = true; + break; + case 3: + uses_point_size = true; + break; + } + break; + case Attribute::Index::TessCoordInstanceIDVertexID: + switch (element) { + case 2: + uses_instance_id = true; + break; + case 3: + uses_vertex_id = true; + break; + } + break; + case Attribute::Index::ClipDistances0123: + case Attribute::Index::ClipDistances4567: { + const u64 clip_index = (index == Attribute::Index::ClipDistances4567 ? 4 : 0) + element; + used_clip_distances.at(clip_index) = true; + break; + } + case Attribute::Index::FrontColor: + case Attribute::Index::FrontSecondaryColor: + case Attribute::Index::BackColor: + case Attribute::Index::BackSecondaryColor: + uses_legacy_varyings = true; + break; + default: + if (index >= Attribute::Index::TexCoord_0 && index <= Attribute::Index::TexCoord_7) { + uses_legacy_varyings = true; + } + break; + } +} + std::size_t ShaderIR::DeclareAmend(Node new_amend) { const std::size_t id = amend_code.size(); amend_code.push_back(new_amend); diff --git a/src/video_core/shader/shader_ir.h b/src/video_core/shader/shader_ir.h index b0851c3be..80fc9b82c 100644 --- a/src/video_core/shader/shader_ir.h +++ b/src/video_core/shader/shader_ir.h @@ -18,8 +18,8 @@ #include "video_core/engines/shader_header.h" #include "video_core/shader/ast.h" #include "video_core/shader/compiler_settings.h" -#include "video_core/shader/const_buffer_locker.h" #include "video_core/shader/node.h" +#include "video_core/shader/registry.h" namespace VideoCommon::Shader { @@ -69,7 +69,7 @@ struct GlobalMemoryUsage { class ShaderIR final { public: explicit ShaderIR(const ProgramCode& program_code, u32 main_offset, CompilerSettings settings, - ConstBufferLocker& locker); + Registry& registry); ~ShaderIR(); const std::map<u32, NodeBlock>& GetBasicBlocks() const { @@ -137,6 +137,10 @@ public: return uses_vertex_id; } + bool UsesLegacyVaryings() const { + return uses_legacy_varyings; + } + bool UsesWarps() const { return uses_warps; } @@ -343,6 +347,9 @@ private: /// Inserts a sequence of bits from a node Node BitfieldInsert(Node base, Node insert, u32 offset, u32 bits); + /// Marks the usage of a input or output attribute. + void MarkAttributeUsage(Tegra::Shader::Attribute::Index index, u64 element); + void WriteTexInstructionFloat(NodeBlock& bb, Tegra::Shader::Instruction instr, const Node4& components); @@ -414,7 +421,7 @@ private: const ProgramCode& program_code; const u32 main_offset; const CompilerSettings settings; - ConstBufferLocker& locker; + Registry& registry; bool decompiled{}; bool disable_flow_stack{}; @@ -443,6 +450,7 @@ private: bool uses_physical_attributes{}; // Shader uses AL2P or physical attribute read/writes bool uses_instance_id{}; bool uses_vertex_id{}; + bool uses_legacy_varyings{}; bool uses_warps{}; bool uses_indexed_samplers{}; diff --git a/src/video_core/shader/track.cpp b/src/video_core/shader/track.cpp index face8c943..10739b37d 100644 --- a/src/video_core/shader/track.cpp +++ b/src/video_core/shader/track.cpp @@ -81,26 +81,20 @@ std::tuple<Node, TrackSampler> ShaderIR::TrackBindlessSampler(Node tracked, cons MakeTrackSampler<BindlessSamplerNode>(cbuf->GetIndex(), immediate->GetValue()); return {tracked, track}; } else if (const auto operation = std::get_if<OperationNode>(&*offset)) { - auto bound_buffer = locker.ObtainBoundBuffer(); - if (!bound_buffer) { + const u32 bound_buffer = registry.GetBoundBuffer(); + if (bound_buffer != cbuf->GetIndex()) { return {}; } - if (*bound_buffer != cbuf->GetIndex()) { - return {}; - } - auto pair = DecoupleIndirectRead(*operation); + const auto pair = DecoupleIndirectRead(*operation); if (!pair) { return {}; } auto [gpr, base_offset] = *pair; const auto offset_inm = std::get_if<ImmediateNode>(&*base_offset); - auto gpu_driver = locker.AccessGuestDriverProfile(); - if (gpu_driver == nullptr) { - return {}; - } + const auto& gpu_driver = registry.AccessGuestDriverProfile(); const u32 bindless_cv = NewCustomVariable(); - const Node op = Operation(OperationCode::UDiv, NO_PRECISE, gpr, - Immediate(gpu_driver->GetTextureHandlerSize())); + const Node op = + Operation(OperationCode::UDiv, gpr, Immediate(gpu_driver.GetTextureHandlerSize())); const Node cv_node = GetCustomVariable(bindless_cv); Node amend_op = Operation(OperationCode::Assign, cv_node, std::move(op)); @@ -157,13 +151,21 @@ std::tuple<Node, u32, u32> ShaderIR::TrackCbuf(Node tracked, const NodeBlock& co if (gpr->GetIndex() == Tegra::Shader::Register::ZeroIndex) { return {}; } - // Reduce the cursor in one to avoid infinite loops when the instruction sets the same - // register that it uses as operand - const auto [source, new_cursor] = TrackRegister(gpr, code, cursor - 1); - if (!source) { - return {}; + s64 current_cursor = cursor; + while (current_cursor > 0) { + // Reduce the cursor in one to avoid infinite loops when the instruction sets the same + // register that it uses as operand + const auto [source, new_cursor] = TrackRegister(gpr, code, current_cursor - 1); + current_cursor = new_cursor; + if (!source) { + continue; + } + const auto [base_address, index, offset] = TrackCbuf(source, code, current_cursor); + if (base_address != nullptr) { + return {base_address, index, offset}; + } } - return TrackCbuf(source, code, new_cursor); + return {}; } if (const auto operation = std::get_if<OperationNode>(&*tracked)) { for (std::size_t i = operation->GetOperandsCount(); i > 0; --i) { diff --git a/src/video_core/shader/transform_feedback.cpp b/src/video_core/shader/transform_feedback.cpp new file mode 100644 index 000000000..22a933761 --- /dev/null +++ b/src/video_core/shader/transform_feedback.cpp @@ -0,0 +1,115 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include <algorithm> +#include <array> +#include <unordered_map> + +#include "common/assert.h" +#include "common/common_types.h" +#include "video_core/engines/maxwell_3d.h" +#include "video_core/shader/registry.h" +#include "video_core/shader/transform_feedback.h" + +namespace VideoCommon::Shader { + +namespace { + +using Maxwell = Tegra::Engines::Maxwell3D::Regs; + +// TODO(Rodrigo): Change this to constexpr std::unordered_set in C++20 + +/// Attribute offsets that describe a vector +constexpr std::array VECTORS = { + 28, // gl_Position + 32, // Generic 0 + 36, // Generic 1 + 40, // Generic 2 + 44, // Generic 3 + 48, // Generic 4 + 52, // Generic 5 + 56, // Generic 6 + 60, // Generic 7 + 64, // Generic 8 + 68, // Generic 9 + 72, // Generic 10 + 76, // Generic 11 + 80, // Generic 12 + 84, // Generic 13 + 88, // Generic 14 + 92, // Generic 15 + 96, // Generic 16 + 100, // Generic 17 + 104, // Generic 18 + 108, // Generic 19 + 112, // Generic 20 + 116, // Generic 21 + 120, // Generic 22 + 124, // Generic 23 + 128, // Generic 24 + 132, // Generic 25 + 136, // Generic 26 + 140, // Generic 27 + 144, // Generic 28 + 148, // Generic 29 + 152, // Generic 30 + 156, // Generic 31 + 160, // gl_FrontColor + 164, // gl_FrontSecondaryColor + 160, // gl_BackColor + 164, // gl_BackSecondaryColor + 192, // gl_TexCoord[0] + 196, // gl_TexCoord[1] + 200, // gl_TexCoord[2] + 204, // gl_TexCoord[3] + 208, // gl_TexCoord[4] + 212, // gl_TexCoord[5] + 216, // gl_TexCoord[6] + 220, // gl_TexCoord[7] +}; +} // namespace + +std::unordered_map<u8, VaryingTFB> BuildTransformFeedback(const GraphicsInfo& info) { + + std::unordered_map<u8, VaryingTFB> tfb; + + for (std::size_t buffer = 0; buffer < Maxwell::NumTransformFeedbackBuffers; ++buffer) { + const auto& locations = info.tfb_varying_locs[buffer]; + const auto& layout = info.tfb_layouts[buffer]; + const std::size_t varying_count = layout.varying_count; + + std::size_t highest = 0; + + for (std::size_t offset = 0; offset < varying_count; ++offset) { + const std::size_t base_offset = offset; + const u8 location = locations[offset]; + + VaryingTFB varying; + varying.buffer = layout.stream; + varying.stride = layout.stride; + varying.offset = offset * sizeof(u32); + varying.components = 1; + + if (std::find(VECTORS.begin(), VECTORS.end(), location / 4 * 4) != VECTORS.end()) { + UNIMPLEMENTED_IF_MSG(location % 4 != 0, "Unaligned TFB"); + + const u8 base_index = location / 4; + while (offset + 1 < varying_count && base_index == locations[offset + 1] / 4) { + ++offset; + ++varying.components; + } + } + + [[maybe_unused]] const bool inserted = tfb.emplace(location, varying).second; + UNIMPLEMENTED_IF_MSG(!inserted, "Varying already stored"); + + highest = std::max(highest, (base_offset + varying.components) * sizeof(u32)); + } + + UNIMPLEMENTED_IF(highest != layout.stride); + } + return tfb; +} + +} // namespace VideoCommon::Shader diff --git a/src/video_core/shader/transform_feedback.h b/src/video_core/shader/transform_feedback.h new file mode 100644 index 000000000..77d05f64c --- /dev/null +++ b/src/video_core/shader/transform_feedback.h @@ -0,0 +1,23 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include <unordered_map> + +#include "common/common_types.h" +#include "video_core/shader/registry.h" + +namespace VideoCommon::Shader { + +struct VaryingTFB { + std::size_t buffer; + std::size_t stride; + std::size_t offset; + std::size_t components; +}; + +std::unordered_map<u8, VaryingTFB> BuildTransformFeedback(const GraphicsInfo& info); + +} // namespace VideoCommon::Shader diff --git a/src/video_core/surface.cpp b/src/video_core/surface.cpp index 1655ccf16..cc7181229 100644 --- a/src/video_core/surface.cpp +++ b/src/video_core/surface.cpp @@ -111,6 +111,8 @@ PixelFormat PixelFormatFromRenderTargetFormat(Tegra::RenderTargetFormat format) return PixelFormat::RGBA16F; case Tegra::RenderTargetFormat::RGBA16_UNORM: return PixelFormat::RGBA16U; + case Tegra::RenderTargetFormat::RGBA16_SNORM: + return PixelFormat::RGBA16S; case Tegra::RenderTargetFormat::RGBA16_UINT: return PixelFormat::RGBA16UI; case Tegra::RenderTargetFormat::RGBA32_FLOAT: @@ -155,6 +157,8 @@ PixelFormat PixelFormatFromRenderTargetFormat(Tegra::RenderTargetFormat format) return PixelFormat::R16I; case Tegra::RenderTargetFormat::R32_FLOAT: return PixelFormat::R32F; + case Tegra::RenderTargetFormat::R32_SINT: + return PixelFormat::R32I; case Tegra::RenderTargetFormat::R32_UINT: return PixelFormat::R32UI; case Tegra::RenderTargetFormat::RG32_UINT: diff --git a/src/video_core/surface.h b/src/video_core/surface.h index 0d17a93ed..ae8817465 100644 --- a/src/video_core/surface.h +++ b/src/video_core/surface.h @@ -25,81 +25,83 @@ enum class PixelFormat { R8UI = 7, RGBA16F = 8, RGBA16U = 9, - RGBA16UI = 10, - R11FG11FB10F = 11, - RGBA32UI = 12, - DXT1 = 13, - DXT23 = 14, - DXT45 = 15, - DXN1 = 16, // This is also known as BC4 - DXN2UNORM = 17, - DXN2SNORM = 18, - BC7U = 19, - BC6H_UF16 = 20, - BC6H_SF16 = 21, - ASTC_2D_4X4 = 22, - BGRA8 = 23, - RGBA32F = 24, - RG32F = 25, - R32F = 26, - R16F = 27, - R16U = 28, - R16S = 29, - R16UI = 30, - R16I = 31, - RG16 = 32, - RG16F = 33, - RG16UI = 34, - RG16I = 35, - RG16S = 36, - RGB32F = 37, - RGBA8_SRGB = 38, - RG8U = 39, - RG8S = 40, - RG32UI = 41, - RGBX16F = 42, - R32UI = 43, - ASTC_2D_8X8 = 44, - ASTC_2D_8X5 = 45, - ASTC_2D_5X4 = 46, - BGRA8_SRGB = 47, - DXT1_SRGB = 48, - DXT23_SRGB = 49, - DXT45_SRGB = 50, - BC7U_SRGB = 51, - R4G4B4A4U = 52, - ASTC_2D_4X4_SRGB = 53, - ASTC_2D_8X8_SRGB = 54, - ASTC_2D_8X5_SRGB = 55, - ASTC_2D_5X4_SRGB = 56, - ASTC_2D_5X5 = 57, - ASTC_2D_5X5_SRGB = 58, - ASTC_2D_10X8 = 59, - ASTC_2D_10X8_SRGB = 60, - ASTC_2D_6X6 = 61, - ASTC_2D_6X6_SRGB = 62, - ASTC_2D_10X10 = 63, - ASTC_2D_10X10_SRGB = 64, - ASTC_2D_12X12 = 65, - ASTC_2D_12X12_SRGB = 66, - ASTC_2D_8X6 = 67, - ASTC_2D_8X6_SRGB = 68, - ASTC_2D_6X5 = 69, - ASTC_2D_6X5_SRGB = 70, - E5B9G9R9F = 71, + RGBA16S = 10, + RGBA16UI = 11, + R11FG11FB10F = 12, + RGBA32UI = 13, + DXT1 = 14, + DXT23 = 15, + DXT45 = 16, + DXN1 = 17, // This is also known as BC4 + DXN2UNORM = 18, + DXN2SNORM = 19, + BC7U = 20, + BC6H_UF16 = 21, + BC6H_SF16 = 22, + ASTC_2D_4X4 = 23, + BGRA8 = 24, + RGBA32F = 25, + RG32F = 26, + R32F = 27, + R16F = 28, + R16U = 29, + R16S = 30, + R16UI = 31, + R16I = 32, + RG16 = 33, + RG16F = 34, + RG16UI = 35, + RG16I = 36, + RG16S = 37, + RGB32F = 38, + RGBA8_SRGB = 39, + RG8U = 40, + RG8S = 41, + RG32UI = 42, + RGBX16F = 43, + R32UI = 44, + R32I = 45, + ASTC_2D_8X8 = 46, + ASTC_2D_8X5 = 47, + ASTC_2D_5X4 = 48, + BGRA8_SRGB = 49, + DXT1_SRGB = 50, + DXT23_SRGB = 51, + DXT45_SRGB = 52, + BC7U_SRGB = 53, + R4G4B4A4U = 54, + ASTC_2D_4X4_SRGB = 55, + ASTC_2D_8X8_SRGB = 56, + ASTC_2D_8X5_SRGB = 57, + ASTC_2D_5X4_SRGB = 58, + ASTC_2D_5X5 = 59, + ASTC_2D_5X5_SRGB = 60, + ASTC_2D_10X8 = 61, + ASTC_2D_10X8_SRGB = 62, + ASTC_2D_6X6 = 63, + ASTC_2D_6X6_SRGB = 64, + ASTC_2D_10X10 = 65, + ASTC_2D_10X10_SRGB = 66, + ASTC_2D_12X12 = 67, + ASTC_2D_12X12_SRGB = 68, + ASTC_2D_8X6 = 69, + ASTC_2D_8X6_SRGB = 70, + ASTC_2D_6X5 = 71, + ASTC_2D_6X5_SRGB = 72, + E5B9G9R9F = 73, MaxColorFormat, // Depth formats - Z32F = 72, - Z16 = 73, + Z32F = 74, + Z16 = 75, MaxDepthFormat, // DepthStencil formats - Z24S8 = 74, - S8Z24 = 75, - Z32FS8 = 76, + Z24S8 = 76, + S8Z24 = 77, + Z32FS8 = 78, MaxDepthStencilFormat, @@ -137,6 +139,7 @@ constexpr std::array<u32, MaxPixelFormat> compression_factor_shift_table = {{ 0, // R8UI 0, // RGBA16F 0, // RGBA16U + 0, // RGBA16S 0, // RGBA16UI 0, // R11FG11FB10F 0, // RGBA32UI @@ -171,6 +174,7 @@ constexpr std::array<u32, MaxPixelFormat> compression_factor_shift_table = {{ 0, // RG32UI 0, // RGBX16F 0, // R32UI + 0, // R32I 2, // ASTC_2D_8X8 2, // ASTC_2D_8X5 2, // ASTC_2D_5X4 @@ -233,6 +237,7 @@ constexpr std::array<u32, MaxPixelFormat> block_width_table = {{ 1, // R8UI 1, // RGBA16F 1, // RGBA16U + 1, // RGBA16S 1, // RGBA16UI 1, // R11FG11FB10F 1, // RGBA32UI @@ -267,6 +272,7 @@ constexpr std::array<u32, MaxPixelFormat> block_width_table = {{ 1, // RG32UI 1, // RGBX16F 1, // R32UI + 1, // R32I 8, // ASTC_2D_8X8 8, // ASTC_2D_8X5 5, // ASTC_2D_5X4 @@ -321,6 +327,7 @@ constexpr std::array<u32, MaxPixelFormat> block_height_table = {{ 1, // R8UI 1, // RGBA16F 1, // RGBA16U + 1, // RGBA16S 1, // RGBA16UI 1, // R11FG11FB10F 1, // RGBA32UI @@ -355,6 +362,7 @@ constexpr std::array<u32, MaxPixelFormat> block_height_table = {{ 1, // RG32UI 1, // RGBX16F 1, // R32UI + 1, // R32I 8, // ASTC_2D_8X8 5, // ASTC_2D_8X5 4, // ASTC_2D_5X4 @@ -409,6 +417,7 @@ constexpr std::array<u32, MaxPixelFormat> bpp_table = {{ 8, // R8UI 64, // RGBA16F 64, // RGBA16U + 64, // RGBA16S 64, // RGBA16UI 32, // R11FG11FB10F 128, // RGBA32UI @@ -443,6 +452,7 @@ constexpr std::array<u32, MaxPixelFormat> bpp_table = {{ 64, // RG32UI 64, // RGBX16F 32, // R32UI + 32, // R32I 128, // ASTC_2D_8X8 128, // ASTC_2D_8X5 128, // ASTC_2D_5X4 @@ -512,6 +522,7 @@ constexpr std::array<SurfaceCompression, MaxPixelFormat> compression_type_table SurfaceCompression::None, // R8UI SurfaceCompression::None, // RGBA16F SurfaceCompression::None, // RGBA16U + SurfaceCompression::None, // RGBA16S SurfaceCompression::None, // RGBA16UI SurfaceCompression::None, // R11FG11FB10F SurfaceCompression::None, // RGBA32UI @@ -546,6 +557,7 @@ constexpr std::array<SurfaceCompression, MaxPixelFormat> compression_type_table SurfaceCompression::None, // RG32UI SurfaceCompression::None, // RGBX16F SurfaceCompression::None, // R32UI + SurfaceCompression::None, // R32I SurfaceCompression::Converted, // ASTC_2D_8X8 SurfaceCompression::Converted, // ASTC_2D_8X5 SurfaceCompression::Converted, // ASTC_2D_5X4 diff --git a/src/video_core/texture_cache/format_lookup_table.cpp b/src/video_core/texture_cache/format_lookup_table.cpp index 81fb9f633..e151c26c4 100644 --- a/src/video_core/texture_cache/format_lookup_table.cpp +++ b/src/video_core/texture_cache/format_lookup_table.cpp @@ -41,7 +41,7 @@ struct Table { ComponentType alpha_component; bool is_srgb; }; -constexpr std::array<Table, 74> DefinitionTable = {{ +constexpr std::array<Table, 76> DefinitionTable = {{ {TextureFormat::A8R8G8B8, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::ABGR8U}, {TextureFormat::A8R8G8B8, C, SNORM, SNORM, SNORM, SNORM, PixelFormat::ABGR8S}, {TextureFormat::A8R8G8B8, C, UINT, UINT, UINT, UINT, PixelFormat::ABGR8UI}, @@ -61,6 +61,7 @@ constexpr std::array<Table, 74> DefinitionTable = {{ {TextureFormat::G8R8, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::RG8U}, {TextureFormat::G8R8, C, SNORM, SNORM, SNORM, SNORM, PixelFormat::RG8S}, + {TextureFormat::R16_G16_B16_A16, C, SNORM, SNORM, SNORM, SNORM, PixelFormat::RGBA16S}, {TextureFormat::R16_G16_B16_A16, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::RGBA16U}, {TextureFormat::R16_G16_B16_A16, C, FLOAT, FLOAT, FLOAT, FLOAT, PixelFormat::RGBA16F}, {TextureFormat::R16_G16_B16_A16, C, UINT, UINT, UINT, UINT, PixelFormat::RGBA16UI}, @@ -89,6 +90,7 @@ constexpr std::array<Table, 74> DefinitionTable = {{ {TextureFormat::R32, C, FLOAT, FLOAT, FLOAT, FLOAT, PixelFormat::R32F}, {TextureFormat::R32, C, UINT, UINT, UINT, UINT, PixelFormat::R32UI}, + {TextureFormat::R32, C, SINT, SINT, SINT, SINT, PixelFormat::R32I}, {TextureFormat::E5B9G9R9_SHAREDEXP, C, FLOAT, FLOAT, FLOAT, FLOAT, PixelFormat::E5B9G9R9F}, diff --git a/src/video_core/texture_cache/surface_params.cpp b/src/video_core/texture_cache/surface_params.cpp index 38b3a4ba8..9931c5ef7 100644 --- a/src/video_core/texture_cache/surface_params.cpp +++ b/src/video_core/texture_cache/surface_params.cpp @@ -84,19 +84,16 @@ SurfaceParams SurfaceParams::CreateForTexture(const FormatLookupTable& lookup_ta if (entry.IsShadow() && params.type == SurfaceType::ColorTexture) { switch (params.pixel_format) { case PixelFormat::R16U: - case PixelFormat::R16F: { + case PixelFormat::R16F: params.pixel_format = PixelFormat::Z16; break; - } - case PixelFormat::R32F: { + case PixelFormat::R32F: params.pixel_format = PixelFormat::Z32F; break; - } - default: { + default: UNIMPLEMENTED_MSG("Unimplemented shadow convert format: {}", static_cast<u32>(params.pixel_format)); } - } params.type = GetFormatType(params.pixel_format); } params.type = GetFormatType(params.pixel_format); @@ -116,8 +113,10 @@ SurfaceParams SurfaceParams::CreateForTexture(const FormatLookupTable& lookup_ta params.height = tic.Height(); params.depth = tic.Depth(); params.pitch = params.is_tiled ? 0 : tic.Pitch(); - if (params.target == SurfaceTarget::TextureCubemap || - params.target == SurfaceTarget::TextureCubeArray) { + if (params.target == SurfaceTarget::Texture2D && params.depth > 1) { + params.depth = 1; + } else if (params.target == SurfaceTarget::TextureCubemap || + params.target == SurfaceTarget::TextureCubeArray) { params.depth *= 6; } params.num_levels = tic.max_mip_level + 1; @@ -168,27 +167,29 @@ SurfaceParams SurfaceParams::CreateForImage(const FormatLookupTable& lookup_tabl return params; } -SurfaceParams SurfaceParams::CreateForDepthBuffer( - Core::System& system, u32 zeta_width, u32 zeta_height, Tegra::DepthFormat format, - u32 block_width, u32 block_height, u32 block_depth, - Tegra::Engines::Maxwell3D::Regs::InvMemoryLayout type) { +SurfaceParams SurfaceParams::CreateForDepthBuffer(Core::System& system) { + const auto& regs = system.GPU().Maxwell3D().regs; + regs.zeta_width, regs.zeta_height, regs.zeta.format, regs.zeta.memory_layout.type; SurfaceParams params; - params.is_tiled = type == Tegra::Engines::Maxwell3D::Regs::InvMemoryLayout::BlockLinear; + params.is_tiled = regs.zeta.memory_layout.type == + Tegra::Engines::Maxwell3D::Regs::InvMemoryLayout::BlockLinear; params.srgb_conversion = false; - params.block_width = std::min(block_width, 5U); - params.block_height = std::min(block_height, 5U); - params.block_depth = std::min(block_depth, 5U); + params.block_width = std::min(regs.zeta.memory_layout.block_width.Value(), 5U); + params.block_height = std::min(regs.zeta.memory_layout.block_height.Value(), 5U); + params.block_depth = std::min(regs.zeta.memory_layout.block_depth.Value(), 5U); params.tile_width_spacing = 1; - params.pixel_format = PixelFormatFromDepthFormat(format); + params.pixel_format = PixelFormatFromDepthFormat(regs.zeta.format); params.type = GetFormatType(params.pixel_format); - params.width = zeta_width; - params.height = zeta_height; - params.target = SurfaceTarget::Texture2D; - params.depth = 1; + params.width = regs.zeta_width; + params.height = regs.zeta_height; params.pitch = 0; params.num_levels = 1; params.emulated_levels = 1; - params.is_layered = false; + + const bool is_layered = regs.zeta_layers > 1 && params.block_depth == 0; + params.is_layered = is_layered; + params.target = is_layered ? SurfaceTarget::Texture2DArray : SurfaceTarget::Texture2D; + params.depth = is_layered ? regs.zeta_layers.Value() : 1U; return params; } @@ -214,11 +215,13 @@ SurfaceParams SurfaceParams::CreateForFramebuffer(Core::System& system, std::siz params.width = params.pitch / bpp; } params.height = config.height; - params.depth = 1; - params.target = SurfaceTarget::Texture2D; params.num_levels = 1; params.emulated_levels = 1; - params.is_layered = false; + + const bool is_layered = config.layers > 1 && params.block_depth == 0; + params.is_layered = is_layered; + params.depth = is_layered ? config.layers.Value() : 1; + params.target = is_layered ? SurfaceTarget::Texture2DArray : SurfaceTarget::Texture2D; return params; } diff --git a/src/video_core/texture_cache/surface_params.h b/src/video_core/texture_cache/surface_params.h index 9256fd6d9..995cc3818 100644 --- a/src/video_core/texture_cache/surface_params.h +++ b/src/video_core/texture_cache/surface_params.h @@ -35,10 +35,7 @@ public: const VideoCommon::Shader::Image& entry); /// Creates SurfaceCachedParams for a depth buffer configuration. - static SurfaceParams CreateForDepthBuffer( - Core::System& system, u32 zeta_width, u32 zeta_height, Tegra::DepthFormat format, - u32 block_width, u32 block_height, u32 block_depth, - Tegra::Engines::Maxwell3D::Regs::InvMemoryLayout type); + static SurfaceParams CreateForDepthBuffer(Core::System& system); /// Creates SurfaceCachedParams from a framebuffer configuration. static SurfaceParams CreateForFramebuffer(Core::System& system, std::size_t index); diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h index 0d105d386..6cdbe63d0 100644 --- a/src/video_core/texture_cache/texture_cache.h +++ b/src/video_core/texture_cache/texture_cache.h @@ -22,6 +22,7 @@ #include "core/core.h" #include "core/memory.h" #include "core/settings.h" +#include "video_core/dirty_flags.h" #include "video_core/engines/fermi_2d.h" #include "video_core/engines/maxwell_3d.h" #include "video_core/gpu.h" @@ -103,6 +104,11 @@ public: if (!cache_addr) { return GetNullSurface(SurfaceParams::ExpectedTarget(entry)); } + + if (!IsTypeCompatible(tic.texture_type, entry)) { + return GetNullSurface(SurfaceParams::ExpectedTarget(entry)); + } + const auto params{SurfaceParams::CreateForTexture(format_lookup_table, tic, entry)}; const auto [surface, view] = GetSurface(gpu_addr, cache_addr, params, true, false); if (guard_samplers) { @@ -142,11 +148,10 @@ public: TView GetDepthBufferSurface(bool preserve_contents) { std::lock_guard lock{mutex}; auto& maxwell3d = system.GPU().Maxwell3D(); - - if (!maxwell3d.dirty.depth_buffer) { + if (!maxwell3d.dirty.flags[VideoCommon::Dirty::ZetaBuffer]) { return depth_buffer.view; } - maxwell3d.dirty.depth_buffer = false; + maxwell3d.dirty.flags[VideoCommon::Dirty::ZetaBuffer] = false; const auto& regs{maxwell3d.regs}; const auto gpu_addr{regs.zeta.Address()}; @@ -160,10 +165,7 @@ public: SetEmptyDepthBuffer(); return {}; } - const auto depth_params{SurfaceParams::CreateForDepthBuffer( - system, regs.zeta_width, regs.zeta_height, regs.zeta.format, - regs.zeta.memory_layout.block_width, regs.zeta.memory_layout.block_height, - regs.zeta.memory_layout.block_depth, regs.zeta.memory_layout.type)}; + const auto depth_params{SurfaceParams::CreateForDepthBuffer(system)}; auto surface_view = GetSurface(gpu_addr, cache_addr, depth_params, preserve_contents, true); if (depth_buffer.target) depth_buffer.target->MarkAsRenderTarget(false, NO_RT); @@ -178,10 +180,10 @@ public: std::lock_guard lock{mutex}; ASSERT(index < Tegra::Engines::Maxwell3D::Regs::NumRenderTargets); auto& maxwell3d = system.GPU().Maxwell3D(); - if (!maxwell3d.dirty.render_target[index]) { + if (!maxwell3d.dirty.flags[VideoCommon::Dirty::ColorBuffer0 + index]) { return render_targets[index].view; } - maxwell3d.dirty.render_target[index] = false; + maxwell3d.dirty.flags[VideoCommon::Dirty::ColorBuffer0 + index] = false; const auto& regs{maxwell3d.regs}; if (index >= regs.rt_control.count || regs.rt[index].Address() == 0 || @@ -323,14 +325,14 @@ protected: virtual void BufferCopy(TSurface& src_surface, TSurface& dst_surface) = 0; void ManageRenderTargetUnregister(TSurface& surface) { - auto& maxwell3d = system.GPU().Maxwell3D(); + auto& dirty = system.GPU().Maxwell3D().dirty; const u32 index = surface->GetRenderTarget(); if (index == DEPTH_RT) { - maxwell3d.dirty.depth_buffer = true; + dirty.flags[VideoCommon::Dirty::ZetaBuffer] = true; } else { - maxwell3d.dirty.render_target[index] = true; + dirty.flags[VideoCommon::Dirty::ColorBuffer0 + index] = true; } - maxwell3d.dirty.render_settings = true; + dirty.flags[VideoCommon::Dirty::RenderTargets] = true; } void Register(TSurface surface) { @@ -917,13 +919,15 @@ private: params.width = 1; params.height = 1; params.depth = 1; + if (target == SurfaceTarget::TextureCubemap || target == SurfaceTarget::TextureCubeArray) { + params.depth = 6; + } params.pitch = 4; params.num_levels = 1; params.emulated_levels = 1; - params.pixel_format = VideoCore::Surface::PixelFormat::RGBA16F; + params.pixel_format = VideoCore::Surface::PixelFormat::R8U; params.type = VideoCore::Surface::SurfaceType::ColorTexture; auto surface = CreateSurface(0ULL, params); - invalid_memory.clear(); invalid_memory.resize(surface->GetHostSizeInBytes(), 0U); surface->UploadTexture(invalid_memory); surface->MarkAsModified(false, Tick()); @@ -1085,6 +1089,36 @@ private: return siblings_table[static_cast<std::size_t>(format)]; } + /// Returns true the shader sampler entry is compatible with the TIC texture type. + static bool IsTypeCompatible(Tegra::Texture::TextureType tic_type, + const VideoCommon::Shader::Sampler& entry) { + const auto shader_type = entry.GetType(); + switch (tic_type) { + case Tegra::Texture::TextureType::Texture1D: + case Tegra::Texture::TextureType::Texture1DArray: + return shader_type == Tegra::Shader::TextureType::Texture1D; + case Tegra::Texture::TextureType::Texture1DBuffer: + // TODO(Rodrigo): Assume as valid for now + return true; + case Tegra::Texture::TextureType::Texture2D: + case Tegra::Texture::TextureType::Texture2DNoMipmap: + return shader_type == Tegra::Shader::TextureType::Texture2D; + case Tegra::Texture::TextureType::Texture2DArray: + return shader_type == Tegra::Shader::TextureType::Texture2D || + shader_type == Tegra::Shader::TextureType::TextureCube; + case Tegra::Texture::TextureType::Texture3D: + return shader_type == Tegra::Shader::TextureType::Texture3D; + case Tegra::Texture::TextureType::TextureCubeArray: + case Tegra::Texture::TextureType::TextureCubemap: + if (shader_type == Tegra::Shader::TextureType::TextureCube) { + return true; + } + return shader_type == Tegra::Shader::TextureType::Texture2D && entry.IsArray(); + } + UNREACHABLE(); + return true; + } + struct FramebufferTargetInfo { TSurface target; TView view; diff --git a/src/video_core/textures/astc.cpp b/src/video_core/textures/astc.cpp index 33bd31865..062b4f252 100644 --- a/src/video_core/textures/astc.cpp +++ b/src/video_core/textures/astc.cpp @@ -17,26 +17,37 @@ #include <algorithm> #include <cassert> -#include <cstdint> #include <cstring> #include <vector> +#include "common/common_types.h" + #include "video_core/textures/astc.h" +namespace { + +/// Count the number of bits set in a number. +constexpr u32 Popcnt(u32 n) { + u32 c = 0; + for (; n; c++) { + n &= n - 1; + } + return c; +} + +} // Anonymous namespace + class InputBitStream { public: - explicit InputBitStream(const unsigned char* ptr, int start_offset = 0) + explicit InputBitStream(const u8* ptr, std::size_t start_offset = 0) : m_CurByte(ptr), m_NextBit(start_offset % 8) {} - ~InputBitStream() = default; - - int GetBitsRead() const { + std::size_t GetBitsRead() const { return m_BitsRead; } - int ReadBit() { - - int bit = *m_CurByte >> m_NextBit++; + u32 ReadBit() { + u32 bit = *m_CurByte >> m_NextBit++; while (m_NextBit >= 8) { m_NextBit -= 8; m_CurByte++; @@ -46,57 +57,66 @@ public: return bit & 1; } - unsigned int ReadBits(unsigned int nBits) { - unsigned int ret = 0; - for (unsigned int i = 0; i < nBits; i++) { + u32 ReadBits(std::size_t nBits) { + u32 ret = 0; + for (std::size_t i = 0; i < nBits; ++i) { + ret |= (ReadBit() & 1) << i; + } + return ret; + } + + template <std::size_t nBits> + u32 ReadBits() { + u32 ret = 0; + for (std::size_t i = 0; i < nBits; ++i) { ret |= (ReadBit() & 1) << i; } return ret; } private: - const unsigned char* m_CurByte; - int m_NextBit = 0; - int m_BitsRead = 0; + const u8* m_CurByte; + std::size_t m_NextBit = 0; + std::size_t m_BitsRead = 0; }; class OutputBitStream { public: - explicit OutputBitStream(unsigned char* ptr, int nBits = 0, int start_offset = 0) + explicit OutputBitStream(u8* ptr, s32 nBits = 0, s32 start_offset = 0) : m_NumBits(nBits), m_CurByte(ptr), m_NextBit(start_offset % 8) {} ~OutputBitStream() = default; - int GetBitsWritten() const { + s32 GetBitsWritten() const { return m_BitsWritten; } - void WriteBitsR(unsigned int val, unsigned int nBits) { - for (unsigned int i = 0; i < nBits; i++) { + void WriteBitsR(u32 val, u32 nBits) { + for (u32 i = 0; i < nBits; i++) { WriteBit((val >> (nBits - i - 1)) & 1); } } - void WriteBits(unsigned int val, unsigned int nBits) { - for (unsigned int i = 0; i < nBits; i++) { + void WriteBits(u32 val, u32 nBits) { + for (u32 i = 0; i < nBits; i++) { WriteBit((val >> i) & 1); } } private: - void WriteBit(int b) { + void WriteBit(s32 b) { if (done) return; - const unsigned int mask = 1 << m_NextBit++; + const u32 mask = 1 << m_NextBit++; // clear the bit - *m_CurByte &= static_cast<unsigned char>(~mask); + *m_CurByte &= static_cast<u8>(~mask); // Write the bit, if necessary if (b) - *m_CurByte |= static_cast<unsigned char>(mask); + *m_CurByte |= static_cast<u8>(mask); // Next byte? if (m_NextBit >= 8) { @@ -107,10 +127,10 @@ private: done = done || ++m_BitsWritten >= m_NumBits; } - int m_BitsWritten = 0; - const int m_NumBits; - unsigned char* m_CurByte; - int m_NextBit = 0; + s32 m_BitsWritten = 0; + const s32 m_NumBits; + u8* m_CurByte; + s32 m_NextBit = 0; bool done = false; }; @@ -123,20 +143,20 @@ public: Bits(const Bits&) = delete; Bits& operator=(const Bits&) = delete; - uint8_t operator[](uint32_t bitPos) const { - return static_cast<uint8_t>((m_Bits >> bitPos) & 1); + u8 operator[](u32 bitPos) const { + return static_cast<u8>((m_Bits >> bitPos) & 1); } - IntType operator()(uint32_t start, uint32_t end) const { + IntType operator()(u32 start, u32 end) const { if (start == end) { return (*this)[start]; } else if (start > end) { - uint32_t t = start; + u32 t = start; start = end; end = t; } - uint64_t mask = (1 << (end - start + 1)) - 1; + u64 mask = (1 << (end - start + 1)) - 1; return (m_Bits >> start) & static_cast<IntType>(mask); } @@ -144,273 +164,236 @@ private: const IntType& m_Bits; }; -enum EIntegerEncoding { eIntegerEncoding_JustBits, eIntegerEncoding_Quint, eIntegerEncoding_Trit }; - -class IntegerEncodedValue { -private: - const EIntegerEncoding m_Encoding; - const uint32_t m_NumBits; - uint32_t m_BitValue; - union { - uint32_t m_QuintValue; - uint32_t m_TritValue; - }; +enum class IntegerEncoding { JustBits, Qus32, Trit }; -public: - // Jank, but we're not doing any heavy lifting in this class, so it's - // probably OK. It allows us to use these in std::vectors... - IntegerEncodedValue& operator=(const IntegerEncodedValue& other) { - new (this) IntegerEncodedValue(other); - return *this; - } +struct IntegerEncodedValue { + constexpr IntegerEncodedValue() = default; - IntegerEncodedValue(EIntegerEncoding encoding, uint32_t numBits) - : m_Encoding(encoding), m_NumBits(numBits) {} + constexpr IntegerEncodedValue(IntegerEncoding encoding_, u32 num_bits_) + : encoding{encoding_}, num_bits{num_bits_} {} - EIntegerEncoding GetEncoding() const { - return m_Encoding; - } - uint32_t BaseBitLength() const { - return m_NumBits; - } - - uint32_t GetBitValue() const { - return m_BitValue; - } - void SetBitValue(uint32_t val) { - m_BitValue = val; - } - - uint32_t GetTritValue() const { - return m_TritValue; - } - void SetTritValue(uint32_t val) { - m_TritValue = val; - } - - uint32_t GetQuintValue() const { - return m_QuintValue; - } - void SetQuintValue(uint32_t val) { - m_QuintValue = val; - } - - bool MatchesEncoding(const IntegerEncodedValue& other) const { - return m_Encoding == other.m_Encoding && m_NumBits == other.m_NumBits; + constexpr bool MatchesEncoding(const IntegerEncodedValue& other) const { + return encoding == other.encoding && num_bits == other.num_bits; } // Returns the number of bits required to encode nVals values. - uint32_t GetBitLength(uint32_t nVals) const { - uint32_t totalBits = m_NumBits * nVals; - if (m_Encoding == eIntegerEncoding_Trit) { + u32 GetBitLength(u32 nVals) const { + u32 totalBits = num_bits * nVals; + if (encoding == IntegerEncoding::Trit) { totalBits += (nVals * 8 + 4) / 5; - } else if (m_Encoding == eIntegerEncoding_Quint) { + } else if (encoding == IntegerEncoding::Qus32) { totalBits += (nVals * 7 + 2) / 3; } return totalBits; } - // Count the number of bits set in a number. - static inline uint32_t Popcnt(uint32_t n) { - uint32_t c; - for (c = 0; n; c++) { - n &= n - 1; + IntegerEncoding encoding{}; + u32 num_bits = 0; + u32 bit_value = 0; + union { + u32 qus32_value = 0; + u32 trit_value; + }; +}; + +static void DecodeTritBlock(InputBitStream& bits, std::vector<IntegerEncodedValue>& result, + u32 nBitsPerValue) { + // Implement the algorithm in section C.2.12 + u32 m[5]; + u32 t[5]; + u32 T; + + // Read the trit encoded block according to + // table C.2.14 + m[0] = bits.ReadBits(nBitsPerValue); + T = bits.ReadBits<2>(); + m[1] = bits.ReadBits(nBitsPerValue); + T |= bits.ReadBits<2>() << 2; + m[2] = bits.ReadBits(nBitsPerValue); + T |= bits.ReadBit() << 4; + m[3] = bits.ReadBits(nBitsPerValue); + T |= bits.ReadBits<2>() << 5; + m[4] = bits.ReadBits(nBitsPerValue); + T |= bits.ReadBit() << 7; + + u32 C = 0; + + Bits<u32> Tb(T); + if (Tb(2, 4) == 7) { + C = (Tb(5, 7) << 2) | Tb(0, 1); + t[4] = t[3] = 2; + } else { + C = Tb(0, 4); + if (Tb(5, 6) == 3) { + t[4] = 2; + t[3] = Tb[7]; + } else { + t[4] = Tb[7]; + t[3] = Tb(5, 6); } - return c; } - // Returns a new instance of this struct that corresponds to the - // can take no more than maxval values - static IntegerEncodedValue CreateEncoding(uint32_t maxVal) { - while (maxVal > 0) { - uint32_t check = maxVal + 1; - - // Is maxVal a power of two? - if (!(check & (check - 1))) { - return IntegerEncodedValue(eIntegerEncoding_JustBits, Popcnt(maxVal)); - } - - // Is maxVal of the type 3*2^n - 1? - if ((check % 3 == 0) && !((check / 3) & ((check / 3) - 1))) { - return IntegerEncodedValue(eIntegerEncoding_Trit, Popcnt(check / 3 - 1)); - } + Bits<u32> Cb(C); + if (Cb(0, 1) == 3) { + t[2] = 2; + t[1] = Cb[4]; + t[0] = (Cb[3] << 1) | (Cb[2] & ~Cb[3]); + } else if (Cb(2, 3) == 3) { + t[2] = 2; + t[1] = 2; + t[0] = Cb(0, 1); + } else { + t[2] = Cb[4]; + t[1] = Cb(2, 3); + t[0] = (Cb[1] << 1) | (Cb[0] & ~Cb[1]); + } - // Is maxVal of the type 5*2^n - 1? - if ((check % 5 == 0) && !((check / 5) & ((check / 5) - 1))) { - return IntegerEncodedValue(eIntegerEncoding_Quint, Popcnt(check / 5 - 1)); - } + for (std::size_t i = 0; i < 5; ++i) { + IntegerEncodedValue& val = result.emplace_back(IntegerEncoding::Trit, nBitsPerValue); + val.bit_value = m[i]; + val.trit_value = t[i]; + } +} - // Apparently it can't be represented with a bounded integer sequence... - // just iterate. - maxVal--; +static void DecodeQus32Block(InputBitStream& bits, std::vector<IntegerEncodedValue>& result, + u32 nBitsPerValue) { + // Implement the algorithm in section C.2.12 + u32 m[3]; + u32 q[3]; + u32 Q; + + // Read the trit encoded block according to + // table C.2.15 + m[0] = bits.ReadBits(nBitsPerValue); + Q = bits.ReadBits<3>(); + m[1] = bits.ReadBits(nBitsPerValue); + Q |= bits.ReadBits<2>() << 3; + m[2] = bits.ReadBits(nBitsPerValue); + Q |= bits.ReadBits<2>() << 5; + + Bits<u32> Qb(Q); + if (Qb(1, 2) == 3 && Qb(5, 6) == 0) { + q[0] = q[1] = 4; + q[2] = (Qb[0] << 2) | ((Qb[4] & ~Qb[0]) << 1) | (Qb[3] & ~Qb[0]); + } else { + u32 C = 0; + if (Qb(1, 2) == 3) { + q[2] = 4; + C = (Qb(3, 4) << 3) | ((~Qb(5, 6) & 3) << 1) | Qb[0]; + } else { + q[2] = Qb(5, 6); + C = Qb(0, 4); } - return IntegerEncodedValue(eIntegerEncoding_JustBits, 0); - } - - // Fills result with the values that are encoded in the given - // bitstream. We must know beforehand what the maximum possible - // value is, and how many values we're decoding. - static void DecodeIntegerSequence(std::vector<IntegerEncodedValue>& result, - InputBitStream& bits, uint32_t maxRange, uint32_t nValues) { - // Determine encoding parameters - IntegerEncodedValue val = IntegerEncodedValue::CreateEncoding(maxRange); - - // Start decoding - uint32_t nValsDecoded = 0; - while (nValsDecoded < nValues) { - switch (val.GetEncoding()) { - case eIntegerEncoding_Quint: - DecodeQuintBlock(bits, result, val.BaseBitLength()); - nValsDecoded += 3; - break; - case eIntegerEncoding_Trit: - DecodeTritBlock(bits, result, val.BaseBitLength()); - nValsDecoded += 5; - break; - - case eIntegerEncoding_JustBits: - val.SetBitValue(bits.ReadBits(val.BaseBitLength())); - result.push_back(val); - nValsDecoded++; - break; - } + Bits<u32> Cb(C); + if (Cb(0, 2) == 5) { + q[1] = 4; + q[0] = Cb(3, 4); + } else { + q[1] = Cb(3, 4); + q[0] = Cb(0, 2); } } -private: - static void DecodeTritBlock(InputBitStream& bits, std::vector<IntegerEncodedValue>& result, - uint32_t nBitsPerValue) { - // Implement the algorithm in section C.2.12 - uint32_t m[5]; - uint32_t t[5]; - uint32_t T; - - // Read the trit encoded block according to - // table C.2.14 - m[0] = bits.ReadBits(nBitsPerValue); - T = bits.ReadBits(2); - m[1] = bits.ReadBits(nBitsPerValue); - T |= bits.ReadBits(2) << 2; - m[2] = bits.ReadBits(nBitsPerValue); - T |= bits.ReadBit() << 4; - m[3] = bits.ReadBits(nBitsPerValue); - T |= bits.ReadBits(2) << 5; - m[4] = bits.ReadBits(nBitsPerValue); - T |= bits.ReadBit() << 7; - - uint32_t C = 0; - - Bits<uint32_t> Tb(T); - if (Tb(2, 4) == 7) { - C = (Tb(5, 7) << 2) | Tb(0, 1); - t[4] = t[3] = 2; - } else { - C = Tb(0, 4); - if (Tb(5, 6) == 3) { - t[4] = 2; - t[3] = Tb[7]; - } else { - t[4] = Tb[7]; - t[3] = Tb(5, 6); - } + for (std::size_t i = 0; i < 3; ++i) { + IntegerEncodedValue& val = result.emplace_back(IntegerEncoding::Qus32, nBitsPerValue); + val.bit_value = m[i]; + val.qus32_value = q[i]; + } +} + +// Returns a new instance of this struct that corresponds to the +// can take no more than maxval values +static constexpr IntegerEncodedValue CreateEncoding(u32 maxVal) { + while (maxVal > 0) { + u32 check = maxVal + 1; + + // Is maxVal a power of two? + if (!(check & (check - 1))) { + return IntegerEncodedValue(IntegerEncoding::JustBits, Popcnt(maxVal)); } - Bits<uint32_t> Cb(C); - if (Cb(0, 1) == 3) { - t[2] = 2; - t[1] = Cb[4]; - t[0] = (Cb[3] << 1) | (Cb[2] & ~Cb[3]); - } else if (Cb(2, 3) == 3) { - t[2] = 2; - t[1] = 2; - t[0] = Cb(0, 1); - } else { - t[2] = Cb[4]; - t[1] = Cb(2, 3); - t[0] = (Cb[1] << 1) | (Cb[0] & ~Cb[1]); + // Is maxVal of the type 3*2^n - 1? + if ((check % 3 == 0) && !((check / 3) & ((check / 3) - 1))) { + return IntegerEncodedValue(IntegerEncoding::Trit, Popcnt(check / 3 - 1)); } - for (uint32_t i = 0; i < 5; i++) { - IntegerEncodedValue val(eIntegerEncoding_Trit, nBitsPerValue); - val.SetBitValue(m[i]); - val.SetTritValue(t[i]); - result.push_back(val); + // Is maxVal of the type 5*2^n - 1? + if ((check % 5 == 0) && !((check / 5) & ((check / 5) - 1))) { + return IntegerEncodedValue(IntegerEncoding::Qus32, Popcnt(check / 5 - 1)); } + + // Apparently it can't be represented with a bounded integer sequence... + // just iterate. + maxVal--; } + return IntegerEncodedValue(IntegerEncoding::JustBits, 0); +} - static void DecodeQuintBlock(InputBitStream& bits, std::vector<IntegerEncodedValue>& result, - uint32_t nBitsPerValue) { - // Implement the algorithm in section C.2.12 - uint32_t m[3]; - uint32_t q[3]; - uint32_t Q; - - // Read the trit encoded block according to - // table C.2.15 - m[0] = bits.ReadBits(nBitsPerValue); - Q = bits.ReadBits(3); - m[1] = bits.ReadBits(nBitsPerValue); - Q |= bits.ReadBits(2) << 3; - m[2] = bits.ReadBits(nBitsPerValue); - Q |= bits.ReadBits(2) << 5; - - Bits<uint32_t> Qb(Q); - if (Qb(1, 2) == 3 && Qb(5, 6) == 0) { - q[0] = q[1] = 4; - q[2] = (Qb[0] << 2) | ((Qb[4] & ~Qb[0]) << 1) | (Qb[3] & ~Qb[0]); - } else { - uint32_t C = 0; - if (Qb(1, 2) == 3) { - q[2] = 4; - C = (Qb(3, 4) << 3) | ((~Qb(5, 6) & 3) << 1) | Qb[0]; - } else { - q[2] = Qb(5, 6); - C = Qb(0, 4); - } +static constexpr std::array<IntegerEncodedValue, 256> MakeEncodedValues() { + std::array<IntegerEncodedValue, 256> encodings{}; + for (std::size_t i = 0; i < encodings.size(); ++i) { + encodings[i] = CreateEncoding(static_cast<u32>(i)); + } + return encodings; +} - Bits<uint32_t> Cb(C); - if (Cb(0, 2) == 5) { - q[1] = 4; - q[0] = Cb(3, 4); - } else { - q[1] = Cb(3, 4); - q[0] = Cb(0, 2); - } - } +static constexpr std::array EncodingsValues = MakeEncodedValues(); + +// Fills result with the values that are encoded in the given +// bitstream. We must know beforehand what the maximum possible +// value is, and how many values we're decoding. +static void DecodeIntegerSequence(std::vector<IntegerEncodedValue>& result, InputBitStream& bits, + u32 maxRange, u32 nValues) { + // Determine encoding parameters + IntegerEncodedValue val = EncodingsValues[maxRange]; + + // Start decoding + u32 nValsDecoded = 0; + while (nValsDecoded < nValues) { + switch (val.encoding) { + case IntegerEncoding::Qus32: + DecodeQus32Block(bits, result, val.num_bits); + nValsDecoded += 3; + break; + + case IntegerEncoding::Trit: + DecodeTritBlock(bits, result, val.num_bits); + nValsDecoded += 5; + break; - for (uint32_t i = 0; i < 3; i++) { - IntegerEncodedValue val(eIntegerEncoding_Quint, nBitsPerValue); - val.m_BitValue = m[i]; - val.m_QuintValue = q[i]; + case IntegerEncoding::JustBits: + val.bit_value = bits.ReadBits(val.num_bits); result.push_back(val); + nValsDecoded++; + break; } } -}; +} namespace ASTCC { struct TexelWeightParams { - uint32_t m_Width = 0; - uint32_t m_Height = 0; + u32 m_Width = 0; + u32 m_Height = 0; bool m_bDualPlane = false; - uint32_t m_MaxWeight = 0; + u32 m_MaxWeight = 0; bool m_bError = false; bool m_bVoidExtentLDR = false; bool m_bVoidExtentHDR = false; - uint32_t GetPackedBitSize() const { + u32 GetPackedBitSize() const { // How many indices do we have? - uint32_t nIdxs = m_Height * m_Width; + u32 nIdxs = m_Height * m_Width; if (m_bDualPlane) { nIdxs *= 2; } - return IntegerEncodedValue::CreateEncoding(m_MaxWeight).GetBitLength(nIdxs); + return EncodingsValues[m_MaxWeight].GetBitLength(nIdxs); } - uint32_t GetNumWeightValues() const { - uint32_t ret = m_Width * m_Height; + u32 GetNumWeightValues() const { + u32 ret = m_Width * m_Height; if (m_bDualPlane) { ret *= 2; } @@ -422,7 +405,7 @@ static TexelWeightParams DecodeBlockInfo(InputBitStream& strm) { TexelWeightParams params; // Read the entire block mode all at once - uint16_t modeBits = static_cast<uint16_t>(strm.ReadBits(11)); + u16 modeBits = static_cast<u16>(strm.ReadBits<11>()); // Does this match the void extent block mode? if ((modeBits & 0x01FF) == 0x1FC) { @@ -457,7 +440,7 @@ static TexelWeightParams DecodeBlockInfo(InputBitStream& strm) { // of the block mode. Layout is determined by a number // between 0 and 9 corresponding to table C.2.8 of the // ASTC spec. - uint32_t layout = 0; + u32 layout = 0; if ((modeBits & 0x1) || (modeBits & 0x2)) { // layout is in [0-4] @@ -509,7 +492,7 @@ static TexelWeightParams DecodeBlockInfo(InputBitStream& strm) { assert(layout < 10); // Determine R - uint32_t R = !!(modeBits & 0x10); + u32 R = !!(modeBits & 0x10); if (layout < 5) { R |= (modeBits & 0x3) << 1; } else { @@ -520,54 +503,54 @@ static TexelWeightParams DecodeBlockInfo(InputBitStream& strm) { // Determine width & height switch (layout) { case 0: { - uint32_t A = (modeBits >> 5) & 0x3; - uint32_t B = (modeBits >> 7) & 0x3; + u32 A = (modeBits >> 5) & 0x3; + u32 B = (modeBits >> 7) & 0x3; params.m_Width = B + 4; params.m_Height = A + 2; break; } case 1: { - uint32_t A = (modeBits >> 5) & 0x3; - uint32_t B = (modeBits >> 7) & 0x3; + u32 A = (modeBits >> 5) & 0x3; + u32 B = (modeBits >> 7) & 0x3; params.m_Width = B + 8; params.m_Height = A + 2; break; } case 2: { - uint32_t A = (modeBits >> 5) & 0x3; - uint32_t B = (modeBits >> 7) & 0x3; + u32 A = (modeBits >> 5) & 0x3; + u32 B = (modeBits >> 7) & 0x3; params.m_Width = A + 2; params.m_Height = B + 8; break; } case 3: { - uint32_t A = (modeBits >> 5) & 0x3; - uint32_t B = (modeBits >> 7) & 0x1; + u32 A = (modeBits >> 5) & 0x3; + u32 B = (modeBits >> 7) & 0x1; params.m_Width = A + 2; params.m_Height = B + 6; break; } case 4: { - uint32_t A = (modeBits >> 5) & 0x3; - uint32_t B = (modeBits >> 7) & 0x1; + u32 A = (modeBits >> 5) & 0x3; + u32 B = (modeBits >> 7) & 0x1; params.m_Width = B + 2; params.m_Height = A + 2; break; } case 5: { - uint32_t A = (modeBits >> 5) & 0x3; + u32 A = (modeBits >> 5) & 0x3; params.m_Width = 12; params.m_Height = A + 2; break; } case 6: { - uint32_t A = (modeBits >> 5) & 0x3; + u32 A = (modeBits >> 5) & 0x3; params.m_Width = A + 2; params.m_Height = 12; break; @@ -586,15 +569,15 @@ static TexelWeightParams DecodeBlockInfo(InputBitStream& strm) { } case 9: { - uint32_t A = (modeBits >> 5) & 0x3; - uint32_t B = (modeBits >> 9) & 0x3; + u32 A = (modeBits >> 5) & 0x3; + u32 B = (modeBits >> 9) & 0x3; params.m_Width = A + 6; params.m_Height = B + 6; break; } default: - assert(!"Don't know this layout..."); + assert(false && "Don't know this layout..."); params.m_bError = true; break; } @@ -605,10 +588,10 @@ static TexelWeightParams DecodeBlockInfo(InputBitStream& strm) { bool H = (layout != 9) && (modeBits & 0x200); if (H) { - const uint32_t maxWeights[6] = {9, 11, 15, 19, 23, 31}; + const u32 maxWeights[6] = {9, 11, 15, 19, 23, 31}; params.m_MaxWeight = maxWeights[R - 2]; } else { - const uint32_t maxWeights[6] = {1, 2, 3, 4, 5, 7}; + const u32 maxWeights[6] = {1, 2, 3, 4, 5, 7}; params.m_MaxWeight = maxWeights[R - 2]; } @@ -617,32 +600,32 @@ static TexelWeightParams DecodeBlockInfo(InputBitStream& strm) { return params; } -static void FillVoidExtentLDR(InputBitStream& strm, uint32_t* const outBuf, uint32_t blockWidth, - uint32_t blockHeight) { +static void FillVoidExtentLDR(InputBitStream& strm, u32* const outBuf, u32 blockWidth, + u32 blockHeight) { // Don't actually care about the void extent, just read the bits... - for (int i = 0; i < 4; ++i) { - strm.ReadBits(13); + for (s32 i = 0; i < 4; ++i) { + strm.ReadBits<13>(); } // Decode the RGBA components and renormalize them to the range [0, 255] - uint16_t r = static_cast<uint16_t>(strm.ReadBits(16)); - uint16_t g = static_cast<uint16_t>(strm.ReadBits(16)); - uint16_t b = static_cast<uint16_t>(strm.ReadBits(16)); - uint16_t a = static_cast<uint16_t>(strm.ReadBits(16)); + u16 r = static_cast<u16>(strm.ReadBits<16>()); + u16 g = static_cast<u16>(strm.ReadBits<16>()); + u16 b = static_cast<u16>(strm.ReadBits<16>()); + u16 a = static_cast<u16>(strm.ReadBits<16>()); - uint32_t rgba = (r >> 8) | (g & 0xFF00) | (static_cast<uint32_t>(b) & 0xFF00) << 8 | - (static_cast<uint32_t>(a) & 0xFF00) << 16; + u32 rgba = (r >> 8) | (g & 0xFF00) | (static_cast<u32>(b) & 0xFF00) << 8 | + (static_cast<u32>(a) & 0xFF00) << 16; - for (uint32_t j = 0; j < blockHeight; j++) { - for (uint32_t i = 0; i < blockWidth; i++) { + for (u32 j = 0; j < blockHeight; j++) { + for (u32 i = 0; i < blockWidth; i++) { outBuf[j * blockWidth + i] = rgba; } } } -static void FillError(uint32_t* outBuf, uint32_t blockWidth, uint32_t blockHeight) { - for (uint32_t j = 0; j < blockHeight; j++) { - for (uint32_t i = 0; i < blockWidth; i++) { +static void FillError(u32* outBuf, u32 blockWidth, u32 blockHeight) { + for (u32 j = 0; j < blockHeight; j++) { + for (u32 i = 0; i < blockWidth; i++) { outBuf[j * blockWidth + i] = 0xFFFF00FF; } } @@ -651,18 +634,18 @@ static void FillError(uint32_t* outBuf, uint32_t blockWidth, uint32_t blockHeigh // Replicates low numBits such that [(toBit - 1):(toBit - 1 - fromBit)] // is the same as [(numBits - 1):0] and repeats all the way down. template <typename IntType> -static IntType Replicate(const IntType& val, uint32_t numBits, uint32_t toBit) { +static IntType Replicate(IntType val, u32 numBits, u32 toBit) { if (numBits == 0) return 0; if (toBit == 0) return 0; IntType v = val & static_cast<IntType>((1 << numBits) - 1); IntType res = v; - uint32_t reslen = numBits; + u32 reslen = numBits; while (reslen < toBit) { - uint32_t comp = 0; + u32 comp = 0; if (numBits > toBit - reslen) { - uint32_t newshift = toBit - reslen; + u32 newshift = toBit - reslen; comp = numBits - newshift; numBits = newshift; } @@ -675,14 +658,14 @@ static IntType Replicate(const IntType& val, uint32_t numBits, uint32_t toBit) { class Pixel { protected: - using ChannelType = int16_t; - uint8_t m_BitDepth[4] = {8, 8, 8, 8}; - int16_t color[4] = {}; + using ChannelType = s16; + u8 m_BitDepth[4] = {8, 8, 8, 8}; + s16 color[4] = {}; public: Pixel() = default; - Pixel(uint32_t a, uint32_t r, uint32_t g, uint32_t b, unsigned bitDepth = 8) - : m_BitDepth{uint8_t(bitDepth), uint8_t(bitDepth), uint8_t(bitDepth), uint8_t(bitDepth)}, + Pixel(u32 a, u32 r, u32 g, u32 b, u32 bitDepth = 8) + : m_BitDepth{u8(bitDepth), u8(bitDepth), u8(bitDepth), u8(bitDepth)}, color{static_cast<ChannelType>(a), static_cast<ChannelType>(r), static_cast<ChannelType>(g), static_cast<ChannelType>(b)} {} @@ -691,22 +674,22 @@ public: // significant bits when going from larger to smaller bit depth // or by repeating the most significant bits when going from // smaller to larger bit depths. - void ChangeBitDepth(const uint8_t (&depth)[4]) { - for (uint32_t i = 0; i < 4; i++) { + void ChangeBitDepth(const u8 (&depth)[4]) { + for (u32 i = 0; i < 4; i++) { Component(i) = ChangeBitDepth(Component(i), m_BitDepth[i], depth[i]); m_BitDepth[i] = depth[i]; } } template <typename IntType> - static float ConvertChannelToFloat(IntType channel, uint8_t bitDepth) { + static float ConvertChannelToFloat(IntType channel, u8 bitDepth) { float denominator = static_cast<float>((1 << bitDepth) - 1); return static_cast<float>(channel) / denominator; } // Changes the bit depth of a single component. See the comment // above for how we do this. - static ChannelType ChangeBitDepth(Pixel::ChannelType val, uint8_t oldDepth, uint8_t newDepth) { + static ChannelType ChangeBitDepth(Pixel::ChannelType val, u8 oldDepth, u8 newDepth) { assert(newDepth <= 8); assert(oldDepth <= 8); @@ -722,16 +705,15 @@ public: if (newDepth == 0) { return 0xFF; } else { - uint8_t bitsWasted = static_cast<uint8_t>(oldDepth - newDepth); - uint16_t v = static_cast<uint16_t>(val); - v = static_cast<uint16_t>((v + (1 << (bitsWasted - 1))) >> bitsWasted); - v = ::std::min<uint16_t>(::std::max<uint16_t>(0, v), - static_cast<uint16_t>((1 << newDepth) - 1)); - return static_cast<uint8_t>(v); + u8 bitsWasted = static_cast<u8>(oldDepth - newDepth); + u16 v = static_cast<u16>(val); + v = static_cast<u16>((v + (1 << (bitsWasted - 1))) >> bitsWasted); + v = ::std::min<u16>(::std::max<u16>(0, v), static_cast<u16>((1 << newDepth) - 1)); + return static_cast<u8>(v); } } - assert(!"We shouldn't get here."); + assert(false && "We shouldn't get here."); return 0; } @@ -759,15 +741,15 @@ public: ChannelType& B() { return color[3]; } - const ChannelType& Component(uint32_t idx) const { + const ChannelType& Component(u32 idx) const { return color[idx]; } - ChannelType& Component(uint32_t idx) { + ChannelType& Component(u32 idx) { return color[idx]; } - void GetBitDepth(uint8_t (&outDepth)[4]) const { - for (int i = 0; i < 4; i++) { + void GetBitDepth(u8 (&outDepth)[4]) const { + for (s32 i = 0; i < 4; i++) { outDepth[i] = m_BitDepth[i]; } } @@ -776,12 +758,12 @@ public: // and then pack each channel into an R8G8B8A8 32-bit integer. We assume // that the architecture is little-endian, so the alpha channel will end // up in the most-significant byte. - uint32_t Pack() const { + u32 Pack() const { Pixel eightBit(*this); - const uint8_t eightBitDepth[4] = {8, 8, 8, 8}; + const u8 eightBitDepth[4] = {8, 8, 8, 8}; eightBit.ChangeBitDepth(eightBitDepth); - uint32_t r = 0; + u32 r = 0; r |= eightBit.A(); r <<= 8; r |= eightBit.B(); @@ -794,7 +776,7 @@ public: // Clamps the pixel to the range [0,255] void ClampByte() { - for (uint32_t i = 0; i < 4; i++) { + for (u32 i = 0; i < 4; i++) { color[i] = (color[i] < 0) ? 0 : ((color[i] > 255) ? 255 : color[i]); } } @@ -804,24 +786,24 @@ public: } }; -static void DecodeColorValues(uint32_t* out, uint8_t* data, const uint32_t* modes, - const uint32_t nPartitions, const uint32_t nBitsForColorData) { +static void DecodeColorValues(u32* out, u8* data, const u32* modes, const u32 nPartitions, + const u32 nBitsForColorData) { // First figure out how many color values we have - uint32_t nValues = 0; - for (uint32_t i = 0; i < nPartitions; i++) { + u32 nValues = 0; + for (u32 i = 0; i < nPartitions; i++) { nValues += ((modes[i] >> 2) + 1) << 1; } // Then based on the number of values and the remaining number of bits, // figure out the max value for each of them... - uint32_t range = 256; + u32 range = 256; while (--range > 0) { - IntegerEncodedValue val = IntegerEncodedValue::CreateEncoding(range); - uint32_t bitLength = val.GetBitLength(nValues); + IntegerEncodedValue val = EncodingsValues[range]; + u32 bitLength = val.GetBitLength(nValues); if (bitLength <= nBitsForColorData) { // Find the smallest possible range that matches the given encoding while (--range > 0) { - IntegerEncodedValue newval = IntegerEncodedValue::CreateEncoding(range); + IntegerEncodedValue newval = EncodingsValues[range]; if (!newval.MatchesEncoding(val)) { break; } @@ -835,12 +817,14 @@ static void DecodeColorValues(uint32_t* out, uint8_t* data, const uint32_t* mode // We now have enough to decode our integer sequence. std::vector<IntegerEncodedValue> decodedColorValues; + decodedColorValues.reserve(32); + InputBitStream colorStream(data); - IntegerEncodedValue::DecodeIntegerSequence(decodedColorValues, colorStream, range, nValues); + DecodeIntegerSequence(decodedColorValues, colorStream, range, nValues); // Once we have the decoded values, we need to dequantize them to the 0-255 range // This procedure is outlined in ASTC spec C.2.13 - uint32_t outIdx = 0; + u32 outIdx = 0; for (auto itr = decodedColorValues.begin(); itr != decodedColorValues.end(); ++itr) { // Have we already decoded all that we need? if (outIdx >= nValues) { @@ -848,25 +832,25 @@ static void DecodeColorValues(uint32_t* out, uint8_t* data, const uint32_t* mode } const IntegerEncodedValue& val = *itr; - uint32_t bitlen = val.BaseBitLength(); - uint32_t bitval = val.GetBitValue(); + u32 bitlen = val.num_bits; + u32 bitval = val.bit_value; assert(bitlen >= 1); - uint32_t A = 0, B = 0, C = 0, D = 0; + u32 A = 0, B = 0, C = 0, D = 0; // A is just the lsb replicated 9 times. A = Replicate(bitval & 1, 1, 9); - switch (val.GetEncoding()) { + switch (val.encoding) { // Replicate bits - case eIntegerEncoding_JustBits: + case IntegerEncoding::JustBits: out[outIdx++] = Replicate(bitval, bitlen, 8); break; // Use algorithm in C.2.13 - case eIntegerEncoding_Trit: { + case IntegerEncoding::Trit: { - D = val.GetTritValue(); + D = val.trit_value; switch (bitlen) { case 1: { @@ -876,48 +860,48 @@ static void DecodeColorValues(uint32_t* out, uint8_t* data, const uint32_t* mode case 2: { C = 93; // B = b000b0bb0 - uint32_t b = (bitval >> 1) & 1; + u32 b = (bitval >> 1) & 1; B = (b << 8) | (b << 4) | (b << 2) | (b << 1); } break; case 3: { C = 44; // B = cb000cbcb - uint32_t cb = (bitval >> 1) & 3; + u32 cb = (bitval >> 1) & 3; B = (cb << 7) | (cb << 2) | cb; } break; case 4: { C = 22; // B = dcb000dcb - uint32_t dcb = (bitval >> 1) & 7; + u32 dcb = (bitval >> 1) & 7; B = (dcb << 6) | dcb; } break; case 5: { C = 11; // B = edcb000ed - uint32_t edcb = (bitval >> 1) & 0xF; + u32 edcb = (bitval >> 1) & 0xF; B = (edcb << 5) | (edcb >> 2); } break; case 6: { C = 5; // B = fedcb000f - uint32_t fedcb = (bitval >> 1) & 0x1F; + u32 fedcb = (bitval >> 1) & 0x1F; B = (fedcb << 4) | (fedcb >> 4); } break; default: - assert(!"Unsupported trit encoding for color values!"); + assert(false && "Unsupported trit encoding for color values!"); break; } // switch(bitlen) - } // case eIntegerEncoding_Trit + } // case IntegerEncoding::Trit break; - case eIntegerEncoding_Quint: { + case IntegerEncoding::Qus32: { - D = val.GetQuintValue(); + D = val.qus32_value; switch (bitlen) { case 1: { @@ -927,41 +911,41 @@ static void DecodeColorValues(uint32_t* out, uint8_t* data, const uint32_t* mode case 2: { C = 54; // B = b0000bb00 - uint32_t b = (bitval >> 1) & 1; + u32 b = (bitval >> 1) & 1; B = (b << 8) | (b << 3) | (b << 2); } break; case 3: { C = 26; // B = cb0000cbc - uint32_t cb = (bitval >> 1) & 3; + u32 cb = (bitval >> 1) & 3; B = (cb << 7) | (cb << 1) | (cb >> 1); } break; case 4: { C = 13; // B = dcb0000dc - uint32_t dcb = (bitval >> 1) & 7; + u32 dcb = (bitval >> 1) & 7; B = (dcb << 6) | (dcb >> 1); } break; case 5: { C = 6; // B = edcb0000e - uint32_t edcb = (bitval >> 1) & 0xF; + u32 edcb = (bitval >> 1) & 0xF; B = (edcb << 5) | (edcb >> 3); } break; default: - assert(!"Unsupported quint encoding for color values!"); + assert(false && "Unsupported quint encoding for color values!"); break; } // switch(bitlen) - } // case eIntegerEncoding_Quint + } // case IntegerEncoding::Qus32 break; - } // switch(val.GetEncoding()) + } // switch(val.encoding) - if (val.GetEncoding() != eIntegerEncoding_JustBits) { - uint32_t T = D * C + B; + if (val.encoding != IntegerEncoding::JustBits) { + u32 T = D * C + B; T ^= A; T = (A & 0x80) | (T >> 2); out[outIdx++] = T; @@ -969,31 +953,31 @@ static void DecodeColorValues(uint32_t* out, uint8_t* data, const uint32_t* mode } // Make sure that each of our values is in the proper range... - for (uint32_t i = 0; i < nValues; i++) { + for (u32 i = 0; i < nValues; i++) { assert(out[i] <= 255); } } -static uint32_t UnquantizeTexelWeight(const IntegerEncodedValue& val) { - uint32_t bitval = val.GetBitValue(); - uint32_t bitlen = val.BaseBitLength(); +static u32 UnquantizeTexelWeight(const IntegerEncodedValue& val) { + u32 bitval = val.bit_value; + u32 bitlen = val.num_bits; - uint32_t A = Replicate(bitval & 1, 1, 7); - uint32_t B = 0, C = 0, D = 0; + u32 A = Replicate(bitval & 1, 1, 7); + u32 B = 0, C = 0, D = 0; - uint32_t result = 0; - switch (val.GetEncoding()) { - case eIntegerEncoding_JustBits: + u32 result = 0; + switch (val.encoding) { + case IntegerEncoding::JustBits: result = Replicate(bitval, bitlen, 6); break; - case eIntegerEncoding_Trit: { - D = val.GetTritValue(); + case IntegerEncoding::Trit: { + D = val.trit_value; assert(D < 3); switch (bitlen) { case 0: { - uint32_t results[3] = {0, 32, 63}; + u32 results[3] = {0, 32, 63}; result = results[D]; } break; @@ -1003,29 +987,29 @@ static uint32_t UnquantizeTexelWeight(const IntegerEncodedValue& val) { case 2: { C = 23; - uint32_t b = (bitval >> 1) & 1; + u32 b = (bitval >> 1) & 1; B = (b << 6) | (b << 2) | b; } break; case 3: { C = 11; - uint32_t cb = (bitval >> 1) & 3; + u32 cb = (bitval >> 1) & 3; B = (cb << 5) | cb; } break; default: - assert(!"Invalid trit encoding for texel weight"); + assert(false && "Invalid trit encoding for texel weight"); break; } } break; - case eIntegerEncoding_Quint: { - D = val.GetQuintValue(); + case IntegerEncoding::Qus32: { + D = val.qus32_value; assert(D < 5); switch (bitlen) { case 0: { - uint32_t results[5] = {0, 16, 32, 47, 63}; + u32 results[5] = {0, 16, 32, 47, 63}; result = results[D]; } break; @@ -1035,18 +1019,18 @@ static uint32_t UnquantizeTexelWeight(const IntegerEncodedValue& val) { case 2: { C = 13; - uint32_t b = (bitval >> 1) & 1; + u32 b = (bitval >> 1) & 1; B = (b << 6) | (b << 1); } break; default: - assert(!"Invalid quint encoding for texel weight"); + assert(false && "Invalid quint encoding for texel weight"); break; } } break; } - if (val.GetEncoding() != eIntegerEncoding_JustBits && bitlen > 0) { + if (val.encoding != IntegerEncoding::JustBits && bitlen > 0) { // Decode the value... result = D * C + B; result ^= A; @@ -1063,12 +1047,11 @@ static uint32_t UnquantizeTexelWeight(const IntegerEncodedValue& val) { return result; } -static void UnquantizeTexelWeights(uint32_t out[2][144], - const std::vector<IntegerEncodedValue>& weights, - const TexelWeightParams& params, const uint32_t blockWidth, - const uint32_t blockHeight) { - uint32_t weightIdx = 0; - uint32_t unquantized[2][144]; +static void UnquantizeTexelWeights(u32 out[2][144], const std::vector<IntegerEncodedValue>& weights, + const TexelWeightParams& params, const u32 blockWidth, + const u32 blockHeight) { + u32 weightIdx = 0; + u32 unquantized[2][144]; for (auto itr = weights.begin(); itr != weights.end(); ++itr) { unquantized[0][weightIdx] = UnquantizeTexelWeight(*itr); @@ -1086,34 +1069,34 @@ static void UnquantizeTexelWeights(uint32_t out[2][144], } // Do infill if necessary (Section C.2.18) ... - uint32_t Ds = (1024 + (blockWidth / 2)) / (blockWidth - 1); - uint32_t Dt = (1024 + (blockHeight / 2)) / (blockHeight - 1); + u32 Ds = (1024 + (blockWidth / 2)) / (blockWidth - 1); + u32 Dt = (1024 + (blockHeight / 2)) / (blockHeight - 1); - const uint32_t kPlaneScale = params.m_bDualPlane ? 2U : 1U; - for (uint32_t plane = 0; plane < kPlaneScale; plane++) - for (uint32_t t = 0; t < blockHeight; t++) - for (uint32_t s = 0; s < blockWidth; s++) { - uint32_t cs = Ds * s; - uint32_t ct = Dt * t; + const u32 kPlaneScale = params.m_bDualPlane ? 2U : 1U; + for (u32 plane = 0; plane < kPlaneScale; plane++) + for (u32 t = 0; t < blockHeight; t++) + for (u32 s = 0; s < blockWidth; s++) { + u32 cs = Ds * s; + u32 ct = Dt * t; - uint32_t gs = (cs * (params.m_Width - 1) + 32) >> 6; - uint32_t gt = (ct * (params.m_Height - 1) + 32) >> 6; + u32 gs = (cs * (params.m_Width - 1) + 32) >> 6; + u32 gt = (ct * (params.m_Height - 1) + 32) >> 6; - uint32_t js = gs >> 4; - uint32_t fs = gs & 0xF; + u32 js = gs >> 4; + u32 fs = gs & 0xF; - uint32_t jt = gt >> 4; - uint32_t ft = gt & 0x0F; + u32 jt = gt >> 4; + u32 ft = gt & 0x0F; - uint32_t w11 = (fs * ft + 8) >> 4; - uint32_t w10 = ft - w11; - uint32_t w01 = fs - w11; - uint32_t w00 = 16 - fs - ft + w11; + u32 w11 = (fs * ft + 8) >> 4; + u32 w10 = ft - w11; + u32 w01 = fs - w11; + u32 w00 = 16 - fs - ft + w11; - uint32_t v0 = js + jt * params.m_Width; + u32 v0 = js + jt * params.m_Width; #define FIND_TEXEL(tidx, bidx) \ - uint32_t p##bidx = 0; \ + u32 p##bidx = 0; \ do { \ if ((tidx) < (params.m_Width * params.m_Height)) { \ p##bidx = unquantized[plane][(tidx)]; \ @@ -1133,7 +1116,7 @@ static void UnquantizeTexelWeights(uint32_t out[2][144], } // Transfers a bit as described in C.2.14 -static inline void BitTransferSigned(int32_t& a, int32_t& b) { +static inline void BitTransferSigned(s32& a, s32& b) { b >>= 1; b |= a & 0x80; a >>= 1; @@ -1144,14 +1127,14 @@ static inline void BitTransferSigned(int32_t& a, int32_t& b) { // Adds more precision to the blue channel as described // in C.2.14 -static inline Pixel BlueContract(int32_t a, int32_t r, int32_t g, int32_t b) { - return Pixel(static_cast<int16_t>(a), static_cast<int16_t>((r + b) >> 1), - static_cast<int16_t>((g + b) >> 1), static_cast<int16_t>(b)); +static inline Pixel BlueContract(s32 a, s32 r, s32 g, s32 b) { + return Pixel(static_cast<s16>(a), static_cast<s16>((r + b) >> 1), + static_cast<s16>((g + b) >> 1), static_cast<s16>(b)); } // Partition selection functions as specified in // C.2.21 -static inline uint32_t hash52(uint32_t p) { +static inline u32 hash52(u32 p) { p ^= p >> 15; p -= p << 17; p += p << 7; @@ -1165,8 +1148,7 @@ static inline uint32_t hash52(uint32_t p) { return p; } -static uint32_t SelectPartition(int32_t seed, int32_t x, int32_t y, int32_t z, - int32_t partitionCount, int32_t smallBlock) { +static u32 SelectPartition(s32 seed, s32 x, s32 y, s32 z, s32 partitionCount, s32 smallBlock) { if (1 == partitionCount) return 0; @@ -1178,34 +1160,34 @@ static uint32_t SelectPartition(int32_t seed, int32_t x, int32_t y, int32_t z, seed += (partitionCount - 1) * 1024; - uint32_t rnum = hash52(static_cast<uint32_t>(seed)); - uint8_t seed1 = static_cast<uint8_t>(rnum & 0xF); - uint8_t seed2 = static_cast<uint8_t>((rnum >> 4) & 0xF); - uint8_t seed3 = static_cast<uint8_t>((rnum >> 8) & 0xF); - uint8_t seed4 = static_cast<uint8_t>((rnum >> 12) & 0xF); - uint8_t seed5 = static_cast<uint8_t>((rnum >> 16) & 0xF); - uint8_t seed6 = static_cast<uint8_t>((rnum >> 20) & 0xF); - uint8_t seed7 = static_cast<uint8_t>((rnum >> 24) & 0xF); - uint8_t seed8 = static_cast<uint8_t>((rnum >> 28) & 0xF); - uint8_t seed9 = static_cast<uint8_t>((rnum >> 18) & 0xF); - uint8_t seed10 = static_cast<uint8_t>((rnum >> 22) & 0xF); - uint8_t seed11 = static_cast<uint8_t>((rnum >> 26) & 0xF); - uint8_t seed12 = static_cast<uint8_t>(((rnum >> 30) | (rnum << 2)) & 0xF); - - seed1 = static_cast<uint8_t>(seed1 * seed1); - seed2 = static_cast<uint8_t>(seed2 * seed2); - seed3 = static_cast<uint8_t>(seed3 * seed3); - seed4 = static_cast<uint8_t>(seed4 * seed4); - seed5 = static_cast<uint8_t>(seed5 * seed5); - seed6 = static_cast<uint8_t>(seed6 * seed6); - seed7 = static_cast<uint8_t>(seed7 * seed7); - seed8 = static_cast<uint8_t>(seed8 * seed8); - seed9 = static_cast<uint8_t>(seed9 * seed9); - seed10 = static_cast<uint8_t>(seed10 * seed10); - seed11 = static_cast<uint8_t>(seed11 * seed11); - seed12 = static_cast<uint8_t>(seed12 * seed12); - - int32_t sh1, sh2, sh3; + u32 rnum = hash52(static_cast<u32>(seed)); + u8 seed1 = static_cast<u8>(rnum & 0xF); + u8 seed2 = static_cast<u8>((rnum >> 4) & 0xF); + u8 seed3 = static_cast<u8>((rnum >> 8) & 0xF); + u8 seed4 = static_cast<u8>((rnum >> 12) & 0xF); + u8 seed5 = static_cast<u8>((rnum >> 16) & 0xF); + u8 seed6 = static_cast<u8>((rnum >> 20) & 0xF); + u8 seed7 = static_cast<u8>((rnum >> 24) & 0xF); + u8 seed8 = static_cast<u8>((rnum >> 28) & 0xF); + u8 seed9 = static_cast<u8>((rnum >> 18) & 0xF); + u8 seed10 = static_cast<u8>((rnum >> 22) & 0xF); + u8 seed11 = static_cast<u8>((rnum >> 26) & 0xF); + u8 seed12 = static_cast<u8>(((rnum >> 30) | (rnum << 2)) & 0xF); + + seed1 = static_cast<u8>(seed1 * seed1); + seed2 = static_cast<u8>(seed2 * seed2); + seed3 = static_cast<u8>(seed3 * seed3); + seed4 = static_cast<u8>(seed4 * seed4); + seed5 = static_cast<u8>(seed5 * seed5); + seed6 = static_cast<u8>(seed6 * seed6); + seed7 = static_cast<u8>(seed7 * seed7); + seed8 = static_cast<u8>(seed8 * seed8); + seed9 = static_cast<u8>(seed9 * seed9); + seed10 = static_cast<u8>(seed10 * seed10); + seed11 = static_cast<u8>(seed11 * seed11); + seed12 = static_cast<u8>(seed12 * seed12); + + s32 sh1, sh2, sh3; if (seed & 1) { sh1 = (seed & 2) ? 4 : 5; sh2 = (partitionCount == 3) ? 6 : 5; @@ -1215,23 +1197,23 @@ static uint32_t SelectPartition(int32_t seed, int32_t x, int32_t y, int32_t z, } sh3 = (seed & 0x10) ? sh1 : sh2; - seed1 = static_cast<uint8_t>(seed1 >> sh1); - seed2 = static_cast<uint8_t>(seed2 >> sh2); - seed3 = static_cast<uint8_t>(seed3 >> sh1); - seed4 = static_cast<uint8_t>(seed4 >> sh2); - seed5 = static_cast<uint8_t>(seed5 >> sh1); - seed6 = static_cast<uint8_t>(seed6 >> sh2); - seed7 = static_cast<uint8_t>(seed7 >> sh1); - seed8 = static_cast<uint8_t>(seed8 >> sh2); - seed9 = static_cast<uint8_t>(seed9 >> sh3); - seed10 = static_cast<uint8_t>(seed10 >> sh3); - seed11 = static_cast<uint8_t>(seed11 >> sh3); - seed12 = static_cast<uint8_t>(seed12 >> sh3); - - int32_t a = seed1 * x + seed2 * y + seed11 * z + (rnum >> 14); - int32_t b = seed3 * x + seed4 * y + seed12 * z + (rnum >> 10); - int32_t c = seed5 * x + seed6 * y + seed9 * z + (rnum >> 6); - int32_t d = seed7 * x + seed8 * y + seed10 * z + (rnum >> 2); + seed1 = static_cast<u8>(seed1 >> sh1); + seed2 = static_cast<u8>(seed2 >> sh2); + seed3 = static_cast<u8>(seed3 >> sh1); + seed4 = static_cast<u8>(seed4 >> sh2); + seed5 = static_cast<u8>(seed5 >> sh1); + seed6 = static_cast<u8>(seed6 >> sh2); + seed7 = static_cast<u8>(seed7 >> sh1); + seed8 = static_cast<u8>(seed8 >> sh2); + seed9 = static_cast<u8>(seed9 >> sh3); + seed10 = static_cast<u8>(seed10 >> sh3); + seed11 = static_cast<u8>(seed11 >> sh3); + seed12 = static_cast<u8>(seed12 >> sh3); + + s32 a = seed1 * x + seed2 * y + seed11 * z + (rnum >> 14); + s32 b = seed3 * x + seed4 * y + seed12 * z + (rnum >> 10); + s32 c = seed5 * x + seed6 * y + seed9 * z + (rnum >> 6); + s32 d = seed7 * x + seed8 * y + seed10 * z + (rnum >> 2); a &= 0x3F; b &= 0x3F; @@ -1252,27 +1234,26 @@ static uint32_t SelectPartition(int32_t seed, int32_t x, int32_t y, int32_t z, return 3; } -static inline uint32_t Select2DPartition(int32_t seed, int32_t x, int32_t y, int32_t partitionCount, - int32_t smallBlock) { +static inline u32 Select2DPartition(s32 seed, s32 x, s32 y, s32 partitionCount, s32 smallBlock) { return SelectPartition(seed, x, y, 0, partitionCount, smallBlock); } // Section C.2.14 -static void ComputeEndpoints(Pixel& ep1, Pixel& ep2, const uint32_t*& colorValues, - uint32_t colorEndpointMode) { +static void ComputeEndpos32s(Pixel& ep1, Pixel& ep2, const u32*& colorValues, + u32 colorEndpos32Mode) { #define READ_UINT_VALUES(N) \ - uint32_t v[N]; \ - for (uint32_t i = 0; i < N; i++) { \ + u32 v[N]; \ + for (u32 i = 0; i < N; i++) { \ v[i] = *(colorValues++); \ } #define READ_INT_VALUES(N) \ - int32_t v[N]; \ - for (uint32_t i = 0; i < N; i++) { \ - v[i] = static_cast<int32_t>(*(colorValues++)); \ + s32 v[N]; \ + for (u32 i = 0; i < N; i++) { \ + v[i] = static_cast<s32>(*(colorValues++)); \ } - switch (colorEndpointMode) { + switch (colorEndpos32Mode) { case 0: { READ_UINT_VALUES(2) ep1 = Pixel(0xFF, v[0], v[0], v[0]); @@ -1281,8 +1262,8 @@ static void ComputeEndpoints(Pixel& ep1, Pixel& ep2, const uint32_t*& colorValue case 1: { READ_UINT_VALUES(2) - uint32_t L0 = (v[0] >> 2) | (v[1] & 0xC0); - uint32_t L1 = std::max(L0 + (v[1] & 0x3F), 0xFFU); + u32 L0 = (v[0] >> 2) | (v[1] & 0xC0); + u32 L1 = std::max(L0 + (v[1] & 0x3F), 0xFFU); ep1 = Pixel(0xFF, L0, L0, L0); ep2 = Pixel(0xFF, L1, L1, L1); } break; @@ -1371,7 +1352,7 @@ static void ComputeEndpoints(Pixel& ep1, Pixel& ep2, const uint32_t*& colorValue } break; default: - assert(!"Unsupported color endpoint mode (is it HDR?)"); + assert(false && "Unsupported color endpoint mode (is it HDR?)"); break; } @@ -1379,14 +1360,14 @@ static void ComputeEndpoints(Pixel& ep1, Pixel& ep2, const uint32_t*& colorValue #undef READ_INT_VALUES } -static void DecompressBlock(const uint8_t inBuf[16], const uint32_t blockWidth, - const uint32_t blockHeight, uint32_t* outBuf) { +static void DecompressBlock(const u8 inBuf[16], const u32 blockWidth, const u32 blockHeight, + u32* outBuf) { InputBitStream strm(inBuf); TexelWeightParams weightParams = DecodeBlockInfo(strm); // Was there an error? if (weightParams.m_bError) { - assert(!"Invalid block mode"); + assert(false && "Invalid block mode"); FillError(outBuf, blockWidth, blockHeight); return; } @@ -1397,63 +1378,63 @@ static void DecompressBlock(const uint8_t inBuf[16], const uint32_t blockWidth, } if (weightParams.m_bVoidExtentHDR) { - assert(!"HDR void extent blocks are unsupported!"); + assert(false && "HDR void extent blocks are unsupported!"); FillError(outBuf, blockWidth, blockHeight); return; } if (weightParams.m_Width > blockWidth) { - assert(!"Texel weight grid width should be smaller than block width"); + assert(false && "Texel weight grid width should be smaller than block width"); FillError(outBuf, blockWidth, blockHeight); return; } if (weightParams.m_Height > blockHeight) { - assert(!"Texel weight grid height should be smaller than block height"); + assert(false && "Texel weight grid height should be smaller than block height"); FillError(outBuf, blockWidth, blockHeight); return; } // Read num partitions - uint32_t nPartitions = strm.ReadBits(2) + 1; + u32 nPartitions = strm.ReadBits<2>() + 1; assert(nPartitions <= 4); if (nPartitions == 4 && weightParams.m_bDualPlane) { - assert(!"Dual plane mode is incompatible with four partition blocks"); + assert(false && "Dual plane mode is incompatible with four partition blocks"); FillError(outBuf, blockWidth, blockHeight); return; } - // Based on the number of partitions, read the color endpoint mode for + // Based on the number of partitions, read the color endpos32 mode for // each partition. - // Determine partitions, partition index, and color endpoint modes - int32_t planeIdx = -1; - uint32_t partitionIndex; - uint32_t colorEndpointMode[4] = {0, 0, 0, 0}; + // Determine partitions, partition index, and color endpos32 modes + s32 planeIdx = -1; + u32 partitionIndex; + u32 colorEndpos32Mode[4] = {0, 0, 0, 0}; // Define color data. - uint8_t colorEndpointData[16]; - memset(colorEndpointData, 0, sizeof(colorEndpointData)); - OutputBitStream colorEndpointStream(colorEndpointData, 16 * 8, 0); + u8 colorEndpos32Data[16]; + memset(colorEndpos32Data, 0, sizeof(colorEndpos32Data)); + OutputBitStream colorEndpos32Stream(colorEndpos32Data, 16 * 8, 0); // Read extra config data... - uint32_t baseCEM = 0; + u32 baseCEM = 0; if (nPartitions == 1) { - colorEndpointMode[0] = strm.ReadBits(4); + colorEndpos32Mode[0] = strm.ReadBits<4>(); partitionIndex = 0; } else { - partitionIndex = strm.ReadBits(10); - baseCEM = strm.ReadBits(6); + partitionIndex = strm.ReadBits<10>(); + baseCEM = strm.ReadBits<6>(); } - uint32_t baseMode = (baseCEM & 3); + u32 baseMode = (baseCEM & 3); - // Remaining bits are color endpoint data... - uint32_t nWeightBits = weightParams.GetPackedBitSize(); - int32_t remainingBits = 128 - nWeightBits - strm.GetBitsRead(); + // Remaining bits are color endpos32 data... + u32 nWeightBits = weightParams.GetPackedBitSize(); + s32 remainingBits = 128 - nWeightBits - static_cast<s32>(strm.GetBitsRead()); // Consider extra bits prior to texel data... - uint32_t extraCEMbits = 0; + u32 extraCEMbits = 0; if (baseMode) { switch (nPartitions) { case 2: @@ -1473,18 +1454,18 @@ static void DecompressBlock(const uint8_t inBuf[16], const uint32_t blockWidth, remainingBits -= extraCEMbits; // Do we have a dual plane situation? - uint32_t planeSelectorBits = 0; + u32 planeSelectorBits = 0; if (weightParams.m_bDualPlane) { planeSelectorBits = 2; } remainingBits -= planeSelectorBits; // Read color data... - uint32_t colorDataBits = remainingBits; + u32 colorDataBits = remainingBits; while (remainingBits > 0) { - uint32_t nb = std::min(remainingBits, 8); - uint32_t b = strm.ReadBits(nb); - colorEndpointStream.WriteBits(b, nb); + u32 nb = std::min(remainingBits, 8); + u32 b = strm.ReadBits(nb); + colorEndpos32Stream.WriteBits(b, nb); remainingBits -= 8; } @@ -1493,64 +1474,64 @@ static void DecompressBlock(const uint8_t inBuf[16], const uint32_t blockWidth, // Read the rest of the CEM if (baseMode) { - uint32_t extraCEM = strm.ReadBits(extraCEMbits); - uint32_t CEM = (extraCEM << 6) | baseCEM; + u32 extraCEM = strm.ReadBits(extraCEMbits); + u32 CEM = (extraCEM << 6) | baseCEM; CEM >>= 2; bool C[4] = {0}; - for (uint32_t i = 0; i < nPartitions; i++) { + for (u32 i = 0; i < nPartitions; i++) { C[i] = CEM & 1; CEM >>= 1; } - uint8_t M[4] = {0}; - for (uint32_t i = 0; i < nPartitions; i++) { + u8 M[4] = {0}; + for (u32 i = 0; i < nPartitions; i++) { M[i] = CEM & 3; CEM >>= 2; assert(M[i] <= 3); } - for (uint32_t i = 0; i < nPartitions; i++) { - colorEndpointMode[i] = baseMode; + for (u32 i = 0; i < nPartitions; i++) { + colorEndpos32Mode[i] = baseMode; if (!(C[i])) - colorEndpointMode[i] -= 1; - colorEndpointMode[i] <<= 2; - colorEndpointMode[i] |= M[i]; + colorEndpos32Mode[i] -= 1; + colorEndpos32Mode[i] <<= 2; + colorEndpos32Mode[i] |= M[i]; } } else if (nPartitions > 1) { - uint32_t CEM = baseCEM >> 2; - for (uint32_t i = 0; i < nPartitions; i++) { - colorEndpointMode[i] = CEM; + u32 CEM = baseCEM >> 2; + for (u32 i = 0; i < nPartitions; i++) { + colorEndpos32Mode[i] = CEM; } } // Make sure everything up till here is sane. - for (uint32_t i = 0; i < nPartitions; i++) { - assert(colorEndpointMode[i] < 16); + for (u32 i = 0; i < nPartitions; i++) { + assert(colorEndpos32Mode[i] < 16); } assert(strm.GetBitsRead() + weightParams.GetPackedBitSize() == 128); // Decode both color data and texel weight data - uint32_t colorValues[32]; // Four values, two endpoints, four maximum paritions - DecodeColorValues(colorValues, colorEndpointData, colorEndpointMode, nPartitions, + u32 colorValues[32]; // Four values, two endpos32s, four maximum paritions + DecodeColorValues(colorValues, colorEndpos32Data, colorEndpos32Mode, nPartitions, colorDataBits); - Pixel endpoints[4][2]; - const uint32_t* colorValuesPtr = colorValues; - for (uint32_t i = 0; i < nPartitions; i++) { - ComputeEndpoints(endpoints[i][0], endpoints[i][1], colorValuesPtr, colorEndpointMode[i]); + Pixel endpos32s[4][2]; + const u32* colorValuesPtr = colorValues; + for (u32 i = 0; i < nPartitions; i++) { + ComputeEndpos32s(endpos32s[i][0], endpos32s[i][1], colorValuesPtr, colorEndpos32Mode[i]); } // Read the texel weight data.. - uint8_t texelWeightData[16]; + u8 texelWeightData[16]; memcpy(texelWeightData, inBuf, sizeof(texelWeightData)); // Reverse everything - for (uint32_t i = 0; i < 8; i++) { + for (u32 i = 0; i < 8; i++) { // Taken from http://graphics.stanford.edu/~seander/bithacks.html#ReverseByteWith64Bits #define REVERSE_BYTE(b) (((b)*0x80200802ULL) & 0x0884422110ULL) * 0x0101010101ULL >> 32 - unsigned char a = static_cast<unsigned char>(REVERSE_BYTE(texelWeightData[i])); - unsigned char b = static_cast<unsigned char>(REVERSE_BYTE(texelWeightData[15 - i])); + u8 a = static_cast<u8>(REVERSE_BYTE(texelWeightData[i])); + u8 b = static_cast<u8>(REVERSE_BYTE(texelWeightData[15 - i])); #undef REVERSE_BYTE texelWeightData[i] = b; @@ -1558,50 +1539,51 @@ static void DecompressBlock(const uint8_t inBuf[16], const uint32_t blockWidth, } // Make sure that higher non-texel bits are set to zero - const uint32_t clearByteStart = (weightParams.GetPackedBitSize() >> 3) + 1; + const u32 clearByteStart = (weightParams.GetPackedBitSize() >> 3) + 1; texelWeightData[clearByteStart - 1] = texelWeightData[clearByteStart - 1] & - static_cast<uint8_t>((1 << (weightParams.GetPackedBitSize() % 8)) - 1); + static_cast<u8>((1 << (weightParams.GetPackedBitSize() % 8)) - 1); memset(texelWeightData + clearByteStart, 0, 16 - clearByteStart); std::vector<IntegerEncodedValue> texelWeightValues; + texelWeightValues.reserve(64); + InputBitStream weightStream(texelWeightData); - IntegerEncodedValue::DecodeIntegerSequence(texelWeightValues, weightStream, - weightParams.m_MaxWeight, - weightParams.GetNumWeightValues()); + DecodeIntegerSequence(texelWeightValues, weightStream, weightParams.m_MaxWeight, + weightParams.GetNumWeightValues()); // Blocks can be at most 12x12, so we can have as many as 144 weights - uint32_t weights[2][144]; + u32 weights[2][144]; UnquantizeTexelWeights(weights, texelWeightValues, weightParams, blockWidth, blockHeight); - // Now that we have endpoints and weights, we can interpolate and generate + // Now that we have endpos32s and weights, we can s32erpolate and generate // the proper decoding... - for (uint32_t j = 0; j < blockHeight; j++) - for (uint32_t i = 0; i < blockWidth; i++) { - uint32_t partition = Select2DPartition(partitionIndex, i, j, nPartitions, - (blockHeight * blockWidth) < 32); + for (u32 j = 0; j < blockHeight; j++) + for (u32 i = 0; i < blockWidth; i++) { + u32 partition = Select2DPartition(partitionIndex, i, j, nPartitions, + (blockHeight * blockWidth) < 32); assert(partition < nPartitions); Pixel p; - for (uint32_t c = 0; c < 4; c++) { - uint32_t C0 = endpoints[partition][0].Component(c); + for (u32 c = 0; c < 4; c++) { + u32 C0 = endpos32s[partition][0].Component(c); C0 = Replicate(C0, 8, 16); - uint32_t C1 = endpoints[partition][1].Component(c); + u32 C1 = endpos32s[partition][1].Component(c); C1 = Replicate(C1, 8, 16); - uint32_t plane = 0; + u32 plane = 0; if (weightParams.m_bDualPlane && (((planeIdx + 1) & 3) == c)) { plane = 1; } - uint32_t weight = weights[plane][j * blockWidth + i]; - uint32_t C = (C0 * (64 - weight) + C1 * weight + 32) / 64; + u32 weight = weights[plane][j * blockWidth + i]; + u32 C = (C0 * (64 - weight) + C1 * weight + 32) / 64; if (C == 65535) { p.Component(c) = 255; } else { double Cf = static_cast<double>(C); - p.Component(c) = static_cast<uint16_t>(255.0 * (Cf / 65536.0) + 0.5); + p.Component(c) = static_cast<u16>(255.0 * (Cf / 65536.0) + 0.5); } } @@ -1613,26 +1595,26 @@ static void DecompressBlock(const uint8_t inBuf[16], const uint32_t blockWidth, namespace Tegra::Texture::ASTC { -std::vector<uint8_t> Decompress(const uint8_t* data, uint32_t width, uint32_t height, - uint32_t depth, uint32_t block_width, uint32_t block_height) { - uint32_t blockIdx = 0; +std::vector<u8> Decompress(const u8* data, u32 width, u32 height, u32 depth, u32 block_width, + u32 block_height) { + u32 blockIdx = 0; std::size_t depth_offset = 0; - std::vector<uint8_t> outData(height * width * depth * 4); - for (uint32_t k = 0; k < depth; k++) { - for (uint32_t j = 0; j < height; j += block_height) { - for (uint32_t i = 0; i < width; i += block_width) { + std::vector<u8> outData(height * width * depth * 4); + for (u32 k = 0; k < depth; k++) { + for (u32 j = 0; j < height; j += block_height) { + for (u32 i = 0; i < width; i += block_width) { - const uint8_t* blockPtr = data + blockIdx * 16; + const u8* blockPtr = data + blockIdx * 16; // Blocks can be at most 12x12 - uint32_t uncompData[144]; + u32 uncompData[144]; ASTCC::DecompressBlock(blockPtr, block_width, block_height, uncompData); - uint32_t decompWidth = std::min(block_width, width - i); - uint32_t decompHeight = std::min(block_height, height - j); + u32 decompWidth = std::min(block_width, width - i); + u32 decompHeight = std::min(block_height, height - j); - uint8_t* outRow = depth_offset + outData.data() + (j * width + i) * 4; - for (uint32_t jj = 0; jj < decompHeight; jj++) { + u8* outRow = depth_offset + outData.data() + (j * width + i) * 4; + for (u32 jj = 0; jj < decompHeight; jj++) { memcpy(outRow + jj * width * 4, uncompData + jj * block_width, decompWidth * 4); } diff --git a/src/video_core/textures/texture.h b/src/video_core/textures/texture.h index 8e82c6748..7edc4abe1 100644 --- a/src/video_core/textures/texture.h +++ b/src/video_core/textures/texture.h @@ -8,6 +8,7 @@ #include "common/assert.h" #include "common/bit_field.h" #include "common/common_types.h" +#include "core/settings.h" namespace Tegra::Texture { @@ -294,6 +295,14 @@ enum class TextureMipmapFilter : u32 { Linear = 3, }; +enum class Anisotropy { + Default, + Filter2x, + Filter4x, + Filter8x, + Filter16x, +}; + struct TSCEntry { union { struct { @@ -328,7 +337,22 @@ struct TSCEntry { }; float GetMaxAnisotropy() const { - return static_cast<float>(1U << max_anisotropy); + const u32 min_value = [] { + switch (static_cast<Anisotropy>(Settings::values.max_anisotropy)) { + default: + case Anisotropy::Default: + return 1U; + case Anisotropy::Filter2x: + return 2U; + case Anisotropy::Filter4x: + return 4U; + case Anisotropy::Filter8x: + return 8U; + case Anisotropy::Filter16x: + return 16U; + } + }(); + return static_cast<float>(std::max(1U << max_anisotropy, min_value)); } float GetMinLod() const { diff --git a/src/yuzu/CMakeLists.txt b/src/yuzu/CMakeLists.txt index b841e63fa..d34b47b3f 100644 --- a/src/yuzu/CMakeLists.txt +++ b/src/yuzu/CMakeLists.txt @@ -42,6 +42,9 @@ add_executable(yuzu configuration/configure_graphics.cpp configuration/configure_graphics.h configuration/configure_graphics.ui + configuration/configure_graphics_advanced.cpp + configuration/configure_graphics_advanced.h + configuration/configure_graphics_advanced.ui configuration/configure_hotkeys.cpp configuration/configure_hotkeys.h configuration/configure_hotkeys.ui diff --git a/src/yuzu/bootmanager.cpp b/src/yuzu/bootmanager.cpp index 55a37fffa..c3dbb1a88 100644 --- a/src/yuzu/bootmanager.cpp +++ b/src/yuzu/bootmanager.cpp @@ -9,6 +9,9 @@ #include <QKeyEvent> #include <QMessageBox> #include <QOffscreenSurface> +#include <QOpenGLContext> +#include <QOpenGLFunctions> +#include <QOpenGLFunctions_4_3_Core> #include <QOpenGLWindow> #include <QPainter> #include <QScreen> @@ -23,9 +26,10 @@ #include "common/assert.h" #include "common/microprofile.h" #include "common/scm_rev.h" +#include "common/scope_exit.h" #include "core/core.h" #include "core/frontend/framebuffer_layout.h" -#include "core/frontend/scope_acquire_window_context.h" +#include "core/frontend/scope_acquire_context.h" #include "core/settings.h" #include "input_common/keyboard.h" #include "input_common/main.h" @@ -35,15 +39,27 @@ #include "yuzu/bootmanager.h" #include "yuzu/main.h" -EmuThread::EmuThread(GRenderWindow* render_window) : render_window(render_window) {} +EmuThread::EmuThread(GRenderWindow& window) + : shared_context{window.CreateSharedContext()}, + context{(Settings::values.use_asynchronous_gpu_emulation && shared_context) ? *shared_context + : window} {} EmuThread::~EmuThread() = default; -void EmuThread::run() { - render_window->MakeCurrent(); +static GMainWindow* GetMainWindow() { + for (QWidget* w : qApp->topLevelWidgets()) { + if (GMainWindow* main = qobject_cast<GMainWindow*>(w)) { + return main; + } + } + return nullptr; +} +void EmuThread::run() { MicroProfileOnThreadCreate("EmuThread"); + Core::Frontend::ScopeAcquireContext acquire_context{context}; + emit LoadProgress(VideoCore::LoadCallbackStage::Prepare, 0, 0); Core::System::GetInstance().Renderer().Rasterizer().LoadDiskResources( @@ -53,11 +69,6 @@ void EmuThread::run() { emit LoadProgress(VideoCore::LoadCallbackStage::Complete, 0, 0); - if (Settings::values.use_asynchronous_gpu_emulation) { - // Release OpenGL context for the GPU thread - render_window->DoneCurrent(); - } - // Holds whether the cpu was running during the last iteration, // so that the DebugModeLeft signal can be emitted before the // next execution step @@ -98,190 +109,202 @@ void EmuThread::run() { #if MICROPROFILE_ENABLED MicroProfileOnThreadExit(); #endif - - render_window->moveContext(); } class GGLContext : public Core::Frontend::GraphicsContext { public: - explicit GGLContext(QOpenGLContext* shared_context) : shared_context{shared_context} { - context.setFormat(shared_context->format()); - context.setShareContext(shared_context); - context.create(); + explicit GGLContext(QOpenGLContext* shared_context) + : context(new QOpenGLContext(shared_context->parent())), + surface(new QOffscreenSurface(nullptr)) { + + // disable vsync for any shared contexts + auto format = shared_context->format(); + format.setSwapInterval(0); + + context->setShareContext(shared_context); + context->setFormat(format); + context->create(); + surface->setParent(shared_context->parent()); + surface->setFormat(format); + surface->create(); } void MakeCurrent() override { - context.makeCurrent(shared_context->surface()); + context->makeCurrent(surface); } void DoneCurrent() override { - context.doneCurrent(); + context->doneCurrent(); } - void SwapBuffers() override {} - private: - QOpenGLContext* shared_context; - QOpenGLContext context; + QOpenGLContext* context; + QOffscreenSurface* surface; }; -class GWidgetInternal : public QWindow { +class ChildRenderWindow : public QWindow { public: - GWidgetInternal(GRenderWindow* parent) : parent(parent) {} - virtual ~GWidgetInternal() = default; + ChildRenderWindow(QWindow* parent, QWidget* event_handler) + : QWindow{parent}, event_handler{event_handler} {} - void resizeEvent(QResizeEvent* ev) override { - parent->OnClientAreaResized(ev->size().width(), ev->size().height()); - parent->OnFramebufferSizeChanged(); - } + virtual ~ChildRenderWindow() = default; - void keyPressEvent(QKeyEvent* event) override { - InputCommon::GetKeyboard()->PressKey(event->key()); - } + virtual void Present() = 0; - void keyReleaseEvent(QKeyEvent* event) override { - InputCommon::GetKeyboard()->ReleaseKey(event->key()); +protected: + bool event(QEvent* event) override { + switch (event->type()) { + case QEvent::UpdateRequest: + Present(); + return true; + case QEvent::MouseButtonPress: + case QEvent::MouseButtonRelease: + case QEvent::MouseButtonDblClick: + case QEvent::MouseMove: + case QEvent::KeyPress: + case QEvent::KeyRelease: + case QEvent::FocusIn: + case QEvent::FocusOut: + case QEvent::FocusAboutToChange: + case QEvent::Enter: + case QEvent::Leave: + case QEvent::Wheel: + case QEvent::TabletMove: + case QEvent::TabletPress: + case QEvent::TabletRelease: + case QEvent::TabletEnterProximity: + case QEvent::TabletLeaveProximity: + case QEvent::TouchBegin: + case QEvent::TouchUpdate: + case QEvent::TouchEnd: + case QEvent::InputMethodQuery: + case QEvent::TouchCancel: + return QCoreApplication::sendEvent(event_handler, event); + case QEvent::Drop: + GetMainWindow()->DropAction(static_cast<QDropEvent*>(event)); + return true; + case QEvent::DragResponse: + case QEvent::DragEnter: + case QEvent::DragLeave: + case QEvent::DragMove: + GetMainWindow()->AcceptDropEvent(static_cast<QDropEvent*>(event)); + return true; + default: + return QWindow::event(event); + } } - void mousePressEvent(QMouseEvent* event) override { - if (event->source() == Qt::MouseEventSynthesizedBySystem) - return; // touch input is handled in TouchBeginEvent - - const auto pos{event->pos()}; - if (event->button() == Qt::LeftButton) { - const auto [x, y] = parent->ScaleTouch(pos); - parent->TouchPressed(x, y); - } else if (event->button() == Qt::RightButton) { - InputCommon::GetMotionEmu()->BeginTilt(pos.x(), pos.y()); - } + void exposeEvent(QExposeEvent* event) override { + QWindow::requestUpdate(); + QWindow::exposeEvent(event); } - void mouseMoveEvent(QMouseEvent* event) override { - if (event->source() == Qt::MouseEventSynthesizedBySystem) - return; // touch input is handled in TouchUpdateEvent +private: + QWidget* event_handler{}; +}; - const auto pos{event->pos()}; - const auto [x, y] = parent->ScaleTouch(pos); - parent->TouchMoved(x, y); - InputCommon::GetMotionEmu()->Tilt(pos.x(), pos.y()); - } +class OpenGLWindow final : public ChildRenderWindow { +public: + OpenGLWindow(QWindow* parent, QWidget* event_handler, QOpenGLContext* shared_context) + : ChildRenderWindow{parent, event_handler}, + context(new QOpenGLContext(shared_context->parent())) { - void mouseReleaseEvent(QMouseEvent* event) override { - if (event->source() == Qt::MouseEventSynthesizedBySystem) - return; // touch input is handled in TouchEndEvent + // disable vsync for any shared contexts + auto format = shared_context->format(); + format.setSwapInterval(Settings::values.use_vsync ? 1 : 0); + this->setFormat(format); - if (event->button() == Qt::LeftButton) - parent->TouchReleased(); - else if (event->button() == Qt::RightButton) - InputCommon::GetMotionEmu()->EndTilt(); - } + context->setShareContext(shared_context); + context->setScreen(this->screen()); + context->setFormat(format); + context->create(); - void DisablePainting() { - do_painting = false; - } + setSurfaceType(QWindow::OpenGLSurface); - void EnablePainting() { - do_painting = true; + // TODO: One of these flags might be interesting: WA_OpaquePaintEvent, WA_NoBackground, + // WA_DontShowOnScreen, WA_DeleteOnClose } - std::pair<unsigned, unsigned> GetSize() const { - return std::make_pair(width(), height()); + ~OpenGLWindow() override { + context->doneCurrent(); } -protected: - bool IsPaintingEnabled() const { - return do_painting; + void Present() override { + if (!isExposed()) { + return; + } + + context->makeCurrent(this); + Core::System::GetInstance().Renderer().TryPresent(100); + context->swapBuffers(this); + auto f = context->versionFunctions<QOpenGLFunctions_4_3_Core>(); + f->glFinish(); + QWindow::requestUpdate(); } private: - GRenderWindow* parent; - bool do_painting = false; -}; - -// This class overrides paintEvent and resizeEvent to prevent the GUI thread from stealing GL -// context. -// The corresponding functionality is handled in EmuThread instead -class GGLWidgetInternal final : public GWidgetInternal, public QOpenGLWindow { -public: - GGLWidgetInternal(GRenderWindow* parent, QOpenGLContext* shared_context) - : GWidgetInternal(parent), QOpenGLWindow(shared_context) {} - ~GGLWidgetInternal() override = default; - - void paintEvent(QPaintEvent* ev) override { - if (IsPaintingEnabled()) { - QPainter painter(this); - } - } + QOpenGLContext* context{}; }; #ifdef HAS_VULKAN -class GVKWidgetInternal final : public GWidgetInternal { +class VulkanWindow final : public ChildRenderWindow { public: - GVKWidgetInternal(GRenderWindow* parent, QVulkanInstance* instance) : GWidgetInternal(parent) { + VulkanWindow(QWindow* parent, QWidget* event_handler, QVulkanInstance* instance) + : ChildRenderWindow{parent, event_handler} { setSurfaceType(QSurface::SurfaceType::VulkanSurface); setVulkanInstance(instance); } - ~GVKWidgetInternal() override = default; + + ~VulkanWindow() override = default; + + void Present() override { + // TODO(bunnei): ImplementMe + } + +private: + QWidget* event_handler{}; }; #endif -GRenderWindow::GRenderWindow(GMainWindow* parent, EmuThread* emu_thread) - : QWidget(parent), emu_thread(emu_thread) { +GRenderWindow::GRenderWindow(QWidget* parent_, EmuThread* emu_thread) + : QWidget(parent_), emu_thread(emu_thread) { setWindowTitle(QStringLiteral("yuzu %1 | %2-%3") .arg(QString::fromUtf8(Common::g_build_name), QString::fromUtf8(Common::g_scm_branch), QString::fromUtf8(Common::g_scm_desc))); setAttribute(Qt::WA_AcceptTouchEvents); - + auto layout = new QHBoxLayout(this); + layout->setMargin(0); + setLayout(layout); InputCommon::Init(); + + GMainWindow* parent = GetMainWindow(); connect(this, &GRenderWindow::FirstFrameDisplayed, parent, &GMainWindow::OnLoadComplete); } GRenderWindow::~GRenderWindow() { InputCommon::Shutdown(); - - // Avoid an unordered destruction that generates a segfault - delete child; } -void GRenderWindow::moveContext() { - if (!context) { - return; +void GRenderWindow::MakeCurrent() { + if (core_context) { + core_context->MakeCurrent(); } - DoneCurrent(); - - // If the thread started running, move the GL Context to the new thread. Otherwise, move it - // back. - auto thread = (QThread::currentThread() == qApp->thread() && emu_thread != nullptr) - ? emu_thread - : qApp->thread(); - context->moveToThread(thread); } -void GRenderWindow::SwapBuffers() { - if (context) { - context->swapBuffers(child); +void GRenderWindow::DoneCurrent() { + if (core_context) { + core_context->DoneCurrent(); } +} + +void GRenderWindow::PollEvents() { if (!first_frame) { first_frame = true; emit FirstFrameDisplayed(); } } -void GRenderWindow::MakeCurrent() { - if (context) { - context->makeCurrent(child); - } -} - -void GRenderWindow::DoneCurrent() { - if (context) { - context->doneCurrent(); - } -} - -void GRenderWindow::PollEvents() {} - bool GRenderWindow::IsShown() const { return !isMinimized(); } @@ -291,7 +314,7 @@ void GRenderWindow::RetrieveVulkanHandlers(void* get_instance_proc_addr, void* i #ifdef HAS_VULKAN const auto instance_proc_addr = vk_instance->getInstanceProcAddr("vkGetInstanceProcAddr"); const VkInstance instance_copy = vk_instance->vkInstance(); - const VkSurfaceKHR surface_copy = vk_instance->surfaceForWindow(child); + const VkSurfaceKHR surface_copy = vk_instance->surfaceForWindow(child_window); std::memcpy(get_instance_proc_addr, &instance_proc_addr, sizeof(instance_proc_addr)); std::memcpy(instance, &instance_copy, sizeof(instance_copy)); @@ -309,21 +332,10 @@ void GRenderWindow::RetrieveVulkanHandlers(void* get_instance_proc_addr, void* i void GRenderWindow::OnFramebufferSizeChanged() { // Screen changes potentially incur a change in screen DPI, hence we should update the // framebuffer size - const qreal pixelRatio{GetWindowPixelRatio()}; - const auto size{child->GetSize()}; - UpdateCurrentFramebufferLayout(size.first * pixelRatio, size.second * pixelRatio); -} - -void GRenderWindow::ForwardKeyPressEvent(QKeyEvent* event) { - if (child) { - child->keyPressEvent(event); - } -} - -void GRenderWindow::ForwardKeyReleaseEvent(QKeyEvent* event) { - if (child) { - child->keyReleaseEvent(event); - } + const qreal pixel_ratio = windowPixelRatio(); + const u32 width = this->width() * pixel_ratio; + const u32 height = this->height() * pixel_ratio; + UpdateCurrentFramebufferLayout(width, height); } void GRenderWindow::BackupGeometry() { @@ -351,13 +363,12 @@ QByteArray GRenderWindow::saveGeometry() { return geometry; } -qreal GRenderWindow::GetWindowPixelRatio() const { - // windowHandle() might not be accessible until the window is displayed to screen. - return windowHandle() ? windowHandle()->screen()->devicePixelRatio() : 1.0f; +qreal GRenderWindow::windowPixelRatio() const { + return devicePixelRatio(); } std::pair<u32, u32> GRenderWindow::ScaleTouch(const QPointF pos) const { - const qreal pixel_ratio{GetWindowPixelRatio()}; + const qreal pixel_ratio = windowPixelRatio(); return {static_cast<u32>(std::max(std::round(pos.x() * pixel_ratio), qreal{0.0})), static_cast<u32>(std::max(std::round(pos.y() * pixel_ratio), qreal{0.0}))}; } @@ -367,6 +378,47 @@ void GRenderWindow::closeEvent(QCloseEvent* event) { QWidget::closeEvent(event); } +void GRenderWindow::keyPressEvent(QKeyEvent* event) { + InputCommon::GetKeyboard()->PressKey(event->key()); +} + +void GRenderWindow::keyReleaseEvent(QKeyEvent* event) { + InputCommon::GetKeyboard()->ReleaseKey(event->key()); +} + +void GRenderWindow::mousePressEvent(QMouseEvent* event) { + if (event->source() == Qt::MouseEventSynthesizedBySystem) + return; // touch input is handled in TouchBeginEvent + + auto pos = event->pos(); + if (event->button() == Qt::LeftButton) { + const auto [x, y] = ScaleTouch(pos); + this->TouchPressed(x, y); + } else if (event->button() == Qt::RightButton) { + InputCommon::GetMotionEmu()->BeginTilt(pos.x(), pos.y()); + } +} + +void GRenderWindow::mouseMoveEvent(QMouseEvent* event) { + if (event->source() == Qt::MouseEventSynthesizedBySystem) + return; // touch input is handled in TouchUpdateEvent + + auto pos = event->pos(); + const auto [x, y] = ScaleTouch(pos); + this->TouchMoved(x, y); + InputCommon::GetMotionEmu()->Tilt(pos.x(), pos.y()); +} + +void GRenderWindow::mouseReleaseEvent(QMouseEvent* event) { + if (event->source() == Qt::MouseEventSynthesizedBySystem) + return; // touch input is handled in TouchEndEvent + + if (event->button() == Qt::LeftButton) + this->TouchReleased(); + else if (event->button() == Qt::RightButton) + InputCommon::GetMotionEmu()->EndTilt(); +} + void GRenderWindow::TouchBeginEvent(const QTouchEvent* event) { // TouchBegin always has exactly one touch point, so take the .first() const auto [x, y] = ScaleTouch(event->touchPoints().first().pos()); @@ -415,26 +467,20 @@ void GRenderWindow::focusOutEvent(QFocusEvent* event) { InputCommon::GetKeyboard()->ReleaseAllKeys(); } -void GRenderWindow::OnClientAreaResized(u32 width, u32 height) { - NotifyClientAreaSizeChanged(std::make_pair(width, height)); +void GRenderWindow::resizeEvent(QResizeEvent* event) { + QWidget::resizeEvent(event); + OnFramebufferSizeChanged(); } std::unique_ptr<Core::Frontend::GraphicsContext> GRenderWindow::CreateSharedContext() const { - return std::make_unique<GGLContext>(context.get()); + if (Settings::values.renderer_backend == Settings::RendererBackend::OpenGL) { + return std::make_unique<GGLContext>(QOpenGLContext::globalShareContext()); + } + return {}; } bool GRenderWindow::InitRenderTarget() { - shared_context.reset(); - context.reset(); - if (child) { - delete child; - } - if (container) { - delete container; - } - if (layout()) { - delete layout(); - } + ReleaseRenderTarget(); first_frame = false; @@ -451,13 +497,6 @@ bool GRenderWindow::InitRenderTarget() { break; } - container = QWidget::createWindowContainer(child, this); - QBoxLayout* layout = new QHBoxLayout(this); - - layout->addWidget(container); - layout->setMargin(0); - setLayout(layout); - // Reset minimum required size to avoid resizing issues on the main window after restarting. setMinimumSize(1, 1); @@ -467,14 +506,9 @@ bool GRenderWindow::InitRenderTarget() { hide(); resize(Layout::ScreenUndocked::Width, Layout::ScreenUndocked::Height); - child->resize(Layout::ScreenUndocked::Width, Layout::ScreenUndocked::Height); - container->resize(Layout::ScreenUndocked::Width, Layout::ScreenUndocked::Height); OnMinimalClientAreaChangeRequest(GetActiveConfig().min_client_area_size); - OnFramebufferSizeChanged(); - NotifyClientAreaSizeChanged(child->GetSize()); - BackupGeometry(); if (Settings::values.renderer_backend == Settings::RendererBackend::OpenGL) { @@ -486,6 +520,14 @@ bool GRenderWindow::InitRenderTarget() { return true; } +void GRenderWindow::ReleaseRenderTarget() { + if (child_widget) { + layout()->removeWidget(child_widget); + delete child_widget; + child_widget = nullptr; + } +} + void GRenderWindow::CaptureScreenshot(u32 res_scale, const QString& screenshot_path) { auto& renderer = Core::System::GetInstance().Renderer(); @@ -521,16 +563,19 @@ bool GRenderWindow::InitializeOpenGL() { fmt.setOption(QSurfaceFormat::FormatOption::DeprecatedFunctions); // TODO: expose a setting for buffer value (ie default/single/double/triple) fmt.setSwapBehavior(QSurfaceFormat::DefaultSwapBehavior); - shared_context = std::make_unique<QOpenGLContext>(); - shared_context->setFormat(fmt); - shared_context->create(); - context = std::make_unique<QOpenGLContext>(); - context->setShareContext(shared_context.get()); - context->setFormat(fmt); - context->create(); - fmt.setSwapInterval(false); - - child = new GGLWidgetInternal(this, shared_context.get()); + fmt.setSwapInterval(0); + QSurfaceFormat::setDefaultFormat(fmt); + + GMainWindow* parent = GetMainWindow(); + QWindow* parent_win_handle = parent ? parent->windowHandle() : nullptr; + child_window = new OpenGLWindow(parent_win_handle, this, QOpenGLContext::globalShareContext()); + child_window->create(); + child_widget = createWindowContainer(child_window, this); + child_widget->resize(Layout::ScreenUndocked::Width, Layout::ScreenUndocked::Height); + layout()->addWidget(child_widget); + + core_context = CreateSharedContext(); + return true; } @@ -559,7 +604,14 @@ bool GRenderWindow::InitializeVulkan() { return false; } - child = new GVKWidgetInternal(this, vk_instance.get()); + GMainWindow* parent = GetMainWindow(); + QWindow* parent_win_handle = parent ? parent->windowHandle() : nullptr; + child_window = new VulkanWindow(parent_win_handle, this, vk_instance.get()); + child_window->create(); + child_widget = createWindowContainer(child_window, this); + child_widget->resize(Layout::ScreenUndocked::Width, Layout::ScreenUndocked::Height); + layout()->addWidget(child_widget); + return true; #else QMessageBox::critical(this, tr("Vulkan not available!"), @@ -569,7 +621,7 @@ bool GRenderWindow::InitializeVulkan() { } bool GRenderWindow::LoadOpenGL() { - Core::Frontend::ScopeAcquireWindowContext acquire_context{*this}; + Core::Frontend::ScopeAcquireContext acquire_context{*this}; if (!gladLoadGL()) { QMessageBox::critical(this, tr("Error while initializing OpenGL 4.3!"), tr("Your GPU may not support OpenGL 4.3, or you do not have the " @@ -621,12 +673,10 @@ QStringList GRenderWindow::GetUnsupportedGLExtensions() const { void GRenderWindow::OnEmulationStarting(EmuThread* emu_thread) { this->emu_thread = emu_thread; - child->DisablePainting(); } void GRenderWindow::OnEmulationStopping() { emu_thread = nullptr; - child->EnablePainting(); } void GRenderWindow::showEvent(QShowEvent* event) { diff --git a/src/yuzu/bootmanager.h b/src/yuzu/bootmanager.h index 71a2fa321..79b030304 100644 --- a/src/yuzu/bootmanager.h +++ b/src/yuzu/bootmanager.h @@ -11,11 +11,13 @@ #include <QImage> #include <QThread> #include <QWidget> +#include <QWindow> #include "common/thread.h" #include "core/core.h" #include "core/frontend/emu_window.h" +class GRenderWindow; class QKeyEvent; class QScreen; class QTouchEvent; @@ -26,14 +28,6 @@ class QOpenGLContext; class QVulkanInstance; #endif -class GWidgetInternal; -class GGLWidgetInternal; -class GVKWidgetInternal; -class GMainWindow; -class GRenderWindow; -class QSurface; -class QOpenGLContext; - namespace VideoCore { enum class LoadCallbackStage; } @@ -42,7 +36,7 @@ class EmuThread final : public QThread { Q_OBJECT public: - explicit EmuThread(GRenderWindow* render_window); + explicit EmuThread(GRenderWindow& window); ~EmuThread() override; /** @@ -96,7 +90,11 @@ private: std::mutex running_mutex; std::condition_variable running_cv; - GRenderWindow* render_window; + /// Only used in asynchronous GPU mode + std::unique_ptr<Core::Frontend::GraphicsContext> shared_context; + + /// This is shared_context in asynchronous GPU mode, core_context in synchronous GPU mode + Core::Frontend::GraphicsContext& context; signals: /** @@ -126,11 +124,10 @@ class GRenderWindow : public QWidget, public Core::Frontend::EmuWindow { Q_OBJECT public: - GRenderWindow(GMainWindow* parent, EmuThread* emu_thread); + GRenderWindow(QWidget* parent, EmuThread* emu_thread); ~GRenderWindow() override; - // EmuWindow implementation - void SwapBuffers() override; + // EmuWindow implementation. void MakeCurrent() override; void DoneCurrent() override; void PollEvents() override; @@ -139,30 +136,36 @@ public: void* surface) const override; std::unique_ptr<Core::Frontend::GraphicsContext> CreateSharedContext() const override; - void ForwardKeyPressEvent(QKeyEvent* event); - void ForwardKeyReleaseEvent(QKeyEvent* event); - void BackupGeometry(); void RestoreGeometry(); void restoreGeometry(const QByteArray& geometry); // overridden QByteArray saveGeometry(); // overridden - qreal GetWindowPixelRatio() const; - std::pair<u32, u32> ScaleTouch(QPointF pos) const; + qreal windowPixelRatio() const; void closeEvent(QCloseEvent* event) override; + + void resizeEvent(QResizeEvent* event) override; + + void keyPressEvent(QKeyEvent* event) override; + void keyReleaseEvent(QKeyEvent* event) override; + + void mousePressEvent(QMouseEvent* event) override; + void mouseMoveEvent(QMouseEvent* event) override; + void mouseReleaseEvent(QMouseEvent* event) override; + bool event(QEvent* event) override; - void focusOutEvent(QFocusEvent* event) override; - void OnClientAreaResized(u32 width, u32 height); + void focusOutEvent(QFocusEvent* event) override; bool InitRenderTarget(); + /// Destroy the previous run's child_widget which should also destroy the child_window + void ReleaseRenderTarget(); + void CaptureScreenshot(u32 res_scale, const QString& screenshot_path); public slots: - void moveContext(); // overridden - void OnEmulationStarting(EmuThread* emu_thread); void OnEmulationStopping(); void OnFramebufferSizeChanged(); @@ -173,6 +176,7 @@ signals: void FirstFrameDisplayed(); private: + std::pair<u32, u32> ScaleTouch(QPointF pos) const; void TouchBeginEvent(const QTouchEvent* event); void TouchUpdateEvent(const QTouchEvent* event); void TouchEndEvent(); @@ -184,15 +188,9 @@ private: bool LoadOpenGL(); QStringList GetUnsupportedGLExtensions() const; - QWidget* container = nullptr; - GWidgetInternal* child = nullptr; - EmuThread* emu_thread; - // Context that backs the GGLWidgetInternal (and will be used by core to render) - std::unique_ptr<QOpenGLContext> context; - // Context that will be shared between all newly created contexts. This should never be made - // current - std::unique_ptr<QOpenGLContext> shared_context; + + std::unique_ptr<GraphicsContext> core_context; #ifdef HAS_VULKAN std::unique_ptr<QVulkanInstance> vk_instance; @@ -202,6 +200,15 @@ private: QImage screenshot_image; QByteArray geometry; + + /// Native window handle that backs this presentation widget + QWindow* child_window = nullptr; + + /// In order to embed the window into GRenderWindow, you need to use createWindowContainer to + /// put the child_window into a widget then add it to the layout. This child_widget can be + /// parented to GRenderWindow and use Qt's lifetime system + QWidget* child_widget = nullptr; + bool first_frame = false; protected: diff --git a/src/yuzu/configuration/config.cpp b/src/yuzu/configuration/config.cpp index 6209fff75..3b9ab38dd 100644 --- a/src/yuzu/configuration/config.cpp +++ b/src/yuzu/configuration/config.cpp @@ -539,7 +539,7 @@ void Config::ReadDebuggingValues() { void Config::ReadServiceValues() { qt_config->beginGroup(QStringLiteral("Services")); Settings::values.bcat_backend = - ReadSetting(QStringLiteral("bcat_backend"), QStringLiteral("boxcat")) + ReadSetting(QStringLiteral("bcat_backend"), QStringLiteral("null")) .toString() .toStdString(); Settings::values.bcat_boxcat_local = @@ -631,6 +631,7 @@ void Config::ReadRendererValues() { Settings::values.resolution_factor = ReadSetting(QStringLiteral("resolution_factor"), 1.0).toFloat(); Settings::values.aspect_ratio = ReadSetting(QStringLiteral("aspect_ratio"), 0).toInt(); + Settings::values.max_anisotropy = ReadSetting(QStringLiteral("max_anisotropy"), 0).toInt(); Settings::values.use_frame_limit = ReadSetting(QStringLiteral("use_frame_limit"), true).toBool(); Settings::values.frame_limit = ReadSetting(QStringLiteral("frame_limit"), 100).toInt(); @@ -640,6 +641,7 @@ void Config::ReadRendererValues() { ReadSetting(QStringLiteral("use_accurate_gpu_emulation"), false).toBool(); Settings::values.use_asynchronous_gpu_emulation = ReadSetting(QStringLiteral("use_asynchronous_gpu_emulation"), false).toBool(); + Settings::values.use_vsync = ReadSetting(QStringLiteral("use_vsync"), true).toBool(); Settings::values.force_30fps_mode = ReadSetting(QStringLiteral("force_30fps_mode"), false).toBool(); @@ -680,6 +682,8 @@ void Config::ReadSystemValues() { Settings::values.language_index = ReadSetting(QStringLiteral("language_index"), 1).toInt(); + Settings::values.region_index = ReadSetting(QStringLiteral("region_index"), 1).toInt(); + const auto rng_seed_enabled = ReadSetting(QStringLiteral("rng_seed_enabled"), false).toBool(); if (rng_seed_enabled) { Settings::values.rng_seed = ReadSetting(QStringLiteral("rng_seed"), 0).toULongLong(); @@ -696,6 +700,8 @@ void Config::ReadSystemValues() { Settings::values.custom_rtc = std::nullopt; } + Settings::values.sound_index = ReadSetting(QStringLiteral("sound_index"), 1).toInt(); + qt_config->endGroup(); } @@ -1066,6 +1072,7 @@ void Config::SaveRendererValues() { WriteSetting(QStringLiteral("resolution_factor"), static_cast<double>(Settings::values.resolution_factor), 1.0); WriteSetting(QStringLiteral("aspect_ratio"), Settings::values.aspect_ratio, 0); + WriteSetting(QStringLiteral("max_anisotropy"), Settings::values.max_anisotropy, 0); WriteSetting(QStringLiteral("use_frame_limit"), Settings::values.use_frame_limit, true); WriteSetting(QStringLiteral("frame_limit"), Settings::values.frame_limit, 100); WriteSetting(QStringLiteral("use_disk_shader_cache"), Settings::values.use_disk_shader_cache, @@ -1074,6 +1081,7 @@ void Config::SaveRendererValues() { Settings::values.use_accurate_gpu_emulation, false); WriteSetting(QStringLiteral("use_asynchronous_gpu_emulation"), Settings::values.use_asynchronous_gpu_emulation, false); + WriteSetting(QStringLiteral("use_vsync"), Settings::values.use_vsync, true); WriteSetting(QStringLiteral("force_30fps_mode"), Settings::values.force_30fps_mode, false); // Cast to double because Qt's written float values are not human-readable @@ -1110,6 +1118,7 @@ void Config::SaveSystemValues() { WriteSetting(QStringLiteral("use_docked_mode"), Settings::values.use_docked_mode, false); WriteSetting(QStringLiteral("current_user"), Settings::values.current_user, 0); WriteSetting(QStringLiteral("language_index"), Settings::values.language_index, 1); + WriteSetting(QStringLiteral("region_index"), Settings::values.region_index, 1); WriteSetting(QStringLiteral("rng_seed_enabled"), Settings::values.rng_seed.has_value(), false); WriteSetting(QStringLiteral("rng_seed"), Settings::values.rng_seed.value_or(0), 0); @@ -1121,6 +1130,8 @@ void Config::SaveSystemValues() { Settings::values.custom_rtc.value_or(std::chrono::seconds{}).count()), 0); + WriteSetting(QStringLiteral("sound_index"), Settings::values.sound_index, 1); + qt_config->endGroup(); } diff --git a/src/yuzu/configuration/configure.ui b/src/yuzu/configuration/configure.ui index 67b990f1a..9aec1bd09 100644 --- a/src/yuzu/configuration/configure.ui +++ b/src/yuzu/configuration/configure.ui @@ -83,6 +83,11 @@ <string>Graphics</string> </attribute> </widget> + <widget class="ConfigureGraphicsAdvanced" name="graphicsAdvancedTab"> + <attribute name="title"> + <string>GraphicsAdvanced</string> + </attribute> + </widget> <widget class="ConfigureAudio" name="audioTab"> <attribute name="title"> <string>Audio</string> @@ -160,6 +165,12 @@ <container>1</container> </customwidget> <customwidget> + <class>ConfigureGraphicsAdvanced</class> + <extends>QWidget</extends> + <header>configuration/configure_graphics_advanced.h</header> + <container>1</container> + </customwidget> + <customwidget> <class>ConfigureWeb</class> <extends>QWidget</extends> <header>configuration/configure_web.h</header> diff --git a/src/yuzu/configuration/configure_dialog.cpp b/src/yuzu/configuration/configure_dialog.cpp index db3b19352..df4473b46 100644 --- a/src/yuzu/configuration/configure_dialog.cpp +++ b/src/yuzu/configuration/configure_dialog.cpp @@ -41,6 +41,7 @@ void ConfigureDialog::ApplyConfiguration() { ui->inputTab->ApplyConfiguration(); ui->hotkeysTab->ApplyConfiguration(registry); ui->graphicsTab->ApplyConfiguration(); + ui->graphicsAdvancedTab->ApplyConfiguration(); ui->audioTab->ApplyConfiguration(); ui->debugTab->ApplyConfiguration(); ui->webTab->ApplyConfiguration(); @@ -76,7 +77,7 @@ void ConfigureDialog::PopulateSelectionList() { const std::array<std::pair<QString, QList<QWidget*>>, 5> items{ {{tr("General"), {ui->generalTab, ui->webTab, ui->debugTab, ui->uiTab}}, {tr("System"), {ui->systemTab, ui->profileManagerTab, ui->serviceTab, ui->filesystemTab}}, - {tr("Graphics"), {ui->graphicsTab}}, + {tr("Graphics"), {ui->graphicsTab, ui->graphicsAdvancedTab}}, {tr("Audio"), {ui->audioTab}}, {tr("Controls"), {ui->inputTab, ui->hotkeysTab}}}, }; @@ -105,6 +106,7 @@ void ConfigureDialog::UpdateVisibleTabs() { {ui->inputTab, tr("Input")}, {ui->hotkeysTab, tr("Hotkeys")}, {ui->graphicsTab, tr("Graphics")}, + {ui->graphicsAdvancedTab, tr("Advanced")}, {ui->audioTab, tr("Audio")}, {ui->debugTab, tr("Debug")}, {ui->webTab, tr("Web")}, diff --git a/src/yuzu/configuration/configure_graphics.cpp b/src/yuzu/configuration/configure_graphics.cpp index ea899c080..a821c7b3c 100644 --- a/src/yuzu/configuration/configure_graphics.cpp +++ b/src/yuzu/configuration/configure_graphics.cpp @@ -100,11 +100,8 @@ void ConfigureGraphics::SetConfiguration() { ui->aspect_ratio_combobox->setCurrentIndex(Settings::values.aspect_ratio); ui->use_disk_shader_cache->setEnabled(runtime_lock); ui->use_disk_shader_cache->setChecked(Settings::values.use_disk_shader_cache); - ui->use_accurate_gpu_emulation->setChecked(Settings::values.use_accurate_gpu_emulation); ui->use_asynchronous_gpu_emulation->setEnabled(runtime_lock); ui->use_asynchronous_gpu_emulation->setChecked(Settings::values.use_asynchronous_gpu_emulation); - ui->force_30fps_mode->setEnabled(runtime_lock); - ui->force_30fps_mode->setChecked(Settings::values.force_30fps_mode); UpdateBackgroundColorButton(QColor::fromRgbF(Settings::values.bg_red, Settings::values.bg_green, Settings::values.bg_blue)); UpdateDeviceComboBox(); @@ -117,10 +114,8 @@ void ConfigureGraphics::ApplyConfiguration() { ToResolutionFactor(static_cast<Resolution>(ui->resolution_factor_combobox->currentIndex())); Settings::values.aspect_ratio = ui->aspect_ratio_combobox->currentIndex(); Settings::values.use_disk_shader_cache = ui->use_disk_shader_cache->isChecked(); - Settings::values.use_accurate_gpu_emulation = ui->use_accurate_gpu_emulation->isChecked(); Settings::values.use_asynchronous_gpu_emulation = ui->use_asynchronous_gpu_emulation->isChecked(); - Settings::values.force_30fps_mode = ui->force_30fps_mode->isChecked(); Settings::values.bg_red = static_cast<float>(bg_color.redF()); Settings::values.bg_green = static_cast<float>(bg_color.greenF()); Settings::values.bg_blue = static_cast<float>(bg_color.blueF()); diff --git a/src/yuzu/configuration/configure_graphics.ui b/src/yuzu/configuration/configure_graphics.ui index db60426ab..c816d6108 100644 --- a/src/yuzu/configuration/configure_graphics.ui +++ b/src/yuzu/configuration/configure_graphics.ui @@ -85,20 +85,6 @@ </widget> </item> <item> - <widget class="QCheckBox" name="use_accurate_gpu_emulation"> - <property name="text"> - <string>Use accurate GPU emulation (slow)</string> - </property> - </widget> - </item> - <item> - <widget class="QCheckBox" name="force_30fps_mode"> - <property name="text"> - <string>Force 30 FPS mode</string> - </property> - </widget> - </item> - <item> <layout class="QHBoxLayout" name="horizontalLayout_2"> <item> <widget class="QLabel" name="label"> diff --git a/src/yuzu/configuration/configure_graphics_advanced.cpp b/src/yuzu/configuration/configure_graphics_advanced.cpp new file mode 100644 index 000000000..b9f429f84 --- /dev/null +++ b/src/yuzu/configuration/configure_graphics_advanced.cpp @@ -0,0 +1,48 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include "core/core.h" +#include "core/settings.h" +#include "ui_configure_graphics_advanced.h" +#include "yuzu/configuration/configure_graphics_advanced.h" + +ConfigureGraphicsAdvanced::ConfigureGraphicsAdvanced(QWidget* parent) + : QWidget(parent), ui(new Ui::ConfigureGraphicsAdvanced) { + + ui->setupUi(this); + + SetConfiguration(); +} + +ConfigureGraphicsAdvanced::~ConfigureGraphicsAdvanced() = default; + +void ConfigureGraphicsAdvanced::SetConfiguration() { + const bool runtime_lock = !Core::System::GetInstance().IsPoweredOn(); + ui->use_accurate_gpu_emulation->setChecked(Settings::values.use_accurate_gpu_emulation); + ui->use_vsync->setEnabled(runtime_lock); + ui->use_vsync->setChecked(Settings::values.use_vsync); + ui->force_30fps_mode->setEnabled(runtime_lock); + ui->force_30fps_mode->setChecked(Settings::values.force_30fps_mode); + ui->anisotropic_filtering_combobox->setEnabled(runtime_lock); + ui->anisotropic_filtering_combobox->setCurrentIndex(Settings::values.max_anisotropy); +} + +void ConfigureGraphicsAdvanced::ApplyConfiguration() { + Settings::values.use_accurate_gpu_emulation = ui->use_accurate_gpu_emulation->isChecked(); + Settings::values.use_vsync = ui->use_vsync->isChecked(); + Settings::values.force_30fps_mode = ui->force_30fps_mode->isChecked(); + Settings::values.max_anisotropy = ui->anisotropic_filtering_combobox->currentIndex(); +} + +void ConfigureGraphicsAdvanced::changeEvent(QEvent* event) { + if (event->type() == QEvent::LanguageChange) { + RetranslateUI(); + } + + QWidget::changeEvent(event); +} + +void ConfigureGraphicsAdvanced::RetranslateUI() { + ui->retranslateUi(this); +} diff --git a/src/yuzu/configuration/configure_graphics_advanced.h b/src/yuzu/configuration/configure_graphics_advanced.h new file mode 100644 index 000000000..bbc9d4355 --- /dev/null +++ b/src/yuzu/configuration/configure_graphics_advanced.h @@ -0,0 +1,30 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include <memory> +#include <QWidget> + +namespace Ui { +class ConfigureGraphicsAdvanced; +} + +class ConfigureGraphicsAdvanced : public QWidget { + Q_OBJECT + +public: + explicit ConfigureGraphicsAdvanced(QWidget* parent = nullptr); + ~ConfigureGraphicsAdvanced() override; + + void ApplyConfiguration(); + +private: + void changeEvent(QEvent* event) override; + void RetranslateUI(); + + void SetConfiguration(); + + std::unique_ptr<Ui::ConfigureGraphicsAdvanced> ui; +}; diff --git a/src/yuzu/configuration/configure_graphics_advanced.ui b/src/yuzu/configuration/configure_graphics_advanced.ui new file mode 100644 index 000000000..42eec278e --- /dev/null +++ b/src/yuzu/configuration/configure_graphics_advanced.ui @@ -0,0 +1,111 @@ +<?xml version="1.0" encoding="UTF-8"?> +<ui version="4.0"> + <class>ConfigureGraphicsAdvanced</class> + <widget class="QWidget" name="ConfigureGraphicsAdvanced"> + <property name="geometry"> + <rect> + <x>0</x> + <y>0</y> + <width>400</width> + <height>321</height> + </rect> + </property> + <property name="windowTitle"> + <string>Form</string> + </property> + <layout class="QVBoxLayout" name="verticalLayout_1"> + <item> + <layout class="QVBoxLayout" name="verticalLayout_2"> + <item> + <widget class="QGroupBox" name="groupBox_1"> + <property name="title"> + <string>Advanced Graphics Settings</string> + </property> + <layout class="QVBoxLayout" name="verticalLayout_3"> + <item> + <widget class="QCheckBox" name="use_accurate_gpu_emulation"> + <property name="text"> + <string>Use accurate GPU emulation (slow)</string> + </property> + </widget> + </item> + <item> + <widget class="QCheckBox" name="use_vsync"> + <property name="toolTip"> + <string>VSync prevents the screen from tearing, but some graphics cards have lower performance with VSync enabled. Keep it enabled if you don't notice a performance difference.</string> + </property> + <property name="text"> + <string>Use VSync (OpenGL only)</string> + </property> + </widget> + </item> + <item> + <widget class="QCheckBox" name="force_30fps_mode"> + <property name="text"> + <string>Force 30 FPS mode</string> + </property> + </widget> + </item> + <item> + <layout class="QHBoxLayout" name="horizontalLayout_1"> + <item> + <widget class="QLabel" name="af_label"> + <property name="text"> + <string>Anisotropic Filtering:</string> + </property> + </widget> + </item> + <item> + <widget class="QComboBox" name="anisotropic_filtering_combobox"> + <item> + <property name="text"> + <string>Default</string> + </property> + </item> + <item> + <property name="text"> + <string>2x</string> + </property> + </item> + <item> + <property name="text"> + <string>4x</string> + </property> + </item> + <item> + <property name="text"> + <string>8x</string> + </property> + </item> + <item> + <property name="text"> + <string>16x</string> + </property> + </item> + </widget> + </item> + </layout> + </item> + </layout> + </widget> + </item> + </layout> + </item> + <item> + <spacer name="verticalSpacer"> + <property name="orientation"> + <enum>Qt::Vertical</enum> + </property> + <property name="sizeHint" stdset="0"> + <size> + <width>20</width> + <height>40</height> + </size> + </property> + </spacer> + </item> + </layout> + </widget> + <resources/> + <connections/> +</ui> diff --git a/src/yuzu/configuration/configure_system.cpp b/src/yuzu/configuration/configure_system.cpp index e1b52f8d9..f49cd4c8f 100644 --- a/src/yuzu/configuration/configure_system.cpp +++ b/src/yuzu/configuration/configure_system.cpp @@ -56,6 +56,8 @@ void ConfigureSystem::SetConfiguration() { enabled = !Core::System::GetInstance().IsPoweredOn(); ui->combo_language->setCurrentIndex(Settings::values.language_index); + ui->combo_region->setCurrentIndex(Settings::values.region_index); + ui->combo_sound->setCurrentIndex(Settings::values.sound_index); ui->rng_seed_checkbox->setChecked(Settings::values.rng_seed.has_value()); ui->rng_seed_edit->setEnabled(Settings::values.rng_seed.has_value()); @@ -81,6 +83,8 @@ void ConfigureSystem::ApplyConfiguration() { } Settings::values.language_index = ui->combo_language->currentIndex(); + Settings::values.region_index = ui->combo_region->currentIndex(); + Settings::values.sound_index = ui->combo_sound->currentIndex(); if (ui->rng_seed_checkbox->isChecked()) { Settings::values.rng_seed = ui->rng_seed_edit->text().toULongLong(nullptr, 16); diff --git a/src/yuzu/configuration/configure_system.h b/src/yuzu/configuration/configure_system.h index 1eab3781d..d8fa2d2cc 100644 --- a/src/yuzu/configuration/configure_system.h +++ b/src/yuzu/configuration/configure_system.h @@ -36,5 +36,6 @@ private: bool enabled = false; int language_index = 0; + int region_index = 0; int sound_index = 0; }; diff --git a/src/yuzu/configuration/configure_system.ui b/src/yuzu/configuration/configure_system.ui index 65745a2f8..4e2c7e76e 100644 --- a/src/yuzu/configuration/configure_system.ui +++ b/src/yuzu/configuration/configure_system.ui @@ -22,14 +22,14 @@ <string>System Settings</string> </property> <layout class="QGridLayout" name="gridLayout"> - <item row="1" column="0"> + <item row="2" column="0"> <widget class="QLabel" name="label_sound"> <property name="text"> <string>Sound output mode</string> </property> </widget> </item> - <item row="2" column="0"> + <item row="3" column="0"> <widget class="QLabel" name="label_console_id"> <property name="text"> <string>Console ID:</string> @@ -128,14 +128,60 @@ </item> </widget> </item> - <item row="4" column="0"> + <item row="1" column="0"> + <widget class="QLabel" name="label_region"> + <property name="text"> + <string>Region:</string> + </property> + </widget> + </item> + <item row="1" column="1"> + <widget class="QComboBox" name="combo_region"> + <item> + <property name="text"> + <string>Japan</string> + </property> + </item> + <item> + <property name="text"> + <string>USA</string> + </property> + </item> + <item> + <property name="text"> + <string>Europe</string> + </property> + </item> + <item> + <property name="text"> + <string>Australia</string> + </property> + </item> + <item> + <property name="text"> + <string>China</string> + </property> + </item> + <item> + <property name="text"> + <string>Korea</string> + </property> + </item> + <item> + <property name="text"> + <string>Taiwan</string> + </property> + </item> + </widget> + </item> + <item row="5" column="0"> <widget class="QCheckBox" name="rng_seed_checkbox"> <property name="text"> <string>RNG Seed</string> </property> </widget> </item> - <item row="1" column="1"> + <item row="2" column="1"> <widget class="QComboBox" name="combo_sound"> <item> <property name="text"> @@ -161,7 +207,7 @@ </property> </widget> </item> - <item row="2" column="1"> + <item row="3" column="1"> <widget class="QPushButton" name="button_regenerate_console_id"> <property name="sizePolicy"> <sizepolicy hsizetype="Fixed" vsizetype="Fixed"> @@ -177,14 +223,14 @@ </property> </widget> </item> - <item row="3" column="0"> + <item row="4" column="0"> <widget class="QCheckBox" name="custom_rtc_checkbox"> <property name="text"> <string>Custom RTC</string> </property> </widget> </item> - <item row="3" column="1"> + <item row="4" column="1"> <widget class="QDateTimeEdit" name="custom_rtc_edit"> <property name="minimumDate"> <date> @@ -198,7 +244,7 @@ </property> </widget> </item> - <item row="4" column="1"> + <item row="5" column="1"> <widget class="QLineEdit" name="rng_seed_edit"> <property name="sizePolicy"> <sizepolicy hsizetype="Minimum" vsizetype="Fixed"> diff --git a/src/yuzu/debugger/wait_tree.cpp b/src/yuzu/debugger/wait_tree.cpp index 3f1a94627..c1ea25fb8 100644 --- a/src/yuzu/debugger/wait_tree.cpp +++ b/src/yuzu/debugger/wait_tree.cpp @@ -116,7 +116,7 @@ std::vector<std::unique_ptr<WaitTreeItem>> WaitTreeCallstack::GetChildren() cons constexpr std::size_t BaseRegister = 29; auto& memory = Core::System::GetInstance().Memory(); - u64 base_pointer = thread.GetContext().cpu_registers[BaseRegister]; + u64 base_pointer = thread.GetContext64().cpu_registers[BaseRegister]; while (base_pointer != 0) { const u64 lr = memory.Read64(base_pointer + sizeof(u64)); @@ -240,7 +240,7 @@ QString WaitTreeThread::GetText() const { break; } - const auto& context = thread.GetContext(); + const auto& context = thread.GetContext64(); const QString pc_info = tr(" PC = 0x%1 LR = 0x%2") .arg(context.pc, 8, 16, QLatin1Char{'0'}) .arg(context.cpu_registers[30], 8, 16, QLatin1Char{'0'}); diff --git a/src/yuzu/loading_screen.cpp b/src/yuzu/loading_screen.cpp index 4f2bfab48..2a6483370 100644 --- a/src/yuzu/loading_screen.cpp +++ b/src/yuzu/loading_screen.cpp @@ -34,18 +34,6 @@ constexpr char PROGRESSBAR_STYLE_PREPARE[] = R"( QProgressBar {} QProgressBar::chunk {})"; -constexpr char PROGRESSBAR_STYLE_DECOMPILE[] = R"( -QProgressBar { - background-color: black; - border: 2px solid white; - border-radius: 4px; - padding: 2px; -} -QProgressBar::chunk { - background-color: #0ab9e6; - width: 1px; -})"; - constexpr char PROGRESSBAR_STYLE_BUILD[] = R"( QProgressBar { background-color: black; @@ -100,13 +88,11 @@ LoadingScreen::LoadingScreen(QWidget* parent) stage_translations = { {VideoCore::LoadCallbackStage::Prepare, tr("Loading...")}, - {VideoCore::LoadCallbackStage::Decompile, tr("Preparing Shaders %1 / %2")}, {VideoCore::LoadCallbackStage::Build, tr("Loading Shaders %1 / %2")}, {VideoCore::LoadCallbackStage::Complete, tr("Launching...")}, }; progressbar_style = { {VideoCore::LoadCallbackStage::Prepare, PROGRESSBAR_STYLE_PREPARE}, - {VideoCore::LoadCallbackStage::Decompile, PROGRESSBAR_STYLE_DECOMPILE}, {VideoCore::LoadCallbackStage::Build, PROGRESSBAR_STYLE_BUILD}, {VideoCore::LoadCallbackStage::Complete, PROGRESSBAR_STYLE_COMPLETE}, }; @@ -192,8 +178,7 @@ void LoadingScreen::OnLoadProgress(VideoCore::LoadCallbackStage stage, std::size } // update labels and progress bar - if (stage == VideoCore::LoadCallbackStage::Decompile || - stage == VideoCore::LoadCallbackStage::Build) { + if (stage == VideoCore::LoadCallbackStage::Build) { ui->stage->setText(stage_translations[stage].arg(value).arg(total)); } else { ui->stage->setText(stage_translations[stage]); diff --git a/src/yuzu/main.cpp b/src/yuzu/main.cpp index 1be61bd48..4769a612e 100644 --- a/src/yuzu/main.cpp +++ b/src/yuzu/main.cpp @@ -20,7 +20,6 @@ #include "core/file_sys/vfs.h" #include "core/file_sys/vfs_real.h" #include "core/frontend/applets/general_frontend.h" -#include "core/frontend/scope_acquire_window_context.h" #include "core/hle/service/acc/profile_manager.h" #include "core/hle/service/am/applet_ae.h" #include "core/hle/service/am/applet_oe.h" @@ -985,11 +984,8 @@ void GMainWindow::BootGame(const QString& filename) { return; // Create and start the emulation thread - emu_thread = std::make_unique<EmuThread>(render_window); + emu_thread = std::make_unique<EmuThread>(*render_window); emit EmulationStarting(emu_thread.get()); - if (Settings::values.renderer_backend == Settings::RendererBackend::OpenGL) { - render_window->moveContext(); - } emu_thread->start(); connect(render_window, &GRenderWindow::Closed, this, &GMainWindow::OnStopGame); @@ -1087,6 +1083,9 @@ void GMainWindow::ShutdownGame() { emulation_running = false; game_path.clear(); + + // When closing the game, destroy the GLWindow to clear the context after the game is closed + render_window->ReleaseRenderTarget(); } void GMainWindow::StoreRecentFile(const QString& filename) { @@ -2210,48 +2209,47 @@ void GMainWindow::closeEvent(QCloseEvent* event) { QWidget::closeEvent(event); } -void GMainWindow::keyPressEvent(QKeyEvent* event) { - if (render_window) { - render_window->ForwardKeyPressEvent(event); - } +static bool IsSingleFileDropEvent(const QMimeData* mime) { + return mime->hasUrls() && mime->urls().length() == 1; } -void GMainWindow::keyReleaseEvent(QKeyEvent* event) { - if (render_window) { - render_window->ForwardKeyReleaseEvent(event); +void GMainWindow::AcceptDropEvent(QDropEvent* event) { + if (IsSingleFileDropEvent(event->mimeData())) { + event->setDropAction(Qt::DropAction::LinkAction); + event->accept(); } } -static bool IsSingleFileDropEvent(QDropEvent* event) { - const QMimeData* mimeData = event->mimeData(); - return mimeData->hasUrls() && mimeData->urls().length() == 1; -} - -void GMainWindow::dropEvent(QDropEvent* event) { - if (!IsSingleFileDropEvent(event)) { - return; +bool GMainWindow::DropAction(QDropEvent* event) { + if (!IsSingleFileDropEvent(event->mimeData())) { + return false; } const QMimeData* mime_data = event->mimeData(); - const QString filename = mime_data->urls().at(0).toLocalFile(); + const QString& filename = mime_data->urls().at(0).toLocalFile(); if (emulation_running && QFileInfo(filename).suffix() == QStringLiteral("bin")) { + // Amiibo LoadAmiibo(filename); } else { + // Game if (ConfirmChangeGame()) { BootGame(filename); } } + return true; +} + +void GMainWindow::dropEvent(QDropEvent* event) { + DropAction(event); } void GMainWindow::dragEnterEvent(QDragEnterEvent* event) { - if (IsSingleFileDropEvent(event)) { - event->acceptProposedAction(); - } + AcceptDropEvent(event); } void GMainWindow::dragMoveEvent(QDragMoveEvent* event) { - event->acceptProposedAction(); + AcceptDropEvent(event); } bool GMainWindow::ConfirmChangeGame() { @@ -2372,6 +2370,7 @@ int main(int argc, char* argv[]) { // Enables the core to make the qt created contexts current on std::threads QCoreApplication::setAttribute(Qt::AA_DontCheckOpenGLContextThreadAffinity); + QCoreApplication::setAttribute(Qt::AA_ShareOpenGLContexts); QApplication app(argc, argv); // Qt changes the locale and causes issues in float conversion using std::to_string() when diff --git a/src/yuzu/main.h b/src/yuzu/main.h index 8eba2172c..a67125567 100644 --- a/src/yuzu/main.h +++ b/src/yuzu/main.h @@ -78,6 +78,9 @@ public: std::unique_ptr<DiscordRPC::DiscordInterface> discord_rpc; + bool DropAction(QDropEvent* event); + void AcceptDropEvent(QDropEvent* event); + signals: /** @@ -264,8 +267,4 @@ protected: void dropEvent(QDropEvent* event) override; void dragEnterEvent(QDragEnterEvent* event) override; void dragMoveEvent(QDragMoveEvent* event) override; - - // Overrides used to forward signals to the render window when the focus moves out. - void keyPressEvent(QKeyEvent* event) override; - void keyReleaseEvent(QKeyEvent* event) override; }; diff --git a/src/yuzu_cmd/config.cpp b/src/yuzu_cmd/config.cpp index 96f1ce3af..f4cd905c9 100644 --- a/src/yuzu_cmd/config.cpp +++ b/src/yuzu_cmd/config.cpp @@ -381,6 +381,8 @@ void Config::ReadValues() { static_cast<float>(sdl2_config->GetReal("Renderer", "resolution_factor", 1.0)); Settings::values.aspect_ratio = static_cast<int>(sdl2_config->GetInteger("Renderer", "aspect_ratio", 0)); + Settings::values.max_anisotropy = + static_cast<int>(sdl2_config->GetInteger("Renderer", "max_anisotropy", 0)); Settings::values.use_frame_limit = sdl2_config->GetBoolean("Renderer", "use_frame_limit", true); Settings::values.frame_limit = static_cast<u16>(sdl2_config->GetInteger("Renderer", "frame_limit", 100)); @@ -390,6 +392,8 @@ void Config::ReadValues() { sdl2_config->GetBoolean("Renderer", "use_accurate_gpu_emulation", false); Settings::values.use_asynchronous_gpu_emulation = sdl2_config->GetBoolean("Renderer", "use_asynchronous_gpu_emulation", false); + Settings::values.use_vsync = + static_cast<u16>(sdl2_config->GetInteger("Renderer", "use_vsync", 1)); Settings::values.bg_red = static_cast<float>(sdl2_config->GetReal("Renderer", "bg_red", 0.0)); Settings::values.bg_green = @@ -448,7 +452,7 @@ void Config::ReadValues() { Settings::values.yuzu_token = sdl2_config->Get("WebService", "yuzu_token", ""); // Services - Settings::values.bcat_backend = sdl2_config->Get("Services", "bcat_backend", "boxcat"); + Settings::values.bcat_backend = sdl2_config->Get("Services", "bcat_backend", "null"); Settings::values.bcat_boxcat_local = sdl2_config->GetBoolean("Services", "bcat_boxcat_local", false); } diff --git a/src/yuzu_cmd/default_ini.h b/src/yuzu_cmd/default_ini.h index 8a2b658cd..d63d7a58e 100644 --- a/src/yuzu_cmd/default_ini.h +++ b/src/yuzu_cmd/default_ini.h @@ -84,7 +84,7 @@ touch_device= # from any cemuhook compatible motion program. # IPv4 address of the udp input server (Default "127.0.0.1") -udp_input_address= +udp_input_address=127.0.0.1 # Port of the udp input server. (Default 26760) udp_input_port= @@ -126,6 +126,10 @@ resolution_factor = # 0: Default (16:9), 1: Force 4:3, 2: Force 21:9, 3: Stretch to Window aspect_ratio = +# Anisotropic filtering +# 0: Default, 1: 2x, 2: 4x, 3: 8x, 4: 16x +max_anisotropy = + # Whether to enable V-Sync (caps the framerate at 60FPS) or not. # 0 (default): Off, 1: On use_vsync = @@ -150,6 +154,11 @@ use_accurate_gpu_emulation = # 0 : Off (slow), 1 (default): On (fast) use_asynchronous_gpu_emulation = +# Forces VSync on the display thread. Usually doesn't impact performance, but on some drivers it can +# so only turn this off if you notice a speed difference. +# 0: Off, 1 (default): On +use_vsync = + # The clear color for the renderer. What shows up on the sides of the bottom screen. # Must be in range of 0.0-1.0. Defaults to 1.0 for all. bg_red = diff --git a/src/yuzu_cmd/emu_window/emu_window_sdl2.cpp b/src/yuzu_cmd/emu_window/emu_window_sdl2.cpp index e96139885..19584360c 100644 --- a/src/yuzu_cmd/emu_window/emu_window_sdl2.cpp +++ b/src/yuzu_cmd/emu_window/emu_window_sdl2.cpp @@ -13,7 +13,7 @@ #include "input_common/sdl/sdl.h" #include "yuzu_cmd/emu_window/emu_window_sdl2.h" -EmuWindow_SDL2::EmuWindow_SDL2(bool fullscreen) { +EmuWindow_SDL2::EmuWindow_SDL2(Core::System& system, bool fullscreen) : system{system} { if (SDL_Init(SDL_INIT_VIDEO | SDL_INIT_JOYSTICK) < 0) { LOG_CRITICAL(Frontend, "Failed to initialize SDL2! Exiting..."); exit(1); diff --git a/src/yuzu_cmd/emu_window/emu_window_sdl2.h b/src/yuzu_cmd/emu_window/emu_window_sdl2.h index b38f56661..fffac4252 100644 --- a/src/yuzu_cmd/emu_window/emu_window_sdl2.h +++ b/src/yuzu_cmd/emu_window/emu_window_sdl2.h @@ -10,9 +10,13 @@ struct SDL_Window; +namespace Core { +class System; +} + class EmuWindow_SDL2 : public Core::Frontend::EmuWindow { public: - explicit EmuWindow_SDL2(bool fullscreen); + explicit EmuWindow_SDL2(Core::System& system, bool fullscreen); ~EmuWindow_SDL2(); /// Polls window events @@ -24,6 +28,9 @@ public: /// Returns if window is shown (not minimized) bool IsShown() const override; + /// Presents the next frame + virtual void Present() = 0; + protected: /// Called by PollEvents when a key is pressed or released. void OnKeyEvent(int key, u8 state); @@ -55,6 +62,9 @@ protected: /// Called when a configuration change affects the minimal size of the window void OnMinimalClientAreaChangeRequest(std::pair<unsigned, unsigned> minimal_size) override; + /// Instance of the system, used to access renderer for the presentation thread + Core::System& system; + /// Is the window still open? bool is_open = true; @@ -62,7 +72,7 @@ protected: bool is_shown = true; /// Internal SDL2 render window - SDL_Window* render_window; + SDL_Window* render_window{}; /// Keeps track of how often to update the title bar during gameplay u32 last_time = 0; diff --git a/src/yuzu_cmd/emu_window/emu_window_sdl2_gl.cpp b/src/yuzu_cmd/emu_window/emu_window_sdl2_gl.cpp index 7ffa0ac09..c0d373477 100644 --- a/src/yuzu_cmd/emu_window/emu_window_sdl2_gl.cpp +++ b/src/yuzu_cmd/emu_window/emu_window_sdl2_gl.cpp @@ -13,24 +13,25 @@ #include "common/logging/log.h" #include "common/scm_rev.h" #include "common/string_util.h" +#include "core/core.h" #include "core/settings.h" #include "input_common/keyboard.h" #include "input_common/main.h" #include "input_common/motion_emu.h" +#include "video_core/renderer_base.h" #include "yuzu_cmd/emu_window/emu_window_sdl2_gl.h" class SDLGLContext : public Core::Frontend::GraphicsContext { public: explicit SDLGLContext() { // create a hidden window to make the shared context against - window = SDL_CreateWindow("", SDL_WINDOWPOS_UNDEFINED, // x position - SDL_WINDOWPOS_UNDEFINED, // y position - Layout::ScreenUndocked::Width, Layout::ScreenUndocked::Height, - SDL_WINDOW_OPENGL | SDL_WINDOW_HIDDEN); + window = SDL_CreateWindow(NULL, SDL_WINDOWPOS_UNDEFINED, SDL_WINDOWPOS_UNDEFINED, 0, 0, + SDL_WINDOW_HIDDEN | SDL_WINDOW_OPENGL); context = SDL_GL_CreateContext(window); } ~SDLGLContext() { + DoneCurrent(); SDL_GL_DeleteContext(context); SDL_DestroyWindow(window); } @@ -43,8 +44,6 @@ public: SDL_GL_MakeCurrent(window, nullptr); } - void SwapBuffers() override {} - private: SDL_Window* window; SDL_GLContext context; @@ -80,7 +79,8 @@ bool EmuWindow_SDL2_GL::SupportsRequiredGLExtensions() { return unsupported_ext.empty(); } -EmuWindow_SDL2_GL::EmuWindow_SDL2_GL(bool fullscreen) : EmuWindow_SDL2(fullscreen) { +EmuWindow_SDL2_GL::EmuWindow_SDL2_GL(Core::System& system, bool fullscreen) + : EmuWindow_SDL2{system, fullscreen} { SDL_GL_SetAttribute(SDL_GL_CONTEXT_MAJOR_VERSION, 4); SDL_GL_SetAttribute(SDL_GL_CONTEXT_MINOR_VERSION, 3); SDL_GL_SetAttribute(SDL_GL_CONTEXT_PROFILE_MASK, SDL_GL_CONTEXT_PROFILE_COMPATIBILITY); @@ -90,6 +90,7 @@ EmuWindow_SDL2_GL::EmuWindow_SDL2_GL(bool fullscreen) : EmuWindow_SDL2(fullscree SDL_GL_SetAttribute(SDL_GL_BLUE_SIZE, 8); SDL_GL_SetAttribute(SDL_GL_ALPHA_SIZE, 0); SDL_GL_SetAttribute(SDL_GL_SHARE_WITH_CURRENT_CONTEXT, 1); + SDL_GL_SetSwapInterval(0); std::string window_title = fmt::format("yuzu {} | {}-{}", Common::g_build_fullname, Common::g_scm_branch, Common::g_scm_desc); @@ -105,13 +106,22 @@ EmuWindow_SDL2_GL::EmuWindow_SDL2_GL(bool fullscreen) : EmuWindow_SDL2(fullscree exit(1); } + dummy_window = SDL_CreateWindow(NULL, SDL_WINDOWPOS_UNDEFINED, SDL_WINDOWPOS_UNDEFINED, 0, 0, + SDL_WINDOW_HIDDEN | SDL_WINDOW_OPENGL); + if (fullscreen) { Fullscreen(); } - gl_context = SDL_GL_CreateContext(render_window); - if (gl_context == nullptr) { - LOG_CRITICAL(Frontend, "Failed to create SDL2 GL context! {}", SDL_GetError()); + window_context = SDL_GL_CreateContext(render_window); + core_context = CreateSharedContext(); + + if (window_context == nullptr) { + LOG_CRITICAL(Frontend, "Failed to create SDL2 GL context: {}", SDL_GetError()); + exit(1); + } + if (core_context == nullptr) { + LOG_CRITICAL(Frontend, "Failed to create shared SDL2 GL context: {}", SDL_GetError()); exit(1); } @@ -128,28 +138,22 @@ EmuWindow_SDL2_GL::EmuWindow_SDL2_GL(bool fullscreen) : EmuWindow_SDL2(fullscree OnResize(); OnMinimalClientAreaChangeRequest(GetActiveConfig().min_client_area_size); SDL_PumpEvents(); - SDL_GL_SetSwapInterval(false); LOG_INFO(Frontend, "yuzu Version: {} | {}-{}", Common::g_build_fullname, Common::g_scm_branch, Common::g_scm_desc); Settings::LogSettings(); - - DoneCurrent(); } EmuWindow_SDL2_GL::~EmuWindow_SDL2_GL() { - SDL_GL_DeleteContext(gl_context); -} - -void EmuWindow_SDL2_GL::SwapBuffers() { - SDL_GL_SwapWindow(render_window); + core_context.reset(); + SDL_GL_DeleteContext(window_context); } void EmuWindow_SDL2_GL::MakeCurrent() { - SDL_GL_MakeCurrent(render_window, gl_context); + core_context->MakeCurrent(); } void EmuWindow_SDL2_GL::DoneCurrent() { - SDL_GL_MakeCurrent(render_window, nullptr); + core_context->DoneCurrent(); } void EmuWindow_SDL2_GL::RetrieveVulkanHandlers(void* get_instance_proc_addr, void* instance, @@ -161,3 +165,13 @@ void EmuWindow_SDL2_GL::RetrieveVulkanHandlers(void* get_instance_proc_addr, voi std::unique_ptr<Core::Frontend::GraphicsContext> EmuWindow_SDL2_GL::CreateSharedContext() const { return std::make_unique<SDLGLContext>(); } + +void EmuWindow_SDL2_GL::Present() { + SDL_GL_MakeCurrent(render_window, window_context); + SDL_GL_SetSwapInterval(Settings::values.use_vsync ? 1 : 0); + while (IsOpen()) { + system.Renderer().TryPresent(100); + SDL_GL_SwapWindow(render_window); + } + SDL_GL_MakeCurrent(render_window, nullptr); +} diff --git a/src/yuzu_cmd/emu_window/emu_window_sdl2_gl.h b/src/yuzu_cmd/emu_window/emu_window_sdl2_gl.h index c753085a8..b80669ff0 100644 --- a/src/yuzu_cmd/emu_window/emu_window_sdl2_gl.h +++ b/src/yuzu_cmd/emu_window/emu_window_sdl2_gl.h @@ -10,17 +10,12 @@ class EmuWindow_SDL2_GL final : public EmuWindow_SDL2 { public: - explicit EmuWindow_SDL2_GL(bool fullscreen); + explicit EmuWindow_SDL2_GL(Core::System& system, bool fullscreen); ~EmuWindow_SDL2_GL(); - /// Swap buffers to display the next frame - void SwapBuffers() override; - - /// Makes the graphics context current for the caller thread void MakeCurrent() override; - - /// Releases the GL context from the caller thread void DoneCurrent() override; + void Present() override; /// Ignored in OpenGL void RetrieveVulkanHandlers(void* get_instance_proc_addr, void* instance, @@ -29,10 +24,17 @@ public: std::unique_ptr<Core::Frontend::GraphicsContext> CreateSharedContext() const override; private: + /// Fake hidden window for the core context + SDL_Window* dummy_window{}; + /// Whether the GPU and driver supports the OpenGL extension required bool SupportsRequiredGLExtensions(); using SDL_GLContext = void*; + /// The OpenGL context associated with the window - SDL_GLContext gl_context; + SDL_GLContext window_context; + + /// The OpenGL context associated with the core + std::unique_ptr<Core::Frontend::GraphicsContext> core_context; }; diff --git a/src/yuzu_cmd/emu_window/emu_window_sdl2_vk.cpp b/src/yuzu_cmd/emu_window/emu_window_sdl2_vk.cpp index a203f0da9..abcc58165 100644 --- a/src/yuzu_cmd/emu_window/emu_window_sdl2_vk.cpp +++ b/src/yuzu_cmd/emu_window/emu_window_sdl2_vk.cpp @@ -15,7 +15,8 @@ #include "core/settings.h" #include "yuzu_cmd/emu_window/emu_window_sdl2_vk.h" -EmuWindow_SDL2_VK::EmuWindow_SDL2_VK(bool fullscreen) : EmuWindow_SDL2(fullscreen) { +EmuWindow_SDL2_VK::EmuWindow_SDL2_VK(Core::System& system, bool fullscreen) + : EmuWindow_SDL2{system, fullscreen} { if (SDL_Vulkan_LoadLibrary(nullptr) != 0) { LOG_CRITICAL(Frontend, "SDL failed to load the Vulkan library: {}", SDL_GetError()); exit(EXIT_FAILURE); @@ -110,8 +111,6 @@ EmuWindow_SDL2_VK::~EmuWindow_SDL2_VK() { vkDestroyInstance(vk_instance, nullptr); } -void EmuWindow_SDL2_VK::SwapBuffers() {} - void EmuWindow_SDL2_VK::MakeCurrent() { // Unused on Vulkan } @@ -160,3 +159,7 @@ bool EmuWindow_SDL2_VK::UseStandardLayers(PFN_vkGetInstanceProcAddr vkGetInstanc return layer.layerName == std::string("VK_LAYER_LUNARG_standard_validation"); }) != layers.end(); } + +void EmuWindow_SDL2_VK::Present() { + // TODO (bunnei): ImplementMe +} diff --git a/src/yuzu_cmd/emu_window/emu_window_sdl2_vk.h b/src/yuzu_cmd/emu_window/emu_window_sdl2_vk.h index 2a7c06a24..1eb8c0868 100644 --- a/src/yuzu_cmd/emu_window/emu_window_sdl2_vk.h +++ b/src/yuzu_cmd/emu_window/emu_window_sdl2_vk.h @@ -10,19 +10,12 @@ class EmuWindow_SDL2_VK final : public EmuWindow_SDL2 { public: - explicit EmuWindow_SDL2_VK(bool fullscreen); + explicit EmuWindow_SDL2_VK(Core::System& system, bool fullscreen); ~EmuWindow_SDL2_VK(); - /// Swap buffers to display the next frame - void SwapBuffers() override; - - /// Makes the graphics context current for the caller thread void MakeCurrent() override; - - /// Releases the GL context from the caller thread void DoneCurrent() override; - - /// Retrieves Vulkan specific handlers from the window + void Present() override; void RetrieveVulkanHandlers(void* get_instance_proc_addr, void* instance, void* surface) const override; diff --git a/src/yuzu_cmd/yuzu.cpp b/src/yuzu_cmd/yuzu.cpp index 325795321..babf4c3a4 100644 --- a/src/yuzu_cmd/yuzu.cpp +++ b/src/yuzu_cmd/yuzu.cpp @@ -177,14 +177,16 @@ int main(int argc, char** argv) { Settings::values.use_gdbstub = use_gdbstub; Settings::Apply(); + Core::System& system{Core::System::GetInstance()}; + std::unique_ptr<EmuWindow_SDL2> emu_window; switch (Settings::values.renderer_backend) { case Settings::RendererBackend::OpenGL: - emu_window = std::make_unique<EmuWindow_SDL2_GL>(fullscreen); + emu_window = std::make_unique<EmuWindow_SDL2_GL>(system, fullscreen); break; case Settings::RendererBackend::Vulkan: #ifdef HAS_VULKAN - emu_window = std::make_unique<EmuWindow_SDL2_VK>(fullscreen); + emu_window = std::make_unique<EmuWindow_SDL2_VK>(system, fullscreen); break; #else LOG_CRITICAL(Frontend, "Vulkan backend has not been compiled!"); @@ -192,12 +194,6 @@ int main(int argc, char** argv) { #endif } - if (!Settings::values.use_multi_core) { - // Single core mode must acquire OpenGL context for entire emulation session - emu_window->MakeCurrent(); - } - - Core::System& system{Core::System::GetInstance()}; system.SetContentProvider(std::make_unique<FileSys::ContentProviderUnion>()); system.SetFilesystem(std::make_shared<FileSys::RealVfsFilesystem>()); system.GetFileSystemController().CreateFactories(*system.GetFilesystem()); @@ -234,12 +230,23 @@ int main(int argc, char** argv) { system.TelemetrySession().AddField(Telemetry::FieldType::App, "Frontend", "SDL"); - emu_window->MakeCurrent(); system.Renderer().Rasterizer().LoadDiskResources(); + // Acquire render context for duration of the thread if this is the rendering thread + if (!Settings::values.use_asynchronous_gpu_emulation) { + emu_window->MakeCurrent(); + } + SCOPE_EXIT({ + if (!Settings::values.use_asynchronous_gpu_emulation) { + emu_window->DoneCurrent(); + } + }); + + std::thread render_thread([&emu_window] { emu_window->Present(); }); while (emu_window->IsOpen()) { system.RunLoop(); } + render_thread.join(); system.Shutdown(); diff --git a/src/yuzu_tester/config.cpp b/src/yuzu_tester/config.cpp index 0ac93b62a..ee2591c8f 100644 --- a/src/yuzu_tester/config.cpp +++ b/src/yuzu_tester/config.cpp @@ -120,6 +120,8 @@ void Config::ReadValues() { static_cast<float>(sdl2_config->GetReal("Renderer", "resolution_factor", 1.0)); Settings::values.aspect_ratio = static_cast<int>(sdl2_config->GetInteger("Renderer", "aspect_ratio", 0)); + Settings::values.max_anisotropy = + static_cast<int>(sdl2_config->GetInteger("Renderer", "max_anisotropy", 0)); Settings::values.use_frame_limit = false; Settings::values.frame_limit = 100; Settings::values.use_disk_shader_cache = diff --git a/src/yuzu_tester/default_ini.h b/src/yuzu_tester/default_ini.h index 8d93f7b88..ca203b64d 100644 --- a/src/yuzu_tester/default_ini.h +++ b/src/yuzu_tester/default_ini.h @@ -30,6 +30,10 @@ resolution_factor = # 0: Default (16:9), 1: Force 4:3, 2: Force 21:9, 3: Stretch to Window aspect_ratio = +# Anisotropic filtering +# 0: Default, 1: 2x, 2: 4x, 3: 8x, 4: 16x +max_anisotropy = + # Whether to enable V-Sync (caps the framerate at 60FPS) or not. # 0 (default): Off, 1: On use_vsync = diff --git a/src/yuzu_tester/emu_window/emu_window_sdl2_hide.cpp b/src/yuzu_tester/emu_window/emu_window_sdl2_hide.cpp index f2cc4a797..a1bdb1a12 100644 --- a/src/yuzu_tester/emu_window/emu_window_sdl2_hide.cpp +++ b/src/yuzu_tester/emu_window/emu_window_sdl2_hide.cpp @@ -112,10 +112,6 @@ EmuWindow_SDL2_Hide::~EmuWindow_SDL2_Hide() { SDL_Quit(); } -void EmuWindow_SDL2_Hide::SwapBuffers() { - SDL_GL_SwapWindow(render_window); -} - void EmuWindow_SDL2_Hide::PollEvents() {} void EmuWindow_SDL2_Hide::MakeCurrent() { diff --git a/src/yuzu_tester/emu_window/emu_window_sdl2_hide.h b/src/yuzu_tester/emu_window/emu_window_sdl2_hide.h index c7fccc002..b13e15309 100644 --- a/src/yuzu_tester/emu_window/emu_window_sdl2_hide.h +++ b/src/yuzu_tester/emu_window/emu_window_sdl2_hide.h @@ -13,9 +13,6 @@ public: explicit EmuWindow_SDL2_Hide(); ~EmuWindow_SDL2_Hide(); - /// Swap buffers to display the next frame - void SwapBuffers() override; - /// Polls window events void PollEvents() override; |