diff options
| author | Ameer <aj662@drexel.edu> | 2020-07-04 00:59:40 -0400 | 
|---|---|---|
| committer | Ameer <aj662@drexel.edu> | 2020-07-04 00:59:40 -0400 | 
| commit | f829932ed191ad469df01342191bf2725e8a20bb (patch) | |
| tree | 0ae185ce3ef43ef9b085aae7b9ad5abb04e3d239 /src/video_core | |
| parent | d00972fce1fe5f2eb13c7e5d7e4e56036cb6bc91 (diff) | |
| parent | 3096adb3471af1b094d670751e476c337007d299 (diff) | |
Fix merge conflicts?
Diffstat (limited to 'src/video_core')
51 files changed, 1164 insertions, 479 deletions
| diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt index 099bb446e..21c46a567 100644 --- a/src/video_core/CMakeLists.txt +++ b/src/video_core/CMakeLists.txt @@ -3,6 +3,8 @@ add_library(video_core STATIC      buffer_cache/buffer_cache.h      buffer_cache/map_interval.cpp      buffer_cache/map_interval.h +    compatible_formats.cpp +    compatible_formats.h      dirty_flags.cpp      dirty_flags.h      dma_pusher.cpp @@ -27,6 +29,8 @@ add_library(video_core STATIC      engines/shader_type.h      macro/macro.cpp      macro/macro.h +    macro/macro_hle.cpp +    macro/macro_hle.h      macro/macro_interpreter.cpp      macro/macro_interpreter.h      macro/macro_jit_x64.cpp diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h index 308d8b55f..c6479af9f 100644 --- a/src/video_core/buffer_cache/buffer_cache.h +++ b/src/video_core/buffer_cache/buffer_cache.h @@ -41,16 +41,20 @@ class BufferCache {      static constexpr u64 BLOCK_PAGE_SIZE = 1ULL << BLOCK_PAGE_BITS;  public: -    using BufferInfo = std::pair<BufferType, u64>; +    struct BufferInfo { +        BufferType handle; +        u64 offset; +        u64 address; +    };      BufferInfo UploadMemory(GPUVAddr gpu_addr, std::size_t size, std::size_t alignment = 4,                              bool is_written = false, bool use_fast_cbuf = false) {          std::lock_guard lock{mutex}; -        const auto& memory_manager = system.GPU().MemoryManager(); +        auto& memory_manager = system.GPU().MemoryManager();          const std::optional<VAddr> cpu_addr_opt = memory_manager.GpuToCpuAddress(gpu_addr);          if (!cpu_addr_opt) { -            return {GetEmptyBuffer(size), 0}; +            return GetEmptyBuffer(size);          }          const VAddr cpu_addr = *cpu_addr_opt; @@ -59,7 +63,6 @@ public:          constexpr std::size_t max_stream_size = 0x800;          if (use_fast_cbuf || size < max_stream_size) {              if (!is_written && !IsRegionWritten(cpu_addr, cpu_addr + size - 1)) { -                auto& memory_manager = system.GPU().MemoryManager();                  const bool is_granular = memory_manager.IsGranularRange(gpu_addr, size);                  if (use_fast_cbuf) {                      u8* dest; @@ -89,7 +92,7 @@ public:          Buffer* const block = GetBlock(cpu_addr, size);          MapInterval* const map = MapAddress(block, gpu_addr, cpu_addr, size);          if (!map) { -            return {GetEmptyBuffer(size), 0}; +            return GetEmptyBuffer(size);          }          if (is_written) {              map->MarkAsModified(true, GetModifiedTicks()); @@ -102,7 +105,7 @@ public:              }          } -        return {block->Handle(), static_cast<u64>(block->Offset(cpu_addr))}; +        return BufferInfo{block->Handle(), block->Offset(cpu_addr), block->Address()};      }      /// Uploads from a host memory. Returns the OpenGL buffer where it's located and its offset. @@ -255,27 +258,17 @@ public:          committed_flushes.pop_front();      } -    virtual BufferType GetEmptyBuffer(std::size_t size) = 0; +    virtual BufferInfo GetEmptyBuffer(std::size_t size) = 0;  protected:      explicit BufferCache(VideoCore::RasterizerInterface& rasterizer, Core::System& system, -                         std::unique_ptr<StreamBuffer> stream_buffer_) -        : rasterizer{rasterizer}, system{system}, stream_buffer{std::move(stream_buffer_)}, -          stream_buffer_handle{stream_buffer->Handle()} {} +                         std::unique_ptr<StreamBuffer> stream_buffer) +        : rasterizer{rasterizer}, system{system}, stream_buffer{std::move(stream_buffer)} {}      ~BufferCache() = default;      virtual std::shared_ptr<Buffer> CreateBlock(VAddr cpu_addr, std::size_t size) = 0; -    virtual void UploadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size, -                                 const u8* data) = 0; - -    virtual void DownloadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size, -                                   u8* data) = 0; - -    virtual void CopyBlock(const Buffer& src, const Buffer& dst, std::size_t src_offset, -                           std::size_t dst_offset, std::size_t size) = 0; -      virtual BufferInfo ConstBufferUpload(const void* raw_pointer, std::size_t size) {          return {};      } @@ -329,19 +322,18 @@ protected:      }  private: -    MapInterval* MapAddress(const Buffer* block, GPUVAddr gpu_addr, VAddr cpu_addr, -                            std::size_t size) { +    MapInterval* MapAddress(Buffer* block, GPUVAddr gpu_addr, VAddr cpu_addr, std::size_t size) {          const VectorMapInterval overlaps = GetMapsInRange(cpu_addr, size);          if (overlaps.empty()) {              auto& memory_manager = system.GPU().MemoryManager();              const VAddr cpu_addr_end = cpu_addr + size;              if (memory_manager.IsGranularRange(gpu_addr, size)) {                  u8* host_ptr = memory_manager.GetPointer(gpu_addr); -                UploadBlockData(*block, block->Offset(cpu_addr), size, host_ptr); +                block->Upload(block->Offset(cpu_addr), size, host_ptr);              } else {                  staging_buffer.resize(size);                  memory_manager.ReadBlockUnsafe(gpu_addr, staging_buffer.data(), size); -                UploadBlockData(*block, block->Offset(cpu_addr), size, staging_buffer.data()); +                block->Upload(block->Offset(cpu_addr), size, staging_buffer.data());              }              return Register(MapInterval(cpu_addr, cpu_addr_end, gpu_addr));          } @@ -384,8 +376,7 @@ private:          return map;      } -    void UpdateBlock(const Buffer* block, VAddr start, VAddr end, -                     const VectorMapInterval& overlaps) { +    void UpdateBlock(Buffer* block, VAddr start, VAddr end, const VectorMapInterval& overlaps) {          const IntervalType base_interval{start, end};          IntervalSet interval_set{};          interval_set.add(base_interval); @@ -400,7 +391,7 @@ private:              }              staging_buffer.resize(size);              system.Memory().ReadBlockUnsafe(interval.lower(), staging_buffer.data(), size); -            UploadBlockData(*block, block->Offset(interval.lower()), size, staging_buffer.data()); +            block->Upload(block->Offset(interval.lower()), size, staging_buffer.data());          }      } @@ -437,7 +428,7 @@ private:          const std::size_t size = map->end - map->start;          staging_buffer.resize(size); -        DownloadBlockData(*block, block->Offset(map->start), size, staging_buffer.data()); +        block->Download(block->Offset(map->start), size, staging_buffer.data());          system.Memory().WriteBlockUnsafe(map->start, staging_buffer.data(), size);          map->MarkAsModified(false, 0);      } @@ -450,7 +441,7 @@ private:          buffer_ptr += size;          buffer_offset += size; -        return {stream_buffer_handle, uploaded_offset}; +        return BufferInfo{stream_buffer->Handle(), uploaded_offset, stream_buffer->Address()};      }      void AlignBuffer(std::size_t alignment) { @@ -465,7 +456,7 @@ private:          const std::size_t new_size = old_size + BLOCK_PAGE_SIZE;          const VAddr cpu_addr = buffer->CpuAddr();          std::shared_ptr<Buffer> new_buffer = CreateBlock(cpu_addr, new_size); -        CopyBlock(*buffer, *new_buffer, 0, 0, old_size); +        new_buffer->CopyFrom(*buffer, 0, 0, old_size);          QueueDestruction(std::move(buffer));          const VAddr cpu_addr_end = cpu_addr + new_size - 1; @@ -487,8 +478,8 @@ private:          const std::size_t new_size = size_1 + size_2;          std::shared_ptr<Buffer> new_buffer = CreateBlock(new_addr, new_size); -        CopyBlock(*first, *new_buffer, 0, new_buffer->Offset(first_addr), size_1); -        CopyBlock(*second, *new_buffer, 0, new_buffer->Offset(second_addr), size_2); +        new_buffer->CopyFrom(*first, 0, new_buffer->Offset(first_addr), size_1); +        new_buffer->CopyFrom(*second, 0, new_buffer->Offset(second_addr), size_2);          QueueDestruction(std::move(first));          QueueDestruction(std::move(second)); diff --git a/src/video_core/compatible_formats.cpp b/src/video_core/compatible_formats.cpp new file mode 100644 index 000000000..6c426b035 --- /dev/null +++ b/src/video_core/compatible_formats.cpp @@ -0,0 +1,162 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include <array> +#include <bitset> +#include <cstddef> + +#include "video_core/compatible_formats.h" +#include "video_core/surface.h" + +namespace VideoCore::Surface { + +namespace { + +// Compatibility table taken from Table 3.X.2 in: +// https://www.khronos.org/registry/OpenGL/extensions/ARB/ARB_texture_view.txt + +constexpr std::array VIEW_CLASS_128_BITS = { +    PixelFormat::RGBA32F, +    PixelFormat::RGBA32UI, +}; +// Missing formats: +// PixelFormat::RGBA32I + +constexpr std::array VIEW_CLASS_96_BITS = { +    PixelFormat::RGB32F, +}; +// Missing formats: +// PixelFormat::RGB32UI, +// PixelFormat::RGB32I, + +constexpr std::array VIEW_CLASS_64_BITS = { +    PixelFormat::RGBA16F, PixelFormat::RG32F,   PixelFormat::RGBA16UI, PixelFormat::RG32UI, +    PixelFormat::RGBA16U, PixelFormat::RGBA16F, PixelFormat::RGBA16S, +}; +// Missing formats: +// PixelFormat::RGBA16I +// PixelFormat::RG32I + +// TODO: How should we handle 48 bits? + +constexpr std::array VIEW_CLASS_32_BITS = { +    PixelFormat::RG16F,        PixelFormat::R11FG11FB10F, PixelFormat::R32F, +    PixelFormat::A2B10G10R10U, PixelFormat::RG16UI,       PixelFormat::R32UI, +    PixelFormat::RG16I,        PixelFormat::R32I,         PixelFormat::ABGR8U, +    PixelFormat::RG16,         PixelFormat::ABGR8S,       PixelFormat::RG16S, +    PixelFormat::RGBA8_SRGB,   PixelFormat::E5B9G9R9F,    PixelFormat::BGRA8, +    PixelFormat::BGRA8_SRGB, +}; +// Missing formats: +// PixelFormat::RGBA8UI +// PixelFormat::RGBA8I +// PixelFormat::RGB10_A2_UI + +// TODO: How should we handle 24 bits? + +constexpr std::array VIEW_CLASS_16_BITS = { +    PixelFormat::R16F, PixelFormat::RG8UI, PixelFormat::R16UI, PixelFormat::R16I, +    PixelFormat::RG8U, PixelFormat::R16U,  PixelFormat::RG8S,  PixelFormat::R16S, +}; +// Missing formats: +// PixelFormat::RG8I + +constexpr std::array VIEW_CLASS_8_BITS = { +    PixelFormat::R8UI, +    PixelFormat::R8U, +}; +// Missing formats: +// PixelFormat::R8I +// PixelFormat::R8S + +constexpr std::array VIEW_CLASS_RGTC1_RED = { +    PixelFormat::DXN1, +}; +// Missing formats: +// COMPRESSED_SIGNED_RED_RGTC1 + +constexpr std::array VIEW_CLASS_RGTC2_RG = { +    PixelFormat::DXN2UNORM, +    PixelFormat::DXN2SNORM, +}; + +constexpr std::array VIEW_CLASS_BPTC_UNORM = { +    PixelFormat::BC7U, +    PixelFormat::BC7U_SRGB, +}; + +constexpr std::array VIEW_CLASS_BPTC_FLOAT = { +    PixelFormat::BC6H_SF16, +    PixelFormat::BC6H_UF16, +}; + +// Compatibility table taken from Table 4.X.1 in: +// https://www.khronos.org/registry/OpenGL/extensions/ARB/ARB_copy_image.txt + +constexpr std::array COPY_CLASS_128_BITS = { +    PixelFormat::RGBA32UI,   PixelFormat::RGBA32F,   PixelFormat::DXT23, +    PixelFormat::DXT23_SRGB, PixelFormat::DXT45,     PixelFormat::DXT45_SRGB, +    PixelFormat::DXN2SNORM,  PixelFormat::BC7U,      PixelFormat::BC7U_SRGB, +    PixelFormat::BC6H_SF16,  PixelFormat::BC6H_UF16, +}; +// Missing formats: +// PixelFormat::RGBA32I +// COMPRESSED_RG_RGTC2 + +constexpr std::array COPY_CLASS_64_BITS = { +    PixelFormat::RGBA16F, PixelFormat::RG32F,   PixelFormat::RGBA16UI,  PixelFormat::RG32UI, +    PixelFormat::RGBA16U, PixelFormat::RGBA16S, PixelFormat::DXT1_SRGB, PixelFormat::DXT1, + +}; +// Missing formats: +// PixelFormat::RGBA16I +// PixelFormat::RG32I, +// COMPRESSED_RGB_S3TC_DXT1_EXT +// COMPRESSED_SRGB_S3TC_DXT1_EXT +// COMPRESSED_RGBA_S3TC_DXT1_EXT +// COMPRESSED_SIGNED_RED_RGTC1 + +void Enable(FormatCompatibility::Table& compatiblity, size_t format_a, size_t format_b) { +    compatiblity[format_a][format_b] = true; +    compatiblity[format_b][format_a] = true; +} + +void Enable(FormatCompatibility::Table& compatibility, PixelFormat format_a, PixelFormat format_b) { +    Enable(compatibility, static_cast<size_t>(format_a), static_cast<size_t>(format_b)); +} + +template <typename Range> +void EnableRange(FormatCompatibility::Table& compatibility, const Range& range) { +    for (auto it_a = range.begin(); it_a != range.end(); ++it_a) { +        for (auto it_b = it_a; it_b != range.end(); ++it_b) { +            Enable(compatibility, *it_a, *it_b); +        } +    } +} + +} // Anonymous namespace + +FormatCompatibility::FormatCompatibility() { +    for (size_t i = 0; i < MaxPixelFormat; ++i) { +        // Identity is allowed +        Enable(view, i, i); +    } + +    EnableRange(view, VIEW_CLASS_128_BITS); +    EnableRange(view, VIEW_CLASS_96_BITS); +    EnableRange(view, VIEW_CLASS_64_BITS); +    EnableRange(view, VIEW_CLASS_32_BITS); +    EnableRange(view, VIEW_CLASS_16_BITS); +    EnableRange(view, VIEW_CLASS_8_BITS); +    EnableRange(view, VIEW_CLASS_RGTC1_RED); +    EnableRange(view, VIEW_CLASS_RGTC2_RG); +    EnableRange(view, VIEW_CLASS_BPTC_UNORM); +    EnableRange(view, VIEW_CLASS_BPTC_FLOAT); + +    copy = view; +    EnableRange(copy, COPY_CLASS_128_BITS); +    EnableRange(copy, COPY_CLASS_64_BITS); +} + +} // namespace VideoCore::Surface diff --git a/src/video_core/compatible_formats.h b/src/video_core/compatible_formats.h new file mode 100644 index 000000000..d1082566d --- /dev/null +++ b/src/video_core/compatible_formats.h @@ -0,0 +1,32 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include <array> +#include <bitset> +#include <cstddef> + +#include "video_core/surface.h" + +namespace VideoCore::Surface { + +class FormatCompatibility { +public: +    using Table = std::array<std::bitset<MaxPixelFormat>, MaxPixelFormat>; + +    explicit FormatCompatibility(); + +    bool TestView(PixelFormat format_a, PixelFormat format_b) const noexcept { +        return view[static_cast<size_t>(format_a)][static_cast<size_t>(format_b)]; +    } + +    bool TestCopy(PixelFormat format_a, PixelFormat format_b) const noexcept { +        return copy[static_cast<size_t>(format_a)][static_cast<size_t>(format_b)]; +    } + +private: +    Table view; +    Table copy; +}; + +} // namespace VideoCore::Surface diff --git a/src/video_core/engines/maxwell_3d.cpp b/src/video_core/engines/maxwell_3d.cpp index ea3c8a963..c01436295 100644 --- a/src/video_core/engines/maxwell_3d.cpp +++ b/src/video_core/engines/maxwell_3d.cpp @@ -128,7 +128,7 @@ void Maxwell3D::CallMacroMethod(u32 method, const std::vector<u32>& parameters)          ((method - MacroRegistersStart) >> 1) % static_cast<u32>(macro_positions.size());      // Execute the current macro. -    macro_engine->Execute(macro_positions[entry], parameters); +    macro_engine->Execute(*this, macro_positions[entry], parameters);      if (mme_draw.current_mode != MMEDrawMode::Undefined) {          FlushMMEInlineDraw();      } diff --git a/src/video_core/engines/maxwell_3d.h b/src/video_core/engines/maxwell_3d.h index d5fe25065..ef1618990 100644 --- a/src/video_core/engines/maxwell_3d.h +++ b/src/video_core/engines/maxwell_3d.h @@ -1418,6 +1418,14 @@ public:          return execute_on;      } +    VideoCore::RasterizerInterface& GetRasterizer() { +        return rasterizer; +    } + +    const VideoCore::RasterizerInterface& GetRasterizer() const { +        return rasterizer; +    } +      /// Notify a memory write has happened.      void OnMemoryWrite() {          dirty.flags |= dirty.on_write_stores; diff --git a/src/video_core/engines/shader_bytecode.h b/src/video_core/engines/shader_bytecode.h index e7cb87589..d374b73cf 100644 --- a/src/video_core/engines/shader_bytecode.h +++ b/src/video_core/engines/shader_bytecode.h @@ -661,6 +661,10 @@ union Instruction {      constexpr Instruction(u64 value) : value{value} {}      constexpr Instruction(const Instruction& instr) : value(instr.value) {} +    constexpr bool Bit(u64 offset) const { +        return ((value >> offset) & 1) != 0; +    } +      BitField<0, 8, Register> gpr0;      BitField<8, 8, Register> gpr8;      union { @@ -1874,7 +1878,9 @@ public:          HSETP2_C,          HSETP2_R,          HSETP2_IMM, +        HSET2_C,          HSET2_R, +        HSET2_IMM,          POPC_C,          POPC_R,          POPC_IMM, @@ -2194,7 +2200,9 @@ private:              INST("0111111-1-------", Id::HSETP2_C, Type::HalfSetPredicate, "HSETP2_C"),              INST("0101110100100---", Id::HSETP2_R, Type::HalfSetPredicate, "HSETP2_R"),              INST("0111111-0-------", Id::HSETP2_IMM, Type::HalfSetPredicate, "HSETP2_IMM"), +            INST("0111110-1-------", Id::HSET2_C, Type::HalfSet, "HSET2_C"),              INST("0101110100011---", Id::HSET2_R, Type::HalfSet, "HSET2_R"), +            INST("0111110-0-------", Id::HSET2_IMM, Type::HalfSet, "HSET2_IMM"),              INST("010110111010----", Id::FCMP_RR, Type::Arithmetic, "FCMP_RR"),              INST("010010111010----", Id::FCMP_RC, Type::Arithmetic, "FCMP_RC"),              INST("0101000010000---", Id::MUFU, Type::Arithmetic, "MUFU"), diff --git a/src/video_core/gpu.cpp b/src/video_core/gpu.cpp index 8eb017f65..482e49711 100644 --- a/src/video_core/gpu.cpp +++ b/src/video_core/gpu.cpp @@ -2,6 +2,8 @@  // Licensed under GPLv2 or any later version  // Refer to the license.txt file included. +#include <chrono> +  #include "common/assert.h"  #include "common/microprofile.h"  #include "core/core.h" @@ -154,8 +156,7 @@ u64 GPU::GetTicks() const {      constexpr u64 gpu_ticks_num = 384;      constexpr u64 gpu_ticks_den = 625; -    const u64 cpu_ticks = system.CoreTiming().GetTicks(); -    u64 nanoseconds = Core::Timing::CyclesToNs(cpu_ticks).count(); +    u64 nanoseconds = system.CoreTiming().GetGlobalTimeNs().count();      if (Settings::values.use_fast_gpu_time) {          nanoseconds /= 256;      } diff --git a/src/video_core/gpu.h b/src/video_core/gpu.h index a1b4c305c..2c42483bd 100644 --- a/src/video_core/gpu.h +++ b/src/video_core/gpu.h @@ -284,6 +284,12 @@ public:      /// core timing events.      virtual void Start() = 0; +    /// Obtain the CPU Context +    virtual void ObtainContext() = 0; + +    /// Release the CPU Context +    virtual void ReleaseContext() = 0; +      /// Push GPU command entries to be processed      virtual void PushGPUEntries(Tegra::CommandList&& entries) = 0; diff --git a/src/video_core/gpu_asynch.cpp b/src/video_core/gpu_asynch.cpp index 53305ab43..7b855f63e 100644 --- a/src/video_core/gpu_asynch.cpp +++ b/src/video_core/gpu_asynch.cpp @@ -19,10 +19,17 @@ GPUAsynch::GPUAsynch(Core::System& system, std::unique_ptr<VideoCore::RendererBa  GPUAsynch::~GPUAsynch() = default;  void GPUAsynch::Start() { -    cpu_context->MakeCurrent();      gpu_thread.StartThread(*renderer, *gpu_context, *dma_pusher);  } +void GPUAsynch::ObtainContext() { +    cpu_context->MakeCurrent(); +} + +void GPUAsynch::ReleaseContext() { +    cpu_context->DoneCurrent(); +} +  void GPUAsynch::PushGPUEntries(Tegra::CommandList&& entries) {      gpu_thread.SubmitList(std::move(entries));  } diff --git a/src/video_core/gpu_asynch.h b/src/video_core/gpu_asynch.h index 517658612..15e9f1d38 100644 --- a/src/video_core/gpu_asynch.h +++ b/src/video_core/gpu_asynch.h @@ -25,6 +25,8 @@ public:      ~GPUAsynch() override;      void Start() override; +    void ObtainContext() override; +    void ReleaseContext() override;      void PushGPUEntries(Tegra::CommandList&& entries) override;      void SwapBuffers(const Tegra::FramebufferConfig* framebuffer) override;      void FlushRegion(VAddr addr, u64 size) override; diff --git a/src/video_core/gpu_synch.cpp b/src/video_core/gpu_synch.cpp index 6f38a672a..aaeb9811d 100644 --- a/src/video_core/gpu_synch.cpp +++ b/src/video_core/gpu_synch.cpp @@ -13,10 +13,16 @@ GPUSynch::GPUSynch(Core::System& system, std::unique_ptr<VideoCore::RendererBase  GPUSynch::~GPUSynch() = default; -void GPUSynch::Start() { +void GPUSynch::Start() {} + +void GPUSynch::ObtainContext() {      context->MakeCurrent();  } +void GPUSynch::ReleaseContext() { +    context->DoneCurrent(); +} +  void GPUSynch::PushGPUEntries(Tegra::CommandList&& entries) {      dma_pusher->Push(std::move(entries));      dma_pusher->DispatchCalls(); diff --git a/src/video_core/gpu_synch.h b/src/video_core/gpu_synch.h index 4a6e9a01d..762c20aa5 100644 --- a/src/video_core/gpu_synch.h +++ b/src/video_core/gpu_synch.h @@ -24,6 +24,8 @@ public:      ~GPUSynch() override;      void Start() override; +    void ObtainContext() override; +    void ReleaseContext() override;      void PushGPUEntries(Tegra::CommandList&& entries) override;      void SwapBuffers(const Tegra::FramebufferConfig* framebuffer) override;      void FlushRegion(VAddr addr, u64 size) override; diff --git a/src/video_core/gpu_thread.cpp b/src/video_core/gpu_thread.cpp index c3bb4fe06..738c6f0c1 100644 --- a/src/video_core/gpu_thread.cpp +++ b/src/video_core/gpu_thread.cpp @@ -4,6 +4,7 @@  #include "common/assert.h"  #include "common/microprofile.h" +#include "common/thread.h"  #include "core/core.h"  #include "core/frontend/emu_window.h"  #include "core/settings.h" @@ -18,7 +19,11 @@ namespace VideoCommon::GPUThread {  static void RunThread(Core::System& system, VideoCore::RendererBase& renderer,                        Core::Frontend::GraphicsContext& context, Tegra::DmaPusher& dma_pusher,                        SynchState& state) { -    MicroProfileOnThreadCreate("GpuThread"); +    std::string name = "yuzu:GPU"; +    MicroProfileOnThreadCreate(name.c_str()); +    Common::SetCurrentThreadName(name.c_str()); +    Common::SetCurrentThreadPriority(Common::ThreadPriority::High); +    system.RegisterHostThread();      // Wait for first GPU command before acquiring the window context      while (state.queue.Empty()) diff --git a/src/video_core/macro/macro.cpp b/src/video_core/macro/macro.cpp index 89077a2d8..a50e7b4e0 100644 --- a/src/video_core/macro/macro.cpp +++ b/src/video_core/macro/macro.cpp @@ -2,32 +2,78 @@  // Licensed under GPLv2 or any later version  // Refer to the license.txt file included. +#include <optional> +#include <boost/container_hash/hash.hpp>  #include "common/assert.h"  #include "common/logging/log.h"  #include "core/settings.h" +#include "video_core/engines/maxwell_3d.h"  #include "video_core/macro/macro.h" +#include "video_core/macro/macro_hle.h"  #include "video_core/macro/macro_interpreter.h"  #include "video_core/macro/macro_jit_x64.h"  namespace Tegra { +MacroEngine::MacroEngine(Engines::Maxwell3D& maxwell3d) +    : hle_macros{std::make_unique<Tegra::HLEMacro>(maxwell3d)} {} + +MacroEngine::~MacroEngine() = default; +  void MacroEngine::AddCode(u32 method, u32 data) {      uploaded_macro_code[method].push_back(data);  } -void MacroEngine::Execute(u32 method, const std::vector<u32>& parameters) { +void MacroEngine::Execute(Engines::Maxwell3D& maxwell3d, u32 method, +                          const std::vector<u32>& parameters) {      auto compiled_macro = macro_cache.find(method);      if (compiled_macro != macro_cache.end()) { -        compiled_macro->second->Execute(parameters, method); +        const auto& cache_info = compiled_macro->second; +        if (cache_info.has_hle_program) { +            cache_info.hle_program->Execute(parameters, method); +        } else { +            cache_info.lle_program->Execute(parameters, method); +        }      } else {          // Macro not compiled, check if it's uploaded and if so, compile it -        auto macro_code = uploaded_macro_code.find(method); +        std::optional<u32> mid_method = std::nullopt; +        const auto macro_code = uploaded_macro_code.find(method);          if (macro_code == uploaded_macro_code.end()) { -            UNREACHABLE_MSG("Macro 0x{0:x} was not uploaded", method); -            return; +            for (const auto& [method_base, code] : uploaded_macro_code) { +                if (method >= method_base && (method - method_base) < code.size()) { +                    mid_method = method_base; +                    break; +                } +            } +            if (!mid_method.has_value()) { +                UNREACHABLE_MSG("Macro 0x{0:x} was not uploaded", method); +                return; +            } +        } +        auto& cache_info = macro_cache[method]; + +        if (!mid_method.has_value()) { +            cache_info.lle_program = Compile(macro_code->second); +            cache_info.hash = boost::hash_value(macro_code->second); +        } else { +            const auto& macro_cached = uploaded_macro_code[mid_method.value()]; +            const auto rebased_method = method - mid_method.value(); +            auto& code = uploaded_macro_code[method]; +            code.resize(macro_cached.size() - rebased_method); +            std::memcpy(code.data(), macro_cached.data() + rebased_method, +                        code.size() * sizeof(u32)); +            cache_info.hash = boost::hash_value(code); +            cache_info.lle_program = Compile(code); +        } + +        auto hle_program = hle_macros->GetHLEProgram(cache_info.hash); +        if (hle_program.has_value()) { +            cache_info.has_hle_program = true; +            cache_info.hle_program = std::move(hle_program.value()); +            cache_info.hle_program->Execute(parameters, method); +        } else { +            cache_info.lle_program->Execute(parameters, method);          } -        macro_cache[method] = Compile(macro_code->second); -        macro_cache[method]->Execute(parameters, method);      }  } diff --git a/src/video_core/macro/macro.h b/src/video_core/macro/macro.h index b76ed891f..4d00b84b0 100644 --- a/src/video_core/macro/macro.h +++ b/src/video_core/macro/macro.h @@ -11,9 +11,11 @@  #include "common/common_types.h"  namespace Tegra { +  namespace Engines {  class Maxwell3D;  } +  namespace Macro {  constexpr std::size_t NUM_MACRO_REGISTERS = 8;  enum class Operation : u32 { @@ -94,6 +96,8 @@ union MethodAddress {  } // namespace Macro +class HLEMacro; +  class CachedMacro {  public:      virtual ~CachedMacro() = default; @@ -107,20 +111,29 @@ public:  class MacroEngine {  public: -    virtual ~MacroEngine() = default; +    explicit MacroEngine(Engines::Maxwell3D& maxwell3d); +    virtual ~MacroEngine();      // Store the uploaded macro code to compile them when they're called.      void AddCode(u32 method, u32 data);      // Compiles the macro if its not in the cache, and executes the compiled macro -    void Execute(u32 method, const std::vector<u32>& parameters); +    void Execute(Engines::Maxwell3D& maxwell3d, u32 method, const std::vector<u32>& parameters);  protected:      virtual std::unique_ptr<CachedMacro> Compile(const std::vector<u32>& code) = 0;  private: -    std::unordered_map<u32, std::unique_ptr<CachedMacro>> macro_cache; +    struct CacheInfo { +        std::unique_ptr<CachedMacro> lle_program{}; +        std::unique_ptr<CachedMacro> hle_program{}; +        u64 hash{}; +        bool has_hle_program{}; +    }; + +    std::unordered_map<u32, CacheInfo> macro_cache;      std::unordered_map<u32, std::vector<u32>> uploaded_macro_code; +    std::unique_ptr<HLEMacro> hle_macros;  };  std::unique_ptr<MacroEngine> GetMacroEngine(Engines::Maxwell3D& maxwell3d); diff --git a/src/video_core/macro/macro_hle.cpp b/src/video_core/macro/macro_hle.cpp new file mode 100644 index 000000000..410f99018 --- /dev/null +++ b/src/video_core/macro/macro_hle.cpp @@ -0,0 +1,113 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include <array> +#include <vector> +#include "video_core/engines/maxwell_3d.h" +#include "video_core/macro/macro_hle.h" +#include "video_core/rasterizer_interface.h" + +namespace Tegra { + +namespace { +// HLE'd functions +static void HLE_771BB18C62444DA0(Engines::Maxwell3D& maxwell3d, +                                 const std::vector<u32>& parameters) { +    const u32 instance_count = parameters[2] & maxwell3d.GetRegisterValue(0xD1B); + +    maxwell3d.regs.draw.topology.Assign( +        static_cast<Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology>(parameters[0] & +                                                                        ~(0x3ffffff << 26))); +    maxwell3d.regs.vb_base_instance = parameters[5]; +    maxwell3d.mme_draw.instance_count = instance_count; +    maxwell3d.regs.vb_element_base = parameters[3]; +    maxwell3d.regs.index_array.count = parameters[1]; +    maxwell3d.regs.index_array.first = parameters[4]; + +    if (maxwell3d.ShouldExecute()) { +        maxwell3d.GetRasterizer().Draw(true, true); +    } +    maxwell3d.regs.index_array.count = 0; +    maxwell3d.mme_draw.instance_count = 0; +    maxwell3d.mme_draw.current_mode = Engines::Maxwell3D::MMEDrawMode::Undefined; +} + +static void HLE_0D61FC9FAAC9FCAD(Engines::Maxwell3D& maxwell3d, +                                 const std::vector<u32>& parameters) { +    const u32 count = (maxwell3d.GetRegisterValue(0xD1B) & parameters[2]); + +    maxwell3d.regs.vertex_buffer.first = parameters[3]; +    maxwell3d.regs.vertex_buffer.count = parameters[1]; +    maxwell3d.regs.vb_base_instance = parameters[4]; +    maxwell3d.regs.draw.topology.Assign( +        static_cast<Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology>(parameters[0])); +    maxwell3d.mme_draw.instance_count = count; + +    if (maxwell3d.ShouldExecute()) { +        maxwell3d.GetRasterizer().Draw(false, true); +    } +    maxwell3d.regs.vertex_buffer.count = 0; +    maxwell3d.mme_draw.instance_count = 0; +    maxwell3d.mme_draw.current_mode = Engines::Maxwell3D::MMEDrawMode::Undefined; +} + +static void HLE_0217920100488FF7(Engines::Maxwell3D& maxwell3d, +                                 const std::vector<u32>& parameters) { +    const u32 instance_count = (maxwell3d.GetRegisterValue(0xD1B) & parameters[2]); +    const u32 element_base = parameters[4]; +    const u32 base_instance = parameters[5]; +    maxwell3d.regs.index_array.first = parameters[3]; +    maxwell3d.regs.reg_array[0x446] = element_base; // vertex id base? +    maxwell3d.regs.index_array.count = parameters[1]; +    maxwell3d.regs.vb_element_base = element_base; +    maxwell3d.regs.vb_base_instance = base_instance; +    maxwell3d.mme_draw.instance_count = instance_count; +    maxwell3d.CallMethodFromMME(0x8e3, 0x640); +    maxwell3d.CallMethodFromMME(0x8e4, element_base); +    maxwell3d.CallMethodFromMME(0x8e5, base_instance); +    maxwell3d.regs.draw.topology.Assign( +        static_cast<Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology>(parameters[0])); +    if (maxwell3d.ShouldExecute()) { +        maxwell3d.GetRasterizer().Draw(true, true); +    } +    maxwell3d.regs.reg_array[0x446] = 0x0; // vertex id base? +    maxwell3d.regs.index_array.count = 0; +    maxwell3d.regs.vb_element_base = 0x0; +    maxwell3d.regs.vb_base_instance = 0x0; +    maxwell3d.mme_draw.instance_count = 0; +    maxwell3d.CallMethodFromMME(0x8e3, 0x640); +    maxwell3d.CallMethodFromMME(0x8e4, 0x0); +    maxwell3d.CallMethodFromMME(0x8e5, 0x0); +    maxwell3d.mme_draw.current_mode = Engines::Maxwell3D::MMEDrawMode::Undefined; +} +} // namespace + +constexpr std::array<std::pair<u64, HLEFunction>, 3> hle_funcs{{ +    std::make_pair<u64, HLEFunction>(0x771BB18C62444DA0, &HLE_771BB18C62444DA0), +    std::make_pair<u64, HLEFunction>(0x0D61FC9FAAC9FCAD, &HLE_0D61FC9FAAC9FCAD), +    std::make_pair<u64, HLEFunction>(0x0217920100488FF7, &HLE_0217920100488FF7), +}}; + +HLEMacro::HLEMacro(Engines::Maxwell3D& maxwell3d) : maxwell3d(maxwell3d) {} +HLEMacro::~HLEMacro() = default; + +std::optional<std::unique_ptr<CachedMacro>> HLEMacro::GetHLEProgram(u64 hash) const { +    const auto it = std::find_if(hle_funcs.cbegin(), hle_funcs.cend(), +                                 [hash](const auto& pair) { return pair.first == hash; }); +    if (it == hle_funcs.end()) { +        return std::nullopt; +    } +    return std::make_unique<HLEMacroImpl>(maxwell3d, it->second); +} + +HLEMacroImpl::~HLEMacroImpl() = default; + +HLEMacroImpl::HLEMacroImpl(Engines::Maxwell3D& maxwell3d, HLEFunction func) +    : maxwell3d(maxwell3d), func(func) {} + +void HLEMacroImpl::Execute(const std::vector<u32>& parameters, u32 method) { +    func(maxwell3d, parameters); +} + +} // namespace Tegra diff --git a/src/video_core/macro/macro_hle.h b/src/video_core/macro/macro_hle.h new file mode 100644 index 000000000..37af875a0 --- /dev/null +++ b/src/video_core/macro/macro_hle.h @@ -0,0 +1,44 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include <memory> +#include <optional> +#include <vector> +#include "common/common_types.h" +#include "video_core/macro/macro.h" + +namespace Tegra { + +namespace Engines { +class Maxwell3D; +} + +using HLEFunction = void (*)(Engines::Maxwell3D& maxwell3d, const std::vector<u32>& parameters); + +class HLEMacro { +public: +    explicit HLEMacro(Engines::Maxwell3D& maxwell3d); +    ~HLEMacro(); + +    std::optional<std::unique_ptr<CachedMacro>> GetHLEProgram(u64 hash) const; + +private: +    Engines::Maxwell3D& maxwell3d; +}; + +class HLEMacroImpl : public CachedMacro { +public: +    explicit HLEMacroImpl(Engines::Maxwell3D& maxwell3d, HLEFunction func); +    ~HLEMacroImpl(); + +    void Execute(const std::vector<u32>& parameters, u32 method) override; + +private: +    Engines::Maxwell3D& maxwell3d; +    HLEFunction func; +}; + +} // namespace Tegra diff --git a/src/video_core/macro/macro_interpreter.cpp b/src/video_core/macro/macro_interpreter.cpp index 5edff27aa..aa5256419 100644 --- a/src/video_core/macro/macro_interpreter.cpp +++ b/src/video_core/macro/macro_interpreter.cpp @@ -11,7 +11,8 @@  MICROPROFILE_DEFINE(MacroInterp, "GPU", "Execute macro interpreter", MP_RGB(128, 128, 192));  namespace Tegra { -MacroInterpreter::MacroInterpreter(Engines::Maxwell3D& maxwell3d) : maxwell3d(maxwell3d) {} +MacroInterpreter::MacroInterpreter(Engines::Maxwell3D& maxwell3d) +    : MacroEngine::MacroEngine(maxwell3d), maxwell3d(maxwell3d) {}  std::unique_ptr<CachedMacro> MacroInterpreter::Compile(const std::vector<u32>& code) {      return std::make_unique<MacroInterpreterImpl>(maxwell3d, code); diff --git a/src/video_core/macro/macro_jit_x64.cpp b/src/video_core/macro/macro_jit_x64.cpp index 389b58989..07292702f 100644 --- a/src/video_core/macro/macro_jit_x64.cpp +++ b/src/video_core/macro/macro_jit_x64.cpp @@ -28,7 +28,8 @@ static const std::bitset<32> PERSISTENT_REGISTERS = Common::X64::BuildRegSet({      BRANCH_HOLDER,  }); -MacroJITx64::MacroJITx64(Engines::Maxwell3D& maxwell3d) : maxwell3d(maxwell3d) {} +MacroJITx64::MacroJITx64(Engines::Maxwell3D& maxwell3d) +    : MacroEngine::MacroEngine(maxwell3d), maxwell3d(maxwell3d) {}  std::unique_ptr<CachedMacro> MacroJITx64::Compile(const std::vector<u32>& code) {      return std::make_unique<MacroJITx64Impl>(maxwell3d, code); @@ -553,7 +554,7 @@ Xbyak::Reg32 MacroJITx64Impl::Compile_GetRegister(u32 index, Xbyak::Reg32 dst) {  }  void MacroJITx64Impl::Compile_ProcessResult(Macro::ResultOperation operation, u32 reg) { -    auto SetRegister = [=](u32 reg, Xbyak::Reg32 result) { +    const auto SetRegister = [this](u32 reg, const Xbyak::Reg32& result) {          // Register 0 is supposed to always return 0. NOP is implemented as a store to the zero          // register.          if (reg == 0) { @@ -561,7 +562,7 @@ void MacroJITx64Impl::Compile_ProcessResult(Macro::ResultOperation operation, u3          }          mov(dword[STATE + offsetof(JITState, registers) + reg * sizeof(u32)], result);      }; -    auto SetMethodAddress = [=](Xbyak::Reg32 reg) { mov(METHOD_ADDRESS, reg); }; +    const auto SetMethodAddress = [this](const Xbyak::Reg32& reg) { mov(METHOD_ADDRESS, reg); };      switch (operation) {      case Macro::ResultOperation::IgnoreAndFetch: diff --git a/src/video_core/memory_manager.cpp b/src/video_core/memory_manager.cpp index dbee9f634..ff5505d12 100644 --- a/src/video_core/memory_manager.cpp +++ b/src/video_core/memory_manager.cpp @@ -210,10 +210,11 @@ bool MemoryManager::IsBlockContinuous(const GPUVAddr start, const std::size_t si      return range == inner_size;  } -void MemoryManager::ReadBlock(GPUVAddr src_addr, void* dest_buffer, const std::size_t size) const { +void MemoryManager::ReadBlock(GPUVAddr gpu_src_addr, void* dest_buffer, +                              const std::size_t size) const {      std::size_t remaining_size{size}; -    std::size_t page_index{src_addr >> page_bits}; -    std::size_t page_offset{src_addr & page_mask}; +    std::size_t page_index{gpu_src_addr >> page_bits}; +    std::size_t page_offset{gpu_src_addr & page_mask};      auto& memory = system.Memory(); @@ -234,11 +235,11 @@ void MemoryManager::ReadBlock(GPUVAddr src_addr, void* dest_buffer, const std::s      }  } -void MemoryManager::ReadBlockUnsafe(GPUVAddr src_addr, void* dest_buffer, +void MemoryManager::ReadBlockUnsafe(GPUVAddr gpu_src_addr, void* dest_buffer,                                      const std::size_t size) const {      std::size_t remaining_size{size}; -    std::size_t page_index{src_addr >> page_bits}; -    std::size_t page_offset{src_addr & page_mask}; +    std::size_t page_index{gpu_src_addr >> page_bits}; +    std::size_t page_offset{gpu_src_addr & page_mask};      auto& memory = system.Memory(); @@ -259,10 +260,11 @@ void MemoryManager::ReadBlockUnsafe(GPUVAddr src_addr, void* dest_buffer,      }  } -void MemoryManager::WriteBlock(GPUVAddr dest_addr, const void* src_buffer, const std::size_t size) { +void MemoryManager::WriteBlock(GPUVAddr gpu_dest_addr, const void* src_buffer, +                               const std::size_t size) {      std::size_t remaining_size{size}; -    std::size_t page_index{dest_addr >> page_bits}; -    std::size_t page_offset{dest_addr & page_mask}; +    std::size_t page_index{gpu_dest_addr >> page_bits}; +    std::size_t page_offset{gpu_dest_addr & page_mask};      auto& memory = system.Memory(); @@ -283,11 +285,11 @@ void MemoryManager::WriteBlock(GPUVAddr dest_addr, const void* src_buffer, const      }  } -void MemoryManager::WriteBlockUnsafe(GPUVAddr dest_addr, const void* src_buffer, +void MemoryManager::WriteBlockUnsafe(GPUVAddr gpu_dest_addr, const void* src_buffer,                                       const std::size_t size) {      std::size_t remaining_size{size}; -    std::size_t page_index{dest_addr >> page_bits}; -    std::size_t page_offset{dest_addr & page_mask}; +    std::size_t page_index{gpu_dest_addr >> page_bits}; +    std::size_t page_offset{gpu_dest_addr & page_mask};      auto& memory = system.Memory(); @@ -306,16 +308,18 @@ void MemoryManager::WriteBlockUnsafe(GPUVAddr dest_addr, const void* src_buffer,      }  } -void MemoryManager::CopyBlock(GPUVAddr dest_addr, GPUVAddr src_addr, const std::size_t size) { +void MemoryManager::CopyBlock(GPUVAddr gpu_dest_addr, GPUVAddr gpu_src_addr, +                              const std::size_t size) {      std::vector<u8> tmp_buffer(size); -    ReadBlock(src_addr, tmp_buffer.data(), size); -    WriteBlock(dest_addr, tmp_buffer.data(), size); +    ReadBlock(gpu_src_addr, tmp_buffer.data(), size); +    WriteBlock(gpu_dest_addr, tmp_buffer.data(), size);  } -void MemoryManager::CopyBlockUnsafe(GPUVAddr dest_addr, GPUVAddr src_addr, const std::size_t size) { +void MemoryManager::CopyBlockUnsafe(GPUVAddr gpu_dest_addr, GPUVAddr gpu_src_addr, +                                    const std::size_t size) {      std::vector<u8> tmp_buffer(size); -    ReadBlockUnsafe(src_addr, tmp_buffer.data(), size); -    WriteBlockUnsafe(dest_addr, tmp_buffer.data(), size); +    ReadBlockUnsafe(gpu_src_addr, tmp_buffer.data(), size); +    WriteBlockUnsafe(gpu_dest_addr, tmp_buffer.data(), size);  }  bool MemoryManager::IsGranularRange(GPUVAddr gpu_addr, std::size_t size) { diff --git a/src/video_core/memory_manager.h b/src/video_core/memory_manager.h index 0ddd52d5a..87658e87a 100644 --- a/src/video_core/memory_manager.h +++ b/src/video_core/memory_manager.h @@ -79,9 +79,9 @@ public:       * in the Host Memory counterpart. Note: This functions cause Host GPU Memory       * Flushes and Invalidations, respectively to each operation.       */ -    void ReadBlock(GPUVAddr src_addr, void* dest_buffer, std::size_t size) const; -    void WriteBlock(GPUVAddr dest_addr, const void* src_buffer, std::size_t size); -    void CopyBlock(GPUVAddr dest_addr, GPUVAddr src_addr, std::size_t size); +    void ReadBlock(GPUVAddr gpu_src_addr, void* dest_buffer, std::size_t size) const; +    void WriteBlock(GPUVAddr gpu_dest_addr, const void* src_buffer, std::size_t size); +    void CopyBlock(GPUVAddr gpu_dest_addr, GPUVAddr gpu_src_addr, std::size_t size);      /**       * ReadBlockUnsafe and WriteBlockUnsafe are special versions of ReadBlock and @@ -93,9 +93,9 @@ public:       * WriteBlockUnsafe instead of WriteBlock since it shouldn't invalidate the texture       * being flushed.       */ -    void ReadBlockUnsafe(GPUVAddr src_addr, void* dest_buffer, std::size_t size) const; -    void WriteBlockUnsafe(GPUVAddr dest_addr, const void* src_buffer, std::size_t size); -    void CopyBlockUnsafe(GPUVAddr dest_addr, GPUVAddr src_addr, std::size_t size); +    void ReadBlockUnsafe(GPUVAddr gpu_src_addr, void* dest_buffer, std::size_t size) const; +    void WriteBlockUnsafe(GPUVAddr gpu_dest_addr, const void* src_buffer, std::size_t size); +    void CopyBlockUnsafe(GPUVAddr gpu_dest_addr, GPUVAddr gpu_src_addr, std::size_t size);      /**       * IsGranularRange checks if a gpu region can be simply read with a pointer diff --git a/src/video_core/renderer_opengl/gl_buffer_cache.cpp b/src/video_core/renderer_opengl/gl_buffer_cache.cpp index ad0577a4f..e461e4c70 100644 --- a/src/video_core/renderer_opengl/gl_buffer_cache.cpp +++ b/src/video_core/renderer_opengl/gl_buffer_cache.cpp @@ -22,21 +22,53 @@ using Maxwell = Tegra::Engines::Maxwell3D::Regs;  MICROPROFILE_DEFINE(OpenGL_Buffer_Download, "OpenGL", "Buffer Download", MP_RGB(192, 192, 128)); -Buffer::Buffer(VAddr cpu_addr, const std::size_t size) : VideoCommon::BufferBlock{cpu_addr, size} { +Buffer::Buffer(const Device& device, VAddr cpu_addr, std::size_t size) +    : VideoCommon::BufferBlock{cpu_addr, size} {      gl_buffer.Create();      glNamedBufferData(gl_buffer.handle, static_cast<GLsizeiptr>(size), nullptr, GL_DYNAMIC_DRAW); +    if (device.HasVertexBufferUnifiedMemory()) { +        glMakeNamedBufferResidentNV(gl_buffer.handle, GL_READ_WRITE); +        glGetNamedBufferParameterui64vNV(gl_buffer.handle, GL_BUFFER_GPU_ADDRESS_NV, &gpu_address); +    }  }  Buffer::~Buffer() = default; +void Buffer::Upload(std::size_t offset, std::size_t size, const u8* data) { +    glNamedBufferSubData(Handle(), static_cast<GLintptr>(offset), static_cast<GLsizeiptr>(size), +                         data); +} + +void Buffer::Download(std::size_t offset, std::size_t size, u8* data) { +    MICROPROFILE_SCOPE(OpenGL_Buffer_Download); +    const GLsizeiptr gl_size = static_cast<GLsizeiptr>(size); +    const GLintptr gl_offset = static_cast<GLintptr>(offset); +    if (read_buffer.handle == 0) { +        read_buffer.Create(); +        glNamedBufferData(read_buffer.handle, static_cast<GLsizeiptr>(Size()), nullptr, +                          GL_STREAM_READ); +    } +    glMemoryBarrier(GL_BUFFER_UPDATE_BARRIER_BIT); +    glCopyNamedBufferSubData(gl_buffer.handle, read_buffer.handle, gl_offset, gl_offset, gl_size); +    glGetNamedBufferSubData(read_buffer.handle, gl_offset, gl_size, data); +} + +void Buffer::CopyFrom(const Buffer& src, std::size_t src_offset, std::size_t dst_offset, +                      std::size_t size) { +    glCopyNamedBufferSubData(src.Handle(), Handle(), static_cast<GLintptr>(src_offset), +                             static_cast<GLintptr>(dst_offset), static_cast<GLsizeiptr>(size)); +} +  OGLBufferCache::OGLBufferCache(RasterizerOpenGL& rasterizer, Core::System& system, -                               const Device& device, std::size_t stream_size) -    : GenericBufferCache{rasterizer, system, std::make_unique<OGLStreamBuffer>(stream_size, true)} { +                               const Device& device_, std::size_t stream_size) +    : GenericBufferCache{rasterizer, system, +                         std::make_unique<OGLStreamBuffer>(device_, stream_size, true)}, +      device{device_} {      if (!device.HasFastBufferSubData()) {          return;      } -    static constexpr auto size = static_cast<GLsizeiptr>(Maxwell::MaxConstBufferSize); +    static constexpr GLsizeiptr size = static_cast<GLsizeiptr>(Maxwell::MaxConstBufferSize);      glCreateBuffers(static_cast<GLsizei>(std::size(cbufs)), std::data(cbufs));      for (const GLuint cbuf : cbufs) {          glNamedBufferData(cbuf, size, nullptr, GL_STREAM_DRAW); @@ -48,39 +80,20 @@ OGLBufferCache::~OGLBufferCache() {  }  std::shared_ptr<Buffer> OGLBufferCache::CreateBlock(VAddr cpu_addr, std::size_t size) { -    return std::make_shared<Buffer>(cpu_addr, size); +    return std::make_shared<Buffer>(device, cpu_addr, size);  } -GLuint OGLBufferCache::GetEmptyBuffer(std::size_t) { -    return 0; -} - -void OGLBufferCache::UploadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size, -                                     const u8* data) { -    glNamedBufferSubData(buffer.Handle(), static_cast<GLintptr>(offset), -                         static_cast<GLsizeiptr>(size), data); -} - -void OGLBufferCache::DownloadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size, -                                       u8* data) { -    MICROPROFILE_SCOPE(OpenGL_Buffer_Download); -    glMemoryBarrier(GL_BUFFER_UPDATE_BARRIER_BIT); -    glGetNamedBufferSubData(buffer.Handle(), static_cast<GLintptr>(offset), -                            static_cast<GLsizeiptr>(size), data); -} - -void OGLBufferCache::CopyBlock(const Buffer& src, const Buffer& dst, std::size_t src_offset, -                               std::size_t dst_offset, std::size_t size) { -    glCopyNamedBufferSubData(src.Handle(), dst.Handle(), static_cast<GLintptr>(src_offset), -                             static_cast<GLintptr>(dst_offset), static_cast<GLsizeiptr>(size)); +OGLBufferCache::BufferInfo OGLBufferCache::GetEmptyBuffer(std::size_t) { +    return {0, 0, 0};  }  OGLBufferCache::BufferInfo OGLBufferCache::ConstBufferUpload(const void* raw_pointer,                                                               std::size_t size) {      DEBUG_ASSERT(cbuf_cursor < std::size(cbufs));      const GLuint cbuf = cbufs[cbuf_cursor++]; +      glNamedBufferSubData(cbuf, 0, static_cast<GLsizeiptr>(size), raw_pointer); -    return {cbuf, 0}; +    return {cbuf, 0, 0};  }  } // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_buffer_cache.h b/src/video_core/renderer_opengl/gl_buffer_cache.h index a49aaf9c4..88fdc0536 100644 --- a/src/video_core/renderer_opengl/gl_buffer_cache.h +++ b/src/video_core/renderer_opengl/gl_buffer_cache.h @@ -25,15 +25,28 @@ class RasterizerOpenGL;  class Buffer : public VideoCommon::BufferBlock {  public: -    explicit Buffer(VAddr cpu_addr, const std::size_t size); +    explicit Buffer(const Device& device, VAddr cpu_addr, std::size_t size);      ~Buffer(); -    GLuint Handle() const { +    void Upload(std::size_t offset, std::size_t size, const u8* data); + +    void Download(std::size_t offset, std::size_t size, u8* data); + +    void CopyFrom(const Buffer& src, std::size_t src_offset, std::size_t dst_offset, +                  std::size_t size); + +    GLuint Handle() const noexcept {          return gl_buffer.handle;      } +    u64 Address() const noexcept { +        return gpu_address; +    } +  private:      OGLBuffer gl_buffer; +    OGLBuffer read_buffer; +    u64 gpu_address = 0;  };  using GenericBufferCache = VideoCommon::BufferCache<Buffer, GLuint, OGLStreamBuffer>; @@ -43,7 +56,7 @@ public:                              const Device& device, std::size_t stream_size);      ~OGLBufferCache(); -    GLuint GetEmptyBuffer(std::size_t) override; +    BufferInfo GetEmptyBuffer(std::size_t) override;      void Acquire() noexcept {          cbuf_cursor = 0; @@ -52,22 +65,16 @@ public:  protected:      std::shared_ptr<Buffer> CreateBlock(VAddr cpu_addr, std::size_t size) override; -    void UploadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size, -                         const u8* data) override; - -    void DownloadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size, -                           u8* data) override; - -    void CopyBlock(const Buffer& src, const Buffer& dst, std::size_t src_offset, -                   std::size_t dst_offset, std::size_t size) override; -      BufferInfo ConstBufferUpload(const void* raw_pointer, std::size_t size) override;  private: +    static constexpr std::size_t NUM_CBUFS = Tegra::Engines::Maxwell3D::Regs::MaxConstBuffers * +                                             Tegra::Engines::Maxwell3D::Regs::MaxShaderProgram; + +    const Device& device; +      std::size_t cbuf_cursor = 0; -    std::array<GLuint, Tegra::Engines::Maxwell3D::Regs::MaxConstBuffers * -                           Tegra::Engines::Maxwell3D::Regs::MaxShaderProgram> -        cbufs; +    std::array<GLuint, NUM_CBUFS> cbufs{};  };  } // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_device.cpp b/src/video_core/renderer_opengl/gl_device.cpp index b31d604e4..208fc6167 100644 --- a/src/video_core/renderer_opengl/gl_device.cpp +++ b/src/video_core/renderer_opengl/gl_device.cpp @@ -178,7 +178,7 @@ bool IsASTCSupported() {          for (const GLenum format : formats) {              for (const GLenum support : required_support) {                  GLint value; -                glGetInternalformativ(GL_TEXTURE_2D, format, support, 1, &value); +                glGetInternalformativ(target, format, support, 1, &value);                  if (value != GL_FULL_SUPPORT) {                      return false;                  } @@ -193,6 +193,7 @@ bool IsASTCSupported() {  Device::Device()      : max_uniform_buffers{BuildMaxUniformBuffers()}, base_bindings{BuildBaseBindings()} {      const std::string_view vendor = reinterpret_cast<const char*>(glGetString(GL_VENDOR)); +    const std::string_view renderer = reinterpret_cast<const char*>(glGetString(GL_RENDERER));      const std::string_view version = reinterpret_cast<const char*>(glGetString(GL_VERSION));      const std::vector extensions = GetExtensions(); @@ -216,12 +217,18 @@ Device::Device()      has_shader_ballot = GLAD_GL_ARB_shader_ballot;      has_vertex_viewport_layer = GLAD_GL_ARB_shader_viewport_layer_array;      has_image_load_formatted = HasExtension(extensions, "GL_EXT_shader_image_load_formatted"); +    has_texture_shadow_lod = HasExtension(extensions, "GL_EXT_texture_shadow_lod");      has_astc = IsASTCSupported();      has_variable_aoffi = TestVariableAoffi();      has_component_indexing_bug = is_amd;      has_precise_bug = TestPreciseBug(); -    has_fast_buffer_sub_data = is_nvidia && !disable_fast_buffer_sub_data;      has_nv_viewport_array2 = GLAD_GL_NV_viewport_array2; +    has_vertex_buffer_unified_memory = GLAD_GL_NV_vertex_buffer_unified_memory; + +    // At the moment of writing this, only Nvidia's driver optimizes BufferSubData on exclusive +    // uniform buffers as "push constants" +    has_fast_buffer_sub_data = is_nvidia && !disable_fast_buffer_sub_data; +      use_assembly_shaders = Settings::values.use_assembly_shaders && GLAD_GL_NV_gpu_program5 &&                             GLAD_GL_NV_compute_program5 && GLAD_GL_NV_transform_feedback &&                             GLAD_GL_NV_transform_feedback2; @@ -245,6 +252,7 @@ Device::Device(std::nullptr_t) {      has_shader_ballot = true;      has_vertex_viewport_layer = true;      has_image_load_formatted = true; +    has_texture_shadow_lod = true;      has_variable_aoffi = true;  } diff --git a/src/video_core/renderer_opengl/gl_device.h b/src/video_core/renderer_opengl/gl_device.h index 145347943..e1d811966 100644 --- a/src/video_core/renderer_opengl/gl_device.h +++ b/src/video_core/renderer_opengl/gl_device.h @@ -68,6 +68,14 @@ public:          return has_image_load_formatted;      } +    bool HasTextureShadowLod() const { +        return has_texture_shadow_lod; +    } + +    bool HasVertexBufferUnifiedMemory() const { +        return has_vertex_buffer_unified_memory; +    } +      bool HasASTC() const {          return has_astc;      } @@ -110,6 +118,8 @@ private:      bool has_shader_ballot{};      bool has_vertex_viewport_layer{};      bool has_image_load_formatted{}; +    bool has_texture_shadow_lod{}; +    bool has_vertex_buffer_unified_memory{};      bool has_astc{};      bool has_variable_aoffi{};      bool has_component_indexing_bug{}; diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp index 2d6c11320..e960a0ef1 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.cpp +++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp @@ -61,7 +61,8 @@ constexpr std::size_t NUM_CONST_BUFFERS_BYTES_PER_STAGE =  constexpr std::size_t TOTAL_CONST_BUFFER_BYTES =      NUM_CONST_BUFFERS_BYTES_PER_STAGE * Maxwell::MaxShaderStage; -constexpr std::size_t NumSupportedVertexAttributes = 16; +constexpr std::size_t NUM_SUPPORTED_VERTEX_ATTRIBUTES = 16; +constexpr std::size_t NUM_SUPPORTED_VERTEX_BINDINGS = 16;  template <typename Engine, typename Entry>  Tegra::Texture::FullTextureInfo GetTextureInfo(const Engine& engine, const Entry& entry, @@ -193,7 +194,7 @@ void RasterizerOpenGL::SetupVertexFormat() {      // avoid OpenGL errors.      // TODO(Subv): Analyze the shader to identify which attributes are actually used and don't      // assume every shader uses them all. -    for (std::size_t index = 0; index < NumSupportedVertexAttributes; ++index) { +    for (std::size_t index = 0; index < NUM_SUPPORTED_VERTEX_ATTRIBUTES; ++index) {          if (!flags[Dirty::VertexFormat0 + index]) {              continue;          } @@ -212,9 +213,10 @@ void RasterizerOpenGL::SetupVertexFormat() {          if (attrib.type == Maxwell::VertexAttribute::Type::SignedInt ||              attrib.type == Maxwell::VertexAttribute::Type::UnsignedInt) {              glVertexAttribIFormat(gl_index, attrib.ComponentCount(), -                                  MaxwellToGL::VertexType(attrib), attrib.offset); +                                  MaxwellToGL::VertexFormat(attrib), attrib.offset);          } else { -            glVertexAttribFormat(gl_index, attrib.ComponentCount(), MaxwellToGL::VertexType(attrib), +            glVertexAttribFormat(gl_index, attrib.ComponentCount(), +                                 MaxwellToGL::VertexFormat(attrib),                                   attrib.IsNormalized() ? GL_TRUE : GL_FALSE, attrib.offset);          }          glVertexAttribBinding(gl_index, attrib.buffer); @@ -231,9 +233,11 @@ void RasterizerOpenGL::SetupVertexBuffer() {      MICROPROFILE_SCOPE(OpenGL_VB); +    const bool use_unified_memory = device.HasVertexBufferUnifiedMemory(); +      // Upload all guest vertex arrays sequentially to our buffer      const auto& regs = gpu.regs; -    for (std::size_t index = 0; index < Maxwell::NumVertexArrays; ++index) { +    for (std::size_t index = 0; index < NUM_SUPPORTED_VERTEX_BINDINGS; ++index) {          if (!flags[Dirty::VertexBuffer0 + index]) {              continue;          } @@ -246,16 +250,25 @@ void RasterizerOpenGL::SetupVertexBuffer() {          const GPUVAddr start = vertex_array.StartAddress();          const GPUVAddr end = regs.vertex_array_limit[index].LimitAddress(); -          ASSERT(end >= start); + +        const GLuint gl_index = static_cast<GLuint>(index);          const u64 size = end - start;          if (size == 0) { -            glBindVertexBuffer(static_cast<GLuint>(index), 0, 0, vertex_array.stride); +            glBindVertexBuffer(gl_index, 0, 0, vertex_array.stride); +            if (use_unified_memory) { +                glBufferAddressRangeNV(GL_VERTEX_ATTRIB_ARRAY_ADDRESS_NV, gl_index, 0, 0); +            }              continue;          } -        const auto [vertex_buffer, vertex_buffer_offset] = buffer_cache.UploadMemory(start, size); -        glBindVertexBuffer(static_cast<GLuint>(index), vertex_buffer, vertex_buffer_offset, -                           vertex_array.stride); +        const auto info = buffer_cache.UploadMemory(start, size); +        if (use_unified_memory) { +            glBindVertexBuffer(gl_index, 0, 0, vertex_array.stride); +            glBufferAddressRangeNV(GL_VERTEX_ATTRIB_ARRAY_ADDRESS_NV, gl_index, +                                   info.address + info.offset, size); +        } else { +            glBindVertexBuffer(gl_index, info.handle, info.offset, vertex_array.stride); +        }      }  } @@ -268,7 +281,7 @@ void RasterizerOpenGL::SetupVertexInstances() {      flags[Dirty::VertexInstances] = false;      const auto& regs = gpu.regs; -    for (std::size_t index = 0; index < NumSupportedVertexAttributes; ++index) { +    for (std::size_t index = 0; index < NUM_SUPPORTED_VERTEX_ATTRIBUTES; ++index) {          if (!flags[Dirty::VertexInstance0 + index]) {              continue;          } @@ -285,9 +298,9 @@ GLintptr RasterizerOpenGL::SetupIndexBuffer() {      MICROPROFILE_SCOPE(OpenGL_Index);      const auto& regs = system.GPU().Maxwell3D().regs;      const std::size_t size = CalculateIndexBufferSize(); -    const auto [buffer, offset] = buffer_cache.UploadMemory(regs.index_array.IndexStart(), size); -    glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, buffer); -    return offset; +    const auto info = buffer_cache.UploadMemory(regs.index_array.IndexStart(), size); +    glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, info.handle); +    return info.offset;  }  void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) { @@ -643,9 +656,9 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) {      if (!device.UseAssemblyShaders()) {          MaxwellUniformData ubo;          ubo.SetFromRegs(gpu); -        const auto [buffer, offset] = +        const auto info =              buffer_cache.UploadHostMemory(&ubo, sizeof(ubo), device.GetUniformBufferAlignment()); -        glBindBufferRange(GL_UNIFORM_BUFFER, EmulationUniformBlockBinding, buffer, offset, +        glBindBufferRange(GL_UNIFORM_BUFFER, EmulationUniformBlockBinding, info.handle, info.offset,                            static_cast<GLsizeiptr>(sizeof(ubo)));      } @@ -956,8 +969,7 @@ void RasterizerOpenGL::SetupConstBuffer(GLenum stage, u32 binding,          if (device.UseAssemblyShaders()) {              glBindBufferRangeNV(stage, entry.GetIndex(), 0, 0, 0);          } else { -            glBindBufferRange(GL_UNIFORM_BUFFER, binding, -                              buffer_cache.GetEmptyBuffer(sizeof(float)), 0, sizeof(float)); +            glBindBufferRange(GL_UNIFORM_BUFFER, binding, 0, 0, sizeof(float));          }          return;      } @@ -970,24 +982,25 @@ void RasterizerOpenGL::SetupConstBuffer(GLenum stage, u32 binding,      const std::size_t alignment = use_unified ? 4 : device.GetUniformBufferAlignment();      const GPUVAddr gpu_addr = buffer.address; -    auto [cbuf, offset] = buffer_cache.UploadMemory(gpu_addr, size, alignment, false, fast_upload); +    auto info = buffer_cache.UploadMemory(gpu_addr, size, alignment, false, fast_upload);      if (device.UseAssemblyShaders()) {          UNIMPLEMENTED_IF(use_unified); -        if (offset != 0) { +        if (info.offset != 0) {              const GLuint staging_cbuf = staging_cbufs[current_cbuf++]; -            glCopyNamedBufferSubData(cbuf, staging_cbuf, offset, 0, size); -            cbuf = staging_cbuf; -            offset = 0; +            glCopyNamedBufferSubData(info.handle, staging_cbuf, info.offset, 0, size); +            info.handle = staging_cbuf; +            info.offset = 0;          } -        glBindBufferRangeNV(stage, binding, cbuf, offset, size); +        glBindBufferRangeNV(stage, binding, info.handle, info.offset, size);          return;      }      if (use_unified) { -        glCopyNamedBufferSubData(cbuf, unified_uniform_buffer.handle, offset, unified_offset, size); +        glCopyNamedBufferSubData(info.handle, unified_uniform_buffer.handle, info.offset, +                                 unified_offset, size);      } else { -        glBindBufferRange(GL_UNIFORM_BUFFER, binding, cbuf, offset, size); +        glBindBufferRange(GL_UNIFORM_BUFFER, binding, info.handle, info.offset, size);      }  } @@ -1023,9 +1036,8 @@ void RasterizerOpenGL::SetupComputeGlobalMemory(Shader* kernel) {  void RasterizerOpenGL::SetupGlobalMemory(u32 binding, const GlobalMemoryEntry& entry,                                           GPUVAddr gpu_addr, std::size_t size) {      const auto alignment{device.GetShaderStorageBufferAlignment()}; -    const auto [ssbo, buffer_offset] = -        buffer_cache.UploadMemory(gpu_addr, size, alignment, entry.is_written); -    glBindBufferRange(GL_SHADER_STORAGE_BUFFER, binding, ssbo, buffer_offset, +    const auto info = buffer_cache.UploadMemory(gpu_addr, size, alignment, entry.is_written); +    glBindBufferRange(GL_SHADER_STORAGE_BUFFER, binding, info.handle, info.offset,                        static_cast<GLsizeiptr>(size));  } @@ -1712,8 +1724,9 @@ void RasterizerOpenGL::EndTransformFeedback() {          const GLuint handle = transform_feedback_buffers[index].handle;          const GPUVAddr gpu_addr = binding.Address();          const std::size_t size = binding.buffer_size; -        const auto [dest_buffer, offset] = buffer_cache.UploadMemory(gpu_addr, size, 4, true); -        glCopyNamedBufferSubData(handle, dest_buffer, 0, offset, static_cast<GLsizeiptr>(size)); +        const auto info = buffer_cache.UploadMemory(gpu_addr, size, 4, true); +        glCopyNamedBufferSubData(handle, info.handle, 0, info.offset, +                                 static_cast<GLsizeiptr>(size));      }  } diff --git a/src/video_core/renderer_opengl/gl_shader_cache.cpp b/src/video_core/renderer_opengl/gl_shader_cache.cpp index 46e780a06..c6a3bf3a1 100644 --- a/src/video_core/renderer_opengl/gl_shader_cache.cpp +++ b/src/video_core/renderer_opengl/gl_shader_cache.cpp @@ -460,8 +460,9 @@ Shader* ShaderCacheOpenGL::GetStageProgram(Maxwell::ShaderProgram program) {          const u8* host_ptr_b = memory_manager.GetPointer(address_b);          code_b = GetShaderCode(memory_manager, address_b, host_ptr_b, false);      } +    const std::size_t code_size = code.size() * sizeof(u64); -    const auto unique_identifier = GetUniqueIdentifier( +    const u64 unique_identifier = GetUniqueIdentifier(          GetShaderType(program), program == Maxwell::ShaderProgram::VertexA, code, code_b);      const ShaderParameters params{system,    disk_cache, device, @@ -477,7 +478,7 @@ Shader* ShaderCacheOpenGL::GetStageProgram(Maxwell::ShaderProgram program) {      Shader* const result = shader.get();      if (cpu_addr) { -        Register(std::move(shader), *cpu_addr, code.size() * sizeof(u64)); +        Register(std::move(shader), *cpu_addr, code_size);      } else {          null_shader = std::move(shader);      } @@ -495,8 +496,9 @@ Shader* ShaderCacheOpenGL::GetComputeKernel(GPUVAddr code_addr) {      const auto host_ptr{memory_manager.GetPointer(code_addr)};      // No kernel found, create a new one -    auto code{GetShaderCode(memory_manager, code_addr, host_ptr, true)}; -    const auto unique_identifier{GetUniqueIdentifier(ShaderType::Compute, false, code)}; +    ProgramCode code{GetShaderCode(memory_manager, code_addr, host_ptr, true)}; +    const std::size_t code_size{code.size() * sizeof(u64)}; +    const u64 unique_identifier{GetUniqueIdentifier(ShaderType::Compute, false, code)};      const ShaderParameters params{system,    disk_cache, device,                                    *cpu_addr, host_ptr,   unique_identifier}; @@ -511,7 +513,7 @@ Shader* ShaderCacheOpenGL::GetComputeKernel(GPUVAddr code_addr) {      Shader* const result = kernel.get();      if (cpu_addr) { -        Register(std::move(kernel), *cpu_addr, code.size() * sizeof(u64)); +        Register(std::move(kernel), *cpu_addr, code_size);      } else {          null_kernel = std::move(kernel);      } diff --git a/src/video_core/renderer_opengl/gl_shader_cache.h b/src/video_core/renderer_opengl/gl_shader_cache.h index 6848f1388..994aaeaf2 100644 --- a/src/video_core/renderer_opengl/gl_shader_cache.h +++ b/src/video_core/renderer_opengl/gl_shader_cache.h @@ -37,7 +37,6 @@ namespace OpenGL {  class Device;  class RasterizerOpenGL; -struct UnspecializedShader;  using Maxwell = Tegra::Engines::Maxwell3D::Regs; diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp index d6e30b321..2c49aeaac 100644 --- a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp +++ b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp @@ -37,6 +37,7 @@ using Tegra::Shader::IpaMode;  using Tegra::Shader::IpaSampleMode;  using Tegra::Shader::PixelImap;  using Tegra::Shader::Register; +using Tegra::Shader::TextureType;  using VideoCommon::Shader::BuildTransformFeedback;  using VideoCommon::Shader::Registry; @@ -526,6 +527,9 @@ private:          if (device.HasImageLoadFormatted()) {              code.AddLine("#extension GL_EXT_shader_image_load_formatted : require");          } +        if (device.HasTextureShadowLod()) { +            code.AddLine("#extension GL_EXT_texture_shadow_lod : require"); +        }          if (device.HasWarpIntrinsics()) {              code.AddLine("#extension GL_NV_gpu_shader5 : require");              code.AddLine("#extension GL_NV_shader_thread_group : require"); @@ -909,13 +913,13 @@ private:                      return "samplerBuffer";                  }                  switch (sampler.type) { -                case Tegra::Shader::TextureType::Texture1D: +                case TextureType::Texture1D:                      return "sampler1D"; -                case Tegra::Shader::TextureType::Texture2D: +                case TextureType::Texture2D:                      return "sampler2D"; -                case Tegra::Shader::TextureType::Texture3D: +                case TextureType::Texture3D:                      return "sampler3D"; -                case Tegra::Shader::TextureType::TextureCube: +                case TextureType::TextureCube:                      return "samplerCube";                  default:                      UNREACHABLE(); @@ -1380,8 +1384,19 @@ private:          const std::size_t count = operation.GetOperandsCount();          const bool has_array = meta->sampler.is_array;          const bool has_shadow = meta->sampler.is_shadow; +        const bool workaround_lod_array_shadow_as_grad = +            !device.HasTextureShadowLod() && function_suffix == "Lod" && meta->sampler.is_shadow && +            ((meta->sampler.type == TextureType::Texture2D && meta->sampler.is_array) || +             meta->sampler.type == TextureType::TextureCube); + +        std::string expr = "texture"; + +        if (workaround_lod_array_shadow_as_grad) { +            expr += "Grad"; +        } else { +            expr += function_suffix; +        } -        std::string expr = "texture" + function_suffix;          if (!meta->aoffi.empty()) {              expr += "Offset";          } else if (!meta->ptp.empty()) { @@ -1415,6 +1430,16 @@ private:              expr += ')';          } +        if (workaround_lod_array_shadow_as_grad) { +            switch (meta->sampler.type) { +            case TextureType::Texture2D: +                return expr + ", vec2(0.0), vec2(0.0))"; +            case TextureType::TextureCube: +                return expr + ", vec3(0.0), vec3(0.0))"; +            } +            UNREACHABLE(); +        } +          for (const auto& variant : extras) {              if (const auto argument = std::get_if<TextureArgument>(&variant)) {                  expr += GenerateTextureArgument(*argument); @@ -2041,8 +2066,19 @@ private:          const auto meta = std::get_if<MetaTexture>(&operation.GetMeta());          ASSERT(meta); -        std::string expr = GenerateTexture( -            operation, "Lod", {TextureArgument{Type::Float, meta->lod}, TextureOffset{}}); +        std::string expr{}; + +        if (!device.HasTextureShadowLod() && meta->sampler.is_shadow && +            ((meta->sampler.type == TextureType::Texture2D && meta->sampler.is_array) || +             meta->sampler.type == TextureType::TextureCube)) { +            LOG_ERROR(Render_OpenGL, +                      "Device lacks GL_EXT_texture_shadow_lod, using textureGrad as a workaround"); +            expr = GenerateTexture(operation, "Lod", {}); +        } else { +            expr = GenerateTexture(operation, "Lod", +                                   {TextureArgument{Type::Float, meta->lod}, TextureOffset{}}); +        } +          if (meta->sampler.is_shadow) {              expr = "vec4(" + expr + ')';          } diff --git a/src/video_core/renderer_opengl/gl_stream_buffer.cpp b/src/video_core/renderer_opengl/gl_stream_buffer.cpp index 932a2f69e..3655ff629 100644 --- a/src/video_core/renderer_opengl/gl_stream_buffer.cpp +++ b/src/video_core/renderer_opengl/gl_stream_buffer.cpp @@ -2,11 +2,13 @@  // Licensed under GPLv2 or any later version  // Refer to the license.txt file included. -#include <deque> +#include <tuple>  #include <vector> +  #include "common/alignment.h"  #include "common/assert.h"  #include "common/microprofile.h" +#include "video_core/renderer_opengl/gl_device.h"  #include "video_core/renderer_opengl/gl_stream_buffer.h"  MICROPROFILE_DEFINE(OpenGL_StreamBuffer, "OpenGL", "Stream Buffer Orphaning", @@ -14,8 +16,7 @@ MICROPROFILE_DEFINE(OpenGL_StreamBuffer, "OpenGL", "Stream Buffer Orphaning",  namespace OpenGL { -OGLStreamBuffer::OGLStreamBuffer(GLsizeiptr size, bool vertex_data_usage, bool prefer_coherent, -                                 bool use_persistent) +OGLStreamBuffer::OGLStreamBuffer(const Device& device, GLsizeiptr size, bool vertex_data_usage)      : buffer_size(size) {      gl_buffer.Create(); @@ -29,23 +30,19 @@ OGLStreamBuffer::OGLStreamBuffer(GLsizeiptr size, bool vertex_data_usage, bool p          allocate_size *= 2;      } -    if (use_persistent) { -        persistent = true; -        coherent = prefer_coherent; -        const GLbitfield flags = -            GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT | (coherent ? GL_MAP_COHERENT_BIT : 0); -        glNamedBufferStorage(gl_buffer.handle, allocate_size, nullptr, flags); -        mapped_ptr = static_cast<u8*>(glMapNamedBufferRange( -            gl_buffer.handle, 0, buffer_size, flags | (coherent ? 0 : GL_MAP_FLUSH_EXPLICIT_BIT))); -    } else { -        glNamedBufferData(gl_buffer.handle, allocate_size, nullptr, GL_STREAM_DRAW); +    static constexpr GLbitfield flags = GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT; +    glNamedBufferStorage(gl_buffer.handle, allocate_size, nullptr, flags); +    mapped_ptr = static_cast<u8*>( +        glMapNamedBufferRange(gl_buffer.handle, 0, buffer_size, flags | GL_MAP_FLUSH_EXPLICIT_BIT)); + +    if (device.HasVertexBufferUnifiedMemory()) { +        glMakeNamedBufferResidentNV(gl_buffer.handle, GL_READ_ONLY); +        glGetNamedBufferParameterui64vNV(gl_buffer.handle, GL_BUFFER_GPU_ADDRESS_NV, &gpu_address);      }  }  OGLStreamBuffer::~OGLStreamBuffer() { -    if (persistent) { -        glUnmapNamedBuffer(gl_buffer.handle); -    } +    glUnmapNamedBuffer(gl_buffer.handle);      gl_buffer.Release();  } @@ -60,36 +57,21 @@ std::tuple<u8*, GLintptr, bool> OGLStreamBuffer::Map(GLsizeiptr size, GLintptr a      bool invalidate = false;      if (buffer_pos + size > buffer_size) { +        MICROPROFILE_SCOPE(OpenGL_StreamBuffer); +        glInvalidateBufferData(gl_buffer.handle); +          buffer_pos = 0;          invalidate = true; - -        if (persistent) { -            glUnmapNamedBuffer(gl_buffer.handle); -        }      } -    if (invalidate || !persistent) { -        MICROPROFILE_SCOPE(OpenGL_StreamBuffer); -        GLbitfield flags = GL_MAP_WRITE_BIT | (persistent ? GL_MAP_PERSISTENT_BIT : 0) | -                           (coherent ? GL_MAP_COHERENT_BIT : GL_MAP_FLUSH_EXPLICIT_BIT) | -                           (invalidate ? GL_MAP_INVALIDATE_BUFFER_BIT : GL_MAP_UNSYNCHRONIZED_BIT); -        mapped_ptr = static_cast<u8*>( -            glMapNamedBufferRange(gl_buffer.handle, buffer_pos, buffer_size - buffer_pos, flags)); -        mapped_offset = buffer_pos; -    } - -    return std::make_tuple(mapped_ptr + buffer_pos - mapped_offset, buffer_pos, invalidate); +    return std::make_tuple(mapped_ptr + buffer_pos, buffer_pos, invalidate);  }  void OGLStreamBuffer::Unmap(GLsizeiptr size) {      ASSERT(size <= mapped_size); -    if (!coherent && size > 0) { -        glFlushMappedNamedBufferRange(gl_buffer.handle, buffer_pos - mapped_offset, size); -    } - -    if (!persistent) { -        glUnmapNamedBuffer(gl_buffer.handle); +    if (size > 0) { +        glFlushMappedNamedBufferRange(gl_buffer.handle, buffer_pos, size);      }      buffer_pos += size; diff --git a/src/video_core/renderer_opengl/gl_stream_buffer.h b/src/video_core/renderer_opengl/gl_stream_buffer.h index 866da3594..307a67113 100644 --- a/src/video_core/renderer_opengl/gl_stream_buffer.h +++ b/src/video_core/renderer_opengl/gl_stream_buffer.h @@ -11,10 +11,11 @@  namespace OpenGL { +class Device; +  class OGLStreamBuffer : private NonCopyable {  public: -    explicit OGLStreamBuffer(GLsizeiptr size, bool vertex_data_usage, bool prefer_coherent = false, -                             bool use_persistent = true); +    explicit OGLStreamBuffer(const Device& device, GLsizeiptr size, bool vertex_data_usage);      ~OGLStreamBuffer();      /* @@ -33,19 +34,20 @@ public:          return gl_buffer.handle;      } -    GLsizeiptr Size() const { +    u64 Address() const { +        return gpu_address; +    } + +    GLsizeiptr Size() const noexcept {          return buffer_size;      }  private:      OGLBuffer gl_buffer; -    bool coherent = false; -    bool persistent = false; - +    GLuint64EXT gpu_address = 0;      GLintptr buffer_pos = 0;      GLsizeiptr buffer_size = 0; -    GLintptr mapped_offset = 0;      GLsizeiptr mapped_size = 0;      u8* mapped_ptr = nullptr;  }; diff --git a/src/video_core/renderer_opengl/maxwell_to_gl.h b/src/video_core/renderer_opengl/maxwell_to_gl.h index 35e329240..fe9bd4b5a 100644 --- a/src/video_core/renderer_opengl/maxwell_to_gl.h +++ b/src/video_core/renderer_opengl/maxwell_to_gl.h @@ -24,10 +24,11 @@ namespace MaxwellToGL {  using Maxwell = Tegra::Engines::Maxwell3D::Regs; -inline GLenum VertexType(Maxwell::VertexAttribute attrib) { +inline GLenum VertexFormat(Maxwell::VertexAttribute attrib) {      switch (attrib.type) { -    case Maxwell::VertexAttribute::Type::UnsignedInt:      case Maxwell::VertexAttribute::Type::UnsignedNorm: +    case Maxwell::VertexAttribute::Type::UnsignedScaled: +    case Maxwell::VertexAttribute::Type::UnsignedInt:          switch (attrib.size) {          case Maxwell::VertexAttribute::Size::Size_8:          case Maxwell::VertexAttribute::Size::Size_8_8: @@ -48,8 +49,9 @@ inline GLenum VertexType(Maxwell::VertexAttribute attrib) {              return GL_UNSIGNED_INT_2_10_10_10_REV;          }          break; -    case Maxwell::VertexAttribute::Type::SignedInt:      case Maxwell::VertexAttribute::Type::SignedNorm: +    case Maxwell::VertexAttribute::Type::SignedScaled: +    case Maxwell::VertexAttribute::Type::SignedInt:          switch (attrib.size) {          case Maxwell::VertexAttribute::Size::Size_8:          case Maxwell::VertexAttribute::Size::Size_8_8: @@ -84,36 +86,8 @@ inline GLenum VertexType(Maxwell::VertexAttribute attrib) {              return GL_FLOAT;          }          break; -    case Maxwell::VertexAttribute::Type::UnsignedScaled: -        switch (attrib.size) { -        case Maxwell::VertexAttribute::Size::Size_8: -        case Maxwell::VertexAttribute::Size::Size_8_8: -        case Maxwell::VertexAttribute::Size::Size_8_8_8: -        case Maxwell::VertexAttribute::Size::Size_8_8_8_8: -            return GL_UNSIGNED_BYTE; -        case Maxwell::VertexAttribute::Size::Size_16: -        case Maxwell::VertexAttribute::Size::Size_16_16: -        case Maxwell::VertexAttribute::Size::Size_16_16_16: -        case Maxwell::VertexAttribute::Size::Size_16_16_16_16: -            return GL_UNSIGNED_SHORT; -        } -        break; -    case Maxwell::VertexAttribute::Type::SignedScaled: -        switch (attrib.size) { -        case Maxwell::VertexAttribute::Size::Size_8: -        case Maxwell::VertexAttribute::Size::Size_8_8: -        case Maxwell::VertexAttribute::Size::Size_8_8_8: -        case Maxwell::VertexAttribute::Size::Size_8_8_8_8: -            return GL_BYTE; -        case Maxwell::VertexAttribute::Size::Size_16: -        case Maxwell::VertexAttribute::Size::Size_16_16: -        case Maxwell::VertexAttribute::Size::Size_16_16_16: -        case Maxwell::VertexAttribute::Size::Size_16_16_16_16: -            return GL_SHORT; -        } -        break;      } -    UNIMPLEMENTED_MSG("Unimplemented vertex type={} and size={}", attrib.TypeString(), +    UNIMPLEMENTED_MSG("Unimplemented vertex format of type={} and size={}", attrib.TypeString(),                        attrib.SizeString());      return {};  } @@ -217,6 +191,12 @@ inline GLenum WrapMode(Tegra::Texture::WrapMode wrap_mode) {          } else {              return GL_MIRROR_CLAMP_TO_EDGE;          } +    case Tegra::Texture::WrapMode::MirrorOnceClampOGL: +        if (GL_EXT_texture_mirror_clamp) { +            return GL_MIRROR_CLAMP_EXT; +        } else { +            return GL_MIRROR_CLAMP_TO_EDGE; +        }      }      UNIMPLEMENTED_MSG("Unimplemented texture wrap mode={}", static_cast<u32>(wrap_mode));      return GL_REPEAT; diff --git a/src/video_core/renderer_opengl/renderer_opengl.cpp b/src/video_core/renderer_opengl/renderer_opengl.cpp index 6214fcbc3..c40adb6e7 100644 --- a/src/video_core/renderer_opengl/renderer_opengl.cpp +++ b/src/video_core/renderer_opengl/renderer_opengl.cpp @@ -488,6 +488,15 @@ void RendererOpenGL::InitOpenGLObjects() {      // Clear screen to black      LoadColorToActiveGLTexture(0, 0, 0, 0, screen_info.texture); + +    // Enable unified vertex attributes and query vertex buffer address when the driver supports it +    if (device.HasVertexBufferUnifiedMemory()) { +        glEnableClientState(GL_VERTEX_ATTRIB_ARRAY_UNIFIED_NV); + +        glMakeNamedBufferResidentNV(vertex_buffer.handle, GL_READ_ONLY); +        glGetNamedBufferParameterui64vNV(vertex_buffer.handle, GL_BUFFER_GPU_ADDRESS_NV, +                                         &vertex_buffer_address); +    }  }  void RendererOpenGL::AddTelemetryFields() { @@ -656,7 +665,13 @@ void RendererOpenGL::DrawScreen(const Layout::FramebufferLayout& layout) {                           offsetof(ScreenRectVertex, tex_coord));      glVertexAttribBinding(PositionLocation, 0);      glVertexAttribBinding(TexCoordLocation, 0); -    glBindVertexBuffer(0, vertex_buffer.handle, 0, sizeof(ScreenRectVertex)); +    if (device.HasVertexBufferUnifiedMemory()) { +        glBindVertexBuffer(0, 0, 0, sizeof(ScreenRectVertex)); +        glBufferAddressRangeNV(GL_VERTEX_ATTRIB_ARRAY_ADDRESS_NV, 0, vertex_buffer_address, +                               sizeof(vertices)); +    } else { +        glBindVertexBuffer(0, vertex_buffer.handle, 0, sizeof(ScreenRectVertex)); +    }      glBindTextureUnit(0, screen_info.display_texture);      glBindSampler(0, 0); diff --git a/src/video_core/renderer_opengl/renderer_opengl.h b/src/video_core/renderer_opengl/renderer_opengl.h index 61bf507f4..8b18d32e6 100644 --- a/src/video_core/renderer_opengl/renderer_opengl.h +++ b/src/video_core/renderer_opengl/renderer_opengl.h @@ -107,6 +107,9 @@ private:      OGLPipeline pipeline;      OGLFramebuffer screenshot_framebuffer; +    // GPU address of the vertex buffer +    GLuint64EXT vertex_buffer_address = 0; +      /// Display information for Switch screen      ScreenInfo screen_info; diff --git a/src/video_core/renderer_vulkan/maxwell_to_vk.cpp b/src/video_core/renderer_vulkan/maxwell_to_vk.cpp index 1f2b6734b..d7f1ae89f 100644 --- a/src/video_core/renderer_vulkan/maxwell_to_vk.cpp +++ b/src/video_core/renderer_vulkan/maxwell_to_vk.cpp @@ -294,6 +294,28 @@ VkPrimitiveTopology PrimitiveTopology([[maybe_unused]] const VKDevice& device,  VkFormat VertexFormat(Maxwell::VertexAttribute::Type type, Maxwell::VertexAttribute::Size size) {      switch (type) { +    case Maxwell::VertexAttribute::Type::UnsignedNorm: +        switch (size) { +        case Maxwell::VertexAttribute::Size::Size_8: +            return VK_FORMAT_R8_UNORM; +        case Maxwell::VertexAttribute::Size::Size_8_8: +            return VK_FORMAT_R8G8_UNORM; +        case Maxwell::VertexAttribute::Size::Size_8_8_8: +            return VK_FORMAT_R8G8B8_UNORM; +        case Maxwell::VertexAttribute::Size::Size_8_8_8_8: +            return VK_FORMAT_R8G8B8A8_UNORM; +        case Maxwell::VertexAttribute::Size::Size_16: +            return VK_FORMAT_R16_UNORM; +        case Maxwell::VertexAttribute::Size::Size_16_16: +            return VK_FORMAT_R16G16_UNORM; +        case Maxwell::VertexAttribute::Size::Size_16_16_16: +            return VK_FORMAT_R16G16B16_UNORM; +        case Maxwell::VertexAttribute::Size::Size_16_16_16_16: +            return VK_FORMAT_R16G16B16A16_UNORM; +        case Maxwell::VertexAttribute::Size::Size_10_10_10_2: +            return VK_FORMAT_A2B10G10R10_UNORM_PACK32; +        } +        break;      case Maxwell::VertexAttribute::Type::SignedNorm:          switch (size) {          case Maxwell::VertexAttribute::Size::Size_8: @@ -314,62 +336,50 @@ VkFormat VertexFormat(Maxwell::VertexAttribute::Type type, Maxwell::VertexAttrib              return VK_FORMAT_R16G16B16A16_SNORM;          case Maxwell::VertexAttribute::Size::Size_10_10_10_2:              return VK_FORMAT_A2B10G10R10_SNORM_PACK32; -        default: -            break;          }          break; -    case Maxwell::VertexAttribute::Type::UnsignedNorm: +    case Maxwell::VertexAttribute::Type::UnsignedScaled:          switch (size) {          case Maxwell::VertexAttribute::Size::Size_8: -            return VK_FORMAT_R8_UNORM; +            return VK_FORMAT_R8_USCALED;          case Maxwell::VertexAttribute::Size::Size_8_8: -            return VK_FORMAT_R8G8_UNORM; +            return VK_FORMAT_R8G8_USCALED;          case Maxwell::VertexAttribute::Size::Size_8_8_8: -            return VK_FORMAT_R8G8B8_UNORM; +            return VK_FORMAT_R8G8B8_USCALED;          case Maxwell::VertexAttribute::Size::Size_8_8_8_8: -            return VK_FORMAT_R8G8B8A8_UNORM; +            return VK_FORMAT_R8G8B8A8_USCALED;          case Maxwell::VertexAttribute::Size::Size_16: -            return VK_FORMAT_R16_UNORM; +            return VK_FORMAT_R16_USCALED;          case Maxwell::VertexAttribute::Size::Size_16_16: -            return VK_FORMAT_R16G16_UNORM; +            return VK_FORMAT_R16G16_USCALED;          case Maxwell::VertexAttribute::Size::Size_16_16_16: -            return VK_FORMAT_R16G16B16_UNORM; +            return VK_FORMAT_R16G16B16_USCALED;          case Maxwell::VertexAttribute::Size::Size_16_16_16_16: -            return VK_FORMAT_R16G16B16A16_UNORM; +            return VK_FORMAT_R16G16B16A16_USCALED;          case Maxwell::VertexAttribute::Size::Size_10_10_10_2: -            return VK_FORMAT_A2B10G10R10_UNORM_PACK32; -        default: -            break; +            return VK_FORMAT_A2B10G10R10_USCALED_PACK32;          }          break; -    case Maxwell::VertexAttribute::Type::SignedInt: +    case Maxwell::VertexAttribute::Type::SignedScaled:          switch (size) {          case Maxwell::VertexAttribute::Size::Size_8: -            return VK_FORMAT_R8_SINT; +            return VK_FORMAT_R8_SSCALED;          case Maxwell::VertexAttribute::Size::Size_8_8: -            return VK_FORMAT_R8G8_SINT; +            return VK_FORMAT_R8G8_SSCALED;          case Maxwell::VertexAttribute::Size::Size_8_8_8: -            return VK_FORMAT_R8G8B8_SINT; +            return VK_FORMAT_R8G8B8_SSCALED;          case Maxwell::VertexAttribute::Size::Size_8_8_8_8: -            return VK_FORMAT_R8G8B8A8_SINT; +            return VK_FORMAT_R8G8B8A8_SSCALED;          case Maxwell::VertexAttribute::Size::Size_16: -            return VK_FORMAT_R16_SINT; +            return VK_FORMAT_R16_SSCALED;          case Maxwell::VertexAttribute::Size::Size_16_16: -            return VK_FORMAT_R16G16_SINT; +            return VK_FORMAT_R16G16_SSCALED;          case Maxwell::VertexAttribute::Size::Size_16_16_16: -            return VK_FORMAT_R16G16B16_SINT; +            return VK_FORMAT_R16G16B16_SSCALED;          case Maxwell::VertexAttribute::Size::Size_16_16_16_16: -            return VK_FORMAT_R16G16B16A16_SINT; -        case Maxwell::VertexAttribute::Size::Size_32: -            return VK_FORMAT_R32_SINT; -        case Maxwell::VertexAttribute::Size::Size_32_32: -            return VK_FORMAT_R32G32_SINT; -        case Maxwell::VertexAttribute::Size::Size_32_32_32: -            return VK_FORMAT_R32G32B32_SINT; -        case Maxwell::VertexAttribute::Size::Size_32_32_32_32: -            return VK_FORMAT_R32G32B32A32_SINT; -        default: -            break; +            return VK_FORMAT_R16G16B16A16_SSCALED; +        case Maxwell::VertexAttribute::Size::Size_10_10_10_2: +            return VK_FORMAT_A2B10G10R10_SSCALED_PACK32;          }          break;      case Maxwell::VertexAttribute::Type::UnsignedInt: @@ -398,56 +408,50 @@ VkFormat VertexFormat(Maxwell::VertexAttribute::Type type, Maxwell::VertexAttrib              return VK_FORMAT_R32G32B32_UINT;          case Maxwell::VertexAttribute::Size::Size_32_32_32_32:              return VK_FORMAT_R32G32B32A32_UINT; -        default: -            break; +        case Maxwell::VertexAttribute::Size::Size_10_10_10_2: +            return VK_FORMAT_A2B10G10R10_UINT_PACK32;          }          break; -    case Maxwell::VertexAttribute::Type::UnsignedScaled: +    case Maxwell::VertexAttribute::Type::SignedInt:          switch (size) {          case Maxwell::VertexAttribute::Size::Size_8: -            return VK_FORMAT_R8_USCALED; +            return VK_FORMAT_R8_SINT;          case Maxwell::VertexAttribute::Size::Size_8_8: -            return VK_FORMAT_R8G8_USCALED; +            return VK_FORMAT_R8G8_SINT;          case Maxwell::VertexAttribute::Size::Size_8_8_8: -            return VK_FORMAT_R8G8B8_USCALED; +            return VK_FORMAT_R8G8B8_SINT;          case Maxwell::VertexAttribute::Size::Size_8_8_8_8: -            return VK_FORMAT_R8G8B8A8_USCALED; +            return VK_FORMAT_R8G8B8A8_SINT;          case Maxwell::VertexAttribute::Size::Size_16: -            return VK_FORMAT_R16_USCALED; +            return VK_FORMAT_R16_SINT;          case Maxwell::VertexAttribute::Size::Size_16_16: -            return VK_FORMAT_R16G16_USCALED; +            return VK_FORMAT_R16G16_SINT;          case Maxwell::VertexAttribute::Size::Size_16_16_16: -            return VK_FORMAT_R16G16B16_USCALED; +            return VK_FORMAT_R16G16B16_SINT;          case Maxwell::VertexAttribute::Size::Size_16_16_16_16: -            return VK_FORMAT_R16G16B16A16_USCALED; -        default: -            break; +            return VK_FORMAT_R16G16B16A16_SINT; +        case Maxwell::VertexAttribute::Size::Size_32: +            return VK_FORMAT_R32_SINT; +        case Maxwell::VertexAttribute::Size::Size_32_32: +            return VK_FORMAT_R32G32_SINT; +        case Maxwell::VertexAttribute::Size::Size_32_32_32: +            return VK_FORMAT_R32G32B32_SINT; +        case Maxwell::VertexAttribute::Size::Size_32_32_32_32: +            return VK_FORMAT_R32G32B32A32_SINT; +        case Maxwell::VertexAttribute::Size::Size_10_10_10_2: +            return VK_FORMAT_A2B10G10R10_SINT_PACK32;          }          break; -    case Maxwell::VertexAttribute::Type::SignedScaled: +    case Maxwell::VertexAttribute::Type::Float:          switch (size) { -        case Maxwell::VertexAttribute::Size::Size_8: -            return VK_FORMAT_R8_SSCALED; -        case Maxwell::VertexAttribute::Size::Size_8_8: -            return VK_FORMAT_R8G8_SSCALED; -        case Maxwell::VertexAttribute::Size::Size_8_8_8: -            return VK_FORMAT_R8G8B8_SSCALED; -        case Maxwell::VertexAttribute::Size::Size_8_8_8_8: -            return VK_FORMAT_R8G8B8A8_SSCALED;          case Maxwell::VertexAttribute::Size::Size_16: -            return VK_FORMAT_R16_SSCALED; +            return VK_FORMAT_R16_SFLOAT;          case Maxwell::VertexAttribute::Size::Size_16_16: -            return VK_FORMAT_R16G16_SSCALED; +            return VK_FORMAT_R16G16_SFLOAT;          case Maxwell::VertexAttribute::Size::Size_16_16_16: -            return VK_FORMAT_R16G16B16_SSCALED; +            return VK_FORMAT_R16G16B16_SFLOAT;          case Maxwell::VertexAttribute::Size::Size_16_16_16_16: -            return VK_FORMAT_R16G16B16A16_SSCALED; -        default: -            break; -        } -        break; -    case Maxwell::VertexAttribute::Type::Float: -        switch (size) { +            return VK_FORMAT_R16G16B16A16_SFLOAT;          case Maxwell::VertexAttribute::Size::Size_32:              return VK_FORMAT_R32_SFLOAT;          case Maxwell::VertexAttribute::Size::Size_32_32: @@ -456,16 +460,6 @@ VkFormat VertexFormat(Maxwell::VertexAttribute::Type type, Maxwell::VertexAttrib              return VK_FORMAT_R32G32B32_SFLOAT;          case Maxwell::VertexAttribute::Size::Size_32_32_32_32:              return VK_FORMAT_R32G32B32A32_SFLOAT; -        case Maxwell::VertexAttribute::Size::Size_16: -            return VK_FORMAT_R16_SFLOAT; -        case Maxwell::VertexAttribute::Size::Size_16_16: -            return VK_FORMAT_R16G16_SFLOAT; -        case Maxwell::VertexAttribute::Size::Size_16_16_16: -            return VK_FORMAT_R16G16B16_SFLOAT; -        case Maxwell::VertexAttribute::Size::Size_16_16_16_16: -            return VK_FORMAT_R16G16B16A16_SFLOAT; -        default: -            break;          }          break;      } diff --git a/src/video_core/renderer_vulkan/renderer_vulkan.cpp b/src/video_core/renderer_vulkan/renderer_vulkan.cpp index cd9673d1f..2d9b18ed9 100644 --- a/src/video_core/renderer_vulkan/renderer_vulkan.cpp +++ b/src/video_core/renderer_vulkan/renderer_vulkan.cpp @@ -155,11 +155,31 @@ vk::Instance CreateInstance(Common::DynamicLibrary& library, vk::InstanceDispatc          }      } -    static constexpr std::array layers_data{"VK_LAYER_LUNARG_standard_validation"}; -    vk::Span<const char*> layers = layers_data; -    if (!enable_layers) { -        layers = {}; +    std::vector<const char*> layers; +    layers.reserve(1); +    if (enable_layers) { +        layers.push_back("VK_LAYER_KHRONOS_validation"); +    } + +    const std::optional layer_properties = vk::EnumerateInstanceLayerProperties(dld); +    if (!layer_properties) { +        LOG_ERROR(Render_Vulkan, "Failed to query layer properties, disabling layers"); +        layers.clear(); +    } + +    for (auto layer_it = layers.begin(); layer_it != layers.end();) { +        const char* const layer = *layer_it; +        const auto it = std::find_if( +            layer_properties->begin(), layer_properties->end(), +            [layer](const VkLayerProperties& prop) { return !std::strcmp(layer, prop.layerName); }); +        if (it == layer_properties->end()) { +            LOG_ERROR(Render_Vulkan, "Layer {} not available, removing it", layer); +            layer_it = layers.erase(layer_it); +        } else { +            ++layer_it; +        }      } +      vk::Instance instance = vk::Instance::Create(layers, extensions, dld);      if (!instance) {          LOG_ERROR(Render_Vulkan, "Failed to create Vulkan instance"); diff --git a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp index 1fde38328..2be38d419 100644 --- a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp @@ -37,9 +37,9 @@ std::unique_ptr<VKStreamBuffer> CreateStreamBuffer(const VKDevice& device, VKSch  } // Anonymous namespace -Buffer::Buffer(const VKDevice& device, VKMemoryManager& memory_manager, VAddr cpu_addr, -               std::size_t size) -    : VideoCommon::BufferBlock{cpu_addr, size} { +Buffer::Buffer(const VKDevice& device, VKMemoryManager& memory_manager, VKScheduler& scheduler_, +               VKStagingBufferPool& staging_pool_, VAddr cpu_addr, std::size_t size) +    : VideoCommon::BufferBlock{cpu_addr, size}, scheduler{scheduler_}, staging_pool{staging_pool_} {      VkBufferCreateInfo ci;      ci.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO;      ci.pNext = nullptr; @@ -56,40 +56,15 @@ Buffer::Buffer(const VKDevice& device, VKMemoryManager& memory_manager, VAddr cp  Buffer::~Buffer() = default; -VKBufferCache::VKBufferCache(VideoCore::RasterizerInterface& rasterizer, Core::System& system, -                             const VKDevice& device, VKMemoryManager& memory_manager, -                             VKScheduler& scheduler, VKStagingBufferPool& staging_pool) -    : VideoCommon::BufferCache<Buffer, VkBuffer, VKStreamBuffer>{rasterizer, system, -                                                                 CreateStreamBuffer(device, -                                                                                    scheduler)}, -      device{device}, memory_manager{memory_manager}, scheduler{scheduler}, staging_pool{ -                                                                                staging_pool} {} - -VKBufferCache::~VKBufferCache() = default; - -std::shared_ptr<Buffer> VKBufferCache::CreateBlock(VAddr cpu_addr, std::size_t size) { -    return std::make_shared<Buffer>(device, memory_manager, cpu_addr, size); -} - -VkBuffer VKBufferCache::GetEmptyBuffer(std::size_t size) { -    size = std::max(size, std::size_t(4)); -    const auto& empty = staging_pool.GetUnusedBuffer(size, false); -    scheduler.RequestOutsideRenderPassOperationContext(); -    scheduler.Record([size, buffer = *empty.handle](vk::CommandBuffer cmdbuf) { -        cmdbuf.FillBuffer(buffer, 0, size, 0); -    }); -    return *empty.handle; -} - -void VKBufferCache::UploadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size, -                                    const u8* data) { +void Buffer::Upload(std::size_t offset, std::size_t size, const u8* data) {      const auto& staging = staging_pool.GetUnusedBuffer(size, true);      std::memcpy(staging.commit->Map(size), data, size);      scheduler.RequestOutsideRenderPassOperationContext(); -    scheduler.Record([staging = *staging.handle, buffer = buffer.Handle(), offset, -                      size](vk::CommandBuffer cmdbuf) { -        cmdbuf.CopyBuffer(staging, buffer, VkBufferCopy{0, offset, size}); + +    const VkBuffer handle = Handle(); +    scheduler.Record([staging = *staging.handle, handle, offset, size](vk::CommandBuffer cmdbuf) { +        cmdbuf.CopyBuffer(staging, handle, VkBufferCopy{0, offset, size});          VkBufferMemoryBarrier barrier;          barrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER; @@ -98,7 +73,7 @@ void VKBufferCache::UploadBlockData(const Buffer& buffer, std::size_t offset, st          barrier.dstAccessMask = UPLOAD_ACCESS_BARRIERS;          barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;          barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; -        barrier.buffer = buffer; +        barrier.buffer = handle;          barrier.offset = offset;          barrier.size = size;          cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT, UPLOAD_PIPELINE_STAGE, 0, {}, @@ -106,12 +81,12 @@ void VKBufferCache::UploadBlockData(const Buffer& buffer, std::size_t offset, st      });  } -void VKBufferCache::DownloadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size, -                                      u8* data) { +void Buffer::Download(std::size_t offset, std::size_t size, u8* data) {      const auto& staging = staging_pool.GetUnusedBuffer(size, true);      scheduler.RequestOutsideRenderPassOperationContext(); -    scheduler.Record([staging = *staging.handle, buffer = buffer.Handle(), offset, -                      size](vk::CommandBuffer cmdbuf) { + +    const VkBuffer handle = Handle(); +    scheduler.Record([staging = *staging.handle, handle, offset, size](vk::CommandBuffer cmdbuf) {          VkBufferMemoryBarrier barrier;          barrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER;          barrier.pNext = nullptr; @@ -119,7 +94,7 @@ void VKBufferCache::DownloadBlockData(const Buffer& buffer, std::size_t offset,          barrier.dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT;          barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;          barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; -        barrier.buffer = buffer; +        barrier.buffer = handle;          barrier.offset = offset;          barrier.size = size; @@ -127,17 +102,19 @@ void VKBufferCache::DownloadBlockData(const Buffer& buffer, std::size_t offset,                                     VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT |                                     VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,                                 VK_PIPELINE_STAGE_TRANSFER_BIT, 0, {}, barrier, {}); -        cmdbuf.CopyBuffer(buffer, staging, VkBufferCopy{offset, 0, size}); +        cmdbuf.CopyBuffer(handle, staging, VkBufferCopy{offset, 0, size});      });      scheduler.Finish();      std::memcpy(data, staging.commit->Map(size), size);  } -void VKBufferCache::CopyBlock(const Buffer& src, const Buffer& dst, std::size_t src_offset, -                              std::size_t dst_offset, std::size_t size) { +void Buffer::CopyFrom(const Buffer& src, std::size_t src_offset, std::size_t dst_offset, +                      std::size_t size) {      scheduler.RequestOutsideRenderPassOperationContext(); -    scheduler.Record([src_buffer = src.Handle(), dst_buffer = dst.Handle(), src_offset, dst_offset, + +    const VkBuffer dst_buffer = Handle(); +    scheduler.Record([src_buffer = src.Handle(), dst_buffer, src_offset, dst_offset,                        size](vk::CommandBuffer cmdbuf) {          cmdbuf.CopyBuffer(src_buffer, dst_buffer, VkBufferCopy{src_offset, dst_offset, size}); @@ -165,4 +142,30 @@ void VKBufferCache::CopyBlock(const Buffer& src, const Buffer& dst, std::size_t      });  } +VKBufferCache::VKBufferCache(VideoCore::RasterizerInterface& rasterizer, Core::System& system, +                             const VKDevice& device, VKMemoryManager& memory_manager, +                             VKScheduler& scheduler, VKStagingBufferPool& staging_pool) +    : VideoCommon::BufferCache<Buffer, VkBuffer, VKStreamBuffer>{rasterizer, system, +                                                                 CreateStreamBuffer(device, +                                                                                    scheduler)}, +      device{device}, memory_manager{memory_manager}, scheduler{scheduler}, staging_pool{ +                                                                                staging_pool} {} + +VKBufferCache::~VKBufferCache() = default; + +std::shared_ptr<Buffer> VKBufferCache::CreateBlock(VAddr cpu_addr, std::size_t size) { +    return std::make_shared<Buffer>(device, memory_manager, scheduler, staging_pool, cpu_addr, +                                    size); +} + +VKBufferCache::BufferInfo VKBufferCache::GetEmptyBuffer(std::size_t size) { +    size = std::max(size, std::size_t(4)); +    const auto& empty = staging_pool.GetUnusedBuffer(size, false); +    scheduler.RequestOutsideRenderPassOperationContext(); +    scheduler.Record([size, buffer = *empty.handle](vk::CommandBuffer cmdbuf) { +        cmdbuf.FillBuffer(buffer, 0, size, 0); +    }); +    return {*empty.handle, 0, 0}; +} +  } // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_buffer_cache.h b/src/video_core/renderer_vulkan/vk_buffer_cache.h index 9ebbef835..991ee451c 100644 --- a/src/video_core/renderer_vulkan/vk_buffer_cache.h +++ b/src/video_core/renderer_vulkan/vk_buffer_cache.h @@ -25,15 +25,29 @@ class VKScheduler;  class Buffer final : public VideoCommon::BufferBlock {  public: -    explicit Buffer(const VKDevice& device, VKMemoryManager& memory_manager, VAddr cpu_addr, -                    std::size_t size); +    explicit Buffer(const VKDevice& device, VKMemoryManager& memory_manager, VKScheduler& scheduler, +                    VKStagingBufferPool& staging_pool, VAddr cpu_addr, std::size_t size);      ~Buffer(); +    void Upload(std::size_t offset, std::size_t size, const u8* data); + +    void Download(std::size_t offset, std::size_t size, u8* data); + +    void CopyFrom(const Buffer& src, std::size_t src_offset, std::size_t dst_offset, +                  std::size_t size); +      VkBuffer Handle() const {          return *buffer.handle;      } +    u64 Address() const { +        return 0; +    } +  private: +    VKScheduler& scheduler; +    VKStagingBufferPool& staging_pool; +      VKBuffer buffer;  }; @@ -44,20 +58,11 @@ public:                             VKScheduler& scheduler, VKStagingBufferPool& staging_pool);      ~VKBufferCache(); -    VkBuffer GetEmptyBuffer(std::size_t size) override; +    BufferInfo GetEmptyBuffer(std::size_t size) override;  protected:      std::shared_ptr<Buffer> CreateBlock(VAddr cpu_addr, std::size_t size) override; -    void UploadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size, -                         const u8* data) override; - -    void DownloadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size, -                           u8* data) override; - -    void CopyBlock(const Buffer& src, const Buffer& dst, std::size_t src_offset, -                   std::size_t dst_offset, std::size_t size) override; -  private:      const VKDevice& device;      VKMemoryManager& memory_manager; diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp index a77fa35c3..a8d94eac3 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp +++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp @@ -143,6 +143,49 @@ Tegra::Texture::FullTextureInfo GetTextureInfo(const Engine& engine, const Entry      }  } +/// @brief Determine if an attachment to be updated has to preserve contents +/// @param is_clear True when a clear is being executed +/// @param regs 3D registers +/// @return True when the contents have to be preserved +bool HasToPreserveColorContents(bool is_clear, const Maxwell& regs) { +    if (!is_clear) { +        return true; +    } +    // First we have to make sure all clear masks are enabled. +    if (!regs.clear_buffers.R || !regs.clear_buffers.G || !regs.clear_buffers.B || +        !regs.clear_buffers.A) { +        return true; +    } +    // If scissors are disabled, the whole screen is cleared +    if (!regs.clear_flags.scissor) { +        return false; +    } +    // Then we have to confirm scissor testing clears the whole image +    const std::size_t index = regs.clear_buffers.RT; +    const auto& scissor = regs.scissor_test[0]; +    return scissor.min_x > 0 || scissor.min_y > 0 || scissor.max_x < regs.rt[index].width || +           scissor.max_y < regs.rt[index].height; +} + +/// @brief Determine if an attachment to be updated has to preserve contents +/// @param is_clear True when a clear is being executed +/// @param regs 3D registers +/// @return True when the contents have to be preserved +bool HasToPreserveDepthContents(bool is_clear, const Maxwell& regs) { +    // If we are not clearing, the contents have to be preserved +    if (!is_clear) { +        return true; +    } +    // For depth stencil clears we only have to confirm scissor test covers the whole image +    if (!regs.clear_flags.scissor) { +        return false; +    } +    // Make sure the clear cover the whole image +    const auto& scissor = regs.scissor_test[0]; +    return scissor.min_x > 0 || scissor.min_y > 0 || scissor.max_x < regs.zeta_width || +           scissor.max_y < regs.zeta_height; +} +  } // Anonymous namespace  class BufferBindings final { @@ -344,7 +387,7 @@ void RasterizerVulkan::Draw(bool is_indexed, bool is_instanced) {      buffer_cache.Unmap(); -    const Texceptions texceptions = UpdateAttachments(); +    const Texceptions texceptions = UpdateAttachments(false);      SetupImageTransitions(texceptions, color_attachments, zeta_attachment);      key.renderpass_params = GetRenderPassParams(texceptions); @@ -400,7 +443,7 @@ void RasterizerVulkan::Clear() {          return;      } -    [[maybe_unused]] const auto texceptions = UpdateAttachments(); +    [[maybe_unused]] const auto texceptions = UpdateAttachments(true);      DEBUG_ASSERT(texceptions.none());      SetupImageTransitions(0, color_attachments, zeta_attachment); @@ -677,9 +720,12 @@ void RasterizerVulkan::FlushWork() {      draw_counter = 0;  } -RasterizerVulkan::Texceptions RasterizerVulkan::UpdateAttachments() { +RasterizerVulkan::Texceptions RasterizerVulkan::UpdateAttachments(bool is_clear) {      MICROPROFILE_SCOPE(Vulkan_RenderTargets); -    auto& dirty = system.GPU().Maxwell3D().dirty.flags; +    auto& maxwell3d = system.GPU().Maxwell3D(); +    auto& dirty = maxwell3d.dirty.flags; +    auto& regs = maxwell3d.regs; +      const bool update_rendertargets = dirty[VideoCommon::Dirty::RenderTargets];      dirty[VideoCommon::Dirty::RenderTargets] = false; @@ -688,7 +734,8 @@ RasterizerVulkan::Texceptions RasterizerVulkan::UpdateAttachments() {      Texceptions texceptions;      for (std::size_t rt = 0; rt < Maxwell::NumRenderTargets; ++rt) {          if (update_rendertargets) { -            color_attachments[rt] = texture_cache.GetColorBufferSurface(rt, true); +            const bool preserve_contents = HasToPreserveColorContents(is_clear, regs); +            color_attachments[rt] = texture_cache.GetColorBufferSurface(rt, preserve_contents);          }          if (color_attachments[rt] && WalkAttachmentOverlaps(*color_attachments[rt])) {              texceptions[rt] = true; @@ -696,7 +743,8 @@ RasterizerVulkan::Texceptions RasterizerVulkan::UpdateAttachments() {      }      if (update_rendertargets) { -        zeta_attachment = texture_cache.GetDepthBufferSurface(true); +        const bool preserve_contents = HasToPreserveDepthContents(is_clear, regs); +        zeta_attachment = texture_cache.GetDepthBufferSurface(preserve_contents);      }      if (zeta_attachment && WalkAttachmentOverlaps(*zeta_attachment)) {          texceptions[ZETA_TEXCEPTION_INDEX] = true; @@ -870,10 +918,10 @@ void RasterizerVulkan::BeginTransformFeedback() {      UNIMPLEMENTED_IF(binding.buffer_offset != 0);      const GPUVAddr gpu_addr = binding.Address(); -    const auto size = static_cast<VkDeviceSize>(binding.buffer_size); -    const auto [buffer, offset] = buffer_cache.UploadMemory(gpu_addr, size, 4, true); +    const VkDeviceSize size = static_cast<VkDeviceSize>(binding.buffer_size); +    const auto info = buffer_cache.UploadMemory(gpu_addr, size, 4, true); -    scheduler.Record([buffer = buffer, offset = offset, size](vk::CommandBuffer cmdbuf) { +    scheduler.Record([buffer = info.handle, offset = info.offset, size](vk::CommandBuffer cmdbuf) {          cmdbuf.BindTransformFeedbackBuffersEXT(0, 1, &buffer, &offset, &size);          cmdbuf.BeginTransformFeedbackEXT(0, 0, nullptr, nullptr);      }); @@ -925,8 +973,8 @@ void RasterizerVulkan::SetupVertexArrays(FixedPipelineState::VertexInput& vertex              buffer_bindings.AddVertexBinding(DefaultBuffer(), 0);              continue;          } -        const auto [buffer, offset] = buffer_cache.UploadMemory(start, size); -        buffer_bindings.AddVertexBinding(buffer, offset); +        const auto info = buffer_cache.UploadMemory(start, size); +        buffer_bindings.AddVertexBinding(info.handle, info.offset);      }  } @@ -948,7 +996,9 @@ void RasterizerVulkan::SetupIndexBuffer(BufferBindings& buffer_bindings, DrawPar              break;          }          const GPUVAddr gpu_addr = regs.index_array.IndexStart(); -        auto [buffer, offset] = buffer_cache.UploadMemory(gpu_addr, CalculateIndexBufferSize()); +        const auto info = buffer_cache.UploadMemory(gpu_addr, CalculateIndexBufferSize()); +        VkBuffer buffer = info.handle; +        u64 offset = info.offset;          std::tie(buffer, offset) = quad_indexed_pass.Assemble(              regs.index_array.format, params.num_vertices, params.base_vertex, buffer, offset); @@ -962,7 +1012,9 @@ void RasterizerVulkan::SetupIndexBuffer(BufferBindings& buffer_bindings, DrawPar              break;          }          const GPUVAddr gpu_addr = regs.index_array.IndexStart(); -        auto [buffer, offset] = buffer_cache.UploadMemory(gpu_addr, CalculateIndexBufferSize()); +        const auto info = buffer_cache.UploadMemory(gpu_addr, CalculateIndexBufferSize()); +        VkBuffer buffer = info.handle; +        u64 offset = info.offset;          auto format = regs.index_array.format;          const bool is_uint8 = format == Maxwell::IndexFormat::UnsignedByte; @@ -1109,10 +1161,9 @@ void RasterizerVulkan::SetupConstBuffer(const ConstBufferEntry& entry,          Common::AlignUp(CalculateConstBufferSize(entry, buffer), 4 * sizeof(float));      ASSERT(size <= MaxConstbufferSize); -    const auto [buffer_handle, offset] = +    const auto info =          buffer_cache.UploadMemory(buffer.address, size, device.GetUniformBufferAlignment()); - -    update_descriptor_queue.AddBuffer(buffer_handle, offset, size); +    update_descriptor_queue.AddBuffer(info.handle, info.offset, size);  }  void RasterizerVulkan::SetupGlobalBuffer(const GlobalBufferEntry& entry, GPUVAddr address) { @@ -1126,14 +1177,14 @@ void RasterizerVulkan::SetupGlobalBuffer(const GlobalBufferEntry& entry, GPUVAdd          // Note: Do *not* use DefaultBuffer() here, storage buffers can be written breaking the          // default buffer.          static constexpr std::size_t dummy_size = 4; -        const auto buffer = buffer_cache.GetEmptyBuffer(dummy_size); -        update_descriptor_queue.AddBuffer(buffer, 0, dummy_size); +        const auto info = buffer_cache.GetEmptyBuffer(dummy_size); +        update_descriptor_queue.AddBuffer(info.handle, info.offset, dummy_size);          return;      } -    const auto [buffer, offset] = buffer_cache.UploadMemory( +    const auto info = buffer_cache.UploadMemory(          actual_addr, size, device.GetStorageBufferAlignment(), entry.IsWritten()); -    update_descriptor_queue.AddBuffer(buffer, offset, size); +    update_descriptor_queue.AddBuffer(info.handle, info.offset, size);  }  void RasterizerVulkan::SetupUniformTexels(const Tegra::Texture::TICEntry& tic, @@ -1154,7 +1205,7 @@ void RasterizerVulkan::SetupTexture(const Tegra::Texture::FullTextureInfo& textu      const auto sampler = sampler_cache.GetSampler(texture.tsc);      update_descriptor_queue.AddSampledImage(sampler, image_view); -    const auto image_layout = update_descriptor_queue.GetLastImageLayout(); +    VkImageLayout* const image_layout = update_descriptor_queue.LastImageLayout();      *image_layout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL;      sampled_views.push_back(ImageView{std::move(view), image_layout});  } @@ -1180,7 +1231,7 @@ void RasterizerVulkan::SetupImage(const Tegra::Texture::TICEntry& tic, const Ima          view->GetImageView(tic.x_source, tic.y_source, tic.z_source, tic.w_source);      update_descriptor_queue.AddImage(image_view); -    const auto image_layout = update_descriptor_queue.GetLastImageLayout(); +    VkImageLayout* const image_layout = update_descriptor_queue.LastImageLayout();      *image_layout = VK_IMAGE_LAYOUT_GENERAL;      image_views.push_back(ImageView{std::move(view), image_layout});  } diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.h b/src/video_core/renderer_vulkan/vk_rasterizer.h index c8c187606..83e00e7e9 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.h +++ b/src/video_core/renderer_vulkan/vk_rasterizer.h @@ -159,7 +159,10 @@ private:      void FlushWork(); -    Texceptions UpdateAttachments(); +    /// @brief Updates the currently bound attachments +    /// @param is_clear True when the framebuffer is updated as a clear +    /// @return Bitfield of attachments being used as sampled textures +    Texceptions UpdateAttachments(bool is_clear);      std::tuple<VkFramebuffer, VkExtent2D> ConfigureFramebuffers(VkRenderPass renderpass); diff --git a/src/video_core/renderer_vulkan/vk_scheduler.cpp b/src/video_core/renderer_vulkan/vk_scheduler.cpp index 82ec9180e..56524e6f3 100644 --- a/src/video_core/renderer_vulkan/vk_scheduler.cpp +++ b/src/video_core/renderer_vulkan/vk_scheduler.cpp @@ -9,6 +9,7 @@  #include <utility>  #include "common/microprofile.h" +#include "common/thread.h"  #include "video_core/renderer_vulkan/vk_device.h"  #include "video_core/renderer_vulkan/vk_query_cache.h"  #include "video_core/renderer_vulkan/vk_resource_manager.h" @@ -133,6 +134,7 @@ void VKScheduler::BindGraphicsPipeline(VkPipeline pipeline) {  }  void VKScheduler::WorkerThread() { +    Common::SetCurrentThreadPriority(Common::ThreadPriority::High);      std::unique_lock lock{mutex};      do {          cv.wait(lock, [this] { return !chunk_queue.Empty() || quit; }); diff --git a/src/video_core/renderer_vulkan/vk_stream_buffer.h b/src/video_core/renderer_vulkan/vk_stream_buffer.h index c765c60a0..689f0d276 100644 --- a/src/video_core/renderer_vulkan/vk_stream_buffer.h +++ b/src/video_core/renderer_vulkan/vk_stream_buffer.h @@ -35,10 +35,14 @@ public:      /// Ensures that "size" bytes of memory are available to the GPU, potentially recording a copy.      void Unmap(u64 size); -    VkBuffer Handle() const { +    VkBuffer Handle() const noexcept {          return *buffer;      } +    u64 Address() const noexcept { +        return 0; +    } +  private:      struct Watch final {          VKFenceWatch fence; diff --git a/src/video_core/renderer_vulkan/vk_update_descriptor.cpp b/src/video_core/renderer_vulkan/vk_update_descriptor.cpp index 681ecde98..351c048d2 100644 --- a/src/video_core/renderer_vulkan/vk_update_descriptor.cpp +++ b/src/video_core/renderer_vulkan/vk_update_descriptor.cpp @@ -24,35 +24,25 @@ void VKUpdateDescriptorQueue::TickFrame() {  }  void VKUpdateDescriptorQueue::Acquire() { -    entries.clear(); -} +    // Minimum number of entries required. +    // This is the maximum number of entries a single draw call migth use. +    static constexpr std::size_t MIN_ENTRIES = 0x400; -void VKUpdateDescriptorQueue::Send(VkDescriptorUpdateTemplateKHR update_template, -                                   VkDescriptorSet set) { -    if (payload.size() + entries.size() >= payload.max_size()) { +    if (payload.size() + MIN_ENTRIES >= payload.max_size()) {          LOG_WARNING(Render_Vulkan, "Payload overflow, waiting for worker thread");          scheduler.WaitWorker();          payload.clear();      } +    upload_start = &*payload.end(); +} -    // TODO(Rodrigo): Rework to write the payload directly -    const auto payload_start = payload.data() + payload.size(); -    for (const auto& entry : entries) { -        if (const auto image = std::get_if<VkDescriptorImageInfo>(&entry)) { -            payload.push_back(*image); -        } else if (const auto buffer = std::get_if<VkDescriptorBufferInfo>(&entry)) { -            payload.push_back(*buffer); -        } else if (const auto texel = std::get_if<VkBufferView>(&entry)) { -            payload.push_back(*texel); -        } else { -            UNREACHABLE(); -        } -    } - -    scheduler.Record( -        [payload_start, set, update_template, logical = &device.GetLogical()](vk::CommandBuffer) { -            logical->UpdateDescriptorSet(set, update_template, payload_start); -        }); +void VKUpdateDescriptorQueue::Send(VkDescriptorUpdateTemplateKHR update_template, +                                   VkDescriptorSet set) { +    const void* const data = upload_start; +    const vk::Device* const logical = &device.GetLogical(); +    scheduler.Record([data, logical, set, update_template](vk::CommandBuffer) { +        logical->UpdateDescriptorSet(set, update_template, data); +    });  }  } // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_update_descriptor.h b/src/video_core/renderer_vulkan/vk_update_descriptor.h index cc7e3dff4..945320c72 100644 --- a/src/video_core/renderer_vulkan/vk_update_descriptor.h +++ b/src/video_core/renderer_vulkan/vk_update_descriptor.h @@ -15,17 +15,13 @@ namespace Vulkan {  class VKDevice;  class VKScheduler; -class DescriptorUpdateEntry { -public: -    explicit DescriptorUpdateEntry() {} - -    DescriptorUpdateEntry(VkDescriptorImageInfo image) : image{image} {} +struct DescriptorUpdateEntry { +    DescriptorUpdateEntry(VkDescriptorImageInfo image_) : image{image_} {} -    DescriptorUpdateEntry(VkDescriptorBufferInfo buffer) : buffer{buffer} {} +    DescriptorUpdateEntry(VkDescriptorBufferInfo buffer_) : buffer{buffer_} {} -    DescriptorUpdateEntry(VkBufferView texel_buffer) : texel_buffer{texel_buffer} {} +    DescriptorUpdateEntry(VkBufferView texel_buffer_) : texel_buffer{texel_buffer_} {} -private:      union {          VkDescriptorImageInfo image;          VkDescriptorBufferInfo buffer; @@ -45,32 +41,34 @@ public:      void Send(VkDescriptorUpdateTemplateKHR update_template, VkDescriptorSet set);      void AddSampledImage(VkSampler sampler, VkImageView image_view) { -        entries.emplace_back(VkDescriptorImageInfo{sampler, image_view, {}}); +        payload.emplace_back(VkDescriptorImageInfo{sampler, image_view, {}});      }      void AddImage(VkImageView image_view) { -        entries.emplace_back(VkDescriptorImageInfo{{}, image_view, {}}); +        payload.emplace_back(VkDescriptorImageInfo{{}, image_view, {}});      }      void AddBuffer(VkBuffer buffer, u64 offset, std::size_t size) { -        entries.emplace_back(VkDescriptorBufferInfo{buffer, offset, size}); +        payload.emplace_back(VkDescriptorBufferInfo{buffer, offset, size});      }      void AddTexelBuffer(VkBufferView texel_buffer) { -        entries.emplace_back(texel_buffer); +        payload.emplace_back(texel_buffer);      } -    VkImageLayout* GetLastImageLayout() { -        return &std::get<VkDescriptorImageInfo>(entries.back()).imageLayout; +    VkImageLayout* LastImageLayout() { +        return &payload.back().image.imageLayout;      } -private: -    using Variant = std::variant<VkDescriptorImageInfo, VkDescriptorBufferInfo, VkBufferView>; +    const VkImageLayout* LastImageLayout() const { +        return &payload.back().image.imageLayout; +    } +private:      const VKDevice& device;      VKScheduler& scheduler; -    boost::container::static_vector<Variant, 0x400> entries; +    const DescriptorUpdateEntry* upload_start = nullptr;      boost::container::static_vector<DescriptorUpdateEntry, 0x10000> payload;  }; diff --git a/src/video_core/renderer_vulkan/wrapper.cpp b/src/video_core/renderer_vulkan/wrapper.cpp index 2ce9b0626..0d485a662 100644 --- a/src/video_core/renderer_vulkan/wrapper.cpp +++ b/src/video_core/renderer_vulkan/wrapper.cpp @@ -153,7 +153,8 @@ void Load(VkDevice device, DeviceDispatch& dld) noexcept {  bool Load(InstanceDispatch& dld) noexcept {  #define X(name) Proc(dld.name, dld, #name) -    return X(vkCreateInstance) && X(vkEnumerateInstanceExtensionProperties); +    return X(vkCreateInstance) && X(vkEnumerateInstanceExtensionProperties) && +           X(vkEnumerateInstanceLayerProperties);  #undef X  } @@ -725,8 +726,7 @@ bool PhysicalDevice::GetSurfaceSupportKHR(u32 queue_family_index, VkSurfaceKHR s      return supported == VK_TRUE;  } -VkSurfaceCapabilitiesKHR PhysicalDevice::GetSurfaceCapabilitiesKHR(VkSurfaceKHR surface) const -    noexcept { +VkSurfaceCapabilitiesKHR PhysicalDevice::GetSurfaceCapabilitiesKHR(VkSurfaceKHR surface) const {      VkSurfaceCapabilitiesKHR capabilities;      Check(dld->vkGetPhysicalDeviceSurfaceCapabilitiesKHR(physical_device, surface, &capabilities));      return capabilities; @@ -771,4 +771,17 @@ std::optional<std::vector<VkExtensionProperties>> EnumerateInstanceExtensionProp      return properties;  } +std::optional<std::vector<VkLayerProperties>> EnumerateInstanceLayerProperties( +    const InstanceDispatch& dld) { +    u32 num; +    if (dld.vkEnumerateInstanceLayerProperties(&num, nullptr) != VK_SUCCESS) { +        return std::nullopt; +    } +    std::vector<VkLayerProperties> properties(num); +    if (dld.vkEnumerateInstanceLayerProperties(&num, properties.data()) != VK_SUCCESS) { +        return std::nullopt; +    } +    return properties; +} +  } // namespace Vulkan::vk diff --git a/src/video_core/renderer_vulkan/wrapper.h b/src/video_core/renderer_vulkan/wrapper.h index 98937a77a..d56fdb3f9 100644 --- a/src/video_core/renderer_vulkan/wrapper.h +++ b/src/video_core/renderer_vulkan/wrapper.h @@ -141,6 +141,7 @@ struct InstanceDispatch {      PFN_vkCreateInstance vkCreateInstance;      PFN_vkDestroyInstance vkDestroyInstance;      PFN_vkEnumerateInstanceExtensionProperties vkEnumerateInstanceExtensionProperties; +    PFN_vkEnumerateInstanceLayerProperties vkEnumerateInstanceLayerProperties;      PFN_vkCreateDebugUtilsMessengerEXT vkCreateDebugUtilsMessengerEXT;      PFN_vkCreateDevice vkCreateDevice; @@ -779,7 +780,7 @@ public:      bool GetSurfaceSupportKHR(u32 queue_family_index, VkSurfaceKHR) const; -    VkSurfaceCapabilitiesKHR GetSurfaceCapabilitiesKHR(VkSurfaceKHR) const noexcept; +    VkSurfaceCapabilitiesKHR GetSurfaceCapabilitiesKHR(VkSurfaceKHR) const;      std::vector<VkSurfaceFormatKHR> GetSurfaceFormatsKHR(VkSurfaceKHR) const; @@ -996,4 +997,7 @@ private:  std::optional<std::vector<VkExtensionProperties>> EnumerateInstanceExtensionProperties(      const InstanceDispatch& dld); +std::optional<std::vector<VkLayerProperties>> EnumerateInstanceLayerProperties( +    const InstanceDispatch& dld); +  } // namespace Vulkan::vk diff --git a/src/video_core/shader/decode/half_set.cpp b/src/video_core/shader/decode/half_set.cpp index 848e46874..b2e88fa20 100644 --- a/src/video_core/shader/decode/half_set.cpp +++ b/src/video_core/shader/decode/half_set.cpp @@ -13,55 +13,101 @@  namespace VideoCommon::Shader { +using std::move;  using Tegra::Shader::Instruction;  using Tegra::Shader::OpCode; +using Tegra::Shader::PredCondition;  u32 ShaderIR::DecodeHalfSet(NodeBlock& bb, u32 pc) {      const Instruction instr = {program_code[pc]};      const auto opcode = OpCode::Decode(instr); -    if (instr.hset2.ftz == 0) { -        LOG_DEBUG(HW_GPU, "{} without FTZ is not implemented", opcode->get().GetName()); +    PredCondition cond; +    bool bf; +    bool ftz; +    bool neg_a; +    bool abs_a; +    bool neg_b; +    bool abs_b; +    switch (opcode->get().GetId()) { +    case OpCode::Id::HSET2_C: +    case OpCode::Id::HSET2_IMM: +        cond = instr.hsetp2.cbuf_and_imm.cond; +        bf = instr.Bit(53); +        ftz = instr.Bit(54); +        neg_a = instr.Bit(43); +        abs_a = instr.Bit(44); +        neg_b = instr.Bit(56); +        abs_b = instr.Bit(54); +        break; +    case OpCode::Id::HSET2_R: +        cond = instr.hsetp2.reg.cond; +        bf = instr.Bit(49); +        ftz = instr.Bit(50); +        neg_a = instr.Bit(43); +        abs_a = instr.Bit(44); +        neg_b = instr.Bit(31); +        abs_b = instr.Bit(30); +        break; +    default: +        UNREACHABLE();      } -    Node op_a = UnpackHalfFloat(GetRegister(instr.gpr8), instr.hset2.type_a); -    op_a = GetOperandAbsNegHalf(op_a, instr.hset2.abs_a, instr.hset2.negate_a); - -    Node op_b = [&]() { +    Node op_b = [this, instr, opcode] {          switch (opcode->get().GetId()) { +        case OpCode::Id::HSET2_C: +            // Inform as unimplemented as this is not tested. +            UNIMPLEMENTED_MSG("HSET2_C is not implemented"); +            return GetConstBuffer(instr.cbuf34.index, instr.cbuf34.GetOffset());          case OpCode::Id::HSET2_R:              return GetRegister(instr.gpr20); +        case OpCode::Id::HSET2_IMM: +            return UnpackHalfImmediate(instr, true);          default:              UNREACHABLE(); -            return Immediate(0); +            return Node{};          }      }(); -    op_b = UnpackHalfFloat(op_b, instr.hset2.type_b); -    op_b = GetOperandAbsNegHalf(op_b, instr.hset2.abs_b, instr.hset2.negate_b); -    const Node second_pred = GetPredicate(instr.hset2.pred39, instr.hset2.neg_pred); +    if (!ftz) { +        LOG_DEBUG(HW_GPU, "{} without FTZ is not implemented", opcode->get().GetName()); +    } + +    Node op_a = UnpackHalfFloat(GetRegister(instr.gpr8), instr.hset2.type_a); +    op_a = GetOperandAbsNegHalf(op_a, abs_a, neg_a); + +    switch (opcode->get().GetId()) { +    case OpCode::Id::HSET2_R: +        op_b = GetOperandAbsNegHalf(move(op_b), abs_b, neg_b); +        [[fallthrough]]; +    case OpCode::Id::HSET2_C: +        op_b = UnpackHalfFloat(move(op_b), instr.hset2.type_b); +        break; +    default: +        break; +    } -    const Node comparison_pair = GetPredicateComparisonHalf(instr.hset2.cond, op_a, op_b); +    Node second_pred = GetPredicate(instr.hset2.pred39, instr.hset2.neg_pred); + +    Node comparison_pair = GetPredicateComparisonHalf(cond, op_a, op_b);      const OperationCode combiner = GetPredicateCombiner(instr.hset2.op);      // HSET2 operates on each half float in the pack.      std::array<Node, 2> values;      for (u32 i = 0; i < 2; ++i) { -        const u32 raw_value = instr.hset2.bf ? 0x3c00 : 0xffff; -        const Node true_value = Immediate(raw_value << (i * 16)); -        const Node false_value = Immediate(0); - -        const Node comparison = -            Operation(OperationCode::LogicalPick2, comparison_pair, Immediate(i)); -        const Node predicate = Operation(combiner, comparison, second_pred); +        const u32 raw_value = bf ? 0x3c00 : 0xffff; +        Node true_value = Immediate(raw_value << (i * 16)); +        Node false_value = Immediate(0); +        Node comparison = Operation(OperationCode::LogicalPick2, comparison_pair, Immediate(i)); +        Node predicate = Operation(combiner, comparison, second_pred);          values[i] = -            Operation(OperationCode::Select, NO_PRECISE, predicate, true_value, false_value); +            Operation(OperationCode::Select, predicate, move(true_value), move(false_value));      } -    const Node value = Operation(OperationCode::UBitwiseOr, NO_PRECISE, values[0], values[1]); -    SetRegister(bb, instr.gpr0, value); +    Node value = Operation(OperationCode::UBitwiseOr, values[0], values[1]); +    SetRegister(bb, instr.gpr0, move(value));      return pc;  } diff --git a/src/video_core/shader/decode/image.cpp b/src/video_core/shader/decode/image.cpp index 60b6ad72a..07778dc3e 100644 --- a/src/video_core/shader/decode/image.cpp +++ b/src/video_core/shader/decode/image.cpp @@ -97,6 +97,7 @@ ComponentType GetComponentType(Tegra::Engines::SamplerDescriptor descriptor,          break;      case TextureFormat::B5G6R5:      case TextureFormat::B6G5R5: +    case TextureFormat::BF10GF11RF11:          if (component == 0) {              return descriptor.b_type;          } @@ -119,7 +120,7 @@ ComponentType GetComponentType(Tegra::Engines::SamplerDescriptor descriptor,          }          break;      } -    UNIMPLEMENTED_MSG("texture format not implement={}", format); +    UNIMPLEMENTED_MSG("Texture format not implemented={}", format);      return ComponentType::FLOAT;  } @@ -191,6 +192,14 @@ u32 GetComponentSize(TextureFormat format, std::size_t component) {              return 6;          }          return 0; +    case TextureFormat::BF10GF11RF11: +        if (component == 1 || component == 2) { +            return 11; +        } +        if (component == 0) { +            return 10; +        } +        return 0;      case TextureFormat::G8R24:          if (component == 0) {              return 8; @@ -211,10 +220,9 @@ u32 GetComponentSize(TextureFormat format, std::size_t component) {          return (component == 0 || component == 1) ? 8 : 0;      case TextureFormat::G4R4:          return (component == 0 || component == 1) ? 4 : 0; -    default: -        UNIMPLEMENTED_MSG("texture format not implement={}", format); -        return 0;      } +    UNIMPLEMENTED_MSG("Texture format not implemented={}", format); +    return 0;  }  std::size_t GetImageComponentMask(TextureFormat format) { @@ -235,6 +243,7 @@ std::size_t GetImageComponentMask(TextureFormat format) {      case TextureFormat::R32_B24G8:      case TextureFormat::B5G6R5:      case TextureFormat::B6G5R5: +    case TextureFormat::BF10GF11RF11:          return std::size_t{R | G | B};      case TextureFormat::R32_G32:      case TextureFormat::R16_G16: @@ -248,10 +257,9 @@ std::size_t GetImageComponentMask(TextureFormat format) {      case TextureFormat::R8:      case TextureFormat::R1:          return std::size_t{R}; -    default: -        UNIMPLEMENTED_MSG("texture format not implement={}", format); -        return std::size_t{R | G | B | A};      } +    UNIMPLEMENTED_MSG("Texture format not implemented={}", format); +    return std::size_t{R | G | B | A};  }  std::size_t GetImageTypeNumCoordinates(Tegra::Shader::ImageType image_type) { @@ -299,7 +307,7 @@ std::pair<Node, bool> ShaderIR::GetComponentValue(ComponentType component_type,              return {std::move(original_value), true};          }      default: -        UNIMPLEMENTED_MSG("Unimplement component type={}", component_type); +        UNIMPLEMENTED_MSG("Unimplemented component type={}", component_type);          return {std::move(original_value), true};      }  } @@ -459,7 +467,7 @@ u32 ShaderIR::DecodeImage(NodeBlock& bb, u32 pc) {              default:                  break;              } -            UNIMPLEMENTED_MSG("Unimplemented operation={} type={}", +            UNIMPLEMENTED_MSG("Unimplemented operation={}, type={}",                                static_cast<u64>(instr.suatom_d.operation.Value()),                                static_cast<u64>(instr.suatom_d.operation_type.Value()));              return OperationCode::AtomicImageAdd; diff --git a/src/video_core/texture_cache/surface_base.cpp b/src/video_core/texture_cache/surface_base.cpp index 94d3a6ae5..0caf3b4f0 100644 --- a/src/video_core/texture_cache/surface_base.cpp +++ b/src/video_core/texture_cache/surface_base.cpp @@ -120,6 +120,9 @@ std::optional<std::pair<u32, u32>> SurfaceBaseImpl::GetLayerMipmap(      }      const auto relative_address{static_cast<GPUVAddr>(candidate_gpu_addr - gpu_addr)};      const auto layer{static_cast<u32>(relative_address / layer_size)}; +    if (layer >= params.depth) { +        return {}; +    }      const GPUVAddr mipmap_address = relative_address - layer_size * layer;      const auto mipmap_it =          Common::BinaryFind(mipmap_offsets.begin(), mipmap_offsets.end(), mipmap_address); diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h index b543fc8c0..6207d8dfe 100644 --- a/src/video_core/texture_cache/texture_cache.h +++ b/src/video_core/texture_cache/texture_cache.h @@ -24,6 +24,7 @@  #include "core/core.h"  #include "core/memory.h"  #include "core/settings.h" +#include "video_core/compatible_formats.h"  #include "video_core/dirty_flags.h"  #include "video_core/engines/fermi_2d.h"  #include "video_core/engines/maxwell_3d.h" @@ -47,8 +48,8 @@ class RasterizerInterface;  namespace VideoCommon { +using VideoCore::Surface::FormatCompatibility;  using VideoCore::Surface::PixelFormat; -  using VideoCore::Surface::SurfaceTarget;  using RenderTargetConfig = Tegra::Engines::Maxwell3D::Regs::RenderTargetConfig; @@ -595,7 +596,7 @@ private:          } else {              new_surface = GetUncachedSurface(gpu_addr, params);          } -        const auto& final_params = new_surface->GetSurfaceParams(); +        const SurfaceParams& final_params = new_surface->GetSurfaceParams();          if (cr_params.type != final_params.type) {              if (Settings::IsGPULevelExtreme()) {                  BufferCopy(current_surface, new_surface); @@ -603,7 +604,7 @@ private:          } else {              std::vector<CopyParams> bricks = current_surface->BreakDown(final_params);              for (auto& brick : bricks) { -                ImageCopy(current_surface, new_surface, brick); +                TryCopyImage(current_surface, new_surface, brick);              }          }          Unregister(current_surface); @@ -694,7 +695,7 @@ private:                  }                  const CopyParams copy_params(0, 0, 0, 0, 0, base_layer, 0, mipmap, width, height,                                               src_params.depth); -                ImageCopy(surface, new_surface, copy_params); +                TryCopyImage(surface, new_surface, copy_params);              }          }          if (passed_tests == 0) { @@ -791,7 +792,7 @@ private:              const u32 width = params.width;              const u32 height = params.height;              const CopyParams copy_params(0, 0, 0, 0, 0, slice, 0, 0, width, height, 1); -            ImageCopy(surface, new_surface, copy_params); +            TryCopyImage(surface, new_surface, copy_params);          }          for (const auto& surface : overlaps) {              Unregister(surface); @@ -1053,7 +1054,7 @@ private:      void DeduceBestBlit(SurfaceParams& src_params, SurfaceParams& dst_params,                          const GPUVAddr src_gpu_addr, const GPUVAddr dst_gpu_addr) {          auto deduced_src = DeduceSurface(src_gpu_addr, src_params); -        auto deduced_dst = DeduceSurface(src_gpu_addr, src_params); +        auto deduced_dst = DeduceSurface(dst_gpu_addr, dst_params);          if (deduced_src.Failed() || deduced_dst.Failed()) {              return;          } @@ -1192,6 +1193,19 @@ private:          return {};      } +    /// Try to do an image copy logging when formats are incompatible. +    void TryCopyImage(TSurface& src, TSurface& dst, const CopyParams& copy) { +        const SurfaceParams& src_params = src->GetSurfaceParams(); +        const SurfaceParams& dst_params = dst->GetSurfaceParams(); +        if (!format_compatibility.TestCopy(src_params.pixel_format, dst_params.pixel_format)) { +            LOG_ERROR(HW_GPU, "Illegal copy between formats={{{}, {}}}", +                      static_cast<int>(dst_params.pixel_format), +                      static_cast<int>(src_params.pixel_format)); +            return; +        } +        ImageCopy(src, dst, copy); +    } +      constexpr PixelFormat GetSiblingFormat(PixelFormat format) const {          return siblings_table[static_cast<std::size_t>(format)];      } @@ -1241,6 +1255,7 @@ private:      VideoCore::RasterizerInterface& rasterizer;      FormatLookupTable format_lookup_table; +    FormatCompatibility format_compatibility;      u64 ticks{}; | 
