diff options
| author | bunnei <bunneidev@gmail.com> | 2019-03-16 00:05:24 -0400 | 
|---|---|---|
| committer | GitHub <noreply@github.com> | 2019-03-16 00:05:24 -0400 | 
| commit | 47b622825ca0ff664044abdf2d64a141452a8d1c (patch) | |
| tree | ff18141caee2b1a460e6d5e22283e78c073880b0 | |
| parent | 06ac6460d31036dddf7e4ae12355391035cc30ca (diff) | |
| parent | 2eaf6c41a4686028c0abc84d1be6fd48a67cf49f (diff) | |
Merge pull request #2237 from bunnei/cache-host-addr
gpu: Use host address for caching instead of guest address.
26 files changed, 394 insertions, 294 deletions
| diff --git a/src/core/hle/service/nvdrv/devices/nvhost_as_gpu.cpp b/src/core/hle/service/nvdrv/devices/nvhost_as_gpu.cpp index a34b9e753..b031ebc66 100644 --- a/src/core/hle/service/nvdrv/devices/nvhost_as_gpu.cpp +++ b/src/core/hle/service/nvdrv/devices/nvhost_as_gpu.cpp @@ -10,6 +10,7 @@  #include "core/core.h"  #include "core/hle/service/nvdrv/devices/nvhost_as_gpu.h"  #include "core/hle/service/nvdrv/devices/nvmap.h" +#include "core/memory.h"  #include "video_core/memory_manager.h"  #include "video_core/rasterizer_interface.h"  #include "video_core/renderer_base.h" @@ -178,7 +179,7 @@ u32 nvhost_as_gpu::UnmapBuffer(const std::vector<u8>& input, std::vector<u8>& ou      auto& gpu = system_instance.GPU();      auto cpu_addr = gpu.MemoryManager().GpuToCpuAddress(params.offset);      ASSERT(cpu_addr); -    gpu.FlushAndInvalidateRegion(*cpu_addr, itr->second.size); +    gpu.FlushAndInvalidateRegion(ToCacheAddr(Memory::GetPointer(*cpu_addr)), itr->second.size);      params.offset = gpu.MemoryManager().UnmapBuffer(params.offset, itr->second.size); diff --git a/src/core/memory.cpp b/src/core/memory.cpp index 6591c45d2..4fde53033 100644 --- a/src/core/memory.cpp +++ b/src/core/memory.cpp @@ -67,8 +67,11 @@ static void MapPages(PageTable& page_table, VAddr base, u64 size, u8* memory, Pa      LOG_DEBUG(HW_Memory, "Mapping {} onto {:016X}-{:016X}", fmt::ptr(memory), base * PAGE_SIZE,                (base + size) * PAGE_SIZE); -    RasterizerFlushVirtualRegion(base << PAGE_BITS, size * PAGE_SIZE, -                                 FlushMode::FlushAndInvalidate); +    // During boot, current_page_table might not be set yet, in which case we need not flush +    if (current_page_table) { +        RasterizerFlushVirtualRegion(base << PAGE_BITS, size * PAGE_SIZE, +                                     FlushMode::FlushAndInvalidate); +    }      VAddr end = base + size;      ASSERT_MSG(end <= page_table.pointers.size(), "out of range mapping at {:016X}", @@ -359,13 +362,13 @@ void RasterizerFlushVirtualRegion(VAddr start, u64 size, FlushMode mode) {          auto& gpu = system_instance.GPU();          switch (mode) {          case FlushMode::Flush: -            gpu.FlushRegion(overlap_start, overlap_size); +            gpu.FlushRegion(ToCacheAddr(GetPointer(overlap_start)), overlap_size);              break;          case FlushMode::Invalidate: -            gpu.InvalidateRegion(overlap_start, overlap_size); +            gpu.InvalidateRegion(ToCacheAddr(GetPointer(overlap_start)), overlap_size);              break;          case FlushMode::FlushAndInvalidate: -            gpu.FlushAndInvalidateRegion(overlap_start, overlap_size); +            gpu.FlushAndInvalidateRegion(ToCacheAddr(GetPointer(overlap_start)), overlap_size);              break;          }      }; diff --git a/src/video_core/engines/kepler_memory.cpp b/src/video_core/engines/kepler_memory.cpp index aae2a4019..daefa43a6 100644 --- a/src/video_core/engines/kepler_memory.cpp +++ b/src/video_core/engines/kepler_memory.cpp @@ -9,6 +9,7 @@  #include "video_core/engines/kepler_memory.h"  #include "video_core/engines/maxwell_3d.h"  #include "video_core/rasterizer_interface.h" +#include "video_core/renderer_base.h"  namespace Tegra::Engines { @@ -48,7 +49,8 @@ void KeplerMemory::ProcessData(u32 data) {      // We have to invalidate the destination region to evict any outdated surfaces from the cache.      // We do this before actually writing the new data because the destination address might contain      // a dirty surface that will have to be written back to memory. -    Core::System::GetInstance().GPU().InvalidateRegion(*dest_address, sizeof(u32)); +    system.Renderer().Rasterizer().InvalidateRegion(ToCacheAddr(Memory::GetPointer(*dest_address)), +                                                    sizeof(u32));      Memory::Write32(*dest_address, data);      system.GPU().Maxwell3D().dirty_flags.OnMemoryWrite(); diff --git a/src/video_core/engines/maxwell_3d.cpp b/src/video_core/engines/maxwell_3d.cpp index 144e7fa82..49979694e 100644 --- a/src/video_core/engines/maxwell_3d.cpp +++ b/src/video_core/engines/maxwell_3d.cpp @@ -396,7 +396,10 @@ void Maxwell3D::ProcessCBData(u32 value) {      const auto address = memory_manager.GpuToCpuAddress(buffer_address + regs.const_buffer.cb_pos);      ASSERT_MSG(address, "Invalid GPU address"); -    Memory::Write32(*address, value); +    u8* ptr{Memory::GetPointer(*address)}; +    rasterizer.InvalidateRegion(ToCacheAddr(ptr), sizeof(u32)); +    std::memcpy(ptr, &value, sizeof(u32)); +      dirty_flags.OnMemoryWrite();      // Increment the current buffer position. diff --git a/src/video_core/engines/maxwell_dma.cpp b/src/video_core/engines/maxwell_dma.cpp index 9dfea5999..415a6319a 100644 --- a/src/video_core/engines/maxwell_dma.cpp +++ b/src/video_core/engines/maxwell_dma.cpp @@ -9,6 +9,7 @@  #include "video_core/engines/maxwell_3d.h"  #include "video_core/engines/maxwell_dma.h"  #include "video_core/rasterizer_interface.h" +#include "video_core/renderer_base.h"  #include "video_core/textures/decoders.h"  namespace Tegra::Engines { @@ -92,12 +93,14 @@ void MaxwellDMA::HandleCopy() {      const auto FlushAndInvalidate = [&](u32 src_size, u64 dst_size) {          // TODO(Subv): For now, manually flush the regions until we implement GPU-accelerated          // copying. -        Core::System::GetInstance().GPU().FlushRegion(*source_cpu, src_size); +        Core::System::GetInstance().Renderer().Rasterizer().FlushRegion( +            ToCacheAddr(Memory::GetPointer(*source_cpu)), src_size);          // We have to invalidate the destination region to evict any outdated surfaces from the          // cache. We do this before actually writing the new data because the destination address          // might contain a dirty surface that will have to be written back to memory. -        Core::System::GetInstance().GPU().InvalidateRegion(*dest_cpu, dst_size); +        Core::System::GetInstance().Renderer().Rasterizer().InvalidateRegion( +            ToCacheAddr(Memory::GetPointer(*dest_cpu)), dst_size);      };      if (regs.exec.is_dst_linear && !regs.exec.is_src_linear) { diff --git a/src/video_core/gpu.h b/src/video_core/gpu.h index 56a203275..a14b95c30 100644 --- a/src/video_core/gpu.h +++ b/src/video_core/gpu.h @@ -11,6 +11,11 @@  #include "video_core/dma_pusher.h"  #include "video_core/memory_manager.h" +using CacheAddr = std::uintptr_t; +inline CacheAddr ToCacheAddr(const void* host_ptr) { +    return reinterpret_cast<CacheAddr>(host_ptr); +} +  namespace Core {  class System;  } @@ -209,13 +214,13 @@ public:          std::optional<std::reference_wrapper<const Tegra::FramebufferConfig>> framebuffer) = 0;      /// Notify rasterizer that any caches of the specified region should be flushed to Switch memory -    virtual void FlushRegion(VAddr addr, u64 size) = 0; +    virtual void FlushRegion(CacheAddr addr, u64 size) = 0;      /// Notify rasterizer that any caches of the specified region should be invalidated -    virtual void InvalidateRegion(VAddr addr, u64 size) = 0; +    virtual void InvalidateRegion(CacheAddr addr, u64 size) = 0;      /// Notify rasterizer that any caches of the specified region should be flushed and invalidated -    virtual void FlushAndInvalidateRegion(VAddr addr, u64 size) = 0; +    virtual void FlushAndInvalidateRegion(CacheAddr addr, u64 size) = 0;  private:      void ProcessBindMethod(const MethodCall& method_call); diff --git a/src/video_core/gpu_asynch.cpp b/src/video_core/gpu_asynch.cpp index ad0a747e3..8b355cf7b 100644 --- a/src/video_core/gpu_asynch.cpp +++ b/src/video_core/gpu_asynch.cpp @@ -22,15 +22,15 @@ void GPUAsynch::SwapBuffers(      gpu_thread.SwapBuffers(std::move(framebuffer));  } -void GPUAsynch::FlushRegion(VAddr addr, u64 size) { +void GPUAsynch::FlushRegion(CacheAddr addr, u64 size) {      gpu_thread.FlushRegion(addr, size);  } -void GPUAsynch::InvalidateRegion(VAddr addr, u64 size) { +void GPUAsynch::InvalidateRegion(CacheAddr addr, u64 size) {      gpu_thread.InvalidateRegion(addr, size);  } -void GPUAsynch::FlushAndInvalidateRegion(VAddr addr, u64 size) { +void GPUAsynch::FlushAndInvalidateRegion(CacheAddr addr, u64 size) {      gpu_thread.FlushAndInvalidateRegion(addr, size);  } diff --git a/src/video_core/gpu_asynch.h b/src/video_core/gpu_asynch.h index e6a807aba..1dcc61a6c 100644 --- a/src/video_core/gpu_asynch.h +++ b/src/video_core/gpu_asynch.h @@ -26,9 +26,9 @@ public:      void PushGPUEntries(Tegra::CommandList&& entries) override;      void SwapBuffers(          std::optional<std::reference_wrapper<const Tegra::FramebufferConfig>> framebuffer) override; -    void FlushRegion(VAddr addr, u64 size) override; -    void InvalidateRegion(VAddr addr, u64 size) override; -    void FlushAndInvalidateRegion(VAddr addr, u64 size) override; +    void FlushRegion(CacheAddr addr, u64 size) override; +    void InvalidateRegion(CacheAddr addr, u64 size) override; +    void FlushAndInvalidateRegion(CacheAddr addr, u64 size) override;  private:      GPUThread::ThreadManager gpu_thread; diff --git a/src/video_core/gpu_synch.cpp b/src/video_core/gpu_synch.cpp index 4c00b96c7..2cfc900ed 100644 --- a/src/video_core/gpu_synch.cpp +++ b/src/video_core/gpu_synch.cpp @@ -22,15 +22,15 @@ void GPUSynch::SwapBuffers(      renderer.SwapBuffers(std::move(framebuffer));  } -void GPUSynch::FlushRegion(VAddr addr, u64 size) { +void GPUSynch::FlushRegion(CacheAddr addr, u64 size) {      renderer.Rasterizer().FlushRegion(addr, size);  } -void GPUSynch::InvalidateRegion(VAddr addr, u64 size) { +void GPUSynch::InvalidateRegion(CacheAddr addr, u64 size) {      renderer.Rasterizer().InvalidateRegion(addr, size);  } -void GPUSynch::FlushAndInvalidateRegion(VAddr addr, u64 size) { +void GPUSynch::FlushAndInvalidateRegion(CacheAddr addr, u64 size) {      renderer.Rasterizer().FlushAndInvalidateRegion(addr, size);  } diff --git a/src/video_core/gpu_synch.h b/src/video_core/gpu_synch.h index 7d5a241ff..766b5631c 100644 --- a/src/video_core/gpu_synch.h +++ b/src/video_core/gpu_synch.h @@ -21,9 +21,9 @@ public:      void PushGPUEntries(Tegra::CommandList&& entries) override;      void SwapBuffers(          std::optional<std::reference_wrapper<const Tegra::FramebufferConfig>> framebuffer) override; -    void FlushRegion(VAddr addr, u64 size) override; -    void InvalidateRegion(VAddr addr, u64 size) override; -    void FlushAndInvalidateRegion(VAddr addr, u64 size) override; +    void FlushRegion(CacheAddr addr, u64 size) override; +    void InvalidateRegion(CacheAddr addr, u64 size) override; +    void FlushAndInvalidateRegion(CacheAddr addr, u64 size) override;  };  } // namespace VideoCommon diff --git a/src/video_core/gpu_thread.cpp b/src/video_core/gpu_thread.cpp index c5bdd2a17..086b2f625 100644 --- a/src/video_core/gpu_thread.cpp +++ b/src/video_core/gpu_thread.cpp @@ -5,7 +5,6 @@  #include "common/assert.h"  #include "common/microprofile.h"  #include "core/frontend/scope_acquire_window_context.h" -#include "core/settings.h"  #include "video_core/dma_pusher.h"  #include "video_core/gpu.h"  #include "video_core/gpu_thread.h" @@ -13,38 +12,13 @@  namespace VideoCommon::GPUThread { -/// Executes a single GPU thread command -static void ExecuteCommand(CommandData* command, VideoCore::RendererBase& renderer, -                           Tegra::DmaPusher& dma_pusher) { -    if (const auto submit_list = std::get_if<SubmitListCommand>(command)) { -        dma_pusher.Push(std::move(submit_list->entries)); -        dma_pusher.DispatchCalls(); -    } else if (const auto data = std::get_if<SwapBuffersCommand>(command)) { -        renderer.SwapBuffers(data->framebuffer); -    } else if (const auto data = std::get_if<FlushRegionCommand>(command)) { -        renderer.Rasterizer().FlushRegion(data->addr, data->size); -    } else if (const auto data = std::get_if<InvalidateRegionCommand>(command)) { -        renderer.Rasterizer().InvalidateRegion(data->addr, data->size); -    } else if (const auto data = std::get_if<FlushAndInvalidateRegionCommand>(command)) { -        renderer.Rasterizer().FlushAndInvalidateRegion(data->addr, data->size); -    } else { -        UNREACHABLE(); -    } -} -  /// Runs the GPU thread  static void RunThread(VideoCore::RendererBase& renderer, Tegra::DmaPusher& dma_pusher,                        SynchState& state) { -      MicroProfileOnThreadCreate("GpuThread"); -    auto WaitForWakeup = [&]() { -        std::unique_lock<std::mutex> lock{state.signal_mutex}; -        state.signal_condition.wait(lock, [&] { return !state.is_idle || !state.is_running; }); -    }; -      // Wait for first GPU command before acquiring the window context -    WaitForWakeup(); +    state.WaitForCommands();      // If emulation was stopped during disk shader loading, abort before trying to acquire context      if (!state.is_running) { @@ -53,100 +27,72 @@ static void RunThread(VideoCore::RendererBase& renderer, Tegra::DmaPusher& dma_p      Core::Frontend::ScopeAcquireWindowContext acquire_context{renderer.GetRenderWindow()}; +    CommandDataContainer next;      while (state.is_running) { -        if (!state.is_running) { -            return; -        } - -        { -            // Thread has been woken up, so make the previous write queue the next read queue -            std::lock_guard<std::mutex> lock{state.signal_mutex}; -            std::swap(state.push_queue, state.pop_queue); -        } - -        // Execute all of the GPU commands -        while (!state.pop_queue->empty()) { -            ExecuteCommand(&state.pop_queue->front(), renderer, dma_pusher); -            state.pop_queue->pop(); +        state.WaitForCommands(); +        while (!state.queue.Empty()) { +            state.queue.Pop(next); +            if (const auto submit_list = std::get_if<SubmitListCommand>(&next.data)) { +                dma_pusher.Push(std::move(submit_list->entries)); +                dma_pusher.DispatchCalls(); +            } else if (const auto data = std::get_if<SwapBuffersCommand>(&next.data)) { +                state.DecrementFramesCounter(); +                renderer.SwapBuffers(std::move(data->framebuffer)); +            } else if (const auto data = std::get_if<FlushRegionCommand>(&next.data)) { +                renderer.Rasterizer().FlushRegion(data->addr, data->size); +            } else if (const auto data = std::get_if<InvalidateRegionCommand>(&next.data)) { +                renderer.Rasterizer().InvalidateRegion(data->addr, data->size); +            } else if (const auto data = std::get_if<EndProcessingCommand>(&next.data)) { +                return; +            } else { +                UNREACHABLE(); +            }          } - -        state.UpdateIdleState(); - -        // Signal that the GPU thread has finished processing commands -        if (state.is_idle) { -            state.idle_condition.notify_one(); -        } - -        // Wait for CPU thread to send more GPU commands -        WaitForWakeup();      }  }  ThreadManager::ThreadManager(VideoCore::RendererBase& renderer, Tegra::DmaPusher& dma_pusher)      : renderer{renderer}, dma_pusher{dma_pusher}, thread{RunThread, std::ref(renderer), -                                                         std::ref(dma_pusher), std::ref(state)}, -      thread_id{thread.get_id()} {} +                                                         std::ref(dma_pusher), std::ref(state)} {}  ThreadManager::~ThreadManager() { -    { -        // Notify GPU thread that a shutdown is pending -        std::lock_guard<std::mutex> lock{state.signal_mutex}; -        state.is_running = false; -    } - -    state.signal_condition.notify_one(); +    // Notify GPU thread that a shutdown is pending +    PushCommand(EndProcessingCommand());      thread.join();  }  void ThreadManager::SubmitList(Tegra::CommandList&& entries) { -    if (entries.empty()) { -        return; -    } - -    PushCommand(SubmitListCommand(std::move(entries)), false, false); +    PushCommand(SubmitListCommand(std::move(entries)));  }  void ThreadManager::SwapBuffers(      std::optional<std::reference_wrapper<const Tegra::FramebufferConfig>> framebuffer) { -    PushCommand(SwapBuffersCommand(std::move(framebuffer)), true, false); +    state.IncrementFramesCounter(); +    PushCommand(SwapBuffersCommand(std::move(framebuffer))); +    state.WaitForFrames();  } -void ThreadManager::FlushRegion(VAddr addr, u64 size) { -    // Block the CPU when using accurate emulation -    PushCommand(FlushRegionCommand(addr, size), Settings::values.use_accurate_gpu_emulation, false); +void ThreadManager::FlushRegion(CacheAddr addr, u64 size) { +    PushCommand(FlushRegionCommand(addr, size));  } -void ThreadManager::InvalidateRegion(VAddr addr, u64 size) { -    PushCommand(InvalidateRegionCommand(addr, size), true, true); +void ThreadManager::InvalidateRegion(CacheAddr addr, u64 size) { +    if (state.queue.Empty()) { +        // It's quicker to invalidate a single region on the CPU if the queue is already empty +        renderer.Rasterizer().InvalidateRegion(addr, size); +    } else { +        PushCommand(InvalidateRegionCommand(addr, size)); +    }  } -void ThreadManager::FlushAndInvalidateRegion(VAddr addr, u64 size) { +void ThreadManager::FlushAndInvalidateRegion(CacheAddr addr, u64 size) { +    // Skip flush on asynch mode, as FlushAndInvalidateRegion is not used for anything too important      InvalidateRegion(addr, size);  } -void ThreadManager::PushCommand(CommandData&& command_data, bool wait_for_idle, bool allow_on_cpu) { -    { -        std::lock_guard<std::mutex> lock{state.signal_mutex}; - -        if ((allow_on_cpu && state.is_idle) || IsGpuThread()) { -            // Execute the command synchronously on the current thread -            ExecuteCommand(&command_data, renderer, dma_pusher); -            return; -        } - -        // Push the command to the GPU thread -        state.UpdateIdleState(); -        state.push_queue->emplace(command_data); -    } - -    // Signal the GPU thread that commands are pending -    state.signal_condition.notify_one(); - -    if (wait_for_idle) { -        // Wait for the GPU to be idle (all commands to be executed) -        std::unique_lock<std::mutex> lock{state.idle_mutex}; -        state.idle_condition.wait(lock, [this] { return static_cast<bool>(state.is_idle); }); -    } +void ThreadManager::PushCommand(CommandData&& command_data) { +    state.queue.Push(CommandDataContainer(std::move(command_data))); +    state.SignalCommands();  }  } // namespace VideoCommon::GPUThread diff --git a/src/video_core/gpu_thread.h b/src/video_core/gpu_thread.h index edb148b14..8cd7db1c6 100644 --- a/src/video_core/gpu_thread.h +++ b/src/video_core/gpu_thread.h @@ -13,6 +13,9 @@  #include <thread>  #include <variant> +#include "common/threadsafe_queue.h" +#include "video_core/gpu.h" +  namespace Tegra {  struct FramebufferConfig;  class DmaPusher; @@ -24,6 +27,9 @@ class RendererBase;  namespace VideoCommon::GPUThread { +/// Command to signal to the GPU thread that processing has ended +struct EndProcessingCommand final {}; +  /// Command to signal to the GPU thread that a command list is ready for processing  struct SubmitListCommand final {      explicit SubmitListCommand(Tegra::CommandList&& entries) : entries{std::move(entries)} {} @@ -36,59 +42,110 @@ struct SwapBuffersCommand final {      explicit SwapBuffersCommand(std::optional<const Tegra::FramebufferConfig> framebuffer)          : framebuffer{std::move(framebuffer)} {} -    std::optional<const Tegra::FramebufferConfig> framebuffer; +    std::optional<Tegra::FramebufferConfig> framebuffer;  };  /// Command to signal to the GPU thread to flush a region  struct FlushRegionCommand final { -    explicit constexpr FlushRegionCommand(VAddr addr, u64 size) : addr{addr}, size{size} {} +    explicit constexpr FlushRegionCommand(CacheAddr addr, u64 size) : addr{addr}, size{size} {} -    const VAddr addr; -    const u64 size; +    CacheAddr addr; +    u64 size;  };  /// Command to signal to the GPU thread to invalidate a region  struct InvalidateRegionCommand final { -    explicit constexpr InvalidateRegionCommand(VAddr addr, u64 size) : addr{addr}, size{size} {} +    explicit constexpr InvalidateRegionCommand(CacheAddr addr, u64 size) : addr{addr}, size{size} {} -    const VAddr addr; -    const u64 size; +    CacheAddr addr; +    u64 size;  };  /// Command to signal to the GPU thread to flush and invalidate a region  struct FlushAndInvalidateRegionCommand final { -    explicit constexpr FlushAndInvalidateRegionCommand(VAddr addr, u64 size) +    explicit constexpr FlushAndInvalidateRegionCommand(CacheAddr addr, u64 size)          : addr{addr}, size{size} {} -    const VAddr addr; -    const u64 size; +    CacheAddr addr; +    u64 size;  }; -using CommandData = std::variant<SubmitListCommand, SwapBuffersCommand, FlushRegionCommand, -                                 InvalidateRegionCommand, FlushAndInvalidateRegionCommand>; +using CommandData = +    std::variant<EndProcessingCommand, SubmitListCommand, SwapBuffersCommand, FlushRegionCommand, +                 InvalidateRegionCommand, FlushAndInvalidateRegionCommand>; + +struct CommandDataContainer { +    CommandDataContainer() = default; + +    CommandDataContainer(CommandData&& data) : data{std::move(data)} {} + +    CommandDataContainer& operator=(const CommandDataContainer& t) { +        data = std::move(t.data); +        return *this; +    } + +    CommandData data; +};  /// Struct used to synchronize the GPU thread  struct SynchState final { -    std::atomic<bool> is_running{true}; -    std::atomic<bool> is_idle{true}; -    std::condition_variable signal_condition; -    std::mutex signal_mutex; -    std::condition_variable idle_condition; -    std::mutex idle_mutex; - -    // We use two queues for sending commands to the GPU thread, one for writing (push_queue) to and -    // one for reading from (pop_queue). These are swapped whenever the current pop_queue becomes -    // empty. This allows for efficient thread-safe access, as it does not require any copies. - -    using CommandQueue = std::queue<CommandData>; -    std::array<CommandQueue, 2> command_queues; -    CommandQueue* push_queue{&command_queues[0]}; -    CommandQueue* pop_queue{&command_queues[1]}; - -    void UpdateIdleState() { -        std::lock_guard<std::mutex> lock{idle_mutex}; -        is_idle = command_queues[0].empty() && command_queues[1].empty(); +    std::atomic_bool is_running{true}; +    std::atomic_int queued_frame_count{}; +    std::mutex frames_mutex; +    std::mutex commands_mutex; +    std::condition_variable commands_condition; +    std::condition_variable frames_condition; + +    void IncrementFramesCounter() { +        std::lock_guard<std::mutex> lock{frames_mutex}; +        ++queued_frame_count; +    } + +    void DecrementFramesCounter() { +        { +            std::lock_guard<std::mutex> lock{frames_mutex}; +            --queued_frame_count; + +            if (queued_frame_count) { +                return; +            } +        } +        frames_condition.notify_one();      } + +    void WaitForFrames() { +        { +            std::lock_guard<std::mutex> lock{frames_mutex}; +            if (!queued_frame_count) { +                return; +            } +        } + +        // Wait for the GPU to be idle (all commands to be executed) +        { +            std::unique_lock<std::mutex> lock{frames_mutex}; +            frames_condition.wait(lock, [this] { return !queued_frame_count; }); +        } +    } + +    void SignalCommands() { +        { +            std::unique_lock<std::mutex> lock{commands_mutex}; +            if (queue.Empty()) { +                return; +            } +        } + +        commands_condition.notify_one(); +    } + +    void WaitForCommands() { +        std::unique_lock<std::mutex> lock{commands_mutex}; +        commands_condition.wait(lock, [this] { return !queue.Empty(); }); +    } + +    using CommandQueue = Common::SPSCQueue<CommandDataContainer>; +    CommandQueue queue;  };  /// Class used to manage the GPU thread @@ -105,22 +162,17 @@ public:          std::optional<std::reference_wrapper<const Tegra::FramebufferConfig>> framebuffer);      /// Notify rasterizer that any caches of the specified region should be flushed to Switch memory -    void FlushRegion(VAddr addr, u64 size); +    void FlushRegion(CacheAddr addr, u64 size);      /// Notify rasterizer that any caches of the specified region should be invalidated -    void InvalidateRegion(VAddr addr, u64 size); +    void InvalidateRegion(CacheAddr addr, u64 size);      /// Notify rasterizer that any caches of the specified region should be flushed and invalidated -    void FlushAndInvalidateRegion(VAddr addr, u64 size); +    void FlushAndInvalidateRegion(CacheAddr addr, u64 size);  private:      /// Pushes a command to be executed by the GPU thread -    void PushCommand(CommandData&& command_data, bool wait_for_idle, bool allow_on_cpu); - -    /// Returns true if this is called by the GPU thread -    bool IsGpuThread() const { -        return std::this_thread::get_id() == thread_id; -    } +    void PushCommand(CommandData&& command_data);  private:      SynchState state; diff --git a/src/video_core/rasterizer_cache.h b/src/video_core/rasterizer_cache.h index a7bcf26fb..ecd9986a0 100644 --- a/src/video_core/rasterizer_cache.h +++ b/src/video_core/rasterizer_cache.h @@ -4,6 +4,7 @@  #pragma once +#include <mutex>  #include <set>  #include <unordered_map> @@ -12,14 +13,26 @@  #include "common/common_types.h"  #include "core/settings.h" +#include "video_core/gpu.h"  #include "video_core/rasterizer_interface.h"  class RasterizerCacheObject {  public: +    explicit RasterizerCacheObject(const u8* host_ptr) +        : host_ptr{host_ptr}, cache_addr{ToCacheAddr(host_ptr)} {} +      virtual ~RasterizerCacheObject(); +    CacheAddr GetCacheAddr() const { +        return cache_addr; +    } + +    const u8* GetHostPtr() const { +        return host_ptr; +    } +      /// Gets the address of the shader in guest memory, required for cache management -    virtual VAddr GetAddr() const = 0; +    virtual VAddr GetCpuAddr() const = 0;      /// Gets the size of the shader in guest memory, required for cache management      virtual std::size_t GetSizeInBytes() const = 0; @@ -58,6 +71,8 @@ private:      bool is_registered{};      ///< Whether the object is currently registered with the cache      bool is_dirty{};           ///< Whether the object is dirty (out of sync with guest memory)      u64 last_modified_ticks{}; ///< When the object was last modified, used for in-order flushing +    CacheAddr cache_addr{};    ///< Cache address memory, unique from emulated virtual address space +    const u8* host_ptr{};      ///< Pointer to the memory backing this cached region  };  template <class T> @@ -68,7 +83,9 @@ public:      explicit RasterizerCache(VideoCore::RasterizerInterface& rasterizer) : rasterizer{rasterizer} {}      /// Write any cached resources overlapping the specified region back to memory -    void FlushRegion(Tegra::GPUVAddr addr, size_t size) { +    void FlushRegion(CacheAddr addr, std::size_t size) { +        std::lock_guard<std::recursive_mutex> lock{mutex}; +          const auto& objects{GetSortedObjectsFromRegion(addr, size)};          for (auto& object : objects) {              FlushObject(object); @@ -76,7 +93,9 @@ public:      }      /// Mark the specified region as being invalidated -    void InvalidateRegion(VAddr addr, u64 size) { +    void InvalidateRegion(CacheAddr addr, u64 size) { +        std::lock_guard<std::recursive_mutex> lock{mutex}; +          const auto& objects{GetSortedObjectsFromRegion(addr, size)};          for (auto& object : objects) {              if (!object->IsRegistered()) { @@ -89,48 +108,60 @@ public:      /// Invalidates everything in the cache      void InvalidateAll() { +        std::lock_guard<std::recursive_mutex> lock{mutex}; +          while (interval_cache.begin() != interval_cache.end()) {              Unregister(*interval_cache.begin()->second.begin());          }      }  protected: -    /// Tries to get an object from the cache with the specified address -    T TryGet(VAddr addr) const { +    /// Tries to get an object from the cache with the specified cache address +    T TryGet(CacheAddr addr) const {          const auto iter = map_cache.find(addr);          if (iter != map_cache.end())              return iter->second;          return nullptr;      } +    T TryGet(const void* addr) const { +        const auto iter = map_cache.find(ToCacheAddr(addr)); +        if (iter != map_cache.end()) +            return iter->second; +        return nullptr; +    } +      /// Register an object into the cache      void Register(const T& object) { +        std::lock_guard<std::recursive_mutex> lock{mutex}; +          object->SetIsRegistered(true);          interval_cache.add({GetInterval(object), ObjectSet{object}}); -        map_cache.insert({object->GetAddr(), object}); -        rasterizer.UpdatePagesCachedCount(object->GetAddr(), object->GetSizeInBytes(), 1); +        map_cache.insert({object->GetCacheAddr(), object}); +        rasterizer.UpdatePagesCachedCount(object->GetCpuAddr(), object->GetSizeInBytes(), 1);      }      /// Unregisters an object from the cache      void Unregister(const T& object) { -        object->SetIsRegistered(false); -        rasterizer.UpdatePagesCachedCount(object->GetAddr(), object->GetSizeInBytes(), -1); -        // Only flush if use_accurate_gpu_emulation is enabled, as it incurs a performance hit -        if (Settings::values.use_accurate_gpu_emulation) { -            FlushObject(object); -        } +        std::lock_guard<std::recursive_mutex> lock{mutex}; +        object->SetIsRegistered(false); +        rasterizer.UpdatePagesCachedCount(object->GetCpuAddr(), object->GetSizeInBytes(), -1);          interval_cache.subtract({GetInterval(object), ObjectSet{object}}); -        map_cache.erase(object->GetAddr()); +        map_cache.erase(object->GetCacheAddr());      }      /// Returns a ticks counter used for tracking when cached objects were last modified      u64 GetModifiedTicks() { +        std::lock_guard<std::recursive_mutex> lock{mutex}; +          return ++modified_ticks;      }      /// Flushes the specified object, updating appropriate cache state as needed      void FlushObject(const T& object) { +        std::lock_guard<std::recursive_mutex> lock{mutex}; +          if (!object->IsDirty()) {              return;          } @@ -140,7 +171,7 @@ protected:  private:      /// Returns a list of cached objects from the specified memory region, ordered by access time -    std::vector<T> GetSortedObjectsFromRegion(VAddr addr, u64 size) { +    std::vector<T> GetSortedObjectsFromRegion(CacheAddr addr, u64 size) {          if (size == 0) {              return {};          } @@ -164,17 +195,18 @@ private:      }      using ObjectSet = std::set<T>; -    using ObjectCache = std::unordered_map<VAddr, T>; -    using IntervalCache = boost::icl::interval_map<VAddr, ObjectSet>; +    using ObjectCache = std::unordered_map<CacheAddr, T>; +    using IntervalCache = boost::icl::interval_map<CacheAddr, ObjectSet>;      using ObjectInterval = typename IntervalCache::interval_type;      static auto GetInterval(const T& object) { -        return ObjectInterval::right_open(object->GetAddr(), -                                          object->GetAddr() + object->GetSizeInBytes()); +        return ObjectInterval::right_open(object->GetCacheAddr(), +                                          object->GetCacheAddr() + object->GetSizeInBytes());      }      ObjectCache map_cache;      IntervalCache interval_cache; ///< Cache of objects      u64 modified_ticks{};         ///< Counter of cache state ticks, used for in-order flushing      VideoCore::RasterizerInterface& rasterizer; +    std::recursive_mutex mutex;  }; diff --git a/src/video_core/rasterizer_interface.h b/src/video_core/rasterizer_interface.h index 6a1dc9cf6..76e292e87 100644 --- a/src/video_core/rasterizer_interface.h +++ b/src/video_core/rasterizer_interface.h @@ -35,14 +35,14 @@ public:      virtual void FlushAll() = 0;      /// Notify rasterizer that any caches of the specified region should be flushed to Switch memory -    virtual void FlushRegion(VAddr addr, u64 size) = 0; +    virtual void FlushRegion(CacheAddr addr, u64 size) = 0;      /// Notify rasterizer that any caches of the specified region should be invalidated -    virtual void InvalidateRegion(VAddr addr, u64 size) = 0; +    virtual void InvalidateRegion(CacheAddr addr, u64 size) = 0;      /// Notify rasterizer that any caches of the specified region should be flushed to Switch memory      /// and invalidated -    virtual void FlushAndInvalidateRegion(VAddr addr, u64 size) = 0; +    virtual void FlushAndInvalidateRegion(CacheAddr addr, u64 size) = 0;      /// Attempt to use a faster method to perform a surface copy      virtual bool AccelerateSurfaceCopy(const Tegra::Engines::Fermi2D::Regs::Surface& src, @@ -63,7 +63,7 @@ public:      }      /// Increase/decrease the number of object in pages touching the specified region -    virtual void UpdatePagesCachedCount(Tegra::GPUVAddr addr, u64 size, int delta) {} +    virtual void UpdatePagesCachedCount(VAddr addr, u64 size, int delta) {}      /// Initialize disk cached resources for the game being emulated      virtual void LoadDiskResources(const std::atomic_bool& stop_loading = false, diff --git a/src/video_core/renderer_opengl/gl_buffer_cache.cpp b/src/video_core/renderer_opengl/gl_buffer_cache.cpp index b3062e5ba..a4eea61a6 100644 --- a/src/video_core/renderer_opengl/gl_buffer_cache.cpp +++ b/src/video_core/renderer_opengl/gl_buffer_cache.cpp @@ -13,6 +13,11 @@  namespace OpenGL { +CachedBufferEntry::CachedBufferEntry(VAddr cpu_addr, std::size_t size, GLintptr offset, +                                     std::size_t alignment, u8* host_ptr) +    : cpu_addr{cpu_addr}, size{size}, offset{offset}, alignment{alignment}, RasterizerCacheObject{ +                                                                                host_ptr} {} +  OGLBufferCache::OGLBufferCache(RasterizerOpenGL& rasterizer, std::size_t size)      : RasterizerCache{rasterizer}, stream_buffer(size, true) {} @@ -26,11 +31,12 @@ GLintptr OGLBufferCache::UploadMemory(Tegra::GPUVAddr gpu_addr, std::size_t size      // TODO: Figure out which size is the best for given games.      cache &= size >= 2048; +    const auto& host_ptr{Memory::GetPointer(*cpu_addr)};      if (cache) { -        auto entry = TryGet(*cpu_addr); +        auto entry = TryGet(host_ptr);          if (entry) { -            if (entry->size >= size && entry->alignment == alignment) { -                return entry->offset; +            if (entry->GetSize() >= size && entry->GetAlignment() == alignment) { +                return entry->GetOffset();              }              Unregister(entry);          } @@ -39,17 +45,17 @@ GLintptr OGLBufferCache::UploadMemory(Tegra::GPUVAddr gpu_addr, std::size_t size      AlignBuffer(alignment);      const GLintptr uploaded_offset = buffer_offset; -    Memory::ReadBlock(*cpu_addr, buffer_ptr, size); +    if (!host_ptr) { +        return uploaded_offset; +    } +    std::memcpy(buffer_ptr, host_ptr, size);      buffer_ptr += size;      buffer_offset += size;      if (cache) { -        auto entry = std::make_shared<CachedBufferEntry>(); -        entry->offset = uploaded_offset; -        entry->size = size; -        entry->alignment = alignment; -        entry->addr = *cpu_addr; +        auto entry = std::make_shared<CachedBufferEntry>(*cpu_addr, size, uploaded_offset, +                                                         alignment, host_ptr);          Register(entry);      } diff --git a/src/video_core/renderer_opengl/gl_buffer_cache.h b/src/video_core/renderer_opengl/gl_buffer_cache.h index c11acfb79..1de1f84ae 100644 --- a/src/video_core/renderer_opengl/gl_buffer_cache.h +++ b/src/video_core/renderer_opengl/gl_buffer_cache.h @@ -17,22 +17,39 @@ namespace OpenGL {  class RasterizerOpenGL; -struct CachedBufferEntry final : public RasterizerCacheObject { -    VAddr GetAddr() const override { -        return addr; +class CachedBufferEntry final : public RasterizerCacheObject { +public: +    explicit CachedBufferEntry(VAddr cpu_addr, std::size_t size, GLintptr offset, +                               std::size_t alignment, u8* host_ptr); + +    VAddr GetCpuAddr() const override { +        return cpu_addr;      }      std::size_t GetSizeInBytes() const override {          return size;      } +    std::size_t GetSize() const { +        return size; +    } + +    GLintptr GetOffset() const { +        return offset; +    } + +    std::size_t GetAlignment() const { +        return alignment; +    } +      // We do not have to flush this cache as things in it are never modified by us.      void Flush() override {} -    VAddr addr; -    std::size_t size; -    GLintptr offset; -    std::size_t alignment; +private: +    VAddr cpu_addr{}; +    std::size_t size{}; +    GLintptr offset{}; +    std::size_t alignment{};  };  class OGLBufferCache final : public RasterizerCache<std::shared_ptr<CachedBufferEntry>> { diff --git a/src/video_core/renderer_opengl/gl_global_cache.cpp b/src/video_core/renderer_opengl/gl_global_cache.cpp index 7161d1dea..a2c509c24 100644 --- a/src/video_core/renderer_opengl/gl_global_cache.cpp +++ b/src/video_core/renderer_opengl/gl_global_cache.cpp @@ -15,12 +15,13 @@  namespace OpenGL { -CachedGlobalRegion::CachedGlobalRegion(VAddr addr, u32 size) : addr{addr}, size{size} { +CachedGlobalRegion::CachedGlobalRegion(VAddr cpu_addr, u32 size, u8* host_ptr) +    : cpu_addr{cpu_addr}, size{size}, RasterizerCacheObject{host_ptr} {      buffer.Create();      // Bind and unbind the buffer so it gets allocated by the driver      glBindBuffer(GL_SHADER_STORAGE_BUFFER, buffer.handle);      glBindBuffer(GL_SHADER_STORAGE_BUFFER, 0); -    LabelGLObject(GL_BUFFER, buffer.handle, addr, "GlobalMemory"); +    LabelGLObject(GL_BUFFER, buffer.handle, cpu_addr, "GlobalMemory");  }  void CachedGlobalRegion::Reload(u32 size_) { @@ -35,7 +36,7 @@ void CachedGlobalRegion::Reload(u32 size_) {      // TODO(Rodrigo): Get rid of Memory::GetPointer with a staging buffer      glBindBuffer(GL_SHADER_STORAGE_BUFFER, buffer.handle); -    glBufferData(GL_SHADER_STORAGE_BUFFER, size, Memory::GetPointer(addr), GL_DYNAMIC_DRAW); +    glBufferData(GL_SHADER_STORAGE_BUFFER, size, GetHostPtr(), GL_DYNAMIC_DRAW);  }  GlobalRegion GlobalRegionCacheOpenGL::TryGetReservedGlobalRegion(VAddr addr, u32 size) const { @@ -46,11 +47,11 @@ GlobalRegion GlobalRegionCacheOpenGL::TryGetReservedGlobalRegion(VAddr addr, u32      return search->second;  } -GlobalRegion GlobalRegionCacheOpenGL::GetUncachedGlobalRegion(VAddr addr, u32 size) { +GlobalRegion GlobalRegionCacheOpenGL::GetUncachedGlobalRegion(VAddr addr, u32 size, u8* host_ptr) {      GlobalRegion region{TryGetReservedGlobalRegion(addr, size)};      if (!region) {          // No reserved surface available, create a new one and reserve it -        region = std::make_shared<CachedGlobalRegion>(addr, size); +        region = std::make_shared<CachedGlobalRegion>(addr, size, host_ptr);          ReserveGlobalRegion(region);      }      region->Reload(size); @@ -58,7 +59,7 @@ GlobalRegion GlobalRegionCacheOpenGL::GetUncachedGlobalRegion(VAddr addr, u32 si  }  void GlobalRegionCacheOpenGL::ReserveGlobalRegion(GlobalRegion region) { -    reserve.insert_or_assign(region->GetAddr(), std::move(region)); +    reserve.insert_or_assign(region->GetCpuAddr(), std::move(region));  }  GlobalRegionCacheOpenGL::GlobalRegionCacheOpenGL(RasterizerOpenGL& rasterizer) @@ -80,11 +81,12 @@ GlobalRegion GlobalRegionCacheOpenGL::GetGlobalRegion(      ASSERT(actual_addr);      // Look up global region in the cache based on address -    GlobalRegion region = TryGet(*actual_addr); +    const auto& host_ptr{Memory::GetPointer(*actual_addr)}; +    GlobalRegion region{TryGet(host_ptr)};      if (!region) {          // No global region found - create a new one -        region = GetUncachedGlobalRegion(*actual_addr, size); +        region = GetUncachedGlobalRegion(*actual_addr, size, host_ptr);          Register(region);      } diff --git a/src/video_core/renderer_opengl/gl_global_cache.h b/src/video_core/renderer_opengl/gl_global_cache.h index ba2bdc60c..e497a0619 100644 --- a/src/video_core/renderer_opengl/gl_global_cache.h +++ b/src/video_core/renderer_opengl/gl_global_cache.h @@ -27,14 +27,12 @@ using GlobalRegion = std::shared_ptr<CachedGlobalRegion>;  class CachedGlobalRegion final : public RasterizerCacheObject {  public: -    explicit CachedGlobalRegion(VAddr addr, u32 size); +    explicit CachedGlobalRegion(VAddr cpu_addr, u32 size, u8* host_ptr); -    /// Gets the address of the shader in guest memory, required for cache management -    VAddr GetAddr() const override { -        return addr; +    VAddr GetCpuAddr() const override { +        return cpu_addr;      } -    /// Gets the size of the shader in guest memory, required for cache management      std::size_t GetSizeInBytes() const override {          return size;      } @@ -53,9 +51,8 @@ public:      }  private: -    VAddr addr{}; +    VAddr cpu_addr{};      u32 size{}; -      OGLBuffer buffer;  }; @@ -69,7 +66,7 @@ public:  private:      GlobalRegion TryGetReservedGlobalRegion(VAddr addr, u32 size) const; -    GlobalRegion GetUncachedGlobalRegion(VAddr addr, u32 size); +    GlobalRegion GetUncachedGlobalRegion(VAddr addr, u32 size, u8* host_ptr);      void ReserveGlobalRegion(GlobalRegion region);      std::unordered_map<VAddr, GlobalRegion> reserve; diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp index 976f64c24..bb6de5477 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.cpp +++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp @@ -449,7 +449,7 @@ static constexpr auto RangeFromInterval(Map& map, const Interval& interval) {      return boost::make_iterator_range(map.equal_range(interval));  } -void RasterizerOpenGL::UpdatePagesCachedCount(Tegra::GPUVAddr addr, u64 size, int delta) { +void RasterizerOpenGL::UpdatePagesCachedCount(VAddr addr, u64 size, int delta) {      const u64 page_start{addr >> Memory::PAGE_BITS};      const u64 page_end{(addr + size + Memory::PAGE_SIZE - 1) >> Memory::PAGE_BITS}; @@ -747,12 +747,12 @@ void RasterizerOpenGL::DrawArrays() {  void RasterizerOpenGL::FlushAll() {} -void RasterizerOpenGL::FlushRegion(VAddr addr, u64 size) { +void RasterizerOpenGL::FlushRegion(CacheAddr addr, u64 size) {      MICROPROFILE_SCOPE(OpenGL_CacheManagement);      res_cache.FlushRegion(addr, size);  } -void RasterizerOpenGL::InvalidateRegion(VAddr addr, u64 size) { +void RasterizerOpenGL::InvalidateRegion(CacheAddr addr, u64 size) {      MICROPROFILE_SCOPE(OpenGL_CacheManagement);      res_cache.InvalidateRegion(addr, size);      shader_cache.InvalidateRegion(addr, size); @@ -760,7 +760,7 @@ void RasterizerOpenGL::InvalidateRegion(VAddr addr, u64 size) {      buffer_cache.InvalidateRegion(addr, size);  } -void RasterizerOpenGL::FlushAndInvalidateRegion(VAddr addr, u64 size) { +void RasterizerOpenGL::FlushAndInvalidateRegion(CacheAddr addr, u64 size) {      FlushRegion(addr, size);      InvalidateRegion(addr, size);  } @@ -782,7 +782,7 @@ bool RasterizerOpenGL::AccelerateDisplay(const Tegra::FramebufferConfig& config,      MICROPROFILE_SCOPE(OpenGL_CacheManagement); -    const auto& surface{res_cache.TryFindFramebufferSurface(framebuffer_addr)}; +    const auto& surface{res_cache.TryFindFramebufferSurface(Memory::GetPointer(framebuffer_addr))};      if (!surface) {          return {};      } diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h index ca3de0592..30f3e8acb 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.h +++ b/src/video_core/renderer_opengl/gl_rasterizer.h @@ -57,9 +57,9 @@ public:      void DrawArrays() override;      void Clear() override;      void FlushAll() override; -    void FlushRegion(VAddr addr, u64 size) override; -    void InvalidateRegion(VAddr addr, u64 size) override; -    void FlushAndInvalidateRegion(VAddr addr, u64 size) override; +    void FlushRegion(CacheAddr addr, u64 size) override; +    void InvalidateRegion(CacheAddr addr, u64 size) override; +    void FlushAndInvalidateRegion(CacheAddr addr, u64 size) override;      bool AccelerateSurfaceCopy(const Tegra::Engines::Fermi2D::Regs::Surface& src,                                 const Tegra::Engines::Fermi2D::Regs::Surface& dst,                                 const Common::Rectangle<u32>& src_rect, @@ -67,7 +67,7 @@ public:      bool AccelerateDisplay(const Tegra::FramebufferConfig& config, VAddr framebuffer_addr,                             u32 pixel_stride) override;      bool AccelerateDrawBatch(bool is_indexed) override; -    void UpdatePagesCachedCount(Tegra::GPUVAddr addr, u64 size, int delta) override; +    void UpdatePagesCachedCount(VAddr addr, u64 size, int delta) override;      void LoadDiskResources(const std::atomic_bool& stop_loading,                             const VideoCore::DiskResourceLoadCallback& callback) override; diff --git a/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp b/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp index bd1409660..451de00e8 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp +++ b/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp @@ -61,6 +61,7 @@ void SurfaceParams::InitCacheParameters(Tegra::GPUVAddr gpu_addr_) {      addr = cpu_addr ? *cpu_addr : 0;      gpu_addr = gpu_addr_; +    host_ptr = Memory::GetPointer(addr);      size_in_bytes = SizeInBytesRaw();      if (IsPixelFormatASTC(pixel_format)) { @@ -563,8 +564,8 @@ void RasterizerCacheOpenGL::CopySurface(const Surface& src_surface, const Surfac  }  CachedSurface::CachedSurface(const SurfaceParams& params) -    : params(params), gl_target(SurfaceTargetToGL(params.target)), -      cached_size_in_bytes(params.size_in_bytes) { +    : params{params}, gl_target{SurfaceTargetToGL(params.target)}, +      cached_size_in_bytes{params.size_in_bytes}, RasterizerCacheObject{params.host_ptr} {      texture.Create(gl_target);      // TODO(Rodrigo): Using params.GetRect() returns a different size than using its Mip*(0) @@ -633,10 +634,9 @@ void CachedSurface::LoadGLBuffer() {          const u32 bpp = params.GetFormatBpp() / 8;          const u32 copy_size = params.width * bpp;          if (params.pitch == copy_size) { -            std::memcpy(gl_buffer[0].data(), Memory::GetPointer(params.addr), -                        params.size_in_bytes_gl); +            std::memcpy(gl_buffer[0].data(), params.host_ptr, params.size_in_bytes_gl);          } else { -            const u8* start = Memory::GetPointer(params.addr); +            const u8* start{params.host_ptr};              u8* write_to = gl_buffer[0].data();              for (u32 h = params.height; h > 0; h--) {                  std::memcpy(write_to, start, copy_size); @@ -680,8 +680,6 @@ void CachedSurface::FlushGLBuffer() {      glPixelStorei(GL_PACK_ROW_LENGTH, 0);      Tegra::Texture::ConvertFromHostToGuest(gl_buffer[0].data(), params.pixel_format, params.width,                                             params.height, params.depth, true, true); -    const u8* const texture_src_data = Memory::GetPointer(params.addr); -    ASSERT(texture_src_data);      if (params.is_tiled) {          ASSERT_MSG(params.block_width == 1, "Block width is defined as {} on texture type {}",                     params.block_width, static_cast<u32>(params.target)); @@ -691,9 +689,9 @@ void CachedSurface::FlushGLBuffer() {          const u32 bpp = params.GetFormatBpp() / 8;          const u32 copy_size = params.width * bpp;          if (params.pitch == copy_size) { -            std::memcpy(Memory::GetPointer(params.addr), gl_buffer[0].data(), GetSizeInBytes()); +            std::memcpy(params.host_ptr, gl_buffer[0].data(), GetSizeInBytes());          } else { -            u8* start = Memory::GetPointer(params.addr); +            u8* start{params.host_ptr};              const u8* read_to = gl_buffer[0].data();              for (u32 h = params.height; h > 0; h--) {                  std::memcpy(start, read_to, copy_size); @@ -932,7 +930,7 @@ Surface RasterizerCacheOpenGL::GetSurface(const SurfaceParams& params, bool pres      }      // Look up surface in the cache based on address -    Surface surface{TryGet(params.addr)}; +    Surface surface{TryGet(params.host_ptr)};      if (surface) {          if (surface->GetSurfaceParams().IsCompatibleSurface(params)) {              // Use the cached surface as-is unless it's not synced with memory @@ -986,7 +984,7 @@ void RasterizerCacheOpenGL::FastLayeredCopySurface(const Surface& src_surface,      for (u32 layer = 0; layer < dst_params.depth; layer++) {          for (u32 mipmap = 0; mipmap < dst_params.max_mip_level; mipmap++) {              const VAddr sub_address = address + dst_params.GetMipmapLevelOffset(mipmap); -            const Surface& copy = TryGet(sub_address); +            const Surface& copy = TryGet(Memory::GetPointer(sub_address));              if (!copy)                  continue;              const auto& src_params{copy->GetSurfaceParams()}; @@ -1163,7 +1161,8 @@ void RasterizerCacheOpenGL::AccurateCopySurface(const Surface& src_surface,      const auto& dst_params{dst_surface->GetSurfaceParams()};      // Flush enough memory for both the source and destination surface -    FlushRegion(src_params.addr, std::max(src_params.MemorySize(), dst_params.MemorySize())); +    FlushRegion(ToCacheAddr(src_params.host_ptr), +                std::max(src_params.MemorySize(), dst_params.MemorySize()));      LoadSurface(dst_surface);  } @@ -1215,8 +1214,8 @@ Surface RasterizerCacheOpenGL::RecreateSurface(const Surface& old_surface,      return new_surface;  } -Surface RasterizerCacheOpenGL::TryFindFramebufferSurface(VAddr addr) const { -    return TryGet(addr); +Surface RasterizerCacheOpenGL::TryFindFramebufferSurface(const u8* host_ptr) const { +    return TryGet(host_ptr);  }  void RasterizerCacheOpenGL::ReserveSurface(const Surface& surface) { @@ -1267,7 +1266,7 @@ static bool LayerFitReinterpretSurface(RasterizerCacheOpenGL& cache, const Surfa              src_params.height == dst_params.MipHeight(*level) &&              src_params.block_height >= dst_params.MipBlockHeight(*level)) {              const std::optional<u32> slot = -                TryFindBestLayer(render_surface->GetAddr(), dst_params, *level); +                TryFindBestLayer(render_surface->GetCpuAddr(), dst_params, *level);              if (slot.has_value()) {                  glCopyImageSubData(render_surface->Texture().handle,                                     SurfaceTargetToGL(src_params.target), 0, 0, 0, 0, @@ -1283,8 +1282,8 @@ static bool LayerFitReinterpretSurface(RasterizerCacheOpenGL& cache, const Surfa  }  static bool IsReinterpretInvalid(const Surface render_surface, const Surface blitted_surface) { -    const VAddr bound1 = blitted_surface->GetAddr() + blitted_surface->GetMemorySize(); -    const VAddr bound2 = render_surface->GetAddr() + render_surface->GetMemorySize(); +    const VAddr bound1 = blitted_surface->GetCpuAddr() + blitted_surface->GetMemorySize(); +    const VAddr bound2 = render_surface->GetCpuAddr() + render_surface->GetMemorySize();      if (bound2 > bound1)          return true;      const auto& dst_params = blitted_surface->GetSurfaceParams(); @@ -1327,7 +1326,8 @@ void RasterizerCacheOpenGL::SignalPreDrawCall() {  void RasterizerCacheOpenGL::SignalPostDrawCall() {      for (u32 i = 0; i < Maxwell::NumRenderTargets; i++) {          if (current_color_buffers[i] != nullptr) { -            Surface intersect = CollideOnReinterpretedSurface(current_color_buffers[i]->GetAddr()); +            Surface intersect = +                CollideOnReinterpretedSurface(current_color_buffers[i]->GetCacheAddr());              if (intersect != nullptr) {                  PartialReinterpretSurface(current_color_buffers[i], intersect);                  texception = true; diff --git a/src/video_core/renderer_opengl/gl_rasterizer_cache.h b/src/video_core/renderer_opengl/gl_rasterizer_cache.h index 9cf6f50be..b3afad139 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer_cache.h +++ b/src/video_core/renderer_opengl/gl_rasterizer_cache.h @@ -297,6 +297,7 @@ struct SurfaceParams {      bool srgb_conversion;      // Parameters used for caching      VAddr addr; +    u8* host_ptr;      Tegra::GPUVAddr gpu_addr;      std::size_t size_in_bytes;      std::size_t size_in_bytes_gl; @@ -345,9 +346,9 @@ class RasterizerOpenGL;  class CachedSurface final : public RasterizerCacheObject {  public: -    CachedSurface(const SurfaceParams& params); +    explicit CachedSurface(const SurfaceParams& params); -    VAddr GetAddr() const override { +    VAddr GetCpuAddr() const override {          return params.addr;      } @@ -449,7 +450,7 @@ public:      Surface GetColorBufferSurface(std::size_t index, bool preserve_contents);      /// Tries to find a framebuffer using on the provided CPU address -    Surface TryFindFramebufferSurface(VAddr addr) const; +    Surface TryFindFramebufferSurface(const u8* host_ptr) const;      /// Copies the contents of one surface to another      void FermiCopySurface(const Tegra::Engines::Fermi2D::Regs::Surface& src_config, @@ -506,12 +507,12 @@ private:      std::array<Surface, Maxwell::NumRenderTargets> current_color_buffers;      Surface last_depth_buffer; -    using SurfaceIntervalCache = boost::icl::interval_map<VAddr, Surface>; +    using SurfaceIntervalCache = boost::icl::interval_map<CacheAddr, Surface>;      using SurfaceInterval = typename SurfaceIntervalCache::interval_type;      static auto GetReinterpretInterval(const Surface& object) { -        return SurfaceInterval::right_open(object->GetAddr() + 1, -                                           object->GetAddr() + object->GetMemorySize() - 1); +        return SurfaceInterval::right_open(object->GetCacheAddr() + 1, +                                           object->GetCacheAddr() + object->GetMemorySize() - 1);      }      // Reinterpreted surfaces are very fragil as the game may keep rendering into them. @@ -523,7 +524,7 @@ private:          reinterpret_surface->MarkReinterpreted();      } -    Surface CollideOnReinterpretedSurface(VAddr addr) const { +    Surface CollideOnReinterpretedSurface(CacheAddr addr) const {          const SurfaceInterval interval{addr};          for (auto& pair :               boost::make_iterator_range(reinterpreted_surfaces.equal_range(interval))) { diff --git a/src/video_core/renderer_opengl/gl_shader_cache.cpp b/src/video_core/renderer_opengl/gl_shader_cache.cpp index 4883e4f62..60a04e146 100644 --- a/src/video_core/renderer_opengl/gl_shader_cache.cpp +++ b/src/video_core/renderer_opengl/gl_shader_cache.cpp @@ -42,9 +42,9 @@ VAddr GetShaderAddress(Maxwell::ShaderProgram program) {  }  /// Gets the shader program code from memory for the specified address -ProgramCode GetShaderCode(VAddr addr) { +ProgramCode GetShaderCode(const u8* host_ptr) {      ProgramCode program_code(VideoCommon::Shader::MAX_PROGRAM_LENGTH); -    Memory::ReadBlock(addr, program_code.data(), program_code.size() * sizeof(u64)); +    std::memcpy(program_code.data(), host_ptr, program_code.size() * sizeof(u64));      return program_code;  } @@ -214,12 +214,13 @@ std::set<GLenum> GetSupportedFormats() {  } // namespace -CachedShader::CachedShader(VAddr addr, u64 unique_identifier, Maxwell::ShaderProgram program_type, -                           ShaderDiskCacheOpenGL& disk_cache, +CachedShader::CachedShader(VAddr guest_addr, u64 unique_identifier, +                           Maxwell::ShaderProgram program_type, ShaderDiskCacheOpenGL& disk_cache,                             const PrecompiledPrograms& precompiled_programs, -                           ProgramCode&& program_code, ProgramCode&& program_code_b) -    : addr{addr}, unique_identifier{unique_identifier}, program_type{program_type}, -      disk_cache{disk_cache}, precompiled_programs{precompiled_programs} { +                           ProgramCode&& program_code, ProgramCode&& program_code_b, u8* host_ptr) +    : host_ptr{host_ptr}, guest_addr{guest_addr}, unique_identifier{unique_identifier}, +      program_type{program_type}, disk_cache{disk_cache}, +      precompiled_programs{precompiled_programs}, RasterizerCacheObject{host_ptr} {      const std::size_t code_size = CalculateProgramSize(program_code);      const std::size_t code_size_b = @@ -243,12 +244,13 @@ CachedShader::CachedShader(VAddr addr, u64 unique_identifier, Maxwell::ShaderPro      disk_cache.SaveRaw(raw);  } -CachedShader::CachedShader(VAddr addr, u64 unique_identifier, Maxwell::ShaderProgram program_type, -                           ShaderDiskCacheOpenGL& disk_cache, +CachedShader::CachedShader(VAddr guest_addr, u64 unique_identifier, +                           Maxwell::ShaderProgram program_type, ShaderDiskCacheOpenGL& disk_cache,                             const PrecompiledPrograms& precompiled_programs, -                           GLShader::ProgramResult result) -    : addr{addr}, unique_identifier{unique_identifier}, program_type{program_type}, -      disk_cache{disk_cache}, precompiled_programs{precompiled_programs} { +                           GLShader::ProgramResult result, u8* host_ptr) +    : guest_addr{guest_addr}, unique_identifier{unique_identifier}, program_type{program_type}, +      disk_cache{disk_cache}, precompiled_programs{precompiled_programs}, RasterizerCacheObject{ +                                                                              host_ptr} {      code = std::move(result.first);      entries = result.second; @@ -271,7 +273,7 @@ std::tuple<GLuint, BaseBindings> CachedShader::GetProgramHandle(GLenum primitive                  disk_cache.SaveUsage(GetUsage(primitive_mode, base_bindings));              } -            LabelGLObject(GL_PROGRAM, program->handle, addr); +            LabelGLObject(GL_PROGRAM, program->handle, guest_addr);          }          handle = program->handle; @@ -323,7 +325,7 @@ GLuint CachedShader::LazyGeometryProgram(CachedProgram& target_program, BaseBind          disk_cache.SaveUsage(GetUsage(primitive_mode, base_bindings));      } -    LabelGLObject(GL_PROGRAM, target_program->handle, addr, debug_name); +    LabelGLObject(GL_PROGRAM, target_program->handle, guest_addr, debug_name);      return target_program->handle;  }; @@ -489,14 +491,17 @@ Shader ShaderCacheOpenGL::GetStageProgram(Maxwell::ShaderProgram program) {      const VAddr program_addr{GetShaderAddress(program)};      // Look up shader in the cache based on address -    Shader shader{TryGet(program_addr)}; +    const auto& host_ptr{Memory::GetPointer(program_addr)}; +    Shader shader{TryGet(host_ptr)};      if (!shader) {          // No shader found - create a new one -        ProgramCode program_code = GetShaderCode(program_addr); +        const auto& host_ptr{Memory::GetPointer(program_addr)}; +        ProgramCode program_code{GetShaderCode(host_ptr)};          ProgramCode program_code_b;          if (program == Maxwell::ShaderProgram::VertexA) { -            program_code_b = GetShaderCode(GetShaderAddress(Maxwell::ShaderProgram::VertexB)); +            program_code_b = GetShaderCode( +                Memory::GetPointer(GetShaderAddress(Maxwell::ShaderProgram::VertexB)));          }          const u64 unique_identifier = GetUniqueIdentifier(program, program_code, program_code_b); @@ -504,11 +509,11 @@ Shader ShaderCacheOpenGL::GetStageProgram(Maxwell::ShaderProgram program) {          if (found != precompiled_shaders.end()) {              shader =                  std::make_shared<CachedShader>(program_addr, unique_identifier, program, disk_cache, -                                               precompiled_programs, found->second); +                                               precompiled_programs, found->second, host_ptr);          } else {              shader = std::make_shared<CachedShader>(                  program_addr, unique_identifier, program, disk_cache, precompiled_programs, -                std::move(program_code), std::move(program_code_b)); +                std::move(program_code), std::move(program_code_b), host_ptr);          }          Register(shader);      } diff --git a/src/video_core/renderer_opengl/gl_shader_cache.h b/src/video_core/renderer_opengl/gl_shader_cache.h index 97eed192f..81fe716b4 100644 --- a/src/video_core/renderer_opengl/gl_shader_cache.h +++ b/src/video_core/renderer_opengl/gl_shader_cache.h @@ -39,18 +39,18 @@ using PrecompiledShaders = std::unordered_map<u64, GLShader::ProgramResult>;  class CachedShader final : public RasterizerCacheObject {  public: -    explicit CachedShader(VAddr addr, u64 unique_identifier, Maxwell::ShaderProgram program_type, -                          ShaderDiskCacheOpenGL& disk_cache, +    explicit CachedShader(VAddr guest_addr, u64 unique_identifier, +                          Maxwell::ShaderProgram program_type, ShaderDiskCacheOpenGL& disk_cache,                            const PrecompiledPrograms& precompiled_programs, -                          ProgramCode&& program_code, ProgramCode&& program_code_b); +                          ProgramCode&& program_code, ProgramCode&& program_code_b, u8* host_ptr); -    explicit CachedShader(VAddr addr, u64 unique_identifier, Maxwell::ShaderProgram program_type, -                          ShaderDiskCacheOpenGL& disk_cache, +    explicit CachedShader(VAddr guest_addr, u64 unique_identifier, +                          Maxwell::ShaderProgram program_type, ShaderDiskCacheOpenGL& disk_cache,                            const PrecompiledPrograms& precompiled_programs, -                          GLShader::ProgramResult result); +                          GLShader::ProgramResult result, u8* host_ptr); -    VAddr GetAddr() const override { -        return addr; +    VAddr GetCpuAddr() const override { +        return guest_addr;      }      std::size_t GetSizeInBytes() const override { @@ -91,7 +91,8 @@ private:      ShaderDiskCacheUsage GetUsage(GLenum primitive_mode, BaseBindings base_bindings) const; -    VAddr addr{}; +    u8* host_ptr{}; +    VAddr guest_addr{};      u64 unique_identifier{};      Maxwell::ShaderProgram program_type{};      ShaderDiskCacheOpenGL& disk_cache; diff --git a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp index 4a33a6c84..95eab3fec 100644 --- a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp @@ -17,6 +17,11 @@  namespace Vulkan { +CachedBufferEntry::CachedBufferEntry(VAddr cpu_addr, std::size_t size, u64 offset, +                                     std::size_t alignment, u8* host_ptr) +    : cpu_addr{cpu_addr}, size{size}, offset{offset}, alignment{alignment}, RasterizerCacheObject{ +                                                                                host_ptr} {} +  VKBufferCache::VKBufferCache(Tegra::MemoryManager& tegra_memory_manager,                               VideoCore::RasterizerInterface& rasterizer, const VKDevice& device,                               VKMemoryManager& memory_manager, VKScheduler& scheduler, u64 size) @@ -37,16 +42,18 @@ VKBufferCache::~VKBufferCache() = default;  u64 VKBufferCache::UploadMemory(Tegra::GPUVAddr gpu_addr, std::size_t size, u64 alignment,                                  bool cache) {      const auto cpu_addr{tegra_memory_manager.GpuToCpuAddress(gpu_addr)}; -    ASSERT(cpu_addr); +    ASSERT_MSG(cpu_addr, "Invalid GPU address");      // Cache management is a big overhead, so only cache entries with a given size.      // TODO: Figure out which size is the best for given games.      cache &= size >= 2048; +    const auto& host_ptr{Memory::GetPointer(*cpu_addr)};      if (cache) { -        if (auto entry = TryGet(*cpu_addr); entry) { -            if (entry->size >= size && entry->alignment == alignment) { -                return entry->offset; +        auto entry = TryGet(host_ptr); +        if (entry) { +            if (entry->GetSize() >= size && entry->GetAlignment() == alignment) { +                return entry->GetOffset();              }              Unregister(entry);          } @@ -55,17 +62,17 @@ u64 VKBufferCache::UploadMemory(Tegra::GPUVAddr gpu_addr, std::size_t size, u64      AlignBuffer(alignment);      const u64 uploaded_offset = buffer_offset; -    Memory::ReadBlock(*cpu_addr, buffer_ptr, size); +    if (!host_ptr) { +        return uploaded_offset; +    } +    std::memcpy(buffer_ptr, host_ptr, size);      buffer_ptr += size;      buffer_offset += size;      if (cache) { -        auto entry = std::make_shared<CachedBufferEntry>(); -        entry->offset = uploaded_offset; -        entry->size = size; -        entry->alignment = alignment; -        entry->addr = *cpu_addr; +        auto entry = std::make_shared<CachedBufferEntry>(*cpu_addr, size, uploaded_offset, +                                                         alignment, host_ptr);          Register(entry);      } diff --git a/src/video_core/renderer_vulkan/vk_buffer_cache.h b/src/video_core/renderer_vulkan/vk_buffer_cache.h index d8e916f31..8b415744b 100644 --- a/src/video_core/renderer_vulkan/vk_buffer_cache.h +++ b/src/video_core/renderer_vulkan/vk_buffer_cache.h @@ -24,22 +24,39 @@ class VKFence;  class VKMemoryManager;  class VKStreamBuffer; -struct CachedBufferEntry final : public RasterizerCacheObject { -    VAddr GetAddr() const override { -        return addr; +class CachedBufferEntry final : public RasterizerCacheObject { +public: +    explicit CachedBufferEntry(VAddr cpu_addr, std::size_t size, u64 offset, std::size_t alignment, +                               u8* host_ptr); + +    VAddr GetCpuAddr() const override { +        return cpu_addr;      }      std::size_t GetSizeInBytes() const override {          return size;      } +    std::size_t GetSize() const { +        return size; +    } + +    u64 GetOffset() const { +        return offset; +    } + +    std::size_t GetAlignment() const { +        return alignment; +    } +      // We do not have to flush this cache as things in it are never modified by us.      void Flush() override {} -    VAddr addr; -    std::size_t size; -    u64 offset; -    std::size_t alignment; +private: +    VAddr cpu_addr{}; +    std::size_t size{}; +    u64 offset{}; +    std::size_t alignment{};  };  class VKBufferCache final : public RasterizerCache<std::shared_ptr<CachedBufferEntry>> { | 
