diff options
| -rw-r--r-- | src/core/memory.cpp | 2 | ||||
| -rw-r--r-- | src/tests/video_core/buffer_base.cpp | 2 | ||||
| -rw-r--r-- | src/video_core/CMakeLists.txt | 1 | ||||
| -rw-r--r-- | src/video_core/buffer_cache/buffer_base.h | 14 | ||||
| -rw-r--r-- | src/video_core/engines/engine_upload.cpp | 2 | ||||
| -rw-r--r-- | src/video_core/engines/fermi_2d.cpp | 6 | ||||
| -rw-r--r-- | src/video_core/engines/fermi_2d.h | 1 | ||||
| -rw-r--r-- | src/video_core/engines/maxwell_3d.cpp | 7 | ||||
| -rw-r--r-- | src/video_core/engines/maxwell_dma.cpp | 21 | ||||
| -rw-r--r-- | src/video_core/invalidation_accumulator.h | 79 | ||||
| -rw-r--r-- | src/video_core/memory_manager.cpp | 102 | ||||
| -rw-r--r-- | src/video_core/memory_manager.h | 18 | ||||
| -rw-r--r-- | src/video_core/rasterizer_interface.h | 7 | ||||
| -rw-r--r-- | src/video_core/renderer_vulkan/vk_rasterizer.cpp | 23 | ||||
| -rw-r--r-- | src/video_core/renderer_vulkan/vk_rasterizer.h | 1 | 
15 files changed, 233 insertions, 53 deletions
| diff --git a/src/core/memory.cpp b/src/core/memory.cpp index 26be74df4..a1e41faff 100644 --- a/src/core/memory.cpp +++ b/src/core/memory.cpp @@ -436,7 +436,7 @@ struct Memory::Impl {          }          if (Settings::IsFastmemEnabled()) { -            const bool is_read_enable = Settings::IsGPULevelHigh() || !cached; +            const bool is_read_enable = !Settings::IsGPULevelExtreme() || !cached;              system.DeviceMemory().buffer.Protect(vaddr, size, is_read_enable, !cached);          } diff --git a/src/tests/video_core/buffer_base.cpp b/src/tests/video_core/buffer_base.cpp index f7236afab..5cd0628f2 100644 --- a/src/tests/video_core/buffer_base.cpp +++ b/src/tests/video_core/buffer_base.cpp @@ -538,7 +538,7 @@ TEST_CASE("BufferBase: Cached write downloads") {      int num = 0;      buffer.ForEachDownloadRangeAndClear(c, WORD, [&](u64 offset, u64 size) { ++num; });      buffer.ForEachUploadRange(c, WORD, [&](u64 offset, u64 size) { ++num; }); -    REQUIRE(num == 0); +    REQUIRE(num == 1);      REQUIRE(!buffer.IsRegionCpuModified(c + PAGE, PAGE));      REQUIRE(!buffer.IsRegionGpuModified(c + PAGE, PAGE));      buffer.FlushCachedWrites(); diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt index aa271a377..b7095ae13 100644 --- a/src/video_core/CMakeLists.txt +++ b/src/video_core/CMakeLists.txt @@ -85,6 +85,7 @@ add_library(video_core STATIC      gpu.h      gpu_thread.cpp      gpu_thread.h +    invalidation_accumulator.h      memory_manager.cpp      memory_manager.h      precompiled_headers.h diff --git a/src/video_core/buffer_cache/buffer_base.h b/src/video_core/buffer_cache/buffer_base.h index 92d77eef2..c47b7d866 100644 --- a/src/video_core/buffer_cache/buffer_base.h +++ b/src/video_core/buffer_cache/buffer_base.h @@ -430,7 +430,7 @@ private:          if (query_begin >= SizeBytes() || size < 0) {              return;          } -        u64* const untracked_words = Array<Type::Untracked>(); +        [[maybe_unused]] u64* const untracked_words = Array<Type::Untracked>();          u64* const state_words = Array<type>();          const u64 query_end = query_begin + std::min(static_cast<u64>(size), SizeBytes());          u64* const words_begin = state_words + query_begin / BYTES_PER_WORD; @@ -483,7 +483,7 @@ private:                  NotifyRasterizer<true>(word_index, current_bits, ~u64{0});              }              // Exclude CPU modified pages when visiting GPU pages -            const u64 word = current_word & ~(type == Type::GPU ? untracked_words[word_index] : 0); +            const u64 word = current_word;              u64 page = page_begin;              page_begin = 0; @@ -531,7 +531,7 @@ private:      [[nodiscard]] bool IsRegionModified(u64 offset, u64 size) const noexcept {          static_assert(type != Type::Untracked); -        const u64* const untracked_words = Array<Type::Untracked>(); +        [[maybe_unused]] const u64* const untracked_words = Array<Type::Untracked>();          const u64* const state_words = Array<type>();          const u64 num_query_words = size / BYTES_PER_WORD + 1;          const u64 word_begin = offset / BYTES_PER_WORD; @@ -539,8 +539,7 @@ private:          const u64 page_limit = Common::DivCeil(offset + size, BYTES_PER_PAGE);          u64 page_index = (offset / BYTES_PER_PAGE) % PAGES_PER_WORD;          for (u64 word_index = word_begin; word_index < word_end; ++word_index, page_index = 0) { -            const u64 off_word = type == Type::GPU ? untracked_words[word_index] : 0; -            const u64 word = state_words[word_index] & ~off_word; +            const u64 word = state_words[word_index];              if (word == 0) {                  continue;              } @@ -564,7 +563,7 @@ private:      [[nodiscard]] std::pair<u64, u64> ModifiedRegion(u64 offset, u64 size) const noexcept {          static_assert(type != Type::Untracked); -        const u64* const untracked_words = Array<Type::Untracked>(); +        [[maybe_unused]] const u64* const untracked_words = Array<Type::Untracked>();          const u64* const state_words = Array<type>();          const u64 num_query_words = size / BYTES_PER_WORD + 1;          const u64 word_begin = offset / BYTES_PER_WORD; @@ -574,8 +573,7 @@ private:          u64 begin = std::numeric_limits<u64>::max();          u64 end = 0;          for (u64 word_index = word_begin; word_index < word_end; ++word_index) { -            const u64 off_word = type == Type::GPU ? untracked_words[word_index] : 0; -            const u64 word = state_words[word_index] & ~off_word; +            const u64 word = state_words[word_index];              if (word == 0) {                  continue;              } diff --git a/src/video_core/engines/engine_upload.cpp b/src/video_core/engines/engine_upload.cpp index cea1dd8b0..7f5a0c29d 100644 --- a/src/video_core/engines/engine_upload.cpp +++ b/src/video_core/engines/engine_upload.cpp @@ -76,7 +76,7 @@ void State::ProcessData(std::span<const u8> read_buffer) {                                         regs.dest.height, regs.dest.depth, x_offset, regs.dest.y,                                         x_elements, regs.line_count, regs.dest.BlockHeight(),                                         regs.dest.BlockDepth(), regs.line_length_in); -        memory_manager.WriteBlock(address, tmp_buffer.data(), dst_size); +        memory_manager.WriteBlockCached(address, tmp_buffer.data(), dst_size);      }  } diff --git a/src/video_core/engines/fermi_2d.cpp b/src/video_core/engines/fermi_2d.cpp index e655e7254..a126c359c 100644 --- a/src/video_core/engines/fermi_2d.cpp +++ b/src/video_core/engines/fermi_2d.cpp @@ -6,6 +6,7 @@  #include "common/microprofile.h"  #include "video_core/engines/fermi_2d.h"  #include "video_core/engines/sw_blitter/blitter.h" +#include "video_core/memory_manager.h"  #include "video_core/rasterizer_interface.h"  #include "video_core/surface.h"  #include "video_core/textures/decoders.h" @@ -20,8 +21,8 @@ namespace Tegra::Engines {  using namespace Texture; -Fermi2D::Fermi2D(MemoryManager& memory_manager_) { -    sw_blitter = std::make_unique<Blitter::SoftwareBlitEngine>(memory_manager_); +Fermi2D::Fermi2D(MemoryManager& memory_manager_) : memory_manager{memory_manager_} { +    sw_blitter = std::make_unique<Blitter::SoftwareBlitEngine>(memory_manager);      // Nvidia's OpenGL driver seems to assume these values      regs.src.depth = 1;      regs.dst.depth = 1; @@ -104,6 +105,7 @@ void Fermi2D::Blit() {          config.src_x0 = 0;      } +    memory_manager.FlushCaching();      if (!rasterizer->AccelerateSurfaceCopy(src, regs.dst, config)) {          sw_blitter->Blit(src, regs.dst, config);      } diff --git a/src/video_core/engines/fermi_2d.h b/src/video_core/engines/fermi_2d.h index 523fbdec2..705b323e1 100644 --- a/src/video_core/engines/fermi_2d.h +++ b/src/video_core/engines/fermi_2d.h @@ -305,6 +305,7 @@ public:  private:      VideoCore::RasterizerInterface* rasterizer = nullptr;      std::unique_ptr<Blitter::SoftwareBlitEngine> sw_blitter; +    MemoryManager& memory_manager;      /// Performs the copy from the source surface to the destination surface as configured in the      /// registers. diff --git a/src/video_core/engines/maxwell_3d.cpp b/src/video_core/engines/maxwell_3d.cpp index fbfd1ddd2..97f547789 100644 --- a/src/video_core/engines/maxwell_3d.cpp +++ b/src/video_core/engines/maxwell_3d.cpp @@ -485,11 +485,6 @@ void Maxwell3D::StampQueryResult(u64 payload, bool long_query) {  }  void Maxwell3D::ProcessQueryGet() { -    // TODO(Subv): Support the other query units. -    if (regs.report_semaphore.query.location != Regs::ReportSemaphore::Location::All) { -        LOG_DEBUG(HW_GPU, "Locations other than ALL are unimplemented"); -    } -      switch (regs.report_semaphore.query.operation) {      case Regs::ReportSemaphore::Operation::Release:          if (regs.report_semaphore.query.short_query != 0) { @@ -649,7 +644,7 @@ void Maxwell3D::ProcessCBMultiData(const u32* start_base, u32 amount) {      const GPUVAddr address{buffer_address + regs.const_buffer.offset};      const size_t copy_size = amount * sizeof(u32); -    memory_manager.WriteBlock(address, start_base, copy_size); +    memory_manager.WriteBlockCached(address, start_base, copy_size);      // Increment the current buffer position.      regs.const_buffer.offset += static_cast<u32>(copy_size); diff --git a/src/video_core/engines/maxwell_dma.cpp b/src/video_core/engines/maxwell_dma.cpp index 01f70ea9e..7762c7d96 100644 --- a/src/video_core/engines/maxwell_dma.cpp +++ b/src/video_core/engines/maxwell_dma.cpp @@ -69,7 +69,7 @@ void MaxwellDMA::Launch() {      if (launch.multi_line_enable) {          const bool is_src_pitch = launch.src_memory_layout == LaunchDMA::MemoryLayout::PITCH;          const bool is_dst_pitch = launch.dst_memory_layout == LaunchDMA::MemoryLayout::PITCH; - +        memory_manager.FlushCaching();          if (!is_src_pitch && !is_dst_pitch) {              // If both the source and the destination are in block layout, assert.              CopyBlockLinearToBlockLinear(); @@ -104,6 +104,7 @@ void MaxwellDMA::Launch() {                                              reinterpret_cast<u8*>(tmp_buffer.data()),                                              regs.line_length_in * sizeof(u32));          } else { +            memory_manager.FlushCaching();              const auto convert_linear_2_blocklinear_addr = [](u64 address) {                  return (address & ~0x1f0ULL) | ((address & 0x40) >> 2) | ((address & 0x10) << 1) |                         ((address & 0x180) >> 1) | ((address & 0x20) << 3); @@ -121,8 +122,8 @@ void MaxwellDMA::Launch() {                      memory_manager.ReadBlockUnsafe(                          convert_linear_2_blocklinear_addr(regs.offset_in + offset),                          tmp_buffer.data(), tmp_buffer.size()); -                    memory_manager.WriteBlock(regs.offset_out + offset, tmp_buffer.data(), -                                              tmp_buffer.size()); +                    memory_manager.WriteBlockCached(regs.offset_out + offset, tmp_buffer.data(), +                                                    tmp_buffer.size());                  }              } else if (is_src_pitch && !is_dst_pitch) {                  UNIMPLEMENTED_IF(regs.line_length_in % 16 != 0); @@ -132,7 +133,7 @@ void MaxwellDMA::Launch() {                  for (u32 offset = 0; offset < regs.line_length_in; offset += 16) {                      memory_manager.ReadBlockUnsafe(regs.offset_in + offset, tmp_buffer.data(),                                                     tmp_buffer.size()); -                    memory_manager.WriteBlock( +                    memory_manager.WriteBlockCached(                          convert_linear_2_blocklinear_addr(regs.offset_out + offset),                          tmp_buffer.data(), tmp_buffer.size());                  } @@ -141,8 +142,8 @@ void MaxwellDMA::Launch() {                      std::vector<u8> tmp_buffer(regs.line_length_in);                      memory_manager.ReadBlockUnsafe(regs.offset_in, tmp_buffer.data(),                                                     regs.line_length_in); -                    memory_manager.WriteBlock(regs.offset_out, tmp_buffer.data(), -                                              regs.line_length_in); +                    memory_manager.WriteBlockCached(regs.offset_out, tmp_buffer.data(), +                                                    regs.line_length_in);                  }              }          } @@ -204,7 +205,7 @@ void MaxwellDMA::CopyBlockLinearToPitch() {                       src_params.origin.y, x_elements, regs.line_count, block_height, block_depth,                       regs.pitch_out); -    memory_manager.WriteBlock(regs.offset_out, write_buffer.data(), dst_size); +    memory_manager.WriteBlockCached(regs.offset_out, write_buffer.data(), dst_size);  }  void MaxwellDMA::CopyPitchToBlockLinear() { @@ -256,7 +257,7 @@ void MaxwellDMA::CopyPitchToBlockLinear() {                     dst_params.origin.y, x_elements, regs.line_count, block_height, block_depth,                     regs.pitch_in); -    memory_manager.WriteBlock(regs.offset_out, write_buffer.data(), dst_size); +    memory_manager.WriteBlockCached(regs.offset_out, write_buffer.data(), dst_size);  }  void MaxwellDMA::FastCopyBlockLinearToPitch() { @@ -287,7 +288,7 @@ void MaxwellDMA::FastCopyBlockLinearToPitch() {                       regs.src_params.block_size.height, regs.src_params.block_size.depth,                       regs.pitch_out); -    memory_manager.WriteBlock(regs.offset_out, write_buffer.data(), dst_size); +    memory_manager.WriteBlockCached(regs.offset_out, write_buffer.data(), dst_size);  }  void MaxwellDMA::CopyBlockLinearToBlockLinear() { @@ -347,7 +348,7 @@ void MaxwellDMA::CopyBlockLinearToBlockLinear() {                     dst.depth, dst_x_offset, dst.origin.y, x_elements, regs.line_count,                     dst.block_size.height, dst.block_size.depth, pitch); -    memory_manager.WriteBlock(regs.offset_out, write_buffer.data(), dst_size); +    memory_manager.WriteBlockCached(regs.offset_out, write_buffer.data(), dst_size);  }  void MaxwellDMA::ReleaseSemaphore() { diff --git a/src/video_core/invalidation_accumulator.h b/src/video_core/invalidation_accumulator.h new file mode 100644 index 000000000..2c2aaf7bb --- /dev/null +++ b/src/video_core/invalidation_accumulator.h @@ -0,0 +1,79 @@ +// SPDX-FileCopyrightText: Copyright 2018 yuzu Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#pragma once + +#include <utility> +#include <vector> + +#include "common/common_types.h" + +namespace VideoCommon { + +class InvalidationAccumulator { +public: +    InvalidationAccumulator() = default; +    ~InvalidationAccumulator() = default; + +    void Add(GPUVAddr address, size_t size) { +        const auto reset_values = [&]() { +            if (has_collected) { +                buffer.emplace_back(start_address, accumulated_size); +            } +            start_address = address; +            accumulated_size = size; +            last_collection = start_address + size; +        }; +        if (address >= start_address && address + size <= last_collection) [[likely]] { +            return; +        } +        size = ((address + size + atomicity_size_mask) & atomicity_mask) - address; +        address = address & atomicity_mask; +        if (!has_collected) [[unlikely]] { +            reset_values(); +            has_collected = true; +            return; +        } +        if (address != last_collection) [[unlikely]] { +            reset_values(); +            return; +        } +        accumulated_size += size; +        last_collection += size; +    } + +    void Clear() { +        buffer.clear(); +        start_address = 0; +        last_collection = 0; +        has_collected = false; +    } + +    bool AnyAccumulated() const { +        return has_collected; +    } + +    template <typename Func> +    void Callback(Func&& func) { +        if (!has_collected) { +            return; +        } +        buffer.emplace_back(start_address, accumulated_size); +        for (auto& [address, size] : buffer) { +            func(address, size); +        } +    } + +private: +    static constexpr size_t atomicity_bits = 5; +    static constexpr size_t atomicity_size = 1ULL << atomicity_bits; +    static constexpr size_t atomicity_size_mask = atomicity_size - 1; +    static constexpr size_t atomicity_mask = ~atomicity_size_mask; +    GPUVAddr start_address{}; +    GPUVAddr last_collection{}; +    size_t accumulated_size{}; +    bool has_collected{}; +    std::vector<std::pair<VAddr, size_t>> buffer; +}; + +} // namespace VideoCommon diff --git a/src/video_core/memory_manager.cpp b/src/video_core/memory_manager.cpp index 3a5cdeb39..3bcae3503 100644 --- a/src/video_core/memory_manager.cpp +++ b/src/video_core/memory_manager.cpp @@ -6,11 +6,13 @@  #include "common/alignment.h"  #include "common/assert.h"  #include "common/logging/log.h" +#include "common/settings.h"  #include "core/core.h"  #include "core/device_memory.h"  #include "core/hle/kernel/k_page_table.h"  #include "core/hle/kernel/k_process.h"  #include "core/memory.h" +#include "video_core/invalidation_accumulator.h"  #include "video_core/memory_manager.h"  #include "video_core/rasterizer_interface.h"  #include "video_core/renderer_base.h" @@ -26,7 +28,8 @@ MemoryManager::MemoryManager(Core::System& system_, u64 address_space_bits_, u64        entries{}, big_entries{}, page_table{address_space_bits, address_space_bits + page_bits - 38,                                             page_bits != big_page_bits ? page_bits : 0},        kind_map{PTEKind::INVALID}, unique_identifier{unique_identifier_generator.fetch_add( -                                      1, std::memory_order_acq_rel)} { +                                      1, std::memory_order_acq_rel)}, +      accumulator{std::make_unique<VideoCommon::InvalidationAccumulator>()} {      address_space_size = 1ULL << address_space_bits;      page_size = 1ULL << page_bits;      page_mask = page_size - 1ULL; @@ -43,6 +46,11 @@ MemoryManager::MemoryManager(Core::System& system_, u64 address_space_bits_, u64      big_page_table_cpu.resize(big_page_table_size);      big_page_continous.resize(big_page_table_size / continous_bits, 0);      entries.resize(page_table_size / 32, 0); +    if (!Settings::IsGPULevelExtreme() && Settings::IsFastmemEnabled()) { +        fastmem_arena = system.DeviceMemory().buffer.VirtualBasePointer(); +    } else { +        fastmem_arena = nullptr; +    }  }  MemoryManager::~MemoryManager() = default; @@ -185,15 +193,12 @@ void MemoryManager::Unmap(GPUVAddr gpu_addr, std::size_t size) {      if (size == 0) {          return;      } -    const auto submapped_ranges = GetSubmappedRange(gpu_addr, size); - -    for (const auto& [map_addr, map_size] : submapped_ranges) { -        // Flush and invalidate through the GPU interface, to be asynchronous if possible. -        const std::optional<VAddr> cpu_addr = GpuToCpuAddress(map_addr); -        ASSERT(cpu_addr); +    GetSubmappedRangeImpl<false>(gpu_addr, size, page_stash); -        rasterizer->UnmapMemory(*cpu_addr, map_size); +    for (const auto& [map_addr, map_size] : page_stash) { +        rasterizer->UnmapMemory(map_addr, map_size);      } +    page_stash.clear();      BigPageTableOp<EntryType::Free>(gpu_addr, 0, size, PTEKind::INVALID);      PageTableOp<EntryType::Free>(gpu_addr, 0, size, PTEKind::INVALID); @@ -355,7 +360,7 @@ inline void MemoryManager::MemoryOperation(GPUVAddr gpu_src_addr, std::size_t si      }  } -template <bool is_safe> +template <bool is_safe, bool use_fastmem>  void MemoryManager::ReadBlockImpl(GPUVAddr gpu_src_addr, void* dest_buffer, std::size_t size,                                    [[maybe_unused]] VideoCommon::CacheType which) const {      auto set_to_zero = [&]([[maybe_unused]] std::size_t page_index, @@ -369,8 +374,12 @@ void MemoryManager::ReadBlockImpl(GPUVAddr gpu_src_addr, void* dest_buffer, std:          if constexpr (is_safe) {              rasterizer->FlushRegion(cpu_addr_base, copy_amount, which);          } -        u8* physical = memory.GetPointer(cpu_addr_base); -        std::memcpy(dest_buffer, physical, copy_amount); +        if constexpr (use_fastmem) { +            std::memcpy(dest_buffer, &fastmem_arena[cpu_addr_base], copy_amount); +        } else { +            u8* physical = memory.GetPointer(cpu_addr_base); +            std::memcpy(dest_buffer, physical, copy_amount); +        }          dest_buffer = static_cast<u8*>(dest_buffer) + copy_amount;      };      auto mapped_big = [&](std::size_t page_index, std::size_t offset, std::size_t copy_amount) { @@ -379,11 +388,15 @@ void MemoryManager::ReadBlockImpl(GPUVAddr gpu_src_addr, void* dest_buffer, std:          if constexpr (is_safe) {              rasterizer->FlushRegion(cpu_addr_base, copy_amount, which);          } -        if (!IsBigPageContinous(page_index)) [[unlikely]] { -            memory.ReadBlockUnsafe(cpu_addr_base, dest_buffer, copy_amount); +        if constexpr (use_fastmem) { +            std::memcpy(dest_buffer, &fastmem_arena[cpu_addr_base], copy_amount);          } else { -            u8* physical = memory.GetPointer(cpu_addr_base); -            std::memcpy(dest_buffer, physical, copy_amount); +            if (!IsBigPageContinous(page_index)) [[unlikely]] { +                memory.ReadBlockUnsafe(cpu_addr_base, dest_buffer, copy_amount); +            } else { +                u8* physical = memory.GetPointer(cpu_addr_base); +                std::memcpy(dest_buffer, physical, copy_amount); +            }          }          dest_buffer = static_cast<u8*>(dest_buffer) + copy_amount;      }; @@ -397,12 +410,20 @@ void MemoryManager::ReadBlockImpl(GPUVAddr gpu_src_addr, void* dest_buffer, std:  void MemoryManager::ReadBlock(GPUVAddr gpu_src_addr, void* dest_buffer, std::size_t size,                                VideoCommon::CacheType which) const { -    ReadBlockImpl<true>(gpu_src_addr, dest_buffer, size, which); +    if (fastmem_arena) [[likely]] { +        ReadBlockImpl<true, true>(gpu_src_addr, dest_buffer, size, which); +        return; +    } +    ReadBlockImpl<true, false>(gpu_src_addr, dest_buffer, size, which);  }  void MemoryManager::ReadBlockUnsafe(GPUVAddr gpu_src_addr, void* dest_buffer,                                      const std::size_t size) const { -    ReadBlockImpl<false>(gpu_src_addr, dest_buffer, size, VideoCommon::CacheType::None); +    if (fastmem_arena) [[likely]] { +        ReadBlockImpl<false, true>(gpu_src_addr, dest_buffer, size, VideoCommon::CacheType::None); +        return; +    } +    ReadBlockImpl<false, false>(gpu_src_addr, dest_buffer, size, VideoCommon::CacheType::None);  }  template <bool is_safe> @@ -454,6 +475,12 @@ void MemoryManager::WriteBlockUnsafe(GPUVAddr gpu_dest_addr, const void* src_buf      WriteBlockImpl<false>(gpu_dest_addr, src_buffer, size, VideoCommon::CacheType::None);  } +void MemoryManager::WriteBlockCached(GPUVAddr gpu_dest_addr, const void* src_buffer, +                                     std::size_t size) { +    WriteBlockImpl<false>(gpu_dest_addr, src_buffer, size, VideoCommon::CacheType::None); +    accumulator->Add(gpu_dest_addr, size); +} +  void MemoryManager::FlushRegion(GPUVAddr gpu_addr, size_t size,                                  VideoCommon::CacheType which) const {      auto do_nothing = [&]([[maybe_unused]] std::size_t page_index, @@ -663,7 +690,17 @@ bool MemoryManager::IsFullyMappedRange(GPUVAddr gpu_addr, std::size_t size) cons  std::vector<std::pair<GPUVAddr, std::size_t>> MemoryManager::GetSubmappedRange(      GPUVAddr gpu_addr, std::size_t size) const {      std::vector<std::pair<GPUVAddr, std::size_t>> result{}; -    std::optional<std::pair<GPUVAddr, std::size_t>> last_segment{}; +    GetSubmappedRangeImpl<true>(gpu_addr, size, result); +    return result; +} + +template <bool is_gpu_address> +void MemoryManager::GetSubmappedRangeImpl( +    GPUVAddr gpu_addr, std::size_t size, +    std::vector<std::pair<std::conditional_t<is_gpu_address, GPUVAddr, VAddr>, std::size_t>>& +        result) const { +    std::optional<std::pair<std::conditional_t<is_gpu_address, GPUVAddr, VAddr>, std::size_t>> +        last_segment{};      std::optional<VAddr> old_page_addr{};      const auto split = [&last_segment, &result]([[maybe_unused]] std::size_t page_index,                                                  [[maybe_unused]] std::size_t offset, @@ -685,8 +722,12 @@ std::vector<std::pair<GPUVAddr, std::size_t>> MemoryManager::GetSubmappedRange(          }          old_page_addr = {cpu_addr_base + copy_amount};          if (!last_segment) { -            const GPUVAddr new_base_addr = (page_index << big_page_bits) + offset; -            last_segment = {new_base_addr, copy_amount}; +            if constexpr (is_gpu_address) { +                const GPUVAddr new_base_addr = (page_index << big_page_bits) + offset; +                last_segment = {new_base_addr, copy_amount}; +            } else { +                last_segment = {cpu_addr_base, copy_amount}; +            }          } else {              last_segment->second += copy_amount;          } @@ -703,8 +744,12 @@ std::vector<std::pair<GPUVAddr, std::size_t>> MemoryManager::GetSubmappedRange(          }          old_page_addr = {cpu_addr_base + copy_amount};          if (!last_segment) { -            const GPUVAddr new_base_addr = (page_index << page_bits) + offset; -            last_segment = {new_base_addr, copy_amount}; +            if constexpr (is_gpu_address) { +                const GPUVAddr new_base_addr = (page_index << page_bits) + offset; +                last_segment = {new_base_addr, copy_amount}; +            } else { +                last_segment = {cpu_addr_base, copy_amount}; +            }          } else {              last_segment->second += copy_amount;          } @@ -715,7 +760,18 @@ std::vector<std::pair<GPUVAddr, std::size_t>> MemoryManager::GetSubmappedRange(      };      MemoryOperation<true>(gpu_addr, size, extend_size_big, split, do_short_pages);      split(0, 0, 0); -    return result; +} + +void MemoryManager::FlushCaching() { +    if (!accumulator->AnyAccumulated()) { +        return; +    } +    accumulator->Callback([this](GPUVAddr addr, size_t size) { +        GetSubmappedRangeImpl<false>(addr, size, page_stash); +    }); +    rasterizer->InnerInvalidation(page_stash); +    page_stash.clear(); +    accumulator->Clear();  }  } // namespace Tegra diff --git a/src/video_core/memory_manager.h b/src/video_core/memory_manager.h index 828e13439..2936364f0 100644 --- a/src/video_core/memory_manager.h +++ b/src/video_core/memory_manager.h @@ -19,6 +19,10 @@ namespace VideoCore {  class RasterizerInterface;  } +namespace VideoCommon { +class InvalidationAccumulator; +} +  namespace Core {  class DeviceMemory;  namespace Memory { @@ -80,6 +84,7 @@ public:       */      void ReadBlockUnsafe(GPUVAddr gpu_src_addr, void* dest_buffer, std::size_t size) const;      void WriteBlockUnsafe(GPUVAddr gpu_dest_addr, const void* src_buffer, std::size_t size); +    void WriteBlockCached(GPUVAddr gpu_dest_addr, const void* src_buffer, std::size_t size);      /**       * Checks if a gpu region can be simply read with a pointer. @@ -129,12 +134,14 @@ public:      size_t GetMemoryLayoutSize(GPUVAddr gpu_addr,                                 size_t max_size = std::numeric_limits<size_t>::max()) const; +    void FlushCaching(); +  private:      template <bool is_big_pages, typename FuncMapped, typename FuncReserved, typename FuncUnmapped>      inline void MemoryOperation(GPUVAddr gpu_src_addr, std::size_t size, FuncMapped&& func_mapped,                                  FuncReserved&& func_reserved, FuncUnmapped&& func_unmapped) const; -    template <bool is_safe> +    template <bool is_safe, bool use_fastmem>      void ReadBlockImpl(GPUVAddr gpu_src_addr, void* dest_buffer, std::size_t size,                         VideoCommon::CacheType which) const; @@ -154,6 +161,12 @@ private:      inline bool IsBigPageContinous(size_t big_page_index) const;      inline void SetBigPageContinous(size_t big_page_index, bool value); +    template <bool is_gpu_address> +    void GetSubmappedRangeImpl( +        GPUVAddr gpu_addr, std::size_t size, +        std::vector<std::pair<std::conditional_t<is_gpu_address, GPUVAddr, VAddr>, std::size_t>>& +            result) const; +      Core::System& system;      Core::Memory::Memory& memory;      Core::DeviceMemory& device_memory; @@ -201,10 +214,13 @@ private:      Common::VirtualBuffer<u32> big_page_table_cpu;      std::vector<u64> big_page_continous; +    std::vector<std::pair<VAddr, std::size_t>> page_stash{}; +    u8* fastmem_arena{};      constexpr static size_t continous_bits = 64;      const size_t unique_identifier; +    std::unique_ptr<VideoCommon::InvalidationAccumulator> accumulator;      static std::atomic<size_t> unique_identifier_generator;  }; diff --git a/src/video_core/rasterizer_interface.h b/src/video_core/rasterizer_interface.h index f44c7df50..1735b6164 100644 --- a/src/video_core/rasterizer_interface.h +++ b/src/video_core/rasterizer_interface.h @@ -6,6 +6,7 @@  #include <functional>  #include <optional>  #include <span> +#include <utility>  #include "common/common_types.h"  #include "common/polyfill_thread.h"  #include "video_core/cache_types.h" @@ -95,6 +96,12 @@ public:      virtual void InvalidateRegion(VAddr addr, u64 size,                                    VideoCommon::CacheType which = VideoCommon::CacheType::All) = 0; +    virtual void InnerInvalidation(std::span<const std::pair<VAddr, std::size_t>> sequences) { +        for (const auto& [cpu_addr, size] : sequences) { +            InvalidateRegion(cpu_addr, size); +        } +    } +      /// Notify rasterizer that any caches of the specified region are desync with guest      virtual void OnCPUWrite(VAddr addr, u64 size) = 0; diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp index 242bf9602..ed4a72166 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp +++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp @@ -186,6 +186,7 @@ void RasterizerVulkan::PrepareDraw(bool is_indexed, Func&& draw_func) {      SCOPE_EXIT({ gpu.TickWork(); });      FlushWork(); +    gpu_memory->FlushCaching();      query_cache.UpdateCounters(); @@ -393,6 +394,7 @@ void RasterizerVulkan::Clear(u32 layer_count) {  void RasterizerVulkan::DispatchCompute() {      FlushWork(); +    gpu_memory->FlushCaching();      ComputePipeline* const pipeline{pipeline_cache.CurrentComputePipeline()};      if (!pipeline) { @@ -481,6 +483,27 @@ void RasterizerVulkan::InvalidateRegion(VAddr addr, u64 size, VideoCommon::Cache      }  } +void RasterizerVulkan::InnerInvalidation(std::span<const std::pair<VAddr, std::size_t>> sequences) { +    { +        std::scoped_lock lock{texture_cache.mutex}; +        for (const auto& [addr, size] : sequences) { +            texture_cache.WriteMemory(addr, size); +        } +    } +    { +        std::scoped_lock lock{buffer_cache.mutex}; +        for (const auto& [addr, size] : sequences) { +            buffer_cache.WriteMemory(addr, size); +        } +    } +    { +        for (const auto& [addr, size] : sequences) { +            query_cache.InvalidateRegion(addr, size); +            pipeline_cache.InvalidateRegion(addr, size); +        } +    } +} +  void RasterizerVulkan::OnCPUWrite(VAddr addr, u64 size) {      if (addr == 0 || size == 0) {          return; diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.h b/src/video_core/renderer_vulkan/vk_rasterizer.h index c661e5b19..472cc64d9 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.h +++ b/src/video_core/renderer_vulkan/vk_rasterizer.h @@ -79,6 +79,7 @@ public:                           VideoCommon::CacheType which = VideoCommon::CacheType::All) override;      void InvalidateRegion(VAddr addr, u64 size,                            VideoCommon::CacheType which = VideoCommon::CacheType::All) override; +    void InnerInvalidation(std::span<const std::pair<VAddr, std::size_t>> sequences) override;      void OnCPUWrite(VAddr addr, u64 size) override;      void InvalidateGPUCache() override;      void UnmapMemory(VAddr addr, u64 size) override; | 
