diff options
Diffstat (limited to 'src/video_core')
| -rw-r--r-- | src/video_core/buffer_cache/buffer_cache.h | 229 | ||||
| -rw-r--r-- | src/video_core/buffer_cache/buffer_cache_base.h | 65 | 
2 files changed, 154 insertions, 140 deletions
diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h index a0701ce4e..43fe5b080 100644 --- a/src/video_core/buffer_cache/buffer_cache.h +++ b/src/video_core/buffer_cache/buffer_cache.h @@ -11,6 +11,8 @@  namespace VideoCommon { +using Core::Memory::YUZU_PAGESIZE; +  template <class P>  BufferCache<P>::BufferCache(VideoCore::RasterizerInterface& rasterizer_,                              Core::Memory::Memory& cpu_memory_, Runtime& runtime_) @@ -87,9 +89,11 @@ void BufferCache<P>::TickFrame() {  template <class P>  void BufferCache<P>::WriteMemory(VAddr cpu_addr, u64 size) {      memory_tracker.MarkRegionAsCpuModified(cpu_addr, size); -    const IntervalType subtract_interval{cpu_addr, cpu_addr + size}; -    ClearDownload(subtract_interval); -    common_ranges.subtract(subtract_interval); +    if (memory_tracker.IsRegionGpuModified(cpu_addr, size)) { +        const IntervalType subtract_interval{cpu_addr, cpu_addr + size}; +        ClearDownload(subtract_interval); +        common_ranges.subtract(subtract_interval); +    }  }  template <class P> @@ -102,17 +106,33 @@ void BufferCache<P>::CachedWriteMemory(VAddr cpu_addr, u64 size) {  template <class P>  void BufferCache<P>::DownloadMemory(VAddr cpu_addr, u64 size) { +    WaitOnAsyncFlushes(cpu_addr, size);      ForEachBufferInRange(cpu_addr, size, [&](BufferId, Buffer& buffer) {          DownloadBufferMemory(buffer, cpu_addr, size);      });  }  template <class P> +void BufferCache<P>::WaitOnAsyncFlushes(VAddr cpu_addr, u64 size) { +    bool must_wait = false; +    ForEachInOverlapCounter(async_downloads, cpu_addr, size, +                            [&](VAddr, VAddr, int) { must_wait = true; }); +    bool must_release = false; +    ForEachInRangeSet(pending_ranges, cpu_addr, size, [&](VAddr, VAddr) { must_release = true; }); +    if (must_release) { +        std::function<void()> tmp([]() {}); +        rasterizer.SignalFence(std::move(tmp)); +    } +    if (must_wait || must_release) { +        rasterizer.ReleaseFences(); +    } +} + +template <class P>  void BufferCache<P>::ClearDownload(IntervalType subtract_interval) { +    async_downloads -= std::make_pair(subtract_interval, std::numeric_limits<int>::max());      uncommitted_ranges.subtract(subtract_interval); -    for (auto& interval_set : async_downloads) { -        interval_set.subtract(subtract_interval); -    } +    pending_ranges.subtract(subtract_interval);      for (auto& interval_set : committed_ranges) {          interval_set.subtract(subtract_interval);      } @@ -132,6 +152,7 @@ bool BufferCache<P>::DMACopy(GPUVAddr src_address, GPUVAddr dest_address, u64 am      }      const IntervalType subtract_interval{*cpu_dest_address, *cpu_dest_address + amount}; +    WaitOnAsyncFlushes(*cpu_src_address, static_cast<u32>(amount));      ClearDownload(subtract_interval);      BufferId buffer_a; @@ -162,6 +183,7 @@ bool BufferCache<P>::DMACopy(GPUVAddr src_address, GPUVAddr dest_address, u64 am          tmp_intervals.push_back(add_interval);          if (is_high_accuracy) {              uncommitted_ranges.add(add_interval); +            pending_ranges.add(add_interval);          }      };      ForEachInRangeSet(common_ranges, *cpu_src_address, amount, mirror); @@ -413,18 +435,15 @@ template <class P>  void BufferCache<P>::FlushCachedWrites() {      cached_write_buffer_ids.clear();      memory_tracker.FlushCachedWrites(); -    /*for (auto& interval : cached_ranges) { -        VAddr cpu_addr = interval.lower(); -        const std::size_t size = interval.upper() - interval.lower(); -        memory_tracker.FlushCachedWrites(cpu_addr, size); -        // common_ranges.subtract(interval); -    }*/ +    for (auto& interval : cached_ranges) { +        ClearDownload(interval); +    }      cached_ranges.clear();  }  template <class P>  bool BufferCache<P>::HasUncommittedFlushes() const noexcept { -    return !uncommitted_ranges.empty() || !committed_ranges.empty() || !pending_queries.empty(); +    return !uncommitted_ranges.empty() || !committed_ranges.empty();  }  template <class P> @@ -437,8 +456,11 @@ void BufferCache<P>::AccumulateFlushes() {  template <class P>  bool BufferCache<P>::ShouldWaitAsyncFlushes() const noexcept { -    return (!async_buffers.empty() && async_buffers.front().has_value()) || -           (!query_async_buffers.empty() && query_async_buffers.front().has_value()); +    if constexpr (IMPLEMENTS_ASYNC_DOWNLOADS) { +        return (!async_buffers.empty() && async_buffers.front().has_value()); +    } else { +        return false; +    }  }  template <class P> @@ -446,11 +468,14 @@ void BufferCache<P>::CommitAsyncFlushesHigh() {      AccumulateFlushes();      if (committed_ranges.empty()) { -        async_buffers.emplace_back(std::optional<Async_Buffer>{}); +        if constexpr (IMPLEMENTS_ASYNC_DOWNLOADS) { +            async_buffers.emplace_back(std::optional<Async_Buffer>{}); +        }          return;      }      MICROPROFILE_SCOPE(GPU_DownloadMemory); +    pending_ranges.clear();      auto it = committed_ranges.begin();      while (it != committed_ranges.end()) {          auto& current_intervals = *it; @@ -491,7 +516,7 @@ void BufferCache<P>::CommitAsyncFlushesHigh() {                                  buffer_id,                              });                              // Align up to avoid cache conflicts -                            constexpr u64 align = 8ULL; +                            constexpr u64 align = 64ULL;                              constexpr u64 mask = ~(align - 1ULL);                              total_size_bytes += (new_size + align - 1) & mask;                              largest_copy = std::max(largest_copy, new_size); @@ -504,7 +529,9 @@ void BufferCache<P>::CommitAsyncFlushesHigh() {      }      committed_ranges.clear();      if (downloads.empty()) { -        async_buffers.emplace_back(std::optional<Async_Buffer>{}); +        if constexpr (IMPLEMENTS_ASYNC_DOWNLOADS) { +            async_buffers.emplace_back(std::optional<Async_Buffer>{}); +        }          return;      }      if constexpr (IMPLEMENTS_ASYNC_DOWNLOADS) { @@ -520,99 +547,54 @@ void BufferCache<P>::CommitAsyncFlushesHigh() {              second_copy.src_offset = static_cast<size_t>(buffer.CpuAddr()) + copy.src_offset;              VAddr orig_cpu_addr = static_cast<VAddr>(second_copy.src_offset);              const IntervalType base_interval{orig_cpu_addr, orig_cpu_addr + copy.size}; -            new_async_range.add(base_interval); +            async_downloads += std::make_pair(base_interval, 1);              runtime.CopyBuffer(download_staging.buffer, buffer, copies, false);              normalized_copies.push_back(second_copy);          } -        async_downloads.emplace_back(std::move(new_async_range)); +        runtime.PostCopyBarrier();          pending_downloads.emplace_back(std::move(normalized_copies));          async_buffers.emplace_back(download_staging);      } else { -        const std::span<u8> immediate_buffer = ImmediateBuffer(largest_copy); -        for (const auto& [copy, buffer_id] : downloads) { -            Buffer& buffer = slot_buffers[buffer_id]; -            buffer.ImmediateDownload(copy.src_offset, immediate_buffer.subspan(0, copy.size)); -            const VAddr cpu_addr = buffer.CpuAddr() + copy.src_offset; -            cpu_memory.WriteBlockUnsafe(cpu_addr, immediate_buffer.data(), copy.size); -        } -    } -} - -template <class P> -void BufferCache<P>::CommitAsyncQueries() { -    if (pending_queries.empty()) { -        query_async_buffers.emplace_back(std::optional<Async_Buffer>{}); -        return; -    } - -    MICROPROFILE_SCOPE(GPU_DownloadMemory); -    boost::container::small_vector<std::pair<BufferCopy, BufferId>, 8> downloads; -    u64 total_size_bytes = 0; -    u64 largest_copy = 0; -    do { -        has_deleted_buffers = false; -        downloads.clear(); -        total_size_bytes = 0; -        largest_copy = 0; -        for (const auto& query_info : pending_queries) { -            const std::size_t size = query_info.second; -            const VAddr cpu_addr = query_info.first; -            const BufferId buffer_id = FindBuffer(cpu_addr, static_cast<u32>(size)); -            Buffer& buffer = slot_buffers[buffer_id]; -            if (has_deleted_buffers) { -                break; +        if constexpr (USE_MEMORY_MAPS) { +            auto download_staging = runtime.DownloadStagingBuffer(total_size_bytes); +            runtime.PreCopyBarrier(); +            for (auto& [copy, buffer_id] : downloads) { +                // Have in mind the staging buffer offset for the copy +                copy.dst_offset += download_staging.offset; +                const std::array copies{copy}; +                runtime.CopyBuffer(download_staging.buffer, slot_buffers[buffer_id], copies, false); +            } +            runtime.PostCopyBarrier(); +            runtime.Finish(); +            for (const auto& [copy, buffer_id] : downloads) { +                const Buffer& buffer = slot_buffers[buffer_id]; +                const VAddr cpu_addr = buffer.CpuAddr() + copy.src_offset; +                // Undo the modified offset +                const u64 dst_offset = copy.dst_offset - download_staging.offset; +                const u8* read_mapped_memory = download_staging.mapped_span.data() + dst_offset; +                cpu_memory.WriteBlockUnsafe(cpu_addr, read_mapped_memory, copy.size); +            } +        } else { +            const std::span<u8> immediate_buffer = ImmediateBuffer(largest_copy); +            for (const auto& [copy, buffer_id] : downloads) { +                Buffer& buffer = slot_buffers[buffer_id]; +                buffer.ImmediateDownload(copy.src_offset, immediate_buffer.subspan(0, copy.size)); +                const VAddr cpu_addr = buffer.CpuAddr() + copy.src_offset; +                cpu_memory.WriteBlockUnsafe(cpu_addr, immediate_buffer.data(), copy.size);              } -            downloads.push_back({ -                BufferCopy{ -                    .src_offset = buffer.Offset(cpu_addr), -                    .dst_offset = total_size_bytes, -                    .size = size, -                }, -                buffer_id, -            }); -            constexpr u64 align = 8ULL; -            constexpr u64 mask = ~(align - 1ULL); -            total_size_bytes += (size + align - 1) & mask; -            largest_copy = std::max(largest_copy, size); -        } -    } while (has_deleted_buffers); -    pending_queries.clear(); -    if (downloads.empty()) { -        query_async_buffers.push_back(std::optional<Async_Buffer>{}); -        return; -    } -    if constexpr (IMPLEMENTS_ASYNC_DOWNLOADS) { -        auto download_staging = runtime.DownloadStagingBuffer(total_size_bytes, true); -        boost::container::small_vector<BufferCopy, 8> normalized_copies; -        runtime.PreCopyBarrier(); -        for (auto& [copy, buffer_id] : downloads) { -            // Have in mind the staging buffer offset for the copy -            copy.dst_offset += download_staging.offset; -            const std::array copies{copy}; -            const Buffer& buffer = slot_buffers[buffer_id]; -            BufferCopy second_copy{copy}; -            second_copy.src_offset = static_cast<size_t>(buffer.CpuAddr()) + second_copy.src_offset; -            runtime.CopyBuffer(download_staging.buffer, buffer, copies, false); -            normalized_copies.push_back(second_copy);          } -        committed_queries.emplace_back(std::move(normalized_copies)); -        query_async_buffers.emplace_back(download_staging); -    } else { -        query_async_buffers.push_back(std::optional<Async_Buffer>{});      }  }  template <class P>  void BufferCache<P>::CommitAsyncFlushes() {      CommitAsyncFlushesHigh(); -    CommitAsyncQueries();  }  template <class P>  void BufferCache<P>::PopAsyncFlushes() {      MICROPROFILE_SCOPE(GPU_DownloadMemory);      PopAsyncBuffers(); -    PopAsyncQueries();  }  template <class P> @@ -627,59 +609,34 @@ void BufferCache<P>::PopAsyncBuffers() {      if constexpr (IMPLEMENTS_ASYNC_DOWNLOADS) {          auto& downloads = pending_downloads.front();          auto& async_buffer = async_buffers.front(); -        auto& async_range = async_downloads.front();          u8* base = async_buffer->mapped_span.data();          const size_t base_offset = async_buffer->offset;          for (const auto& copy : downloads) {              const VAddr cpu_addr = static_cast<VAddr>(copy.src_offset);              const u64 dst_offset = copy.dst_offset - base_offset;              const u8* read_mapped_memory = base + dst_offset; -            ForEachInRangeSet(async_range, cpu_addr, copy.size, [&](VAddr start, VAddr end) { -                const size_t diff = start - cpu_addr; -                const size_t new_size = end - start; -                cpu_memory.WriteBlockUnsafe(start, &read_mapped_memory[diff], new_size); -                const IntervalType base_interval{start, end}; -                common_ranges.subtract(base_interval); -            }); +            ForEachInOverlapCounter( +                async_downloads, cpu_addr, copy.size, [&](VAddr start, VAddr end, int count) { +                    cpu_memory.WriteBlockUnsafe(start, &read_mapped_memory[start - cpu_addr], +                                                end - start); +                    if (count == 1) { +                        const IntervalType base_interval{start, end}; +                        common_ranges.subtract(base_interval); +                    } +                }); +            async_downloads -= std::make_pair(IntervalType(cpu_addr, cpu_addr + copy.size), 1);          }          runtime.FreeDeferredStagingBuffer(*async_buffer);          async_buffers.pop_front();          pending_downloads.pop_front(); -        async_downloads.pop_front(); -    } -} - -template <class P> -void BufferCache<P>::PopAsyncQueries() { -    if constexpr (IMPLEMENTS_ASYNC_DOWNLOADS) { -        if (query_async_buffers.empty()) { -            return; -        } -        if (!query_async_buffers.front().has_value()) { -            query_async_buffers.pop_front(); -            return; -        } -        auto& downloads = committed_queries.front(); -        auto& async_buffer = query_async_buffers.front(); -        flushed_queries.clear(); -        u8* base = async_buffer->mapped_span.data(); -        const size_t base_offset = async_buffer->offset; -        for (const auto& copy : downloads) { -            const size_t dst_offset = copy.dst_offset - base_offset; -            const u8* read_mapped_memory = base + dst_offset; -            u64 new_value{}; -            std::memcpy(&new_value, read_mapped_memory, copy.size); -            flushed_queries.push_back(new_value); -        } -        runtime.FreeDeferredStagingBuffer(*async_buffer); -        committed_queries.pop_front(); -        query_async_buffers.pop_front();      }  }  template <class P>  bool BufferCache<P>::IsRegionGpuModified(VAddr addr, size_t size) { -    return memory_tracker.IsRegionGpuModified(addr, size); +    bool is_dirty = false; +    ForEachInRangeSet(common_ranges, addr, size, [&](VAddr, VAddr) { is_dirty = true; }); +    return is_dirty;  }  template <class P> @@ -1232,16 +1189,18 @@ void BufferCache<P>::UpdateComputeTextureBuffers() {  }  template <class P> -void BufferCache<P>::MarkWrittenBuffer(BufferId, VAddr cpu_addr, u32 size) { +void BufferCache<P>::MarkWrittenBuffer(BufferId buffer_id, VAddr cpu_addr, u32 size) {      memory_tracker.MarkRegionAsGpuModified(cpu_addr, size); +    if (memory_tracker.IsRegionCpuModified(cpu_addr, size)) { +        SynchronizeBuffer(slot_buffers[buffer_id], cpu_addr, size); +    } +      const IntervalType base_interval{cpu_addr, cpu_addr + size};      common_ranges.add(base_interval); -    for (auto& interval_set : async_downloads) { -        interval_set.subtract(base_interval); -    }      if (Settings::values.gpu_accuracy.GetValue() == Settings::GPUAccuracy::High) {          uncommitted_ranges.add(base_interval); +        pending_ranges.add(base_interval);      }  } @@ -1530,7 +1489,9 @@ bool BufferCache<P>::InlineMemory(VAddr dest_address, size_t copy_size,      if (!is_dirty) {          return false;      } -    if (!IsRegionGpuModified(dest_address, copy_size)) { +    VAddr aligned_start = Common::AlignDown(dest_address, YUZU_PAGESIZE); +    VAddr aligned_end = Common::AlignUp(dest_address + copy_size, YUZU_PAGESIZE); +    if (!IsRegionGpuModified(aligned_start, aligned_end - aligned_start)) {          return false;      } diff --git a/src/video_core/buffer_cache/buffer_cache_base.h b/src/video_core/buffer_cache/buffer_cache_base.h index 4b3677da3..6f29cba25 100644 --- a/src/video_core/buffer_cache/buffer_cache_base.h +++ b/src/video_core/buffer_cache/buffer_cache_base.h @@ -17,6 +17,7 @@  #include <boost/pool/detail/mutex.hpp>  #undef BOOST_NO_MT  #include <boost/icl/interval_set.hpp> +#include <boost/icl/split_interval_map.hpp>  #include <boost/pool/pool.hpp>  #include <boost/pool/pool_alloc.hpp> @@ -44,8 +45,7 @@  namespace boost {  template <typename T> -class fast_pool_allocator<T, default_user_allocator_new_delete, details::pool::default_mutex, 4096, -                          0>; +class fast_pool_allocator<T, default_user_allocator_new_delete, details::pool::null_mutex, 4096, 0>;  }  namespace VideoCommon { @@ -123,6 +123,31 @@ class BufferCache : public VideoCommon::ChannelSetupCaches<VideoCommon::ChannelI          boost::icl::interval_set<VAddr, IntervalCompare, IntervalInstance, IntervalAllocator>;      using IntervalType = typename IntervalSet::interval_type; +    template <typename Type> +    struct counter_add_functor : public boost::icl::identity_based_inplace_combine<Type> { +        // types +        typedef counter_add_functor<Type> type; +        typedef boost::icl::identity_based_inplace_combine<Type> base_type; + +        // public member functions +        void operator()(Type& current, const Type& added) const { +            current += added; +            if (current < base_type::identity_element()) { +                current = base_type::identity_element(); +            } +        } + +        // public static functions +        static void version(Type&){}; +    }; + +    using OverlapCombine = ICL_COMBINE_INSTANCE(counter_add_functor, int); +    using OverlapSection = ICL_SECTION_INSTANCE(boost::icl::inter_section, int); +    using OverlapCounter = +        boost::icl::split_interval_map<VAddr, int, boost::icl::partial_absorber, IntervalCompare, +                                       OverlapCombine, OverlapSection, IntervalInstance, +                                       IntervalAllocator>; +      struct Empty {};      struct OverlapResult { @@ -219,12 +244,9 @@ public:      /// Commit asynchronous downloads      void CommitAsyncFlushes();      void CommitAsyncFlushesHigh(); -    void CommitAsyncQueries();      /// Pop asynchronous downloads      void PopAsyncFlushes(); - -    void PopAsyncQueries();      void PopAsyncBuffers();      bool DMACopy(GPUVAddr src_address, GPUVAddr dest_address, u64 amount); @@ -302,6 +324,34 @@ private:          }      } +    template <typename Func> +    void ForEachInOverlapCounter(OverlapCounter& current_range, VAddr cpu_addr, u64 size, +                                 Func&& func) { +        const VAddr start_address = cpu_addr; +        const VAddr end_address = start_address + size; +        const IntervalType search_interval{start_address, end_address}; +        auto it = current_range.lower_bound(search_interval); +        if (it == current_range.end()) { +            return; +        } +        auto end_it = current_range.upper_bound(search_interval); +        for (; it != end_it; it++) { +            auto& inter = it->first; +            VAddr inter_addr_end = inter.upper(); +            VAddr inter_addr = inter.lower(); +            if (inter_addr_end > end_address) { +                inter_addr_end = end_address; +            } +            if (inter_addr < start_address) { +                inter_addr = start_address; +            } +            if (it->second <= 0) { +                __debugbreak(); +            } +            func(inter_addr, inter_addr_end, it->second); +        } +    } +      static bool IsRangeGranular(VAddr cpu_addr, size_t size) {          return (cpu_addr & ~Core::Memory::YUZU_PAGEMASK) ==                 ((cpu_addr + size) & ~Core::Memory::YUZU_PAGEMASK); @@ -309,6 +359,8 @@ private:      void RunGarbageCollector(); +    void WaitOnAsyncFlushes(VAddr cpu_addr, u64 size); +      void BindHostIndexBuffer();      void BindHostVertexBuffers(); @@ -474,10 +526,11 @@ private:      IntervalSet uncommitted_ranges;      IntervalSet common_ranges;      IntervalSet cached_ranges; +    IntervalSet pending_ranges;      std::deque<IntervalSet> committed_ranges;      // Async Buffers -    std::deque<IntervalSet> async_downloads; +    OverlapCounter async_downloads;      std::deque<std::optional<Async_Buffer>> async_buffers;      std::deque<boost::container::small_vector<BufferCopy, 4>> pending_downloads;      std::optional<Async_Buffer> current_buffer;  | 
