diff options
| -rw-r--r-- | src/video_core/buffer_cache/buffer_base.h | 11 | ||||
| -rw-r--r-- | src/video_core/buffer_cache/buffer_cache.h | 138 | ||||
| -rw-r--r-- | src/video_core/texture_cache/image_base.cpp | 17 | ||||
| -rw-r--r-- | src/video_core/texture_cache/image_base.h | 2 | ||||
| -rw-r--r-- | src/video_core/texture_cache/slot_vector.h | 70 | ||||
| -rw-r--r-- | src/video_core/texture_cache/texture_cache.h | 44 | 
6 files changed, 226 insertions, 56 deletions
| diff --git a/src/video_core/buffer_cache/buffer_base.h b/src/video_core/buffer_cache/buffer_base.h index a39505903..b121d36a3 100644 --- a/src/video_core/buffer_cache/buffer_base.h +++ b/src/video_core/buffer_cache/buffer_base.h @@ -256,6 +256,16 @@ public:          stream_score += score;      } +    /// Sets the new frame tick +    void SetFrameTick(u64 new_frame_tick) noexcept { +        frame_tick = new_frame_tick; +    } + +    /// Returns the new frame tick +    [[nodiscard]] u64 FrameTick() const noexcept { +        return frame_tick; +    } +      /// Returns the likeliness of this being a stream buffer      [[nodiscard]] int StreamScore() const noexcept {          return stream_score; @@ -586,6 +596,7 @@ private:      RasterizerInterface* rasterizer = nullptr;      VAddr cpu_addr = 0;      Words words; +    u64 frame_tick = 0;      BufferFlagBits flags{};      int stream_score = 0;  }; diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h index d371b842f..ecb7d3dee 100644 --- a/src/video_core/buffer_cache/buffer_cache.h +++ b/src/video_core/buffer_cache/buffer_cache.h @@ -243,6 +243,8 @@ private:      template <bool insert>      void ChangeRegister(BufferId buffer_id); +    void TouchBuffer(Buffer& buffer) const noexcept; +      bool SynchronizeBuffer(Buffer& buffer, VAddr cpu_addr, u32 size);      bool SynchronizeBufferImpl(Buffer& buffer, VAddr cpu_addr, u32 size); @@ -255,6 +257,10 @@ private:      void MappedUploadMemory(Buffer& buffer, u64 total_size_bytes, std::span<BufferCopy> copies); +    void DownloadBufferMemory(Buffer& buffer_id); + +    void DownloadBufferMemory(Buffer& buffer_id, VAddr cpu_addr, u64 size); +      void DeleteBuffer(BufferId buffer_id);      void ReplaceBufferDownloads(BufferId old_buffer_id, BufferId new_buffer_id); @@ -319,6 +325,9 @@ private:      size_t immediate_buffer_capacity = 0;      std::unique_ptr<u8[]> immediate_buffer_alloc; +    typename SlotVector<Buffer>::Iterator deletion_iterator; +    u64 frame_tick = 0; +      std::array<BufferId, ((1ULL << 39) >> PAGE_BITS)> page_table;  }; @@ -332,6 +341,7 @@ BufferCache<P>::BufferCache(VideoCore::RasterizerInterface& rasterizer_,        gpu_memory{gpu_memory_}, cpu_memory{cpu_memory_}, runtime{runtime_} {      // Ensure the first slot is used for the null buffer      void(slot_buffers.insert(runtime, NullBufferParams{})); +    deletion_iterator = slot_buffers.end();  }  template <class P> @@ -349,7 +359,24 @@ void BufferCache<P>::TickFrame() {      const bool skip_preferred = hits * 256 < shots * 251;      uniform_buffer_skip_cache_size = skip_preferred ? DEFAULT_SKIP_CACHE_SIZE : 0; +    static constexpr u64 ticks_to_destroy = 120; +    int num_iterations = 32; +    for (; num_iterations > 0; --num_iterations) { +        if (deletion_iterator == slot_buffers.end()) { +            deletion_iterator = slot_buffers.begin(); +        } +        ++deletion_iterator; +        if (deletion_iterator == slot_buffers.end()) { +            break; +        } +        const auto [buffer_id, buffer] = *deletion_iterator; +        if (buffer->FrameTick() + ticks_to_destroy < frame_tick) { +            DownloadBufferMemory(*buffer); +            DeleteBuffer(buffer_id); +        } +    }      delayed_destruction_ring.Tick(); +    ++frame_tick;  }  template <class P> @@ -371,50 +398,8 @@ void BufferCache<P>::CachedWriteMemory(VAddr cpu_addr, u64 size) {  template <class P>  void BufferCache<P>::DownloadMemory(VAddr cpu_addr, u64 size) { -    ForEachBufferInRange(cpu_addr, size, [&](BufferId, Buffer& buffer) { -        boost::container::small_vector<BufferCopy, 1> copies; -        u64 total_size_bytes = 0; -        u64 largest_copy = 0; -        buffer.ForEachDownloadRange(cpu_addr, size, [&](u64 range_offset, u64 range_size) { -            copies.push_back(BufferCopy{ -                .src_offset = range_offset, -                .dst_offset = total_size_bytes, -                .size = range_size, -            }); -            total_size_bytes += range_size; -            largest_copy = std::max(largest_copy, range_size); -        }); -        if (total_size_bytes == 0) { -            return; -        } -        MICROPROFILE_SCOPE(GPU_DownloadMemory); - -        if constexpr (USE_MEMORY_MAPS) { -            auto download_staging = runtime.DownloadStagingBuffer(total_size_bytes); -            const u8* const mapped_memory = download_staging.mapped_span.data(); -            const std::span<BufferCopy> copies_span(copies.data(), copies.data() + copies.size()); -            for (BufferCopy& copy : copies) { -                // Modify copies to have the staging offset in mind -                copy.dst_offset += download_staging.offset; -            } -            runtime.CopyBuffer(download_staging.buffer, buffer, copies_span); -            runtime.Finish(); -            for (const BufferCopy& copy : copies) { -                const VAddr copy_cpu_addr = buffer.CpuAddr() + copy.src_offset; -                // Undo the modified offset -                const u64 dst_offset = copy.dst_offset - download_staging.offset; -                const u8* copy_mapped_memory = mapped_memory + dst_offset; -                cpu_memory.WriteBlockUnsafe(copy_cpu_addr, copy_mapped_memory, copy.size); -            } -        } else { -            const std::span<u8> immediate_buffer = ImmediateBuffer(largest_copy); -            for (const BufferCopy& copy : copies) { -                buffer.ImmediateDownload(copy.src_offset, immediate_buffer.subspan(0, copy.size)); -                const VAddr copy_cpu_addr = buffer.CpuAddr() + copy.src_offset; -                cpu_memory.WriteBlockUnsafe(copy_cpu_addr, immediate_buffer.data(), copy.size); -            } -        } -    }); +    ForEachBufferInRange(cpu_addr, size, +                         [&](BufferId, Buffer& buffer) { DownloadBufferMemory(buffer); });  }  template <class P> @@ -640,6 +625,7 @@ bool BufferCache<P>::IsRegionGpuModified(VAddr addr, size_t size) {  template <class P>  void BufferCache<P>::BindHostIndexBuffer() {      Buffer& buffer = slot_buffers[index_buffer.buffer_id]; +    TouchBuffer(buffer);      const u32 offset = buffer.Offset(index_buffer.cpu_addr);      const u32 size = index_buffer.size;      SynchronizeBuffer(buffer, index_buffer.cpu_addr, size); @@ -658,6 +644,7 @@ void BufferCache<P>::BindHostVertexBuffers() {      for (u32 index = 0; index < NUM_VERTEX_BUFFERS; ++index) {          const Binding& binding = vertex_buffers[index];          Buffer& buffer = slot_buffers[binding.buffer_id]; +        TouchBuffer(buffer);          SynchronizeBuffer(buffer, binding.cpu_addr, binding.size);          if (!flags[Dirty::VertexBuffer0 + index]) {              continue; @@ -693,6 +680,7 @@ void BufferCache<P>::BindHostGraphicsUniformBuffer(size_t stage, u32 index, u32      const VAddr cpu_addr = binding.cpu_addr;      const u32 size = binding.size;      Buffer& buffer = slot_buffers[binding.buffer_id]; +    TouchBuffer(buffer);      const bool use_fast_buffer = binding.buffer_id != NULL_BUFFER_ID &&                                   size <= uniform_buffer_skip_cache_size &&                                   !buffer.IsRegionGpuModified(cpu_addr, size); @@ -744,6 +732,7 @@ void BufferCache<P>::BindHostGraphicsStorageBuffers(size_t stage) {      ForEachEnabledBit(enabled_storage_buffers[stage], [&](u32 index) {          const Binding& binding = storage_buffers[stage][index];          Buffer& buffer = slot_buffers[binding.buffer_id]; +        TouchBuffer(buffer);          const u32 size = binding.size;          SynchronizeBuffer(buffer, binding.cpu_addr, size); @@ -766,6 +755,7 @@ void BufferCache<P>::BindHostTransformFeedbackBuffers() {      for (u32 index = 0; index < NUM_TRANSFORM_FEEDBACK_BUFFERS; ++index) {          const Binding& binding = transform_feedback_buffers[index];          Buffer& buffer = slot_buffers[binding.buffer_id]; +        TouchBuffer(buffer);          const u32 size = binding.size;          SynchronizeBuffer(buffer, binding.cpu_addr, size); @@ -784,6 +774,7 @@ void BufferCache<P>::BindHostComputeUniformBuffers() {      ForEachEnabledBit(enabled_compute_uniform_buffers, [&](u32 index) {          const Binding& binding = compute_uniform_buffers[index];          Buffer& buffer = slot_buffers[binding.buffer_id]; +        TouchBuffer(buffer);          const u32 size = binding.size;          SynchronizeBuffer(buffer, binding.cpu_addr, size); @@ -803,6 +794,7 @@ void BufferCache<P>::BindHostComputeStorageBuffers() {      ForEachEnabledBit(enabled_compute_storage_buffers, [&](u32 index) {          const Binding& binding = compute_storage_buffers[index];          Buffer& buffer = slot_buffers[binding.buffer_id]; +        TouchBuffer(buffer);          const u32 size = binding.size;          SynchronizeBuffer(buffer, binding.cpu_addr, size); @@ -1101,6 +1093,7 @@ BufferId BufferCache<P>::CreateBuffer(VAddr cpu_addr, u32 wanted_size) {      const OverlapResult overlap = ResolveOverlaps(cpu_addr, wanted_size);      const u32 size = static_cast<u32>(overlap.end - overlap.begin);      const BufferId new_buffer_id = slot_buffers.insert(runtime, rasterizer, overlap.begin, size); +    TouchBuffer(slot_buffers[new_buffer_id]);      for (const BufferId overlap_id : overlap.ids) {          JoinOverlap(new_buffer_id, overlap_id, !overlap.has_stream_leap);      } @@ -1136,6 +1129,11 @@ void BufferCache<P>::ChangeRegister(BufferId buffer_id) {  }  template <class P> +void BufferCache<P>::TouchBuffer(Buffer& buffer) const noexcept { +    buffer.SetFrameTick(frame_tick); +} + +template <class P>  bool BufferCache<P>::SynchronizeBuffer(Buffer& buffer, VAddr cpu_addr, u32 size) {      if (buffer.CpuAddr() == 0) {          return true; @@ -1212,6 +1210,57 @@ void BufferCache<P>::MappedUploadMemory(Buffer& buffer, u64 total_size_bytes,  }  template <class P> +void BufferCache<P>::DownloadBufferMemory(Buffer& buffer) { +    DownloadBufferMemory(buffer, buffer.CpuAddr(), buffer.SizeBytes()); +} + +template <class P> +void BufferCache<P>::DownloadBufferMemory(Buffer& buffer, VAddr cpu_addr, u64 size) { +    boost::container::small_vector<BufferCopy, 1> copies; +    u64 total_size_bytes = 0; +    u64 largest_copy = 0; +    buffer.ForEachDownloadRange(cpu_addr, size, [&](u64 range_offset, u64 range_size) { +        copies.push_back(BufferCopy{ +            .src_offset = range_offset, +            .dst_offset = total_size_bytes, +            .size = range_size, +        }); +        total_size_bytes += range_size; +        largest_copy = std::max(largest_copy, range_size); +    }); +    if (total_size_bytes == 0) { +        return; +    } +    MICROPROFILE_SCOPE(GPU_DownloadMemory); + +    if constexpr (USE_MEMORY_MAPS) { +        auto download_staging = runtime.DownloadStagingBuffer(total_size_bytes); +        const u8* const mapped_memory = download_staging.mapped_span.data(); +        const std::span<BufferCopy> copies_span(copies.data(), copies.data() + copies.size()); +        for (BufferCopy& copy : copies) { +            // Modify copies to have the staging offset in mind +            copy.dst_offset += download_staging.offset; +        } +        runtime.CopyBuffer(download_staging.buffer, buffer, copies_span); +        runtime.Finish(); +        for (const BufferCopy& copy : copies) { +            const VAddr copy_cpu_addr = buffer.CpuAddr() + copy.src_offset; +            // Undo the modified offset +            const u64 dst_offset = copy.dst_offset - download_staging.offset; +            const u8* copy_mapped_memory = mapped_memory + dst_offset; +            cpu_memory.WriteBlockUnsafe(copy_cpu_addr, copy_mapped_memory, copy.size); +        } +    } else { +        const std::span<u8> immediate_buffer = ImmediateBuffer(largest_copy); +        for (const BufferCopy& copy : copies) { +            buffer.ImmediateDownload(copy.src_offset, immediate_buffer.subspan(0, copy.size)); +            const VAddr copy_cpu_addr = buffer.CpuAddr() + copy.src_offset; +            cpu_memory.WriteBlockUnsafe(copy_cpu_addr, immediate_buffer.data(), copy.size); +        } +    } +} + +template <class P>  void BufferCache<P>::DeleteBuffer(BufferId buffer_id) {      const auto scalar_replace = [buffer_id](Binding& binding) {          if (binding.buffer_id == buffer_id) { @@ -1236,6 +1285,7 @@ void BufferCache<P>::DeleteBuffer(BufferId buffer_id) {      Unregister(buffer_id);      delayed_destruction_ring.Push(std::move(slot_buffers[buffer_id])); +    slot_buffers.erase(buffer_id);      NotifyBufferDeletion();  } diff --git a/src/video_core/texture_cache/image_base.cpp b/src/video_core/texture_cache/image_base.cpp index 9914926b3..bd0e7e64e 100644 --- a/src/video_core/texture_cache/image_base.cpp +++ b/src/video_core/texture_cache/image_base.cpp @@ -113,6 +113,23 @@ void ImageBase::InsertView(const ImageViewInfo& view_info, ImageViewId image_vie      image_view_ids.push_back(image_view_id);  } +bool ImageBase::IsSafeDownload() const noexcept { +    // Skip images that were not modified from the GPU +    if (False(flags & ImageFlagBits::GpuModified)) { +        return false; +    } +    // Skip images that .are. modified from the CPU +    // We don't want to write sensitive data from the guest +    if (True(flags & ImageFlagBits::CpuModified)) { +        return false; +    } +    if (info.num_samples > 1) { +        LOG_WARNING(HW_GPU, "MSAA image downloads are not implemented"); +        return false; +    } +    return true; +} +  void AddImageAlias(ImageBase& lhs, ImageBase& rhs, ImageId lhs_id, ImageId rhs_id) {      static constexpr auto OPTIONS = RelaxedOptions::Size | RelaxedOptions::Format;      ASSERT(lhs.info.type == rhs.info.type); diff --git a/src/video_core/texture_cache/image_base.h b/src/video_core/texture_cache/image_base.h index b7f3b7e43..0f69d8a32 100644 --- a/src/video_core/texture_cache/image_base.h +++ b/src/video_core/texture_cache/image_base.h @@ -44,6 +44,8 @@ struct ImageBase {      void InsertView(const ImageViewInfo& view_info, ImageViewId image_view_id); +    [[nodiscard]] bool IsSafeDownload() const noexcept; +      [[nodiscard]] bool Overlaps(VAddr overlap_cpu_addr, size_t overlap_size) const noexcept {          const VAddr overlap_end = overlap_cpu_addr + overlap_size;          return cpu_addr < overlap_end && overlap_cpu_addr < cpu_addr_end; diff --git a/src/video_core/texture_cache/slot_vector.h b/src/video_core/texture_cache/slot_vector.h index eae3be6ea..1259e8263 100644 --- a/src/video_core/texture_cache/slot_vector.h +++ b/src/video_core/texture_cache/slot_vector.h @@ -5,6 +5,7 @@  #pragma once  #include <array> +#include <bit>  #include <concepts>  #include <numeric>  #include <type_traits> @@ -32,6 +33,60 @@ template <class T>  requires std::is_nothrow_move_assignable_v<T>&&      std::is_nothrow_move_constructible_v<T> class SlotVector {  public: +    class Iterator { +        friend SlotVector<T>; + +    public: +        constexpr Iterator() = default; + +        Iterator& operator++() noexcept { +            const u64* const bitset = slot_vector->stored_bitset.data(); +            const u32 size = static_cast<u32>(slot_vector->stored_bitset.size()) * 64; +            if (id.index < size) { +                do { +                    ++id.index; +                } while (id.index < size && !IsValid(bitset)); +                if (id.index == size) { +                    id.index = SlotId::INVALID_INDEX; +                } +            } +            return *this; +        } + +        Iterator operator++(int) noexcept { +            const Iterator copy{*this}; +            ++*this; +            return copy; +        } + +        bool operator==(const Iterator& other) const noexcept { +            return id.index == other.id.index; +        } + +        bool operator!=(const Iterator& other) const noexcept { +            return id.index != other.id.index; +        } + +        std::pair<SlotId, T*> operator*() const noexcept { +            return {id, std::addressof((*slot_vector)[id])}; +        } + +        T* operator->() const noexcept { +            return std::addressof((*slot_vector)[id]); +        } + +    private: +        Iterator(SlotVector<T>* slot_vector_, SlotId id_) noexcept +            : slot_vector{slot_vector_}, id{id_} {} + +        bool IsValid(const u64* bitset) noexcept { +            return ((bitset[id.index / 64] >> (id.index % 64)) & 1) != 0; +        } + +        SlotVector<T>* slot_vector; +        SlotId id; +    }; +      ~SlotVector() noexcept {          size_t index = 0;          for (u64 bits : stored_bitset) { @@ -70,6 +125,20 @@ public:          ResetStorageBit(id.index);      } +    [[nodiscard]] Iterator begin() noexcept { +        const auto it = std::ranges::find_if(stored_bitset, [](u64 value) { return value != 0; }); +        if (it == stored_bitset.end()) { +            return end(); +        } +        const u32 word_index = static_cast<u32>(std::distance(it, stored_bitset.begin())); +        const SlotId first_id{word_index * 64 + static_cast<u32>(std::countr_zero(*it))}; +        return Iterator(this, first_id); +    } + +    [[nodiscard]] Iterator end() noexcept { +        return Iterator(this, SlotId{SlotId::INVALID_INDEX}); +    } +  private:      struct NonTrivialDummy {          NonTrivialDummy() noexcept {} @@ -140,7 +209,6 @@ private:      Entry* values = nullptr;      size_t values_capacity = 0; -    size_t values_size = 0;      std::vector<u64> stored_bitset;      std::vector<u32> free_list; diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h index 59b7c678b..45ef155b5 100644 --- a/src/video_core/texture_cache/texture_cache.h +++ b/src/video_core/texture_cache/texture_cache.h @@ -353,6 +353,7 @@ private:      u64 modification_tick = 0;      u64 frame_tick = 0; +    typename SlotVector<Image>::Iterator deletion_iterator;  };  template <class P> @@ -373,10 +374,41 @@ TextureCache<P>::TextureCache(Runtime& runtime_, VideoCore::RasterizerInterface&      // This way the null resource becomes a compile time constant      void(slot_image_views.insert(runtime, NullImageParams{}));      void(slot_samplers.insert(runtime, sampler_descriptor)); + +    deletion_iterator = slot_images.begin();  }  template <class P>  void TextureCache<P>::TickFrame() { +    static constexpr u64 ticks_to_destroy = 120; +    int num_iterations = 32; +    for (; num_iterations > 0; --num_iterations) { +        if (deletion_iterator == slot_images.end()) { +            deletion_iterator = slot_images.begin(); +            if (deletion_iterator == slot_images.end()) { +                break; +            } +        } +        const auto [image_id, image] = *deletion_iterator; +        if (image->frame_tick + ticks_to_destroy < frame_tick) { +            if (image->IsSafeDownload() && +                std::ranges::none_of(image->aliased_images, [&](const AliasedImage& alias) { +                    return slot_images[alias.id].modification_tick > image->modification_tick; +                })) { +                auto map = runtime.DownloadStagingBuffer(image->unswizzled_size_bytes); +                const auto copies = FullDownloadCopies(image->info); +                image->DownloadMemory(map, copies); +                runtime.Finish(); +                SwizzleImage(gpu_memory, image->gpu_addr, image->info, copies, map.mapped_span); +            } +            if (True(image->flags & ImageFlagBits::Tracked)) { +                UntrackImage(*image); +            } +            UnregisterImage(image_id); +            DeleteImage(image_id); +        } +        ++deletion_iterator; +    }      // Tick sentenced resources in this order to ensure they are destroyed in the right order      sentenced_images.Tick();      sentenced_framebuffers.Tick(); @@ -568,17 +600,7 @@ template <class P>  void TextureCache<P>::DownloadMemory(VAddr cpu_addr, size_t size) {      std::vector<ImageId> images;      ForEachImageInRegion(cpu_addr, size, [this, &images](ImageId image_id, ImageBase& image) { -        // Skip images that were not modified from the GPU -        if (False(image.flags & ImageFlagBits::GpuModified)) { -            return; -        } -        // Skip images that .are. modified from the CPU -        // We don't want to write sensitive data from the guest -        if (True(image.flags & ImageFlagBits::CpuModified)) { -            return; -        } -        if (image.info.num_samples > 1) { -            LOG_WARNING(HW_GPU, "MSAA image downloads are not implemented"); +        if (!image.IsSafeDownload()) {              return;          }          image.flags &= ~ImageFlagBits::GpuModified; | 
