diff options
49 files changed, 959 insertions, 412 deletions
| diff --git a/CMakeModules/GenerateSCMRev.cmake b/CMakeModules/GenerateSCMRev.cmake index abdc74428..a1ace89cb 100644 --- a/CMakeModules/GenerateSCMRev.cmake +++ b/CMakeModules/GenerateSCMRev.cmake @@ -81,6 +81,7 @@ set(HASH_FILES      "${VIDEO_CORE}/shader/decode/register_set_predicate.cpp"      "${VIDEO_CORE}/shader/decode/shift.cpp"      "${VIDEO_CORE}/shader/decode/video.cpp" +    "${VIDEO_CORE}/shader/decode/warp.cpp"      "${VIDEO_CORE}/shader/decode/xmad.cpp"      "${VIDEO_CORE}/shader/control_flow.cpp"      "${VIDEO_CORE}/shader/control_flow.h" diff --git a/src/common/CMakeLists.txt b/src/common/CMakeLists.txt index 2b4266f29..01abdb3bb 100644 --- a/src/common/CMakeLists.txt +++ b/src/common/CMakeLists.txt @@ -55,6 +55,7 @@ add_custom_command(OUTPUT scm_rev.cpp        "${VIDEO_CORE}/shader/decode/register_set_predicate.cpp"        "${VIDEO_CORE}/shader/decode/shift.cpp"        "${VIDEO_CORE}/shader/decode/video.cpp" +      "${VIDEO_CORE}/shader/decode/warp.cpp"        "${VIDEO_CORE}/shader/decode/xmad.cpp"        "${VIDEO_CORE}/shader/control_flow.cpp"        "${VIDEO_CORE}/shader/control_flow.h" diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt index 7c18c27b3..e2f85c5f1 100644 --- a/src/video_core/CMakeLists.txt +++ b/src/video_core/CMakeLists.txt @@ -1,5 +1,7 @@  add_library(video_core STATIC -    buffer_cache.h +    buffer_cache/buffer_block.h +    buffer_cache/buffer_cache.h +    buffer_cache/map_interval.h      dma_pusher.cpp      dma_pusher.h      debug_utils/debug_utils.cpp @@ -100,6 +102,7 @@ add_library(video_core STATIC      shader/decode/integer_set.cpp      shader/decode/half_set.cpp      shader/decode/video.cpp +    shader/decode/warp.cpp      shader/decode/xmad.cpp      shader/decode/other.cpp      shader/control_flow.cpp diff --git a/src/video_core/buffer_cache.h b/src/video_core/buffer_cache.h deleted file mode 100644 index 6f868b8b4..000000000 --- a/src/video_core/buffer_cache.h +++ /dev/null @@ -1,299 +0,0 @@ -// Copyright 2019 yuzu Emulator Project -// Licensed under GPLv2 or any later version -// Refer to the license.txt file included. - -#pragma once - -#include <array> -#include <memory> -#include <mutex> -#include <unordered_map> -#include <unordered_set> -#include <utility> -#include <vector> - -#include "common/alignment.h" -#include "common/common_types.h" -#include "core/core.h" -#include "video_core/memory_manager.h" -#include "video_core/rasterizer_cache.h" - -namespace VideoCore { -class RasterizerInterface; -} - -namespace VideoCommon { - -template <typename BufferStorageType> -class CachedBuffer final : public RasterizerCacheObject { -public: -    explicit CachedBuffer(VAddr cpu_addr, u8* host_ptr) -        : RasterizerCacheObject{host_ptr}, host_ptr{host_ptr}, cpu_addr{cpu_addr} {} -    ~CachedBuffer() override = default; - -    VAddr GetCpuAddr() const override { -        return cpu_addr; -    } - -    std::size_t GetSizeInBytes() const override { -        return size; -    } - -    u8* GetWritableHostPtr() const { -        return host_ptr; -    } - -    std::size_t GetSize() const { -        return size; -    } - -    std::size_t GetCapacity() const { -        return capacity; -    } - -    bool IsInternalized() const { -        return is_internal; -    } - -    const BufferStorageType& GetBuffer() const { -        return buffer; -    } - -    void SetSize(std::size_t new_size) { -        size = new_size; -    } - -    void SetInternalState(bool is_internal_) { -        is_internal = is_internal_; -    } - -    BufferStorageType ExchangeBuffer(BufferStorageType buffer_, std::size_t new_capacity) { -        capacity = new_capacity; -        std::swap(buffer, buffer_); -        return buffer_; -    } - -private: -    u8* host_ptr{}; -    VAddr cpu_addr{}; -    std::size_t size{}; -    std::size_t capacity{}; -    bool is_internal{}; -    BufferStorageType buffer; -}; - -template <typename BufferStorageType, typename BufferType, typename StreamBuffer> -class BufferCache : public RasterizerCache<std::shared_ptr<CachedBuffer<BufferStorageType>>> { -public: -    using Buffer = std::shared_ptr<CachedBuffer<BufferStorageType>>; -    using BufferInfo = std::pair<const BufferType*, u64>; - -    explicit BufferCache(VideoCore::RasterizerInterface& rasterizer, Core::System& system, -                         std::unique_ptr<StreamBuffer> stream_buffer) -        : RasterizerCache<Buffer>{rasterizer}, system{system}, -          stream_buffer{std::move(stream_buffer)}, stream_buffer_handle{ -                                                       this->stream_buffer->GetHandle()} {} -    ~BufferCache() = default; - -    void Unregister(const Buffer& entry) override { -        std::lock_guard lock{RasterizerCache<Buffer>::mutex}; -        if (entry->IsInternalized()) { -            internalized_entries.erase(entry->GetCacheAddr()); -        } -        ReserveBuffer(entry); -        RasterizerCache<Buffer>::Unregister(entry); -    } - -    void TickFrame() { -        marked_for_destruction_index = -            (marked_for_destruction_index + 1) % marked_for_destruction_ring_buffer.size(); -        MarkedForDestruction().clear(); -    } - -    BufferInfo UploadMemory(GPUVAddr gpu_addr, std::size_t size, std::size_t alignment = 4, -                            bool internalize = false, bool is_written = false) { -        std::lock_guard lock{RasterizerCache<Buffer>::mutex}; - -        auto& memory_manager = system.GPU().MemoryManager(); -        const auto host_ptr = memory_manager.GetPointer(gpu_addr); -        if (!host_ptr) { -            return {GetEmptyBuffer(size), 0}; -        } -        const auto cache_addr = ToCacheAddr(host_ptr); - -        // Cache management is a big overhead, so only cache entries with a given size. -        // TODO: Figure out which size is the best for given games. -        constexpr std::size_t max_stream_size = 0x800; -        if (!internalize && size < max_stream_size && -            internalized_entries.find(cache_addr) == internalized_entries.end()) { -            return StreamBufferUpload(host_ptr, size, alignment); -        } - -        auto entry = RasterizerCache<Buffer>::TryGet(cache_addr); -        if (!entry) { -            return FixedBufferUpload(gpu_addr, host_ptr, size, internalize, is_written); -        } - -        if (entry->GetSize() < size) { -            IncreaseBufferSize(entry, size); -        } -        if (is_written) { -            entry->MarkAsModified(true, *this); -        } -        return {ToHandle(entry->GetBuffer()), 0}; -    } - -    /// Uploads from a host memory. Returns the OpenGL buffer where it's located and its offset. -    BufferInfo UploadHostMemory(const void* raw_pointer, std::size_t size, -                                std::size_t alignment = 4) { -        std::lock_guard lock{RasterizerCache<Buffer>::mutex}; -        return StreamBufferUpload(raw_pointer, size, alignment); -    } - -    void Map(std::size_t max_size) { -        std::tie(buffer_ptr, buffer_offset_base, invalidated) = stream_buffer->Map(max_size, 4); -        buffer_offset = buffer_offset_base; -    } - -    /// Finishes the upload stream, returns true on bindings invalidation. -    bool Unmap() { -        stream_buffer->Unmap(buffer_offset - buffer_offset_base); -        return std::exchange(invalidated, false); -    } - -    virtual const BufferType* GetEmptyBuffer(std::size_t size) = 0; - -protected: -    void FlushObjectInner(const Buffer& entry) override { -        DownloadBufferData(entry->GetBuffer(), 0, entry->GetSize(), entry->GetWritableHostPtr()); -    } - -    virtual BufferStorageType CreateBuffer(std::size_t size) = 0; - -    virtual const BufferType* ToHandle(const BufferStorageType& storage) = 0; - -    virtual void UploadBufferData(const BufferStorageType& buffer, std::size_t offset, -                                  std::size_t size, const u8* data) = 0; - -    virtual void DownloadBufferData(const BufferStorageType& buffer, std::size_t offset, -                                    std::size_t size, u8* data) = 0; - -    virtual void CopyBufferData(const BufferStorageType& src, const BufferStorageType& dst, -                                std::size_t src_offset, std::size_t dst_offset, -                                std::size_t size) = 0; - -private: -    BufferInfo StreamBufferUpload(const void* raw_pointer, std::size_t size, -                                  std::size_t alignment) { -        AlignBuffer(alignment); -        const std::size_t uploaded_offset = buffer_offset; -        std::memcpy(buffer_ptr, raw_pointer, size); - -        buffer_ptr += size; -        buffer_offset += size; -        return {&stream_buffer_handle, uploaded_offset}; -    } - -    BufferInfo FixedBufferUpload(GPUVAddr gpu_addr, u8* host_ptr, std::size_t size, -                                 bool internalize, bool is_written) { -        auto& memory_manager = Core::System::GetInstance().GPU().MemoryManager(); -        const auto cpu_addr = memory_manager.GpuToCpuAddress(gpu_addr); -        ASSERT(cpu_addr); - -        auto entry = GetUncachedBuffer(*cpu_addr, host_ptr); -        entry->SetSize(size); -        entry->SetInternalState(internalize); -        RasterizerCache<Buffer>::Register(entry); - -        if (internalize) { -            internalized_entries.emplace(ToCacheAddr(host_ptr)); -        } -        if (is_written) { -            entry->MarkAsModified(true, *this); -        } - -        if (entry->GetCapacity() < size) { -            MarkedForDestruction().push_back(entry->ExchangeBuffer(CreateBuffer(size), size)); -        } - -        UploadBufferData(entry->GetBuffer(), 0, size, host_ptr); -        return {ToHandle(entry->GetBuffer()), 0}; -    } - -    void IncreaseBufferSize(Buffer& entry, std::size_t new_size) { -        const std::size_t old_size = entry->GetSize(); -        if (entry->GetCapacity() < new_size) { -            const auto& old_buffer = entry->GetBuffer(); -            auto new_buffer = CreateBuffer(new_size); - -            // Copy bits from the old buffer to the new buffer. -            CopyBufferData(old_buffer, new_buffer, 0, 0, old_size); -            MarkedForDestruction().push_back( -                entry->ExchangeBuffer(std::move(new_buffer), new_size)); - -            // This buffer could have been used -            invalidated = true; -        } -        // Upload the new bits. -        const std::size_t size_diff = new_size - old_size; -        UploadBufferData(entry->GetBuffer(), old_size, size_diff, entry->GetHostPtr() + old_size); - -        // Update entry's size in the object and in the cache. -        Unregister(entry); - -        entry->SetSize(new_size); -        RasterizerCache<Buffer>::Register(entry); -    } - -    Buffer GetUncachedBuffer(VAddr cpu_addr, u8* host_ptr) { -        if (auto entry = TryGetReservedBuffer(host_ptr)) { -            return entry; -        } -        return std::make_shared<CachedBuffer<BufferStorageType>>(cpu_addr, host_ptr); -    } - -    Buffer TryGetReservedBuffer(u8* host_ptr) { -        const auto it = buffer_reserve.find(ToCacheAddr(host_ptr)); -        if (it == buffer_reserve.end()) { -            return {}; -        } -        auto& reserve = it->second; -        auto entry = reserve.back(); -        reserve.pop_back(); -        return entry; -    } - -    void ReserveBuffer(Buffer entry) { -        buffer_reserve[entry->GetCacheAddr()].push_back(std::move(entry)); -    } - -    void AlignBuffer(std::size_t alignment) { -        // Align the offset, not the mapped pointer -        const std::size_t offset_aligned = Common::AlignUp(buffer_offset, alignment); -        buffer_ptr += offset_aligned - buffer_offset; -        buffer_offset = offset_aligned; -    } - -    std::vector<BufferStorageType>& MarkedForDestruction() { -        return marked_for_destruction_ring_buffer[marked_for_destruction_index]; -    } - -    Core::System& system; - -    std::unique_ptr<StreamBuffer> stream_buffer; -    BufferType stream_buffer_handle{}; - -    bool invalidated = false; - -    u8* buffer_ptr = nullptr; -    u64 buffer_offset = 0; -    u64 buffer_offset_base = 0; - -    std::size_t marked_for_destruction_index = 0; -    std::array<std::vector<BufferStorageType>, 4> marked_for_destruction_ring_buffer; - -    std::unordered_set<CacheAddr> internalized_entries; -    std::unordered_map<CacheAddr, std::vector<Buffer>> buffer_reserve; -}; - -} // namespace VideoCommon diff --git a/src/video_core/buffer_cache/buffer_block.h b/src/video_core/buffer_cache/buffer_block.h new file mode 100644 index 000000000..4b9193182 --- /dev/null +++ b/src/video_core/buffer_cache/buffer_block.h @@ -0,0 +1,76 @@ +// Copyright 2019 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include <unordered_set> +#include <utility> + +#include "common/alignment.h" +#include "common/common_types.h" +#include "video_core/gpu.h" + +namespace VideoCommon { + +class BufferBlock { +public: +    bool Overlaps(const CacheAddr start, const CacheAddr end) const { +        return (cache_addr < end) && (cache_addr_end > start); +    } + +    bool IsInside(const CacheAddr other_start, const CacheAddr other_end) const { +        return cache_addr <= other_start && other_end <= cache_addr_end; +    } + +    u8* GetWritableHostPtr() const { +        return FromCacheAddr(cache_addr); +    } + +    u8* GetWritableHostPtr(std::size_t offset) const { +        return FromCacheAddr(cache_addr + offset); +    } + +    std::size_t GetOffset(const CacheAddr in_addr) { +        return static_cast<std::size_t>(in_addr - cache_addr); +    } + +    CacheAddr GetCacheAddr() const { +        return cache_addr; +    } + +    CacheAddr GetCacheAddrEnd() const { +        return cache_addr_end; +    } + +    void SetCacheAddr(const CacheAddr new_addr) { +        cache_addr = new_addr; +        cache_addr_end = new_addr + size; +    } + +    std::size_t GetSize() const { +        return size; +    } + +    void SetEpoch(u64 new_epoch) { +        epoch = new_epoch; +    } + +    u64 GetEpoch() { +        return epoch; +    } + +protected: +    explicit BufferBlock(CacheAddr cache_addr, const std::size_t size) : size{size} { +        SetCacheAddr(cache_addr); +    } +    ~BufferBlock() = default; + +private: +    CacheAddr cache_addr{}; +    CacheAddr cache_addr_end{}; +    std::size_t size{}; +    u64 epoch{}; +}; + +} // namespace VideoCommon diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h new file mode 100644 index 000000000..2442ddfd6 --- /dev/null +++ b/src/video_core/buffer_cache/buffer_cache.h @@ -0,0 +1,447 @@ +// Copyright 2019 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include <array> +#include <memory> +#include <mutex> +#include <unordered_map> +#include <unordered_set> +#include <utility> +#include <vector> + +#include "common/alignment.h" +#include "common/common_types.h" +#include "core/core.h" +#include "video_core/buffer_cache/buffer_block.h" +#include "video_core/buffer_cache/map_interval.h" +#include "video_core/memory_manager.h" +#include "video_core/rasterizer_interface.h" + +namespace VideoCommon { + +using MapInterval = std::shared_ptr<MapIntervalBase>; + +template <typename TBuffer, typename TBufferType, typename StreamBuffer> +class BufferCache { +public: +    using BufferInfo = std::pair<const TBufferType*, u64>; + +    BufferInfo UploadMemory(GPUVAddr gpu_addr, std::size_t size, std::size_t alignment = 4, +                            bool is_written = false) { +        std::lock_guard lock{mutex}; + +        auto& memory_manager = system.GPU().MemoryManager(); +        const auto host_ptr = memory_manager.GetPointer(gpu_addr); +        if (!host_ptr) { +            return {GetEmptyBuffer(size), 0}; +        } +        const auto cache_addr = ToCacheAddr(host_ptr); + +        // Cache management is a big overhead, so only cache entries with a given size. +        // TODO: Figure out which size is the best for given games. +        constexpr std::size_t max_stream_size = 0x800; +        if (size < max_stream_size) { +            if (!is_written && !IsRegionWritten(cache_addr, cache_addr + size - 1)) { +                return StreamBufferUpload(host_ptr, size, alignment); +            } +        } + +        auto block = GetBlock(cache_addr, size); +        auto map = MapAddress(block, gpu_addr, cache_addr, size); +        if (is_written) { +            map->MarkAsModified(true, GetModifiedTicks()); +            if (!map->IsWritten()) { +                map->MarkAsWritten(true); +                MarkRegionAsWritten(map->GetStart(), map->GetEnd() - 1); +            } +        } else { +            if (map->IsWritten()) { +                WriteBarrier(); +            } +        } + +        const u64 offset = static_cast<u64>(block->GetOffset(cache_addr)); + +        return {ToHandle(block), offset}; +    } + +    /// Uploads from a host memory. Returns the OpenGL buffer where it's located and its offset. +    BufferInfo UploadHostMemory(const void* raw_pointer, std::size_t size, +                                std::size_t alignment = 4) { +        std::lock_guard lock{mutex}; +        return StreamBufferUpload(raw_pointer, size, alignment); +    } + +    void Map(std::size_t max_size) { +        std::lock_guard lock{mutex}; + +        std::tie(buffer_ptr, buffer_offset_base, invalidated) = stream_buffer->Map(max_size, 4); +        buffer_offset = buffer_offset_base; +    } + +    /// Finishes the upload stream, returns true on bindings invalidation. +    bool Unmap() { +        std::lock_guard lock{mutex}; + +        stream_buffer->Unmap(buffer_offset - buffer_offset_base); +        return std::exchange(invalidated, false); +    } + +    void TickFrame() { +        ++epoch; +        while (!pending_destruction.empty()) { +            if (pending_destruction.front()->GetEpoch() + 1 > epoch) { +                break; +            } +            pending_destruction.pop_front(); +        } +    } + +    /// Write any cached resources overlapping the specified region back to memory +    void FlushRegion(CacheAddr addr, std::size_t size) { +        std::lock_guard lock{mutex}; + +        std::vector<MapInterval> objects = GetMapsInRange(addr, size); +        std::sort(objects.begin(), objects.end(), [](const MapInterval& a, const MapInterval& b) { +            return a->GetModificationTick() < b->GetModificationTick(); +        }); +        for (auto& object : objects) { +            if (object->IsModified() && object->IsRegistered()) { +                FlushMap(object); +            } +        } +    } + +    /// Mark the specified region as being invalidated +    void InvalidateRegion(CacheAddr addr, u64 size) { +        std::lock_guard lock{mutex}; + +        std::vector<MapInterval> objects = GetMapsInRange(addr, size); +        for (auto& object : objects) { +            if (object->IsRegistered()) { +                Unregister(object); +            } +        } +    } + +    virtual const TBufferType* GetEmptyBuffer(std::size_t size) = 0; + +protected: +    explicit BufferCache(VideoCore::RasterizerInterface& rasterizer, Core::System& system, +                         std::unique_ptr<StreamBuffer> stream_buffer) +        : rasterizer{rasterizer}, system{system}, stream_buffer{std::move(stream_buffer)}, +          stream_buffer_handle{this->stream_buffer->GetHandle()} {} + +    ~BufferCache() = default; + +    virtual const TBufferType* ToHandle(const TBuffer& storage) = 0; + +    virtual void WriteBarrier() = 0; + +    virtual TBuffer CreateBlock(CacheAddr cache_addr, std::size_t size) = 0; + +    virtual void UploadBlockData(const TBuffer& buffer, std::size_t offset, std::size_t size, +                                 const u8* data) = 0; + +    virtual void DownloadBlockData(const TBuffer& buffer, std::size_t offset, std::size_t size, +                                   u8* data) = 0; + +    virtual void CopyBlock(const TBuffer& src, const TBuffer& dst, std::size_t src_offset, +                           std::size_t dst_offset, std::size_t size) = 0; + +    /// Register an object into the cache +    void Register(const MapInterval& new_map, bool inherit_written = false) { +        const CacheAddr cache_ptr = new_map->GetStart(); +        const std::optional<VAddr> cpu_addr = +            system.GPU().MemoryManager().GpuToCpuAddress(new_map->GetGpuAddress()); +        if (!cache_ptr || !cpu_addr) { +            LOG_CRITICAL(HW_GPU, "Failed to register buffer with unmapped gpu_address 0x{:016x}", +                         new_map->GetGpuAddress()); +            return; +        } +        const std::size_t size = new_map->GetEnd() - new_map->GetStart(); +        new_map->SetCpuAddress(*cpu_addr); +        new_map->MarkAsRegistered(true); +        const IntervalType interval{new_map->GetStart(), new_map->GetEnd()}; +        mapped_addresses.insert({interval, new_map}); +        rasterizer.UpdatePagesCachedCount(*cpu_addr, size, 1); +        if (inherit_written) { +            MarkRegionAsWritten(new_map->GetStart(), new_map->GetEnd() - 1); +            new_map->MarkAsWritten(true); +        } +    } + +    /// Unregisters an object from the cache +    void Unregister(MapInterval& map) { +        const std::size_t size = map->GetEnd() - map->GetStart(); +        rasterizer.UpdatePagesCachedCount(map->GetCpuAddress(), size, -1); +        map->MarkAsRegistered(false); +        if (map->IsWritten()) { +            UnmarkRegionAsWritten(map->GetStart(), map->GetEnd() - 1); +        } +        const IntervalType delete_interval{map->GetStart(), map->GetEnd()}; +        mapped_addresses.erase(delete_interval); +    } + +private: +    MapInterval CreateMap(const CacheAddr start, const CacheAddr end, const GPUVAddr gpu_addr) { +        return std::make_shared<MapIntervalBase>(start, end, gpu_addr); +    } + +    MapInterval MapAddress(const TBuffer& block, const GPUVAddr gpu_addr, +                           const CacheAddr cache_addr, const std::size_t size) { + +        std::vector<MapInterval> overlaps = GetMapsInRange(cache_addr, size); +        if (overlaps.empty()) { +            const CacheAddr cache_addr_end = cache_addr + size; +            MapInterval new_map = CreateMap(cache_addr, cache_addr_end, gpu_addr); +            u8* host_ptr = FromCacheAddr(cache_addr); +            UploadBlockData(block, block->GetOffset(cache_addr), size, host_ptr); +            Register(new_map); +            return new_map; +        } + +        const CacheAddr cache_addr_end = cache_addr + size; +        if (overlaps.size() == 1) { +            MapInterval& current_map = overlaps[0]; +            if (current_map->IsInside(cache_addr, cache_addr_end)) { +                return current_map; +            } +        } +        CacheAddr new_start = cache_addr; +        CacheAddr new_end = cache_addr_end; +        bool write_inheritance = false; +        bool modified_inheritance = false; +        // Calculate new buffer parameters +        for (auto& overlap : overlaps) { +            new_start = std::min(overlap->GetStart(), new_start); +            new_end = std::max(overlap->GetEnd(), new_end); +            write_inheritance |= overlap->IsWritten(); +            modified_inheritance |= overlap->IsModified(); +        } +        GPUVAddr new_gpu_addr = gpu_addr + new_start - cache_addr; +        for (auto& overlap : overlaps) { +            Unregister(overlap); +        } +        UpdateBlock(block, new_start, new_end, overlaps); +        MapInterval new_map = CreateMap(new_start, new_end, new_gpu_addr); +        if (modified_inheritance) { +            new_map->MarkAsModified(true, GetModifiedTicks()); +        } +        Register(new_map, write_inheritance); +        return new_map; +    } + +    void UpdateBlock(const TBuffer& block, CacheAddr start, CacheAddr end, +                     std::vector<MapInterval>& overlaps) { +        const IntervalType base_interval{start, end}; +        IntervalSet interval_set{}; +        interval_set.add(base_interval); +        for (auto& overlap : overlaps) { +            const IntervalType subtract{overlap->GetStart(), overlap->GetEnd()}; +            interval_set.subtract(subtract); +        } +        for (auto& interval : interval_set) { +            std::size_t size = interval.upper() - interval.lower(); +            if (size > 0) { +                u8* host_ptr = FromCacheAddr(interval.lower()); +                UploadBlockData(block, block->GetOffset(interval.lower()), size, host_ptr); +            } +        } +    } + +    std::vector<MapInterval> GetMapsInRange(CacheAddr addr, std::size_t size) { +        if (size == 0) { +            return {}; +        } + +        std::vector<MapInterval> objects{}; +        const IntervalType interval{addr, addr + size}; +        for (auto& pair : boost::make_iterator_range(mapped_addresses.equal_range(interval))) { +            objects.push_back(pair.second); +        } + +        return objects; +    } + +    /// Returns a ticks counter used for tracking when cached objects were last modified +    u64 GetModifiedTicks() { +        return ++modified_ticks; +    } + +    void FlushMap(MapInterval map) { +        std::size_t size = map->GetEnd() - map->GetStart(); +        TBuffer block = blocks[map->GetStart() >> block_page_bits]; +        u8* host_ptr = FromCacheAddr(map->GetStart()); +        DownloadBlockData(block, block->GetOffset(map->GetStart()), size, host_ptr); +        map->MarkAsModified(false, 0); +    } + +    BufferInfo StreamBufferUpload(const void* raw_pointer, std::size_t size, +                                  std::size_t alignment) { +        AlignBuffer(alignment); +        const std::size_t uploaded_offset = buffer_offset; +        std::memcpy(buffer_ptr, raw_pointer, size); + +        buffer_ptr += size; +        buffer_offset += size; +        return {&stream_buffer_handle, uploaded_offset}; +    } + +    void AlignBuffer(std::size_t alignment) { +        // Align the offset, not the mapped pointer +        const std::size_t offset_aligned = Common::AlignUp(buffer_offset, alignment); +        buffer_ptr += offset_aligned - buffer_offset; +        buffer_offset = offset_aligned; +    } + +    TBuffer EnlargeBlock(TBuffer buffer) { +        const std::size_t old_size = buffer->GetSize(); +        const std::size_t new_size = old_size + block_page_size; +        const CacheAddr cache_addr = buffer->GetCacheAddr(); +        TBuffer new_buffer = CreateBlock(cache_addr, new_size); +        CopyBlock(buffer, new_buffer, 0, 0, old_size); +        buffer->SetEpoch(epoch); +        pending_destruction.push_back(buffer); +        const CacheAddr cache_addr_end = cache_addr + new_size - 1; +        u64 page_start = cache_addr >> block_page_bits; +        const u64 page_end = cache_addr_end >> block_page_bits; +        while (page_start <= page_end) { +            blocks[page_start] = new_buffer; +            ++page_start; +        } +        return new_buffer; +    } + +    TBuffer MergeBlocks(TBuffer first, TBuffer second) { +        const std::size_t size_1 = first->GetSize(); +        const std::size_t size_2 = second->GetSize(); +        const CacheAddr first_addr = first->GetCacheAddr(); +        const CacheAddr second_addr = second->GetCacheAddr(); +        const CacheAddr new_addr = std::min(first_addr, second_addr); +        const std::size_t new_size = size_1 + size_2; +        TBuffer new_buffer = CreateBlock(new_addr, new_size); +        CopyBlock(first, new_buffer, 0, new_buffer->GetOffset(first_addr), size_1); +        CopyBlock(second, new_buffer, 0, new_buffer->GetOffset(second_addr), size_2); +        first->SetEpoch(epoch); +        second->SetEpoch(epoch); +        pending_destruction.push_back(first); +        pending_destruction.push_back(second); +        const CacheAddr cache_addr_end = new_addr + new_size - 1; +        u64 page_start = new_addr >> block_page_bits; +        const u64 page_end = cache_addr_end >> block_page_bits; +        while (page_start <= page_end) { +            blocks[page_start] = new_buffer; +            ++page_start; +        } +        return new_buffer; +    } + +    TBuffer GetBlock(const CacheAddr cache_addr, const std::size_t size) { +        TBuffer found{}; +        const CacheAddr cache_addr_end = cache_addr + size - 1; +        u64 page_start = cache_addr >> block_page_bits; +        const u64 page_end = cache_addr_end >> block_page_bits; +        while (page_start <= page_end) { +            auto it = blocks.find(page_start); +            if (it == blocks.end()) { +                if (found) { +                    found = EnlargeBlock(found); +                } else { +                    const CacheAddr start_addr = (page_start << block_page_bits); +                    found = CreateBlock(start_addr, block_page_size); +                    blocks[page_start] = found; +                } +            } else { +                if (found) { +                    if (found == it->second) { +                        ++page_start; +                        continue; +                    } +                    found = MergeBlocks(found, it->second); +                } else { +                    found = it->second; +                } +            } +            ++page_start; +        } +        return found; +    } + +    void MarkRegionAsWritten(const CacheAddr start, const CacheAddr end) { +        u64 page_start = start >> write_page_bit; +        const u64 page_end = end >> write_page_bit; +        while (page_start <= page_end) { +            auto it = written_pages.find(page_start); +            if (it != written_pages.end()) { +                it->second = it->second + 1; +            } else { +                written_pages[page_start] = 1; +            } +            page_start++; +        } +    } + +    void UnmarkRegionAsWritten(const CacheAddr start, const CacheAddr end) { +        u64 page_start = start >> write_page_bit; +        const u64 page_end = end >> write_page_bit; +        while (page_start <= page_end) { +            auto it = written_pages.find(page_start); +            if (it != written_pages.end()) { +                if (it->second > 1) { +                    it->second = it->second - 1; +                } else { +                    written_pages.erase(it); +                } +            } +            page_start++; +        } +    } + +    bool IsRegionWritten(const CacheAddr start, const CacheAddr end) const { +        u64 page_start = start >> write_page_bit; +        const u64 page_end = end >> write_page_bit; +        while (page_start <= page_end) { +            if (written_pages.count(page_start) > 0) { +                return true; +            } +            page_start++; +        } +        return false; +    } + +    VideoCore::RasterizerInterface& rasterizer; +    Core::System& system; +    std::unique_ptr<StreamBuffer> stream_buffer; + +    TBufferType stream_buffer_handle{}; + +    bool invalidated = false; + +    u8* buffer_ptr = nullptr; +    u64 buffer_offset = 0; +    u64 buffer_offset_base = 0; + +    using IntervalSet = boost::icl::interval_set<CacheAddr>; +    using IntervalCache = boost::icl::interval_map<CacheAddr, MapInterval>; +    using IntervalType = typename IntervalCache::interval_type; +    IntervalCache mapped_addresses{}; + +    static constexpr u64 write_page_bit{11}; +    std::unordered_map<u64, u32> written_pages{}; + +    static constexpr u64 block_page_bits{21}; +    static constexpr u64 block_page_size{1 << block_page_bits}; +    std::unordered_map<u64, TBuffer> blocks{}; + +    std::list<TBuffer> pending_destruction{}; +    u64 epoch{}; +    u64 modified_ticks{}; + +    std::recursive_mutex mutex; +}; + +} // namespace VideoCommon diff --git a/src/video_core/buffer_cache/map_interval.h b/src/video_core/buffer_cache/map_interval.h new file mode 100644 index 000000000..3a104d5cd --- /dev/null +++ b/src/video_core/buffer_cache/map_interval.h @@ -0,0 +1,89 @@ +// Copyright 2019 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include "common/common_types.h" +#include "video_core/gpu.h" + +namespace VideoCommon { + +class MapIntervalBase { +public: +    MapIntervalBase(const CacheAddr start, const CacheAddr end, const GPUVAddr gpu_addr) +        : start{start}, end{end}, gpu_addr{gpu_addr} {} + +    void SetCpuAddress(VAddr new_cpu_addr) { +        cpu_addr = new_cpu_addr; +    } + +    VAddr GetCpuAddress() const { +        return cpu_addr; +    } + +    GPUVAddr GetGpuAddress() const { +        return gpu_addr; +    } + +    bool IsInside(const CacheAddr other_start, const CacheAddr other_end) const { +        return (start <= other_start && other_end <= end); +    } + +    bool operator==(const MapIntervalBase& rhs) const { +        return std::tie(start, end) == std::tie(rhs.start, rhs.end); +    } + +    bool operator!=(const MapIntervalBase& rhs) const { +        return !operator==(rhs); +    } + +    void MarkAsRegistered(const bool registered) { +        is_registered = registered; +    } + +    bool IsRegistered() const { +        return is_registered; +    } + +    CacheAddr GetStart() const { +        return start; +    } + +    CacheAddr GetEnd() const { +        return end; +    } + +    void MarkAsModified(const bool is_modified_, const u64 tick) { +        is_modified = is_modified_; +        ticks = tick; +    } + +    bool IsModified() const { +        return is_modified; +    } + +    u64 GetModificationTick() const { +        return ticks; +    } + +    void MarkAsWritten(const bool is_written_) { +        is_written = is_written_; +    } + +    bool IsWritten() const { +        return is_written; +    } + +private: +    CacheAddr start; +    CacheAddr end; +    GPUVAddr gpu_addr; +    VAddr cpu_addr{}; +    bool is_written{}; +    bool is_modified{}; +    bool is_registered{}; +    u64 ticks{}; +}; + +} // namespace VideoCommon diff --git a/src/video_core/engines/fermi_2d.cpp b/src/video_core/engines/fermi_2d.cpp index 0ee228e28..98a8b5337 100644 --- a/src/video_core/engines/fermi_2d.cpp +++ b/src/video_core/engines/fermi_2d.cpp @@ -10,8 +10,7 @@  namespace Tegra::Engines { -Fermi2D::Fermi2D(VideoCore::RasterizerInterface& rasterizer, MemoryManager& memory_manager) -    : rasterizer{rasterizer}, memory_manager{memory_manager} {} +Fermi2D::Fermi2D(VideoCore::RasterizerInterface& rasterizer) : rasterizer{rasterizer} {}  void Fermi2D::CallMethod(const GPU::MethodCall& method_call) {      ASSERT_MSG(method_call.method < Regs::NUM_REGS, diff --git a/src/video_core/engines/fermi_2d.h b/src/video_core/engines/fermi_2d.h index 05421d185..0901cf2fa 100644 --- a/src/video_core/engines/fermi_2d.h +++ b/src/video_core/engines/fermi_2d.h @@ -33,7 +33,7 @@ namespace Tegra::Engines {  class Fermi2D final {  public: -    explicit Fermi2D(VideoCore::RasterizerInterface& rasterizer, MemoryManager& memory_manager); +    explicit Fermi2D(VideoCore::RasterizerInterface& rasterizer);      ~Fermi2D() = default;      /// Write the value to the register identified by method. @@ -145,7 +145,6 @@ public:  private:      VideoCore::RasterizerInterface& rasterizer; -    MemoryManager& memory_manager;      /// Performs the copy from the source surface to the destination surface as configured in the      /// registers. diff --git a/src/video_core/engines/kepler_memory.cpp b/src/video_core/engines/kepler_memory.cpp index 44279de00..fa4a7c5c1 100644 --- a/src/video_core/engines/kepler_memory.cpp +++ b/src/video_core/engines/kepler_memory.cpp @@ -15,7 +15,7 @@  namespace Tegra::Engines {  KeplerMemory::KeplerMemory(Core::System& system, MemoryManager& memory_manager) -    : system{system}, memory_manager{memory_manager}, upload_state{memory_manager, regs.upload} {} +    : system{system}, upload_state{memory_manager, regs.upload} {}  KeplerMemory::~KeplerMemory() = default; diff --git a/src/video_core/engines/kepler_memory.h b/src/video_core/engines/kepler_memory.h index f3bc675a9..e0e25c321 100644 --- a/src/video_core/engines/kepler_memory.h +++ b/src/video_core/engines/kepler_memory.h @@ -65,7 +65,6 @@ public:  private:      Core::System& system; -    MemoryManager& memory_manager;      Upload::State upload_state;  }; diff --git a/src/video_core/engines/maxwell_3d.cpp b/src/video_core/engines/maxwell_3d.cpp index 125c53360..f5158d219 100644 --- a/src/video_core/engines/maxwell_3d.cpp +++ b/src/video_core/engines/maxwell_3d.cpp @@ -249,16 +249,10 @@ void Maxwell3D::CallMacroMethod(u32 method, std::vector<u32> parameters) {      executing_macro = 0;      // Lookup the macro offset -    const u32 entry{(method - MacroRegistersStart) >> 1}; -    const auto& search{macro_offsets.find(entry)}; -    if (search == macro_offsets.end()) { -        LOG_CRITICAL(HW_GPU, "macro not found for method 0x{:X}!", method); -        UNREACHABLE(); -        return; -    } +    const u32 entry = ((method - MacroRegistersStart) >> 1) % macro_positions.size();      // Execute the current macro. -    macro_interpreter.Execute(search->second, std::move(parameters)); +    macro_interpreter.Execute(macro_positions[entry], std::move(parameters));  }  void Maxwell3D::CallMethod(const GPU::MethodCall& method_call) { @@ -421,7 +415,7 @@ void Maxwell3D::ProcessMacroUpload(u32 data) {  }  void Maxwell3D::ProcessMacroBind(u32 data) { -    macro_offsets[regs.macros.entry] = data; +    macro_positions[regs.macros.entry++] = data;  }  void Maxwell3D::ProcessQueryGet() { @@ -524,7 +518,7 @@ void Maxwell3D::ProcessQueryCondition() {  void Maxwell3D::ProcessSyncPoint() {      const u32 sync_point = regs.sync_info.sync_point.Value();      const u32 increment = regs.sync_info.increment.Value(); -    const u32 cache_flush = regs.sync_info.unknown.Value(); +    [[maybe_unused]] const u32 cache_flush = regs.sync_info.unknown.Value();      if (increment) {          system.GPU().IncrementSyncPoint(sync_point);      } @@ -626,10 +620,10 @@ Texture::TICEntry Maxwell3D::GetTICEntry(u32 tic_index) const {      Texture::TICEntry tic_entry;      memory_manager.ReadBlockUnsafe(tic_address_gpu, &tic_entry, sizeof(Texture::TICEntry)); -    const auto r_type{tic_entry.r_type.Value()}; -    const auto g_type{tic_entry.g_type.Value()}; -    const auto b_type{tic_entry.b_type.Value()}; -    const auto a_type{tic_entry.a_type.Value()}; +    [[maybe_unused]] const auto r_type{tic_entry.r_type.Value()}; +    [[maybe_unused]] const auto g_type{tic_entry.g_type.Value()}; +    [[maybe_unused]] const auto b_type{tic_entry.b_type.Value()}; +    [[maybe_unused]] const auto a_type{tic_entry.a_type.Value()};      // TODO(Subv): Different data types for separate components are not supported      DEBUG_ASSERT(r_type == g_type && r_type == b_type && r_type == a_type); diff --git a/src/video_core/engines/maxwell_3d.h b/src/video_core/engines/maxwell_3d.h index 1ee982b76..0184342a0 100644 --- a/src/video_core/engines/maxwell_3d.h +++ b/src/video_core/engines/maxwell_3d.h @@ -1270,7 +1270,7 @@ private:      MemoryManager& memory_manager;      /// Start offsets of each macro in macro_memory -    std::unordered_map<u32, u32> macro_offsets; +    std::array<u32, 0x80> macro_positions = {};      /// Memory for macro code      MacroMemory macro_memory; diff --git a/src/video_core/engines/maxwell_dma.cpp b/src/video_core/engines/maxwell_dma.cpp index a28c04473..ad8453c5f 100644 --- a/src/video_core/engines/maxwell_dma.cpp +++ b/src/video_core/engines/maxwell_dma.cpp @@ -5,18 +5,17 @@  #include "common/assert.h"  #include "common/logging/log.h"  #include "core/core.h" +#include "core/settings.h"  #include "video_core/engines/maxwell_3d.h"  #include "video_core/engines/maxwell_dma.h"  #include "video_core/memory_manager.h" -#include "video_core/rasterizer_interface.h"  #include "video_core/renderer_base.h"  #include "video_core/textures/decoders.h"  namespace Tegra::Engines { -MaxwellDMA::MaxwellDMA(Core::System& system, VideoCore::RasterizerInterface& rasterizer, -                       MemoryManager& memory_manager) -    : system{system}, rasterizer{rasterizer}, memory_manager{memory_manager} {} +MaxwellDMA::MaxwellDMA(Core::System& system, MemoryManager& memory_manager) +    : system{system}, memory_manager{memory_manager} {}  void MaxwellDMA::CallMethod(const GPU::MethodCall& method_call) {      ASSERT_MSG(method_call.method < Regs::NUM_REGS, @@ -84,13 +83,17 @@ void MaxwellDMA::HandleCopy() {      ASSERT(regs.exec.enable_2d == 1);      if (regs.exec.is_dst_linear && !regs.exec.is_src_linear) { -        ASSERT(regs.src_params.size_z == 1); +        ASSERT(regs.src_params.BlockDepth() == 0);          // If the input is tiled and the output is linear, deswizzle the input and copy it over. -        const u32 src_bytes_per_pixel = regs.src_pitch / regs.src_params.size_x; +        const u32 bytes_per_pixel = regs.dst_pitch / regs.x_count;          const std::size_t src_size = Texture::CalculateSize( -            true, src_bytes_per_pixel, regs.src_params.size_x, regs.src_params.size_y, +            true, bytes_per_pixel, regs.src_params.size_x, regs.src_params.size_y,              regs.src_params.size_z, regs.src_params.BlockHeight(), regs.src_params.BlockDepth()); +        const std::size_t src_layer_size = Texture::CalculateSize( +            true, bytes_per_pixel, regs.src_params.size_x, regs.src_params.size_y, 1, +            regs.src_params.BlockHeight(), regs.src_params.BlockDepth()); +          const std::size_t dst_size = regs.dst_pitch * regs.y_count;          if (read_buffer.size() < src_size) { @@ -104,23 +107,23 @@ void MaxwellDMA::HandleCopy() {          memory_manager.ReadBlock(source, read_buffer.data(), src_size);          memory_manager.ReadBlock(dest, write_buffer.data(), dst_size); -        Texture::UnswizzleSubrect(regs.x_count, regs.y_count, regs.dst_pitch, -                                  regs.src_params.size_x, src_bytes_per_pixel, read_buffer.data(), -                                  write_buffer.data(), regs.src_params.BlockHeight(), -                                  regs.src_params.pos_x, regs.src_params.pos_y); +        Texture::UnswizzleSubrect( +            regs.x_count, regs.y_count, regs.dst_pitch, regs.src_params.size_x, bytes_per_pixel, +            read_buffer.data() + src_layer_size * regs.src_params.pos_z, write_buffer.data(), +            regs.src_params.BlockHeight(), regs.src_params.pos_x, regs.src_params.pos_y);          memory_manager.WriteBlock(dest, write_buffer.data(), dst_size);      } else {          ASSERT(regs.dst_params.BlockDepth() == 0); -        const u32 src_bytes_per_pixel = regs.src_pitch / regs.x_count; +        const u32 bytes_per_pixel = regs.src_pitch / regs.x_count;          const std::size_t dst_size = Texture::CalculateSize( -            true, src_bytes_per_pixel, regs.dst_params.size_x, regs.dst_params.size_y, +            true, bytes_per_pixel, regs.dst_params.size_x, regs.dst_params.size_y,              regs.dst_params.size_z, regs.dst_params.BlockHeight(), regs.dst_params.BlockDepth());          const std::size_t dst_layer_size = Texture::CalculateSize( -            true, src_bytes_per_pixel, regs.dst_params.size_x, regs.dst_params.size_y, 1, +            true, bytes_per_pixel, regs.dst_params.size_x, regs.dst_params.size_y, 1,              regs.dst_params.BlockHeight(), regs.dst_params.BlockDepth());          const std::size_t src_size = regs.src_pitch * regs.y_count; @@ -133,14 +136,19 @@ void MaxwellDMA::HandleCopy() {              write_buffer.resize(dst_size);          } -        memory_manager.ReadBlock(source, read_buffer.data(), src_size); -        memory_manager.ReadBlock(dest, write_buffer.data(), dst_size); +        if (Settings::values.use_accurate_gpu_emulation) { +            memory_manager.ReadBlock(source, read_buffer.data(), src_size); +            memory_manager.ReadBlock(dest, write_buffer.data(), dst_size); +        } else { +            memory_manager.ReadBlockUnsafe(source, read_buffer.data(), src_size); +            memory_manager.ReadBlockUnsafe(dest, write_buffer.data(), dst_size); +        }          // If the input is linear and the output is tiled, swizzle the input and copy it over. -        Texture::SwizzleSubrect(regs.x_count, regs.y_count, regs.src_pitch, regs.dst_params.size_x, -                                src_bytes_per_pixel, -                                write_buffer.data() + dst_layer_size * regs.dst_params.pos_z, -                                read_buffer.data(), regs.dst_params.BlockHeight()); +        Texture::SwizzleSubrect( +            regs.x_count, regs.y_count, regs.src_pitch, regs.dst_params.size_x, bytes_per_pixel, +            write_buffer.data() + dst_layer_size * regs.dst_params.pos_z, read_buffer.data(), +            regs.dst_params.BlockHeight(), regs.dst_params.pos_x, regs.dst_params.pos_y);          memory_manager.WriteBlock(dest, write_buffer.data(), dst_size);      } diff --git a/src/video_core/engines/maxwell_dma.h b/src/video_core/engines/maxwell_dma.h index 17b015ca7..93808a9bb 100644 --- a/src/video_core/engines/maxwell_dma.h +++ b/src/video_core/engines/maxwell_dma.h @@ -20,10 +20,6 @@ namespace Tegra {  class MemoryManager;  } -namespace VideoCore { -class RasterizerInterface; -} -  namespace Tegra::Engines {  /** @@ -33,8 +29,7 @@ namespace Tegra::Engines {  class MaxwellDMA final {  public: -    explicit MaxwellDMA(Core::System& system, VideoCore::RasterizerInterface& rasterizer, -                        MemoryManager& memory_manager); +    explicit MaxwellDMA(Core::System& system, MemoryManager& memory_manager);      ~MaxwellDMA() = default;      /// Write the value to the register identified by method. @@ -180,8 +175,6 @@ public:  private:      Core::System& system; -    VideoCore::RasterizerInterface& rasterizer; -      MemoryManager& memory_manager;      std::vector<u8> read_buffer; diff --git a/src/video_core/engines/shader_bytecode.h b/src/video_core/engines/shader_bytecode.h index 747284700..c3678b9ea 100644 --- a/src/video_core/engines/shader_bytecode.h +++ b/src/video_core/engines/shader_bytecode.h @@ -538,6 +538,12 @@ enum class PhysicalAttributeDirection : u64 {      Output = 1,  }; +enum class VoteOperation : u64 { +    All = 0, // allThreadsNV +    Any = 1, // anyThreadNV +    Eq = 2,  // allThreadsEqualNV +}; +  union Instruction {      Instruction& operator=(const Instruction& instr) {          value = instr.value; @@ -565,6 +571,13 @@ union Instruction {      } nop;      union { +        BitField<48, 2, VoteOperation> operation; +        BitField<45, 3, u64> dest_pred; +        BitField<39, 3, u64> value; +        BitField<42, 1, u64> negate_value; +    } vote; + +    union {          BitField<8, 8, Register> gpr;          BitField<20, 24, s64> offset;      } gmem; @@ -873,6 +886,7 @@ union Instruction {      union {          BitField<0, 3, u64> pred0;          BitField<3, 3, u64> pred3; +        BitField<6, 1, u64> neg_b;          BitField<7, 1, u64> abs_a;          BitField<39, 3, u64> pred39;          BitField<42, 1, u64> neg_pred; @@ -1493,6 +1507,7 @@ public:          SYNC,          BRK,          DEPBAR, +        VOTE,          BFE_C,          BFE_R,          BFE_IMM, @@ -1655,6 +1670,7 @@ public:          Hfma2,          Flow,          Synch, +        Warp,          Memory,          Texture,          Image, @@ -1781,6 +1797,7 @@ private:              INST("111000110100---", Id::BRK, Type::Flow, "BRK"),              INST("111000110000----", Id::EXIT, Type::Flow, "EXIT"),              INST("1111000011110---", Id::DEPBAR, Type::Synch, "DEPBAR"), +            INST("0101000011011---", Id::VOTE, Type::Warp, "VOTE"),              INST("1110111111011---", Id::LD_A, Type::Memory, "LD_A"),              INST("1110111101001---", Id::LD_S, Type::Memory, "LD_S"),              INST("1110111101000---", Id::LD_L, Type::Memory, "LD_L"), diff --git a/src/video_core/gpu.cpp b/src/video_core/gpu.cpp index c409af194..8d9db45f5 100644 --- a/src/video_core/gpu.cpp +++ b/src/video_core/gpu.cpp @@ -35,9 +35,9 @@ GPU::GPU(Core::System& system, VideoCore::RendererBase& renderer, bool is_async)      memory_manager = std::make_unique<Tegra::MemoryManager>(system, rasterizer);      dma_pusher = std::make_unique<Tegra::DmaPusher>(*this);      maxwell_3d = std::make_unique<Engines::Maxwell3D>(system, rasterizer, *memory_manager); -    fermi_2d = std::make_unique<Engines::Fermi2D>(rasterizer, *memory_manager); +    fermi_2d = std::make_unique<Engines::Fermi2D>(rasterizer);      kepler_compute = std::make_unique<Engines::KeplerCompute>(system, rasterizer, *memory_manager); -    maxwell_dma = std::make_unique<Engines::MaxwellDMA>(system, rasterizer, *memory_manager); +    maxwell_dma = std::make_unique<Engines::MaxwellDMA>(system, *memory_manager);      kepler_memory = std::make_unique<Engines::KeplerMemory>(system, *memory_manager);  } diff --git a/src/video_core/gpu.h b/src/video_core/gpu.h index 11857ff99..544340ecd 100644 --- a/src/video_core/gpu.h +++ b/src/video_core/gpu.h @@ -19,6 +19,10 @@ inline CacheAddr ToCacheAddr(const void* host_ptr) {      return reinterpret_cast<CacheAddr>(host_ptr);  } +inline u8* FromCacheAddr(CacheAddr cache_addr) { +    return reinterpret_cast<u8*>(cache_addr); +} +  namespace Core {  class System;  } @@ -281,8 +285,8 @@ private:  protected:      std::unique_ptr<Tegra::DmaPusher> dma_pusher; -    VideoCore::RendererBase& renderer;      Core::System& system; +    VideoCore::RendererBase& renderer;  private:      std::unique_ptr<Tegra::MemoryManager> memory_manager; diff --git a/src/video_core/rasterizer_interface.h b/src/video_core/rasterizer_interface.h index 6e44d51cf..6b3f2d50a 100644 --- a/src/video_core/rasterizer_interface.h +++ b/src/video_core/rasterizer_interface.h @@ -50,7 +50,7 @@ public:      /// and invalidated      virtual void FlushAndInvalidateRegion(CacheAddr addr, u64 size) = 0; -    // Notify the rasterizer to send all written commands to the host GPU. +    /// Notify the rasterizer to send all written commands to the host GPU.      virtual void FlushCommands() = 0;      /// Notify rasterizer that a frame is about to finish diff --git a/src/video_core/renderer_opengl/gl_buffer_cache.cpp b/src/video_core/renderer_opengl/gl_buffer_cache.cpp index 2a9b523f5..f8a807c84 100644 --- a/src/video_core/renderer_opengl/gl_buffer_cache.cpp +++ b/src/video_core/renderer_opengl/gl_buffer_cache.cpp @@ -7,28 +7,41 @@  #include <glad/glad.h>  #include "common/assert.h" +#include "common/microprofile.h" +#include "video_core/rasterizer_interface.h"  #include "video_core/renderer_opengl/gl_buffer_cache.h"  #include "video_core/renderer_opengl/gl_rasterizer.h"  #include "video_core/renderer_opengl/gl_resource_manager.h"  namespace OpenGL { +MICROPROFILE_DEFINE(OpenGL_Buffer_Download, "OpenGL", "Buffer Download", MP_RGB(192, 192, 128)); + +CachedBufferBlock::CachedBufferBlock(CacheAddr cache_addr, const std::size_t size) +    : VideoCommon::BufferBlock{cache_addr, size} { +    gl_buffer.Create(); +    glNamedBufferData(gl_buffer.handle, static_cast<GLsizeiptr>(size), nullptr, GL_DYNAMIC_DRAW); +} + +CachedBufferBlock::~CachedBufferBlock() = default; +  OGLBufferCache::OGLBufferCache(RasterizerOpenGL& rasterizer, Core::System& system,                                 std::size_t stream_size) -    : VideoCommon::BufferCache<OGLBuffer, GLuint, OGLStreamBuffer>{ +    : VideoCommon::BufferCache<Buffer, GLuint, OGLStreamBuffer>{            rasterizer, system, std::make_unique<OGLStreamBuffer>(stream_size, true)} {}  OGLBufferCache::~OGLBufferCache() = default; -OGLBuffer OGLBufferCache::CreateBuffer(std::size_t size) { -    OGLBuffer buffer; -    buffer.Create(); -    glNamedBufferData(buffer.handle, static_cast<GLsizeiptr>(size), nullptr, GL_DYNAMIC_DRAW); -    return buffer; +Buffer OGLBufferCache::CreateBlock(CacheAddr cache_addr, std::size_t size) { +    return std::make_shared<CachedBufferBlock>(cache_addr, size); +} + +void OGLBufferCache::WriteBarrier() { +    glMemoryBarrier(GL_ALL_BARRIER_BITS);  } -const GLuint* OGLBufferCache::ToHandle(const OGLBuffer& buffer) { -    return &buffer.handle; +const GLuint* OGLBufferCache::ToHandle(const Buffer& buffer) { +    return buffer->GetHandle();  }  const GLuint* OGLBufferCache::GetEmptyBuffer(std::size_t) { @@ -36,23 +49,24 @@ const GLuint* OGLBufferCache::GetEmptyBuffer(std::size_t) {      return &null_buffer;  } -void OGLBufferCache::UploadBufferData(const OGLBuffer& buffer, std::size_t offset, std::size_t size, -                                      const u8* data) { -    glNamedBufferSubData(buffer.handle, static_cast<GLintptr>(offset), +void OGLBufferCache::UploadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size, +                                     const u8* data) { +    glNamedBufferSubData(*buffer->GetHandle(), static_cast<GLintptr>(offset),                           static_cast<GLsizeiptr>(size), data);  } -void OGLBufferCache::DownloadBufferData(const OGLBuffer& buffer, std::size_t offset, -                                        std::size_t size, u8* data) { -    glGetNamedBufferSubData(buffer.handle, static_cast<GLintptr>(offset), +void OGLBufferCache::DownloadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size, +                                       u8* data) { +    MICROPROFILE_SCOPE(OpenGL_Buffer_Download); +    glGetNamedBufferSubData(*buffer->GetHandle(), static_cast<GLintptr>(offset),                              static_cast<GLsizeiptr>(size), data);  } -void OGLBufferCache::CopyBufferData(const OGLBuffer& src, const OGLBuffer& dst, -                                    std::size_t src_offset, std::size_t dst_offset, -                                    std::size_t size) { -    glCopyNamedBufferSubData(src.handle, dst.handle, static_cast<GLintptr>(src_offset), -                             static_cast<GLintptr>(dst_offset), static_cast<GLsizeiptr>(size)); +void OGLBufferCache::CopyBlock(const Buffer& src, const Buffer& dst, std::size_t src_offset, +                               std::size_t dst_offset, std::size_t size) { +    glCopyNamedBufferSubData(*src->GetHandle(), *dst->GetHandle(), +                             static_cast<GLintptr>(src_offset), static_cast<GLintptr>(dst_offset), +                             static_cast<GLsizeiptr>(size));  }  } // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_buffer_cache.h b/src/video_core/renderer_opengl/gl_buffer_cache.h index 8c8ac4038..022e7bfa9 100644 --- a/src/video_core/renderer_opengl/gl_buffer_cache.h +++ b/src/video_core/renderer_opengl/gl_buffer_cache.h @@ -7,7 +7,7 @@  #include <memory>  #include "common/common_types.h" -#include "video_core/buffer_cache.h" +#include "video_core/buffer_cache/buffer_cache.h"  #include "video_core/rasterizer_cache.h"  #include "video_core/renderer_opengl/gl_resource_manager.h"  #include "video_core/renderer_opengl/gl_stream_buffer.h" @@ -21,7 +21,24 @@ namespace OpenGL {  class OGLStreamBuffer;  class RasterizerOpenGL; -class OGLBufferCache final : public VideoCommon::BufferCache<OGLBuffer, GLuint, OGLStreamBuffer> { +class CachedBufferBlock; + +using Buffer = std::shared_ptr<CachedBufferBlock>; + +class CachedBufferBlock : public VideoCommon::BufferBlock { +public: +    explicit CachedBufferBlock(CacheAddr cache_addr, const std::size_t size); +    ~CachedBufferBlock(); + +    const GLuint* GetHandle() const { +        return &gl_buffer.handle; +    } + +private: +    OGLBuffer gl_buffer{}; +}; + +class OGLBufferCache final : public VideoCommon::BufferCache<Buffer, GLuint, OGLStreamBuffer> {  public:      explicit OGLBufferCache(RasterizerOpenGL& rasterizer, Core::System& system,                              std::size_t stream_size); @@ -30,18 +47,20 @@ public:      const GLuint* GetEmptyBuffer(std::size_t) override;  protected: -    OGLBuffer CreateBuffer(std::size_t size) override; +    Buffer CreateBlock(CacheAddr cache_addr, std::size_t size) override; + +    void WriteBarrier() override; -    const GLuint* ToHandle(const OGLBuffer& buffer) override; +    const GLuint* ToHandle(const Buffer& buffer) override; -    void UploadBufferData(const OGLBuffer& buffer, std::size_t offset, std::size_t size, -                          const u8* data) override; +    void UploadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size, +                         const u8* data) override; -    void DownloadBufferData(const OGLBuffer& buffer, std::size_t offset, std::size_t size, -                            u8* data) override; +    void DownloadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size, +                           u8* data) override; -    void CopyBufferData(const OGLBuffer& src, const OGLBuffer& dst, std::size_t src_offset, -                        std::size_t dst_offset, std::size_t size) override; +    void CopyBlock(const Buffer& src, const Buffer& dst, std::size_t src_offset, +                   std::size_t dst_offset, std::size_t size) override;  };  } // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_device.cpp b/src/video_core/renderer_opengl/gl_device.cpp index 85424a4c9..03d434b28 100644 --- a/src/video_core/renderer_opengl/gl_device.cpp +++ b/src/video_core/renderer_opengl/gl_device.cpp @@ -27,6 +27,8 @@ Device::Device() {      shader_storage_alignment = GetInteger<std::size_t>(GL_SHADER_STORAGE_BUFFER_OFFSET_ALIGNMENT);      max_vertex_attributes = GetInteger<u32>(GL_MAX_VERTEX_ATTRIBS);      max_varyings = GetInteger<u32>(GL_MAX_VARYING_VECTORS); +    has_warp_intrinsics = GLAD_GL_NV_gpu_shader5 && GLAD_GL_NV_shader_thread_group && +                          GLAD_GL_NV_shader_thread_shuffle;      has_vertex_viewport_layer = GLAD_GL_ARB_shader_viewport_layer_array;      has_variable_aoffi = TestVariableAoffi();      has_component_indexing_bug = TestComponentIndexingBug(); @@ -36,6 +38,7 @@ Device::Device(std::nullptr_t) {      uniform_buffer_alignment = 0;      max_vertex_attributes = 16;      max_varyings = 15; +    has_warp_intrinsics = true;      has_vertex_viewport_layer = true;      has_variable_aoffi = true;      has_component_indexing_bug = false; diff --git a/src/video_core/renderer_opengl/gl_device.h b/src/video_core/renderer_opengl/gl_device.h index dc883722d..3ef7c6dd8 100644 --- a/src/video_core/renderer_opengl/gl_device.h +++ b/src/video_core/renderer_opengl/gl_device.h @@ -30,6 +30,10 @@ public:          return max_varyings;      } +    bool HasWarpIntrinsics() const { +        return has_warp_intrinsics; +    } +      bool HasVertexViewportLayer() const {          return has_vertex_viewport_layer;      } @@ -50,6 +54,7 @@ private:      std::size_t shader_storage_alignment{};      u32 max_vertex_attributes{};      u32 max_varyings{}; +    bool has_warp_intrinsics{};      bool has_vertex_viewport_layer{};      bool has_variable_aoffi{};      bool has_component_indexing_bug{}; diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp index 80cfda7e4..bb09ecd52 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.cpp +++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp @@ -708,8 +708,6 @@ void RasterizerOpenGL::DrawArrays() {          return;      } -    const auto& regs = gpu.regs; -      SyncColorMask();      SyncFragmentColorClampState();      SyncMultiSampleState(); @@ -980,7 +978,7 @@ void RasterizerOpenGL::SetupGlobalMemory(const GLShader::GlobalMemoryEntry& entr                                           GPUVAddr gpu_addr, std::size_t size) {      const auto alignment{device.GetShaderStorageBufferAlignment()};      const auto [ssbo, buffer_offset] = -        buffer_cache.UploadMemory(gpu_addr, size, alignment, true, entry.IsWritten()); +        buffer_cache.UploadMemory(gpu_addr, size, alignment, entry.IsWritten());      bind_ssbo_pushbuffer.Push(ssbo, buffer_offset, static_cast<GLsizeiptr>(size));  } diff --git a/src/video_core/renderer_opengl/gl_shader_cache.cpp b/src/video_core/renderer_opengl/gl_shader_cache.cpp index 1c90facc3..cf6a5cddf 100644 --- a/src/video_core/renderer_opengl/gl_shader_cache.cpp +++ b/src/video_core/renderer_opengl/gl_shader_cache.cpp @@ -212,7 +212,9 @@ CachedProgram SpecializeShader(const std::string& code, const GLShader::ShaderEn      const auto texture_buffer_usage{variant.texture_buffer_usage};      std::string source = "#version 430 core\n" -                         "#extension GL_ARB_separate_shader_objects : enable\n"; +                         "#extension GL_ARB_separate_shader_objects : enable\n" +                         "#extension GL_NV_gpu_shader5 : enable\n" +                         "#extension GL_NV_shader_thread_group : enable\n";      if (entries.shader_viewport_layer_array) {          source += "#extension GL_ARB_shader_viewport_layer_array : enable\n";      } @@ -247,20 +249,24 @@ CachedProgram SpecializeShader(const std::string& code, const GLShader::ShaderEn          if (!texture_buffer_usage.test(i)) {              continue;          } -        source += fmt::format("#define SAMPLER_{}_IS_BUFFER", i); +        source += fmt::format("#define SAMPLER_{}_IS_BUFFER\n", i); +    } +    if (texture_buffer_usage.any()) { +        source += '\n';      }      if (program_type == ProgramType::Geometry) {          const auto [glsl_topology, debug_name, max_vertices] =              GetPrimitiveDescription(primitive_mode); -        source += "layout (" + std::string(glsl_topology) + ") in;\n"; +        source += "layout (" + std::string(glsl_topology) + ") in;\n\n";          source += "#define MAX_VERTEX_INPUT " + std::to_string(max_vertices) + '\n';      }      if (program_type == ProgramType::Compute) {          source += "layout (local_size_variable) in;\n";      } +    source += '\n';      source += code;      OGLShader shader; @@ -289,7 +295,7 @@ std::set<GLenum> GetSupportedFormats() {  CachedShader::CachedShader(const ShaderParameters& params, ProgramType program_type,                             GLShader::ProgramResult result) -    : RasterizerCacheObject{params.host_ptr}, host_ptr{params.host_ptr}, cpu_addr{params.cpu_addr}, +    : RasterizerCacheObject{params.host_ptr}, cpu_addr{params.cpu_addr},        unique_identifier{params.unique_identifier}, program_type{program_type},        disk_cache{params.disk_cache}, precompiled_programs{params.precompiled_programs},        entries{result.second}, code{std::move(result.first)}, shader_length{entries.shader_length} {} diff --git a/src/video_core/renderer_opengl/gl_shader_cache.h b/src/video_core/renderer_opengl/gl_shader_cache.h index a3106a0ff..2c8faf855 100644 --- a/src/video_core/renderer_opengl/gl_shader_cache.h +++ b/src/video_core/renderer_opengl/gl_shader_cache.h @@ -106,7 +106,6 @@ private:      ShaderDiskCacheUsage GetUsage(const ProgramVariant& variant) const; -    u8* host_ptr{};      VAddr cpu_addr{};      u64 unique_identifier{};      ProgramType program_type{}; diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp index d8f722c26..359d58cbe 100644 --- a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp +++ b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp @@ -565,7 +565,7 @@ private:                  case Tegra::Shader::ImageType::Texture1D:                      return "image1D";                  case Tegra::Shader::ImageType::TextureBuffer: -                    return "bufferImage"; +                    return "imageBuffer";                  case Tegra::Shader::ImageType::Texture1DArray:                      return "image1DArray";                  case Tegra::Shader::ImageType::Texture2D: @@ -1735,6 +1735,48 @@ private:          return "utof(gl_WorkGroupID"s + GetSwizzle(element) + ')';      } +    std::string BallotThread(Operation operation) { +        const std::string value = VisitOperand(operation, 0, Type::Bool); +        if (!device.HasWarpIntrinsics()) { +            LOG_ERROR(Render_OpenGL, +                      "Nvidia warp intrinsics are not available and its required by a shader"); +            // Stub on non-Nvidia devices by simulating all threads voting the same as the active +            // one. +            return fmt::format("utof({} ? 0xFFFFFFFFU : 0U)", value); +        } +        return fmt::format("utof(ballotThreadNV({}))", value); +    } + +    std::string Vote(Operation operation, const char* func) { +        const std::string value = VisitOperand(operation, 0, Type::Bool); +        if (!device.HasWarpIntrinsics()) { +            LOG_ERROR(Render_OpenGL, +                      "Nvidia vote intrinsics are not available and its required by a shader"); +            // Stub with a warp size of one. +            return value; +        } +        return fmt::format("{}({})", func, value); +    } + +    std::string VoteAll(Operation operation) { +        return Vote(operation, "allThreadsNV"); +    } + +    std::string VoteAny(Operation operation) { +        return Vote(operation, "anyThreadNV"); +    } + +    std::string VoteEqual(Operation operation) { +        if (!device.HasWarpIntrinsics()) { +            LOG_ERROR(Render_OpenGL, +                      "Nvidia vote intrinsics are not available and its required by a shader"); +            // We must return true here since a stub for a theoretical warp size of 1 will always +            // return an equal result for all its votes. +            return "true"; +        } +        return Vote(operation, "allThreadsEqualNV"); +    } +      static constexpr std::array operation_decompilers = {          &GLSLDecompiler::Assign, @@ -1885,6 +1927,11 @@ private:          &GLSLDecompiler::WorkGroupId<0>,          &GLSLDecompiler::WorkGroupId<1>,          &GLSLDecompiler::WorkGroupId<2>, + +        &GLSLDecompiler::BallotThread, +        &GLSLDecompiler::VoteAll, +        &GLSLDecompiler::VoteAny, +        &GLSLDecompiler::VoteEqual,      };      static_assert(operation_decompilers.size() == static_cast<std::size_t>(OperationCode::Amount)); diff --git a/src/video_core/renderer_opengl/gl_texture_cache.cpp b/src/video_core/renderer_opengl/gl_texture_cache.cpp index 408332f90..4f135fe03 100644 --- a/src/video_core/renderer_opengl/gl_texture_cache.cpp +++ b/src/video_core/renderer_opengl/gl_texture_cache.cpp @@ -184,6 +184,9 @@ GLint GetSwizzleSource(SwizzleSource source) {  }  void ApplyTextureDefaults(const SurfaceParams& params, GLuint texture) { +    if (params.IsBuffer()) { +        return; +    }      glTextureParameteri(texture, GL_TEXTURE_MIN_FILTER, GL_LINEAR);      glTextureParameteri(texture, GL_TEXTURE_MAG_FILTER, GL_LINEAR);      glTextureParameteri(texture, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE); @@ -208,6 +211,7 @@ OGLTexture CreateTexture(const SurfaceParams& params, GLenum target, GLenum inte          glNamedBufferStorage(texture_buffer.handle, params.width * params.GetBytesPerPixel(),                               nullptr, GL_DYNAMIC_STORAGE_BIT);          glTextureBuffer(texture.handle, internal_format, texture_buffer.handle); +        break;      case SurfaceTarget::Texture2D:      case SurfaceTarget::TextureCubemap:          glTextureStorage2D(texture.handle, params.emulated_levels, internal_format, params.width, diff --git a/src/video_core/renderer_opengl/gl_texture_cache.h b/src/video_core/renderer_opengl/gl_texture_cache.h index ff6ab6988..21324488a 100644 --- a/src/video_core/renderer_opengl/gl_texture_cache.h +++ b/src/video_core/renderer_opengl/gl_texture_cache.h @@ -51,7 +51,7 @@ public:      }  protected: -    void DecorateSurfaceName(); +    void DecorateSurfaceName() override;      View CreateView(const ViewParams& view_key) override;      View CreateViewInner(const ViewParams& view_key, bool is_proxy); diff --git a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp index 24a591797..a35b45c9c 100644 --- a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp +++ b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp @@ -1072,6 +1072,26 @@ private:          return {};      } +    Id BallotThread(Operation) { +        UNIMPLEMENTED(); +        return {}; +    } + +    Id VoteAll(Operation) { +        UNIMPLEMENTED(); +        return {}; +    } + +    Id VoteAny(Operation) { +        UNIMPLEMENTED(); +        return {}; +    } + +    Id VoteEqual(Operation) { +        UNIMPLEMENTED(); +        return {}; +    } +      Id DeclareBuiltIn(spv::BuiltIn builtin, spv::StorageClass storage, Id type,                        const std::string& name) {          const Id id = OpVariable(type, storage); @@ -1364,6 +1384,11 @@ private:          &SPIRVDecompiler::WorkGroupId<0>,          &SPIRVDecompiler::WorkGroupId<1>,          &SPIRVDecompiler::WorkGroupId<2>, + +        &SPIRVDecompiler::BallotThread, +        &SPIRVDecompiler::VoteAll, +        &SPIRVDecompiler::VoteAny, +        &SPIRVDecompiler::VoteEqual,      };      static_assert(operation_decompilers.size() == static_cast<std::size_t>(OperationCode::Amount)); diff --git a/src/video_core/shader/decode.cpp b/src/video_core/shader/decode.cpp index b547d8323..47a9fd961 100644 --- a/src/video_core/shader/decode.cpp +++ b/src/video_core/shader/decode.cpp @@ -176,6 +176,7 @@ u32 ShaderIR::DecodeInstr(NodeBlock& bb, u32 pc) {          {OpCode::Type::Ffma, &ShaderIR::DecodeFfma},          {OpCode::Type::Hfma2, &ShaderIR::DecodeHfma2},          {OpCode::Type::Conversion, &ShaderIR::DecodeConversion}, +        {OpCode::Type::Warp, &ShaderIR::DecodeWarp},          {OpCode::Type::Memory, &ShaderIR::DecodeMemory},          {OpCode::Type::Texture, &ShaderIR::DecodeTexture},          {OpCode::Type::Image, &ShaderIR::DecodeImage}, diff --git a/src/video_core/shader/decode/float_set.cpp b/src/video_core/shader/decode/float_set.cpp index f5013e44a..5614e8a0d 100644 --- a/src/video_core/shader/decode/float_set.cpp +++ b/src/video_core/shader/decode/float_set.cpp @@ -15,7 +15,6 @@ using Tegra::Shader::OpCode;  u32 ShaderIR::DecodeFloatSet(NodeBlock& bb, u32 pc) {      const Instruction instr = {program_code[pc]}; -    const auto opcode = OpCode::Decode(instr);      const Node op_a = GetOperandAbsNegFloat(GetRegister(instr.gpr8), instr.fset.abs_a != 0,                                              instr.fset.neg_a != 0); diff --git a/src/video_core/shader/decode/float_set_predicate.cpp b/src/video_core/shader/decode/float_set_predicate.cpp index 2323052b0..200c2c983 100644 --- a/src/video_core/shader/decode/float_set_predicate.cpp +++ b/src/video_core/shader/decode/float_set_predicate.cpp @@ -16,10 +16,9 @@ using Tegra::Shader::Pred;  u32 ShaderIR::DecodeFloatSetPredicate(NodeBlock& bb, u32 pc) {      const Instruction instr = {program_code[pc]}; -    const auto opcode = OpCode::Decode(instr); -    const Node op_a = GetOperandAbsNegFloat(GetRegister(instr.gpr8), instr.fsetp.abs_a != 0, -                                            instr.fsetp.neg_a != 0); +    Node op_a = GetOperandAbsNegFloat(GetRegister(instr.gpr8), instr.fsetp.abs_a != 0, +                                      instr.fsetp.neg_a != 0);      Node op_b = [&]() {          if (instr.is_b_imm) {              return GetImmediate19(instr); @@ -29,12 +28,13 @@ u32 ShaderIR::DecodeFloatSetPredicate(NodeBlock& bb, u32 pc) {              return GetConstBuffer(instr.cbuf34.index, instr.cbuf34.GetOffset());          }      }(); -    op_b = GetOperandAbsNegFloat(op_b, instr.fsetp.abs_b, false); +    op_b = GetOperandAbsNegFloat(std::move(op_b), instr.fsetp.abs_b, instr.fsetp.neg_b);      // We can't use the constant predicate as destination.      ASSERT(instr.fsetp.pred3 != static_cast<u64>(Pred::UnusedIndex)); -    const Node predicate = GetPredicateComparisonFloat(instr.fsetp.cond, op_a, op_b); +    const Node predicate = +        GetPredicateComparisonFloat(instr.fsetp.cond, std::move(op_a), std::move(op_b));      const Node second_pred = GetPredicate(instr.fsetp.pred39, instr.fsetp.neg_pred != 0);      const OperationCode combiner = GetPredicateCombiner(instr.fsetp.op); diff --git a/src/video_core/shader/decode/integer_set.cpp b/src/video_core/shader/decode/integer_set.cpp index 46e3d5905..59809bcd8 100644 --- a/src/video_core/shader/decode/integer_set.cpp +++ b/src/video_core/shader/decode/integer_set.cpp @@ -14,7 +14,6 @@ using Tegra::Shader::OpCode;  u32 ShaderIR::DecodeIntegerSet(NodeBlock& bb, u32 pc) {      const Instruction instr = {program_code[pc]}; -    const auto opcode = OpCode::Decode(instr);      const Node op_a = GetRegister(instr.gpr8);      const Node op_b = [&]() { diff --git a/src/video_core/shader/decode/integer_set_predicate.cpp b/src/video_core/shader/decode/integer_set_predicate.cpp index dd20775d7..25e48fef8 100644 --- a/src/video_core/shader/decode/integer_set_predicate.cpp +++ b/src/video_core/shader/decode/integer_set_predicate.cpp @@ -16,7 +16,6 @@ using Tegra::Shader::Pred;  u32 ShaderIR::DecodeIntegerSetPredicate(NodeBlock& bb, u32 pc) {      const Instruction instr = {program_code[pc]}; -    const auto opcode = OpCode::Decode(instr);      const Node op_a = GetRegister(instr.gpr8); diff --git a/src/video_core/shader/decode/other.cpp b/src/video_core/shader/decode/other.cpp index ac0e764d6..d46e0f823 100644 --- a/src/video_core/shader/decode/other.cpp +++ b/src/video_core/shader/decode/other.cpp @@ -74,6 +74,13 @@ u32 ShaderIR::DecodeOther(NodeBlock& bb, u32 pc) {              case SystemVariable::InvocationInfo:                  LOG_WARNING(HW_GPU, "MOV_SYS instruction with InvocationInfo is incomplete");                  return Immediate(0u); +            case SystemVariable::Tid: { +                Node value = Immediate(0); +                value = BitfieldInsert(value, Operation(OperationCode::LocalInvocationIdX), 0, 9); +                value = BitfieldInsert(value, Operation(OperationCode::LocalInvocationIdY), 16, 9); +                value = BitfieldInsert(value, Operation(OperationCode::LocalInvocationIdZ), 26, 5); +                return value; +            }              case SystemVariable::TidX:                  return Operation(OperationCode::LocalInvocationIdX);              case SystemVariable::TidY: diff --git a/src/video_core/shader/decode/predicate_set_register.cpp b/src/video_core/shader/decode/predicate_set_register.cpp index febbfeb50..84dbc50fe 100644 --- a/src/video_core/shader/decode/predicate_set_register.cpp +++ b/src/video_core/shader/decode/predicate_set_register.cpp @@ -15,7 +15,6 @@ using Tegra::Shader::OpCode;  u32 ShaderIR::DecodePredicateSetRegister(NodeBlock& bb, u32 pc) {      const Instruction instr = {program_code[pc]}; -    const auto opcode = OpCode::Decode(instr);      UNIMPLEMENTED_IF_MSG(instr.generates_cc,                           "Condition codes generation in PSET is not implemented"); diff --git a/src/video_core/shader/decode/warp.cpp b/src/video_core/shader/decode/warp.cpp new file mode 100644 index 000000000..04ca74f46 --- /dev/null +++ b/src/video_core/shader/decode/warp.cpp @@ -0,0 +1,55 @@ +// Copyright 2019 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include "common/assert.h" +#include "common/common_types.h" +#include "video_core/engines/shader_bytecode.h" +#include "video_core/shader/node_helper.h" +#include "video_core/shader/shader_ir.h" + +namespace VideoCommon::Shader { + +using Tegra::Shader::Instruction; +using Tegra::Shader::OpCode; +using Tegra::Shader::Pred; +using Tegra::Shader::VoteOperation; + +namespace { +OperationCode GetOperationCode(VoteOperation vote_op) { +    switch (vote_op) { +    case VoteOperation::All: +        return OperationCode::VoteAll; +    case VoteOperation::Any: +        return OperationCode::VoteAny; +    case VoteOperation::Eq: +        return OperationCode::VoteEqual; +    default: +        UNREACHABLE_MSG("Invalid vote operation={}", static_cast<u64>(vote_op)); +        return OperationCode::VoteAll; +    } +} +} // Anonymous namespace + +u32 ShaderIR::DecodeWarp(NodeBlock& bb, u32 pc) { +    const Instruction instr = {program_code[pc]}; +    const auto opcode = OpCode::Decode(instr); + +    switch (opcode->get().GetId()) { +    case OpCode::Id::VOTE: { +        const Node value = GetPredicate(instr.vote.value, instr.vote.negate_value != 0); +        const Node active = Operation(OperationCode::BallotThread, value); +        const Node vote = Operation(GetOperationCode(instr.vote.operation), value); +        SetRegister(bb, instr.gpr0, active); +        SetPredicate(bb, instr.vote.dest_pred, vote); +        break; +    } +    default: +        UNIMPLEMENTED_MSG("Unhandled warp instruction: {}", opcode->get().GetName()); +        break; +    } + +    return pc; +} + +} // namespace VideoCommon::Shader diff --git a/src/video_core/shader/node.h b/src/video_core/shader/node.h index 5f0852364..5db9313c4 100644 --- a/src/video_core/shader/node.h +++ b/src/video_core/shader/node.h @@ -168,6 +168,11 @@ enum class OperationCode {      WorkGroupIdY,       /// () -> uint      WorkGroupIdZ,       /// () -> uint +    BallotThread, /// (bool) -> uint +    VoteAll,      /// (bool) -> bool +    VoteAny,      /// (bool) -> bool +    VoteEqual,    /// (bool) -> bool +      Amount,  }; diff --git a/src/video_core/shader/shader_ir.cpp b/src/video_core/shader/shader_ir.cpp index 5e91fe129..1e5c7f660 100644 --- a/src/video_core/shader/shader_ir.cpp +++ b/src/video_core/shader/shader_ir.cpp @@ -405,4 +405,9 @@ Node ShaderIR::BitfieldExtract(Node value, u32 offset, u32 bits) {                       Immediate(offset), Immediate(bits));  } +Node ShaderIR::BitfieldInsert(Node base, Node insert, u32 offset, u32 bits) { +    return Operation(OperationCode::UBitfieldInsert, NO_PRECISE, base, insert, Immediate(offset), +                     Immediate(bits)); +} +  } // namespace VideoCommon::Shader diff --git a/src/video_core/shader/shader_ir.h b/src/video_core/shader/shader_ir.h index 59a083d90..bcc9b79b6 100644 --- a/src/video_core/shader/shader_ir.h +++ b/src/video_core/shader/shader_ir.h @@ -167,6 +167,7 @@ private:      u32 DecodeFfma(NodeBlock& bb, u32 pc);      u32 DecodeHfma2(NodeBlock& bb, u32 pc);      u32 DecodeConversion(NodeBlock& bb, u32 pc); +    u32 DecodeWarp(NodeBlock& bb, u32 pc);      u32 DecodeMemory(NodeBlock& bb, u32 pc);      u32 DecodeTexture(NodeBlock& bb, u32 pc);      u32 DecodeImage(NodeBlock& bb, u32 pc); @@ -279,6 +280,9 @@ private:      /// Extracts a sequence of bits from a node      Node BitfieldExtract(Node value, u32 offset, u32 bits); +    /// Inserts a sequence of bits from a node +    Node BitfieldInsert(Node base, Node insert, u32 offset, u32 bits); +      void WriteTexInstructionFloat(NodeBlock& bb, Tegra::Shader::Instruction instr,                                    const Node4& components); diff --git a/src/video_core/texture_cache/surface_params.h b/src/video_core/texture_cache/surface_params.h index 358d6757c..e7ef66ee2 100644 --- a/src/video_core/texture_cache/surface_params.h +++ b/src/video_core/texture_cache/surface_params.h @@ -58,7 +58,6 @@ public:      std::size_t GetHostSizeInBytes() const {          std::size_t host_size_in_bytes;          if (GetCompressionType() == SurfaceCompression::Converted) { -            constexpr std::size_t rgb8_bpp = 4ULL;              // ASTC is uncompressed in software, in emulated as RGBA8              host_size_in_bytes = 0;              for (u32 level = 0; level < num_levels; ++level) { diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h index a3a3770a7..2ec0203d1 100644 --- a/src/video_core/texture_cache/texture_cache.h +++ b/src/video_core/texture_cache/texture_cache.h @@ -308,8 +308,6 @@ protected:          if (!guard_render_targets && surface->IsRenderTarget()) {              ManageRenderTargetUnregister(surface);          } -        const GPUVAddr gpu_addr = surface->GetGpuAddr(); -        const CacheAddr cache_ptr = surface->GetCacheAddr();          const std::size_t size = surface->GetSizeInBytes();          const VAddr cpu_addr = surface->GetCpuAddr();          rasterizer.UpdatePagesCachedCount(cpu_addr, size, -1); diff --git a/src/video_core/textures/decoders.cpp b/src/video_core/textures/decoders.cpp index 7e8295944..7df5f1452 100644 --- a/src/video_core/textures/decoders.cpp +++ b/src/video_core/textures/decoders.cpp @@ -257,19 +257,21 @@ std::vector<u8> UnswizzleTexture(u8* address, u32 tile_size_x, u32 tile_size_y,  void SwizzleSubrect(u32 subrect_width, u32 subrect_height, u32 source_pitch, u32 swizzled_width,                      u32 bytes_per_pixel, u8* swizzled_data, u8* unswizzled_data, -                    u32 block_height_bit) { +                    u32 block_height_bit, u32 offset_x, u32 offset_y) {      const u32 block_height = 1U << block_height_bit;      const u32 image_width_in_gobs{(swizzled_width * bytes_per_pixel + (gob_size_x - 1)) /                                    gob_size_x};      for (u32 line = 0; line < subrect_height; ++line) { +        const u32 dst_y = line + offset_y;          const u32 gob_address_y = -            (line / (gob_size_y * block_height)) * gob_size * block_height * image_width_in_gobs + -            ((line % (gob_size_y * block_height)) / gob_size_y) * gob_size; -        const auto& table = legacy_swizzle_table[line % gob_size_y]; +            (dst_y / (gob_size_y * block_height)) * gob_size * block_height * image_width_in_gobs + +            ((dst_y % (gob_size_y * block_height)) / gob_size_y) * gob_size; +        const auto& table = legacy_swizzle_table[dst_y % gob_size_y];          for (u32 x = 0; x < subrect_width; ++x) { +            const u32 dst_x = x + offset_x;              const u32 gob_address = -                gob_address_y + (x * bytes_per_pixel / gob_size_x) * gob_size * block_height; -            const u32 swizzled_offset = gob_address + table[(x * bytes_per_pixel) % gob_size_x]; +                gob_address_y + (dst_x * bytes_per_pixel / gob_size_x) * gob_size * block_height; +            const u32 swizzled_offset = gob_address + table[(dst_x * bytes_per_pixel) % gob_size_x];              u8* source_line = unswizzled_data + line * source_pitch + x * bytes_per_pixel;              u8* dest_addr = swizzled_data + swizzled_offset; diff --git a/src/video_core/textures/decoders.h b/src/video_core/textures/decoders.h index eaec9b5a5..f1e3952bc 100644 --- a/src/video_core/textures/decoders.h +++ b/src/video_core/textures/decoders.h @@ -44,7 +44,8 @@ std::size_t CalculateSize(bool tiled, u32 bytes_per_pixel, u32 width, u32 height  /// Copies an untiled subrectangle into a tiled surface.  void SwizzleSubrect(u32 subrect_width, u32 subrect_height, u32 source_pitch, u32 swizzled_width, -                    u32 bytes_per_pixel, u8* swizzled_data, u8* unswizzled_data, u32 block_height); +                    u32 bytes_per_pixel, u8* swizzled_data, u8* unswizzled_data, u32 block_height, +                    u32 offset_x, u32 offset_y);  /// Copies a tiled subrectangle into a linear surface.  void UnswizzleSubrect(u32 subrect_width, u32 subrect_height, u32 dest_pitch, u32 swizzled_width, diff --git a/src/video_core/textures/texture.h b/src/video_core/textures/texture.h index e3be018b9..e36bc2c04 100644 --- a/src/video_core/textures/texture.h +++ b/src/video_core/textures/texture.h @@ -213,7 +213,7 @@ struct TICEntry {          if (header_version != TICHeaderVersion::OneDBuffer) {              return width_minus_1 + 1;          } -        return (buffer_high_width_minus_one << 16) | buffer_low_width_minus_one; +        return ((buffer_high_width_minus_one << 16) | buffer_low_width_minus_one) + 1;      }      u32 Height() const { diff --git a/src/yuzu/configuration/config.cpp b/src/yuzu/configuration/config.cpp index 5d0fb3f9f..0456248ac 100644 --- a/src/yuzu/configuration/config.cpp +++ b/src/yuzu/configuration/config.cpp @@ -516,6 +516,7 @@ void Config::ReadPathValues() {      UISettings::values.roms_path = ReadSetting(QStringLiteral("romsPath")).toString();      UISettings::values.symbols_path = ReadSetting(QStringLiteral("symbolsPath")).toString(); +    UISettings::values.screenshot_path = ReadSetting(QStringLiteral("screenshotPath")).toString();      UISettings::values.game_directory_path =          ReadSetting(QStringLiteral("gameListRootDir"), QStringLiteral(".")).toString();      UISettings::values.game_directory_deepscan = diff --git a/src/yuzu/main.cpp b/src/yuzu/main.cpp index a7c656fdb..ac57229d5 100644 --- a/src/yuzu/main.cpp +++ b/src/yuzu/main.cpp @@ -119,6 +119,7 @@ Q_IMPORT_PLUGIN(QWindowsIntegrationPlugin);  #endif  #ifdef _WIN32 +#include <windows.h>  extern "C" {  // tells Nvidia and AMD drivers to use the dedicated GPU by default on laptops with switchable  // graphics @@ -747,6 +748,18 @@ void GMainWindow::OnDisplayTitleBars(bool show) {      }  } +void GMainWindow::PreventOSSleep() { +#ifdef _WIN32 +    SetThreadExecutionState(ES_CONTINUOUS | ES_SYSTEM_REQUIRED | ES_DISPLAY_REQUIRED); +#endif +} + +void GMainWindow::AllowOSSleep() { +#ifdef _WIN32 +    SetThreadExecutionState(ES_CONTINUOUS); +#endif +} +  QStringList GMainWindow::GetUnsupportedGLExtensions() {      QStringList unsupported_ext; @@ -966,6 +979,8 @@ void GMainWindow::BootGame(const QString& filename) {  }  void GMainWindow::ShutdownGame() { +    AllowOSSleep(); +      discord_rpc->Pause();      emu_thread->RequestStop(); @@ -1567,6 +1582,8 @@ void GMainWindow::OnMenuRecentFile() {  }  void GMainWindow::OnStartGame() { +    PreventOSSleep(); +      emu_thread->SetRunning(true);      qRegisterMetaType<Core::Frontend::SoftwareKeyboardParameters>( @@ -1598,6 +1615,8 @@ void GMainWindow::OnPauseGame() {      ui.action_Pause->setEnabled(false);      ui.action_Stop->setEnabled(true);      ui.action_Capture_Screenshot->setEnabled(false); + +    AllowOSSleep();  }  void GMainWindow::OnStopGame() { diff --git a/src/yuzu/main.h b/src/yuzu/main.h index 1137bbc7a..501608ddc 100644 --- a/src/yuzu/main.h +++ b/src/yuzu/main.h @@ -130,6 +130,9 @@ private:      void ConnectWidgetEvents();      void ConnectMenuEvents(); +    void PreventOSSleep(); +    void AllowOSSleep(); +      QStringList GetUnsupportedGLExtensions();      bool LoadROM(const QString& filename);      void BootGame(const QString& filename); | 
