diff options
69 files changed, 2359 insertions, 1166 deletions
| diff --git a/externals/CMakeLists.txt b/externals/CMakeLists.txt index df7a5e0a9..9be5b2780 100644 --- a/externals/CMakeLists.txt +++ b/externals/CMakeLists.txt @@ -4,6 +4,13 @@ list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/CMakeModules")  list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/externals/find-modules")  include(DownloadExternals) +# xbyak +if (ARCHITECTURE_x86 OR ARCHITECTURE_x86_64) +    add_library(xbyak INTERFACE) +    target_include_directories(xbyak SYSTEM INTERFACE ./xbyak/xbyak) +    target_compile_definitions(xbyak INTERFACE XBYAK_NO_OP_NAMES) +endif() +  # Catch  add_library(catch-single-include INTERFACE)  target_include_directories(catch-single-include INTERFACE catch/single_include) @@ -75,11 +82,3 @@ if (ENABLE_WEB_SERVICE)      target_compile_definitions(httplib INTERFACE -DCPPHTTPLIB_OPENSSL_SUPPORT)      target_link_libraries(httplib INTERFACE OpenSSL::SSL OpenSSL::Crypto)  endif() - -if (NOT TARGET xbyak) -    if (ARCHITECTURE_x86 OR ARCHITECTURE_x86_64) -        add_library(xbyak INTERFACE) -        target_include_directories(xbyak SYSTEM INTERFACE ./xbyak/xbyak) -        target_compile_definitions(xbyak INTERFACE XBYAK_NO_OP_NAMES) -    endif() -endif() diff --git a/externals/sirit b/externals/sirit -Subproject a62c5bbc100a5e5a31ea0ccc4a78d8fa6a4167c +Subproject eefca56afd49379bdebc97ded8b480839f93088 diff --git a/src/core/hle/kernel/process.cpp b/src/core/hle/kernel/process.cpp index 36724569f..c4c5199b1 100644 --- a/src/core/hle/kernel/process.cpp +++ b/src/core/hle/kernel/process.cpp @@ -132,7 +132,8 @@ std::shared_ptr<ResourceLimit> Process::GetResourceLimit() const {  u64 Process::GetTotalPhysicalMemoryAvailable() const {      const u64 capacity{resource_limit->GetCurrentResourceValue(ResourceType::PhysicalMemory) + -                       page_table->GetTotalHeapSize() + image_size + main_thread_stack_size}; +                       page_table->GetTotalHeapSize() + GetSystemResourceSize() + image_size + +                       main_thread_stack_size};      if (capacity < memory_usage_capacity) {          return capacity; @@ -146,7 +147,8 @@ u64 Process::GetTotalPhysicalMemoryAvailableWithoutSystemResource() const {  }  u64 Process::GetTotalPhysicalMemoryUsed() const { -    return image_size + main_thread_stack_size + page_table->GetTotalHeapSize(); +    return image_size + main_thread_stack_size + page_table->GetTotalHeapSize() + +           GetSystemResourceSize();  }  u64 Process::GetTotalPhysicalMemoryUsedWithoutSystemResource() const { diff --git a/src/core/hle/kernel/readable_event.cpp b/src/core/hle/kernel/readable_event.cpp index 00860fcbd..ef5e19e63 100644 --- a/src/core/hle/kernel/readable_event.cpp +++ b/src/core/hle/kernel/readable_event.cpp @@ -38,7 +38,7 @@ void ReadableEvent::Clear() {  ResultCode ReadableEvent::Reset() {      if (!is_signaled) { -        LOG_ERROR(Kernel, "Handle is not signaled! object_id={}, object_type={}, object_name={}", +        LOG_TRACE(Kernel, "Handle is not signaled! object_id={}, object_type={}, object_name={}",                    GetObjectId(), GetTypeName(), GetName());          return ERR_INVALID_STATE;      } diff --git a/src/core/hle/kernel/resource_limit.cpp b/src/core/hle/kernel/resource_limit.cpp index d9beaa3a4..212e442f4 100644 --- a/src/core/hle/kernel/resource_limit.cpp +++ b/src/core/hle/kernel/resource_limit.cpp @@ -24,13 +24,9 @@ bool ResourceLimit::Reserve(ResourceType resource, s64 amount, u64 timeout) {      const std::size_t index{ResourceTypeToIndex(resource)};      s64 new_value = current[index] + amount; -    while (new_value > limit[index] && available[index] + amount <= limit[index]) { +    if (new_value > limit[index] && available[index] + amount <= limit[index]) {          // TODO(bunnei): This is wrong for multicore, we should wait the calling thread for timeout          new_value = current[index] + amount; - -        if (timeout >= 0) { -            break; -        }      }      if (new_value <= limit[index]) { diff --git a/src/core/hle/service/nvdrv/devices/nvhost_ctrl_gpu.cpp b/src/core/hle/service/nvdrv/devices/nvhost_ctrl_gpu.cpp index cc2192e5c..0d913334e 100644 --- a/src/core/hle/service/nvdrv/devices/nvhost_ctrl_gpu.cpp +++ b/src/core/hle/service/nvdrv/devices/nvhost_ctrl_gpu.cpp @@ -25,7 +25,7 @@ u32 nvhost_ctrl_gpu::ioctl(Ioctl command, const std::vector<u8>& input,      case IoctlCommand::IocGetCharacteristicsCommand:          return GetCharacteristics(input, output, output2, version);      case IoctlCommand::IocGetTPCMasksCommand: -        return GetTPCMasks(input, output); +        return GetTPCMasks(input, output, output2, version);      case IoctlCommand::IocGetActiveSlotMaskCommand:          return GetActiveSlotMask(input, output);      case IoctlCommand::IocZcullGetCtxSizeCommand: @@ -98,17 +98,22 @@ u32 nvhost_ctrl_gpu::GetCharacteristics(const std::vector<u8>& input, std::vecto      return 0;  } -u32 nvhost_ctrl_gpu::GetTPCMasks(const std::vector<u8>& input, std::vector<u8>& output) { +u32 nvhost_ctrl_gpu::GetTPCMasks(const std::vector<u8>& input, std::vector<u8>& output, +                                 std::vector<u8>& output2, IoctlVersion version) {      IoctlGpuGetTpcMasksArgs params{};      std::memcpy(¶ms, input.data(), input.size()); -    LOG_INFO(Service_NVDRV, "called, mask=0x{:X}, mask_buf_addr=0x{:X}", params.mask_buf_size, -             params.mask_buf_addr); -    // TODO(ogniK): Confirm value on hardware -    if (params.mask_buf_size) -        params.tpc_mask_size = 4 * 1; // 4 * num_gpc -    else -        params.tpc_mask_size = 0; -    std::memcpy(output.data(), ¶ms, sizeof(params)); +    LOG_DEBUG(Service_NVDRV, "called, mask_buffer_size=0x{:X}", params.mask_buffer_size); +    if (params.mask_buffer_size != 0) { +        params.tcp_mask = 3; +    } + +    if (version == IoctlVersion::Version3) { +        std::memcpy(output.data(), input.data(), output.size()); +        std::memcpy(output2.data(), ¶ms.tcp_mask, output2.size()); +    } else { +        std::memcpy(output.data(), ¶ms, output.size()); +    } +      return 0;  } diff --git a/src/core/hle/service/nvdrv/devices/nvhost_ctrl_gpu.h b/src/core/hle/service/nvdrv/devices/nvhost_ctrl_gpu.h index 07b644ec5..ef60f72ce 100644 --- a/src/core/hle/service/nvdrv/devices/nvhost_ctrl_gpu.h +++ b/src/core/hle/service/nvdrv/devices/nvhost_ctrl_gpu.h @@ -92,16 +92,11 @@ private:                    "IoctlCharacteristics is incorrect size");      struct IoctlGpuGetTpcMasksArgs { -        /// [in]  TPC mask buffer size reserved by userspace. Should be at least -        /// sizeof(__u32) * fls(gpc_mask) to receive TPC mask for each GPC. -        /// [out] full kernel buffer size -        u32_le mask_buf_size; -        u32_le reserved; - -        /// [in]  pointer to TPC mask buffer. It will receive one 32-bit TPC mask per GPC or 0 if -        /// GPC is not enabled or not present. This parameter is ignored if mask_buf_size is 0. -        u64_le mask_buf_addr; -        u64_le tpc_mask_size; // Nintendo add this? +        u32_le mask_buffer_size{}; +        INSERT_PADDING_WORDS(1); +        u64_le mask_buffer_address{}; +        u32_le tcp_mask{}; +        INSERT_PADDING_WORDS(1);      };      static_assert(sizeof(IoctlGpuGetTpcMasksArgs) == 24,                    "IoctlGpuGetTpcMasksArgs is incorrect size"); @@ -166,7 +161,8 @@ private:      u32 GetCharacteristics(const std::vector<u8>& input, std::vector<u8>& output,                             std::vector<u8>& output2, IoctlVersion version); -    u32 GetTPCMasks(const std::vector<u8>& input, std::vector<u8>& output); +    u32 GetTPCMasks(const std::vector<u8>& input, std::vector<u8>& output, std::vector<u8>& output2, +                    IoctlVersion version);      u32 GetActiveSlotMask(const std::vector<u8>& input, std::vector<u8>& output);      u32 ZCullGetCtxSize(const std::vector<u8>& input, std::vector<u8>& output);      u32 ZCullGetInfo(const std::vector<u8>& input, std::vector<u8>& output); diff --git a/src/core/settings.h b/src/core/settings.h index 9d916d5cb..33e1e06cd 100644 --- a/src/core/settings.h +++ b/src/core/settings.h @@ -474,6 +474,7 @@ struct Values {      bool reporting_services;      bool quest_flag;      bool disable_cpu_opt; +    bool disable_macro_jit;      // BCAT      std::string bcat_backend; diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt index d6ee82836..39d5d8401 100644 --- a/src/video_core/CMakeLists.txt +++ b/src/video_core/CMakeLists.txt @@ -25,6 +25,12 @@ add_library(video_core STATIC      engines/shader_bytecode.h      engines/shader_header.h      engines/shader_type.h +    macro/macro.cpp +    macro/macro.h +    macro/macro_interpreter.cpp +    macro/macro_interpreter.h +    macro/macro_jit_x64.cpp +    macro/macro_jit_x64.h      fence_manager.h      gpu.cpp      gpu.h @@ -36,8 +42,6 @@ add_library(video_core STATIC      gpu_thread.h      guest_driver.cpp      guest_driver.h -    macro_interpreter.cpp -    macro_interpreter.h      memory_manager.cpp      memory_manager.h      morton.cpp @@ -45,8 +49,6 @@ add_library(video_core STATIC      query_cache.h      rasterizer_accelerated.cpp      rasterizer_accelerated.h -    rasterizer_cache.cpp -    rasterizer_cache.h      rasterizer_interface.h      renderer_base.cpp      renderer_base.h @@ -89,6 +91,7 @@ add_library(video_core STATIC      renderer_opengl/utils.h      sampler_cache.cpp      sampler_cache.h +    shader_cache.h      shader/decode/arithmetic.cpp      shader/decode/arithmetic_immediate.cpp      shader/decode/bfe.cpp diff --git a/src/video_core/buffer_cache/buffer_block.h b/src/video_core/buffer_cache/buffer_block.h index e35ee0b67..e64170e66 100644 --- a/src/video_core/buffer_cache/buffer_block.h +++ b/src/video_core/buffer_cache/buffer_block.h @@ -15,48 +15,47 @@ namespace VideoCommon {  class BufferBlock {  public: -    bool Overlaps(const VAddr start, const VAddr end) const { +    bool Overlaps(VAddr start, VAddr end) const {          return (cpu_addr < end) && (cpu_addr_end > start);      } -    bool IsInside(const VAddr other_start, const VAddr other_end) const { +    bool IsInside(VAddr other_start, VAddr other_end) const {          return cpu_addr <= other_start && other_end <= cpu_addr_end;      } -    std::size_t GetOffset(const VAddr in_addr) { +    std::size_t Offset(VAddr in_addr) const {          return static_cast<std::size_t>(in_addr - cpu_addr);      } -    VAddr GetCpuAddr() const { +    VAddr CpuAddr() const {          return cpu_addr;      } -    VAddr GetCpuAddrEnd() const { +    VAddr CpuAddrEnd() const {          return cpu_addr_end;      } -    void SetCpuAddr(const VAddr new_addr) { +    void SetCpuAddr(VAddr new_addr) {          cpu_addr = new_addr;          cpu_addr_end = new_addr + size;      } -    std::size_t GetSize() const { +    std::size_t Size() const {          return size;      } -    void SetEpoch(u64 new_epoch) { -        epoch = new_epoch; +    u64 Epoch() const { +        return epoch;      } -    u64 GetEpoch() { -        return epoch; +    void SetEpoch(u64 new_epoch) { +        epoch = new_epoch;      }  protected: -    explicit BufferBlock(VAddr cpu_addr, const std::size_t size) : size{size} { -        SetCpuAddr(cpu_addr); +    explicit BufferBlock(VAddr cpu_addr_, std::size_t size_) : size{size_} { +        SetCpuAddr(cpu_addr_);      } -    ~BufferBlock() = default;  private:      VAddr cpu_addr{}; diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h index b88fce2cd..308d8b55f 100644 --- a/src/video_core/buffer_cache/buffer_cache.h +++ b/src/video_core/buffer_cache/buffer_cache.h @@ -30,12 +30,16 @@  namespace VideoCommon { -template <typename OwnerBuffer, typename BufferType, typename StreamBuffer> +template <typename Buffer, typename BufferType, typename StreamBuffer>  class BufferCache {      using IntervalSet = boost::icl::interval_set<VAddr>;      using IntervalType = typename IntervalSet::interval_type;      using VectorMapInterval = boost::container::small_vector<MapInterval*, 1>; +    static constexpr u64 WRITE_PAGE_BIT = 11; +    static constexpr u64 BLOCK_PAGE_BITS = 21; +    static constexpr u64 BLOCK_PAGE_SIZE = 1ULL << BLOCK_PAGE_BITS; +  public:      using BufferInfo = std::pair<BufferType, u64>; @@ -82,7 +86,7 @@ public:              }          } -        OwnerBuffer block = GetBlock(cpu_addr, size); +        Buffer* const block = GetBlock(cpu_addr, size);          MapInterval* const map = MapAddress(block, gpu_addr, cpu_addr, size);          if (!map) {              return {GetEmptyBuffer(size), 0}; @@ -98,7 +102,7 @@ public:              }          } -        return {ToHandle(block), static_cast<u64>(block->GetOffset(cpu_addr))}; +        return {block->Handle(), static_cast<u64>(block->Offset(cpu_addr))};      }      /// Uploads from a host memory. Returns the OpenGL buffer where it's located and its offset. @@ -110,31 +114,37 @@ public:          });      } -    void Map(std::size_t max_size) { +    /// Prepares the buffer cache for data uploading +    /// @param max_size Maximum number of bytes that will be uploaded +    /// @return True when a stream buffer invalidation was required, false otherwise +    bool Map(std::size_t max_size) {          std::lock_guard lock{mutex}; +        bool invalidated;          std::tie(buffer_ptr, buffer_offset_base, invalidated) = stream_buffer->Map(max_size, 4);          buffer_offset = buffer_offset_base; + +        return invalidated;      } -    /// Finishes the upload stream, returns true on bindings invalidation. -    bool Unmap() { +    /// Finishes the upload stream +    void Unmap() {          std::lock_guard lock{mutex}; -          stream_buffer->Unmap(buffer_offset - buffer_offset_base); -        return std::exchange(invalidated, false);      } +    /// Function called at the end of each frame, inteded for deferred operations      void TickFrame() {          ++epoch; +          while (!pending_destruction.empty()) {              // Delay at least 4 frames before destruction.              // This is due to triple buffering happening on some drivers.              static constexpr u64 epochs_to_destroy = 5; -            if (pending_destruction.front()->GetEpoch() + epochs_to_destroy > epoch) { +            if (pending_destruction.front()->Epoch() + epochs_to_destroy > epoch) {                  break;              } -            pending_destruction.pop_front(); +            pending_destruction.pop();          }      } @@ -249,23 +259,21 @@ public:  protected:      explicit BufferCache(VideoCore::RasterizerInterface& rasterizer, Core::System& system, -                         std::unique_ptr<StreamBuffer> stream_buffer) -        : rasterizer{rasterizer}, system{system}, stream_buffer{std::move(stream_buffer)}, -          stream_buffer_handle{this->stream_buffer->GetHandle()} {} +                         std::unique_ptr<StreamBuffer> stream_buffer_) +        : rasterizer{rasterizer}, system{system}, stream_buffer{std::move(stream_buffer_)}, +          stream_buffer_handle{stream_buffer->Handle()} {}      ~BufferCache() = default; -    virtual BufferType ToHandle(const OwnerBuffer& storage) = 0; +    virtual std::shared_ptr<Buffer> CreateBlock(VAddr cpu_addr, std::size_t size) = 0; -    virtual OwnerBuffer CreateBlock(VAddr cpu_addr, std::size_t size) = 0; - -    virtual void UploadBlockData(const OwnerBuffer& buffer, std::size_t offset, std::size_t size, +    virtual void UploadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size,                                   const u8* data) = 0; -    virtual void DownloadBlockData(const OwnerBuffer& buffer, std::size_t offset, std::size_t size, +    virtual void DownloadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size,                                     u8* data) = 0; -    virtual void CopyBlock(const OwnerBuffer& src, const OwnerBuffer& dst, std::size_t src_offset, +    virtual void CopyBlock(const Buffer& src, const Buffer& dst, std::size_t src_offset,                             std::size_t dst_offset, std::size_t size) = 0;      virtual BufferInfo ConstBufferUpload(const void* raw_pointer, std::size_t size) { @@ -321,7 +329,7 @@ protected:      }  private: -    MapInterval* MapAddress(const OwnerBuffer& block, GPUVAddr gpu_addr, VAddr cpu_addr, +    MapInterval* MapAddress(const Buffer* block, GPUVAddr gpu_addr, VAddr cpu_addr,                              std::size_t size) {          const VectorMapInterval overlaps = GetMapsInRange(cpu_addr, size);          if (overlaps.empty()) { @@ -329,11 +337,11 @@ private:              const VAddr cpu_addr_end = cpu_addr + size;              if (memory_manager.IsGranularRange(gpu_addr, size)) {                  u8* host_ptr = memory_manager.GetPointer(gpu_addr); -                UploadBlockData(block, block->GetOffset(cpu_addr), size, host_ptr); +                UploadBlockData(*block, block->Offset(cpu_addr), size, host_ptr);              } else {                  staging_buffer.resize(size);                  memory_manager.ReadBlockUnsafe(gpu_addr, staging_buffer.data(), size); -                UploadBlockData(block, block->GetOffset(cpu_addr), size, staging_buffer.data()); +                UploadBlockData(*block, block->Offset(cpu_addr), size, staging_buffer.data());              }              return Register(MapInterval(cpu_addr, cpu_addr_end, gpu_addr));          } @@ -376,7 +384,7 @@ private:          return map;      } -    void UpdateBlock(const OwnerBuffer& block, VAddr start, VAddr end, +    void UpdateBlock(const Buffer* block, VAddr start, VAddr end,                       const VectorMapInterval& overlaps) {          const IntervalType base_interval{start, end};          IntervalSet interval_set{}; @@ -386,13 +394,13 @@ private:              interval_set.subtract(subtract);          }          for (auto& interval : interval_set) { -            std::size_t size = interval.upper() - interval.lower(); -            if (size > 0) { -                staging_buffer.resize(size); -                system.Memory().ReadBlockUnsafe(interval.lower(), staging_buffer.data(), size); -                UploadBlockData(block, block->GetOffset(interval.lower()), size, -                                staging_buffer.data()); +            const std::size_t size = interval.upper() - interval.lower(); +            if (size == 0) { +                continue;              } +            staging_buffer.resize(size); +            system.Memory().ReadBlockUnsafe(interval.lower(), staging_buffer.data(), size); +            UploadBlockData(*block, block->Offset(interval.lower()), size, staging_buffer.data());          }      } @@ -422,10 +430,14 @@ private:      }      void FlushMap(MapInterval* map) { +        const auto it = blocks.find(map->start >> BLOCK_PAGE_BITS); +        ASSERT_OR_EXECUTE(it != blocks.end(), return;); + +        std::shared_ptr<Buffer> block = it->second; +          const std::size_t size = map->end - map->start; -        OwnerBuffer block = blocks[map->start >> block_page_bits];          staging_buffer.resize(size); -        DownloadBlockData(block, block->GetOffset(map->start), size, staging_buffer.data()); +        DownloadBlockData(*block, block->Offset(map->start), size, staging_buffer.data());          system.Memory().WriteBlockUnsafe(map->start, staging_buffer.data(), size);          map->MarkAsModified(false, 0);      } @@ -448,97 +460,89 @@ private:          buffer_offset = offset_aligned;      } -    OwnerBuffer EnlargeBlock(OwnerBuffer buffer) { -        const std::size_t old_size = buffer->GetSize(); -        const std::size_t new_size = old_size + block_page_size; -        const VAddr cpu_addr = buffer->GetCpuAddr(); -        OwnerBuffer new_buffer = CreateBlock(cpu_addr, new_size); -        CopyBlock(buffer, new_buffer, 0, 0, old_size); -        buffer->SetEpoch(epoch); -        pending_destruction.push_back(buffer); +    std::shared_ptr<Buffer> EnlargeBlock(std::shared_ptr<Buffer> buffer) { +        const std::size_t old_size = buffer->Size(); +        const std::size_t new_size = old_size + BLOCK_PAGE_SIZE; +        const VAddr cpu_addr = buffer->CpuAddr(); +        std::shared_ptr<Buffer> new_buffer = CreateBlock(cpu_addr, new_size); +        CopyBlock(*buffer, *new_buffer, 0, 0, old_size); +        QueueDestruction(std::move(buffer)); +          const VAddr cpu_addr_end = cpu_addr + new_size - 1; -        u64 page_start = cpu_addr >> block_page_bits; -        const u64 page_end = cpu_addr_end >> block_page_bits; -        while (page_start <= page_end) { -            blocks[page_start] = new_buffer; -            ++page_start; +        const u64 page_end = cpu_addr_end >> BLOCK_PAGE_BITS; +        for (u64 page_start = cpu_addr >> BLOCK_PAGE_BITS; page_start <= page_end; ++page_start) { +            blocks.insert_or_assign(page_start, new_buffer);          } +          return new_buffer;      } -    OwnerBuffer MergeBlocks(OwnerBuffer first, OwnerBuffer second) { -        const std::size_t size_1 = first->GetSize(); -        const std::size_t size_2 = second->GetSize(); -        const VAddr first_addr = first->GetCpuAddr(); -        const VAddr second_addr = second->GetCpuAddr(); +    std::shared_ptr<Buffer> MergeBlocks(std::shared_ptr<Buffer> first, +                                        std::shared_ptr<Buffer> second) { +        const std::size_t size_1 = first->Size(); +        const std::size_t size_2 = second->Size(); +        const VAddr first_addr = first->CpuAddr(); +        const VAddr second_addr = second->CpuAddr();          const VAddr new_addr = std::min(first_addr, second_addr);          const std::size_t new_size = size_1 + size_2; -        OwnerBuffer new_buffer = CreateBlock(new_addr, new_size); -        CopyBlock(first, new_buffer, 0, new_buffer->GetOffset(first_addr), size_1); -        CopyBlock(second, new_buffer, 0, new_buffer->GetOffset(second_addr), size_2); -        first->SetEpoch(epoch); -        second->SetEpoch(epoch); -        pending_destruction.push_back(first); -        pending_destruction.push_back(second); + +        std::shared_ptr<Buffer> new_buffer = CreateBlock(new_addr, new_size); +        CopyBlock(*first, *new_buffer, 0, new_buffer->Offset(first_addr), size_1); +        CopyBlock(*second, *new_buffer, 0, new_buffer->Offset(second_addr), size_2); +        QueueDestruction(std::move(first)); +        QueueDestruction(std::move(second)); +          const VAddr cpu_addr_end = new_addr + new_size - 1; -        u64 page_start = new_addr >> block_page_bits; -        const u64 page_end = cpu_addr_end >> block_page_bits; -        while (page_start <= page_end) { -            blocks[page_start] = new_buffer; -            ++page_start; +        const u64 page_end = cpu_addr_end >> BLOCK_PAGE_BITS; +        for (u64 page_start = new_addr >> BLOCK_PAGE_BITS; page_start <= page_end; ++page_start) { +            blocks.insert_or_assign(page_start, new_buffer);          }          return new_buffer;      } -    OwnerBuffer GetBlock(const VAddr cpu_addr, const std::size_t size) { -        OwnerBuffer found; +    Buffer* GetBlock(VAddr cpu_addr, std::size_t size) { +        std::shared_ptr<Buffer> found; +          const VAddr cpu_addr_end = cpu_addr + size - 1; -        u64 page_start = cpu_addr >> block_page_bits; -        const u64 page_end = cpu_addr_end >> block_page_bits; -        while (page_start <= page_end) { +        const u64 page_end = cpu_addr_end >> BLOCK_PAGE_BITS; +        for (u64 page_start = cpu_addr >> BLOCK_PAGE_BITS; page_start <= page_end; ++page_start) {              auto it = blocks.find(page_start);              if (it == blocks.end()) {                  if (found) {                      found = EnlargeBlock(found); -                } else { -                    const VAddr start_addr = (page_start << block_page_bits); -                    found = CreateBlock(start_addr, block_page_size); -                    blocks[page_start] = found; -                } -            } else { -                if (found) { -                    if (found == it->second) { -                        ++page_start; -                        continue; -                    } -                    found = MergeBlocks(found, it->second); -                } else { -                    found = it->second; +                    continue;                  } +                const VAddr start_addr = page_start << BLOCK_PAGE_BITS; +                found = CreateBlock(start_addr, BLOCK_PAGE_SIZE); +                blocks.insert_or_assign(page_start, found); +                continue; +            } +            if (!found) { +                found = it->second; +                continue; +            } +            if (found != it->second) { +                found = MergeBlocks(std::move(found), it->second);              } -            ++page_start;          } -        return found; +        return found.get();      } -    void MarkRegionAsWritten(const VAddr start, const VAddr end) { -        u64 page_start = start >> write_page_bit; -        const u64 page_end = end >> write_page_bit; -        while (page_start <= page_end) { +    void MarkRegionAsWritten(VAddr start, VAddr end) { +        const u64 page_end = end >> WRITE_PAGE_BIT; +        for (u64 page_start = start >> WRITE_PAGE_BIT; page_start <= page_end; ++page_start) {              auto it = written_pages.find(page_start);              if (it != written_pages.end()) {                  it->second = it->second + 1;              } else { -                written_pages[page_start] = 1; +                written_pages.insert_or_assign(page_start, 1);              } -            ++page_start;          }      } -    void UnmarkRegionAsWritten(const VAddr start, const VAddr end) { -        u64 page_start = start >> write_page_bit; -        const u64 page_end = end >> write_page_bit; -        while (page_start <= page_end) { +    void UnmarkRegionAsWritten(VAddr start, VAddr end) { +        const u64 page_end = end >> WRITE_PAGE_BIT; +        for (u64 page_start = start >> WRITE_PAGE_BIT; page_start <= page_end; ++page_start) {              auto it = written_pages.find(page_start);              if (it != written_pages.end()) {                  if (it->second > 1) { @@ -547,22 +551,24 @@ private:                      written_pages.erase(it);                  }              } -            ++page_start;          }      } -    bool IsRegionWritten(const VAddr start, const VAddr end) const { -        u64 page_start = start >> write_page_bit; -        const u64 page_end = end >> write_page_bit; -        while (page_start <= page_end) { +    bool IsRegionWritten(VAddr start, VAddr end) const { +        const u64 page_end = end >> WRITE_PAGE_BIT; +        for (u64 page_start = start >> WRITE_PAGE_BIT; page_start <= page_end; ++page_start) {              if (written_pages.count(page_start) > 0) {                  return true;              } -            ++page_start;          }          return false;      } +    void QueueDestruction(std::shared_ptr<Buffer> buffer) { +        buffer->SetEpoch(epoch); +        pending_destruction.push(std::move(buffer)); +    } +      void MarkForAsyncFlush(MapInterval* map) {          if (!uncommitted_flushes) {              uncommitted_flushes = std::make_shared<std::unordered_set<MapInterval*>>(); @@ -574,9 +580,7 @@ private:      Core::System& system;      std::unique_ptr<StreamBuffer> stream_buffer; -    BufferType stream_buffer_handle{}; - -    bool invalidated = false; +    BufferType stream_buffer_handle;      u8* buffer_ptr = nullptr;      u64 buffer_offset = 0; @@ -586,18 +590,15 @@ private:      boost::intrusive::set<MapInterval, boost::intrusive::compare<MapIntervalCompare>>          mapped_addresses; -    static constexpr u64 write_page_bit = 11;      std::unordered_map<u64, u32> written_pages; +    std::unordered_map<u64, std::shared_ptr<Buffer>> blocks; -    static constexpr u64 block_page_bits = 21; -    static constexpr u64 block_page_size = 1ULL << block_page_bits; -    std::unordered_map<u64, OwnerBuffer> blocks; - -    std::list<OwnerBuffer> pending_destruction; +    std::queue<std::shared_ptr<Buffer>> pending_destruction;      u64 epoch = 0;      u64 modified_ticks = 0;      std::vector<u8> staging_buffer; +      std::list<MapInterval*> marked_for_unregister;      std::shared_ptr<std::unordered_set<MapInterval*>> uncommitted_flushes; diff --git a/src/video_core/engines/const_buffer_engine_interface.h b/src/video_core/engines/const_buffer_engine_interface.h index ebe139504..f46e81bb7 100644 --- a/src/video_core/engines/const_buffer_engine_interface.h +++ b/src/video_core/engines/const_buffer_engine_interface.h @@ -93,6 +93,7 @@ public:      virtual SamplerDescriptor AccessBoundSampler(ShaderType stage, u64 offset) const = 0;      virtual SamplerDescriptor AccessBindlessSampler(ShaderType stage, u64 const_buffer,                                                      u64 offset) const = 0; +    virtual SamplerDescriptor AccessSampler(u32 handle) const = 0;      virtual u32 GetBoundBuffer() const = 0;      virtual VideoCore::GuestDriverProfile& AccessGuestDriverProfile() = 0; diff --git a/src/video_core/engines/kepler_compute.cpp b/src/video_core/engines/kepler_compute.cpp index f6237fc6a..a82b06a38 100644 --- a/src/video_core/engines/kepler_compute.cpp +++ b/src/video_core/engines/kepler_compute.cpp @@ -92,8 +92,11 @@ SamplerDescriptor KeplerCompute::AccessBindlessSampler(ShaderType stage, u64 con      ASSERT(stage == ShaderType::Compute);      const auto& tex_info_buffer = launch_description.const_buffer_config[const_buffer];      const GPUVAddr tex_info_address = tex_info_buffer.Address() + offset; +    return AccessSampler(memory_manager.Read<u32>(tex_info_address)); +} -    const Texture::TextureHandle tex_handle{memory_manager.Read<u32>(tex_info_address)}; +SamplerDescriptor KeplerCompute::AccessSampler(u32 handle) const { +    const Texture::TextureHandle tex_handle{handle};      const Texture::FullTextureInfo tex_info = GetTextureInfo(tex_handle);      SamplerDescriptor result = SamplerDescriptor::FromTIC(tex_info.tic);      result.is_shadow.Assign(tex_info.tsc.depth_compare_enabled.Value()); diff --git a/src/video_core/engines/kepler_compute.h b/src/video_core/engines/kepler_compute.h index 18ceedfaf..b7f668d88 100644 --- a/src/video_core/engines/kepler_compute.h +++ b/src/video_core/engines/kepler_compute.h @@ -219,6 +219,8 @@ public:      SamplerDescriptor AccessBindlessSampler(ShaderType stage, u64 const_buffer,                                              u64 offset) const override; +    SamplerDescriptor AccessSampler(u32 handle) const override; +      u32 GetBoundBuffer() const override {          return regs.tex_cb_index;      } diff --git a/src/video_core/engines/maxwell_3d.cpp b/src/video_core/engines/maxwell_3d.cpp index 13ef2e42d..ea3c8a963 100644 --- a/src/video_core/engines/maxwell_3d.cpp +++ b/src/video_core/engines/maxwell_3d.cpp @@ -25,9 +25,8 @@ constexpr u32 MacroRegistersStart = 0xE00;  Maxwell3D::Maxwell3D(Core::System& system, VideoCore::RasterizerInterface& rasterizer,                       MemoryManager& memory_manager)      : system{system}, rasterizer{rasterizer}, memory_manager{memory_manager}, -      macro_interpreter{*this}, upload_state{memory_manager, regs.upload} { +      macro_engine{GetMacroEngine(*this)}, upload_state{memory_manager, regs.upload} {      dirty.flags.flip(); -      InitializeRegisterDefaults();  } @@ -120,7 +119,7 @@ void Maxwell3D::InitializeRegisterDefaults() {      mme_inline[MAXWELL3D_REG_INDEX(index_array.count)] = true;  } -void Maxwell3D::CallMacroMethod(u32 method, std::size_t num_parameters, const u32* parameters) { +void Maxwell3D::CallMacroMethod(u32 method, const std::vector<u32>& parameters) {      // Reset the current macro.      executing_macro = 0; @@ -129,7 +128,7 @@ void Maxwell3D::CallMacroMethod(u32 method, std::size_t num_parameters, const u3          ((method - MacroRegistersStart) >> 1) % static_cast<u32>(macro_positions.size());      // Execute the current macro. -    macro_interpreter.Execute(macro_positions[entry], num_parameters, parameters); +    macro_engine->Execute(macro_positions[entry], parameters);      if (mme_draw.current_mode != MMEDrawMode::Undefined) {          FlushMMEInlineDraw();      } @@ -165,7 +164,7 @@ void Maxwell3D::CallMethod(u32 method, u32 method_argument, bool is_last_call) {          // Call the macro when there are no more parameters in the command buffer          if (is_last_call) { -            CallMacroMethod(executing_macro, macro_params.size(), macro_params.data()); +            CallMacroMethod(executing_macro, macro_params);              macro_params.clear();          }          return; @@ -201,7 +200,7 @@ void Maxwell3D::CallMethod(u32 method, u32 method_argument, bool is_last_call) {          break;      }      case MAXWELL3D_REG_INDEX(macros.data): { -        ProcessMacroUpload(arg); +        macro_engine->AddCode(regs.macros.upload_address, arg);          break;      }      case MAXWELL3D_REG_INDEX(macros.bind): { @@ -310,7 +309,7 @@ void Maxwell3D::CallMultiMethod(u32 method, const u32* base_start, u32 amount,          // Call the macro when there are no more parameters in the command buffer          if (amount == methods_pending) { -            CallMacroMethod(executing_macro, macro_params.size(), macro_params.data()); +            CallMacroMethod(executing_macro, macro_params);              macro_params.clear();          }          return; @@ -424,9 +423,7 @@ void Maxwell3D::FlushMMEInlineDraw() {  }  void Maxwell3D::ProcessMacroUpload(u32 data) { -    ASSERT_MSG(regs.macros.upload_address < macro_memory.size(), -               "upload_address exceeded macro_memory size!"); -    macro_memory[regs.macros.upload_address++] = data; +    macro_engine->AddCode(regs.macros.upload_address++, data);  }  void Maxwell3D::ProcessMacroBind(u32 data) { @@ -743,8 +740,11 @@ SamplerDescriptor Maxwell3D::AccessBindlessSampler(ShaderType stage, u64 const_b      const auto& shader = state.shader_stages[static_cast<std::size_t>(stage)];      const auto& tex_info_buffer = shader.const_buffers[const_buffer];      const GPUVAddr tex_info_address = tex_info_buffer.address + offset; +    return AccessSampler(memory_manager.Read<u32>(tex_info_address)); +} -    const Texture::TextureHandle tex_handle{memory_manager.Read<u32>(tex_info_address)}; +SamplerDescriptor Maxwell3D::AccessSampler(u32 handle) const { +    const Texture::TextureHandle tex_handle{handle};      const Texture::FullTextureInfo tex_info = GetTextureInfo(tex_handle);      SamplerDescriptor result = SamplerDescriptor::FromTIC(tex_info.tic);      result.is_shadow.Assign(tex_info.tsc.depth_compare_enabled.Value()); diff --git a/src/video_core/engines/maxwell_3d.h b/src/video_core/engines/maxwell_3d.h index 05dd6b39b..d5fe25065 100644 --- a/src/video_core/engines/maxwell_3d.h +++ b/src/video_core/engines/maxwell_3d.h @@ -23,7 +23,7 @@  #include "video_core/engines/engine_upload.h"  #include "video_core/engines/shader_type.h"  #include "video_core/gpu.h" -#include "video_core/macro_interpreter.h" +#include "video_core/macro/macro.h"  #include "video_core/textures/texture.h"  namespace Core { @@ -598,6 +598,7 @@ public:                  BitField<4, 3, u32> block_height;                  BitField<8, 3, u32> block_depth;                  BitField<12, 1, InvMemoryLayout> type; +                BitField<16, 1, u32> is_3d;              } memory_layout;              union {                  BitField<0, 16, u32> layers; @@ -1403,6 +1404,8 @@ public:      SamplerDescriptor AccessBindlessSampler(ShaderType stage, u64 const_buffer,                                              u64 offset) const override; +    SamplerDescriptor AccessSampler(u32 handle) const override; +      u32 GetBoundBuffer() const override {          return regs.tex_cb_index;      } @@ -1411,15 +1414,6 @@ public:      const VideoCore::GuestDriverProfile& AccessGuestDriverProfile() const override; -    /// Memory for macro code - it's undetermined how big this is, however 1MB is much larger than -    /// we've seen used. -    using MacroMemory = std::array<u32, 0x40000>; - -    /// Gets a reference to macro memory. -    const MacroMemory& GetMacroMemory() const { -        return macro_memory; -    } -      bool ShouldExecute() const {          return execute_on;      } @@ -1468,16 +1462,13 @@ private:      std::array<bool, Regs::NUM_REGS> mme_inline{}; -    /// Memory for macro code -    MacroMemory macro_memory; -      /// Macro method that is currently being executed / being fed parameters.      u32 executing_macro = 0;      /// Parameters that have been submitted to the macro call so far.      std::vector<u32> macro_params;      /// Interpreter for the macro codes uploaded to the GPU. -    MacroInterpreter macro_interpreter; +    std::unique_ptr<MacroEngine> macro_engine;      static constexpr u32 null_cb_data = 0xFFFFFFFF;      struct { @@ -1506,7 +1497,7 @@ private:       * @param num_parameters Number of arguments       * @param parameters Arguments to the method call       */ -    void CallMacroMethod(u32 method, std::size_t num_parameters, const u32* parameters); +    void CallMacroMethod(u32 method, const std::vector<u32>& parameters);      /// Handles writes to the macro uploading register.      void ProcessMacroUpload(u32 data); diff --git a/src/video_core/macro/macro.cpp b/src/video_core/macro/macro.cpp new file mode 100644 index 000000000..89077a2d8 --- /dev/null +++ b/src/video_core/macro/macro.cpp @@ -0,0 +1,45 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include "common/assert.h" +#include "common/logging/log.h" +#include "core/settings.h" +#include "video_core/macro/macro.h" +#include "video_core/macro/macro_interpreter.h" +#include "video_core/macro/macro_jit_x64.h" + +namespace Tegra { + +void MacroEngine::AddCode(u32 method, u32 data) { +    uploaded_macro_code[method].push_back(data); +} + +void MacroEngine::Execute(u32 method, const std::vector<u32>& parameters) { +    auto compiled_macro = macro_cache.find(method); +    if (compiled_macro != macro_cache.end()) { +        compiled_macro->second->Execute(parameters, method); +    } else { +        // Macro not compiled, check if it's uploaded and if so, compile it +        auto macro_code = uploaded_macro_code.find(method); +        if (macro_code == uploaded_macro_code.end()) { +            UNREACHABLE_MSG("Macro 0x{0:x} was not uploaded", method); +            return; +        } +        macro_cache[method] = Compile(macro_code->second); +        macro_cache[method]->Execute(parameters, method); +    } +} + +std::unique_ptr<MacroEngine> GetMacroEngine(Engines::Maxwell3D& maxwell3d) { +    if (Settings::values.disable_macro_jit) { +        return std::make_unique<MacroInterpreter>(maxwell3d); +    } +#ifdef ARCHITECTURE_x86_64 +    return std::make_unique<MacroJITx64>(maxwell3d); +#else +    return std::make_unique<MacroInterpreter>(maxwell3d); +#endif +} + +} // namespace Tegra diff --git a/src/video_core/macro/macro.h b/src/video_core/macro/macro.h new file mode 100644 index 000000000..b76ed891f --- /dev/null +++ b/src/video_core/macro/macro.h @@ -0,0 +1,128 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include <memory> +#include <unordered_map> +#include <vector> +#include "common/bit_field.h" +#include "common/common_types.h" + +namespace Tegra { +namespace Engines { +class Maxwell3D; +} +namespace Macro { +constexpr std::size_t NUM_MACRO_REGISTERS = 8; +enum class Operation : u32 { +    ALU = 0, +    AddImmediate = 1, +    ExtractInsert = 2, +    ExtractShiftLeftImmediate = 3, +    ExtractShiftLeftRegister = 4, +    Read = 5, +    Unused = 6, // This operation doesn't seem to be a valid encoding. +    Branch = 7, +}; + +enum class ALUOperation : u32 { +    Add = 0, +    AddWithCarry = 1, +    Subtract = 2, +    SubtractWithBorrow = 3, +    // Operations 4-7 don't seem to be valid encodings. +    Xor = 8, +    Or = 9, +    And = 10, +    AndNot = 11, +    Nand = 12 +}; + +enum class ResultOperation : u32 { +    IgnoreAndFetch = 0, +    Move = 1, +    MoveAndSetMethod = 2, +    FetchAndSend = 3, +    MoveAndSend = 4, +    FetchAndSetMethod = 5, +    MoveAndSetMethodFetchAndSend = 6, +    MoveAndSetMethodSend = 7 +}; + +enum class BranchCondition : u32 { +    Zero = 0, +    NotZero = 1, +}; + +union Opcode { +    u32 raw; +    BitField<0, 3, Operation> operation; +    BitField<4, 3, ResultOperation> result_operation; +    BitField<4, 1, BranchCondition> branch_condition; +    // If set on a branch, then the branch doesn't have a delay slot. +    BitField<5, 1, u32> branch_annul; +    BitField<7, 1, u32> is_exit; +    BitField<8, 3, u32> dst; +    BitField<11, 3, u32> src_a; +    BitField<14, 3, u32> src_b; +    // The signed immediate overlaps the second source operand and the alu operation. +    BitField<14, 18, s32> immediate; + +    BitField<17, 5, ALUOperation> alu_operation; + +    // Bitfield instructions data +    BitField<17, 5, u32> bf_src_bit; +    BitField<22, 5, u32> bf_size; +    BitField<27, 5, u32> bf_dst_bit; + +    u32 GetBitfieldMask() const { +        return (1 << bf_size) - 1; +    } + +    s32 GetBranchTarget() const { +        return static_cast<s32>(immediate * sizeof(u32)); +    } +}; + +union MethodAddress { +    u32 raw; +    BitField<0, 12, u32> address; +    BitField<12, 6, u32> increment; +}; + +} // namespace Macro + +class CachedMacro { +public: +    virtual ~CachedMacro() = default; +    /** +     * Executes the macro code with the specified input parameters. +     * @param code The macro byte code to execute +     * @param parameters The parameters of the macro +     */ +    virtual void Execute(const std::vector<u32>& parameters, u32 method) = 0; +}; + +class MacroEngine { +public: +    virtual ~MacroEngine() = default; + +    // Store the uploaded macro code to compile them when they're called. +    void AddCode(u32 method, u32 data); + +    // Compiles the macro if its not in the cache, and executes the compiled macro +    void Execute(u32 method, const std::vector<u32>& parameters); + +protected: +    virtual std::unique_ptr<CachedMacro> Compile(const std::vector<u32>& code) = 0; + +private: +    std::unordered_map<u32, std::unique_ptr<CachedMacro>> macro_cache; +    std::unordered_map<u32, std::vector<u32>> uploaded_macro_code; +}; + +std::unique_ptr<MacroEngine> GetMacroEngine(Engines::Maxwell3D& maxwell3d); + +} // namespace Tegra diff --git a/src/video_core/macro_interpreter.cpp b/src/video_core/macro/macro_interpreter.cpp index 947364928..5edff27aa 100644 --- a/src/video_core/macro_interpreter.cpp +++ b/src/video_core/macro/macro_interpreter.cpp @@ -1,4 +1,4 @@ -// Copyright 2018 yuzu Emulator Project +// Copyright 2020 yuzu Emulator Project  // Licensed under GPLv2 or any later version  // Refer to the license.txt file included. @@ -6,109 +6,46 @@  #include "common/logging/log.h"  #include "common/microprofile.h"  #include "video_core/engines/maxwell_3d.h" -#include "video_core/macro_interpreter.h" +#include "video_core/macro/macro_interpreter.h"  MICROPROFILE_DEFINE(MacroInterp, "GPU", "Execute macro interpreter", MP_RGB(128, 128, 192));  namespace Tegra { -namespace { -enum class Operation : u32 { -    ALU = 0, -    AddImmediate = 1, -    ExtractInsert = 2, -    ExtractShiftLeftImmediate = 3, -    ExtractShiftLeftRegister = 4, -    Read = 5, -    Unused = 6, // This operation doesn't seem to be a valid encoding. -    Branch = 7, -}; -} // Anonymous namespace - -enum class MacroInterpreter::ALUOperation : u32 { -    Add = 0, -    AddWithCarry = 1, -    Subtract = 2, -    SubtractWithBorrow = 3, -    // Operations 4-7 don't seem to be valid encodings. -    Xor = 8, -    Or = 9, -    And = 10, -    AndNot = 11, -    Nand = 12 -}; - -enum class MacroInterpreter::ResultOperation : u32 { -    IgnoreAndFetch = 0, -    Move = 1, -    MoveAndSetMethod = 2, -    FetchAndSend = 3, -    MoveAndSend = 4, -    FetchAndSetMethod = 5, -    MoveAndSetMethodFetchAndSend = 6, -    MoveAndSetMethodSend = 7 -}; - -enum class MacroInterpreter::BranchCondition : u32 { -    Zero = 0, -    NotZero = 1, -}; - -union MacroInterpreter::Opcode { -    u32 raw; -    BitField<0, 3, Operation> operation; -    BitField<4, 3, ResultOperation> result_operation; -    BitField<4, 1, BranchCondition> branch_condition; -    // If set on a branch, then the branch doesn't have a delay slot. -    BitField<5, 1, u32> branch_annul; -    BitField<7, 1, u32> is_exit; -    BitField<8, 3, u32> dst; -    BitField<11, 3, u32> src_a; -    BitField<14, 3, u32> src_b; -    // The signed immediate overlaps the second source operand and the alu operation. -    BitField<14, 18, s32> immediate; - -    BitField<17, 5, ALUOperation> alu_operation; - -    // Bitfield instructions data -    BitField<17, 5, u32> bf_src_bit; -    BitField<22, 5, u32> bf_size; -    BitField<27, 5, u32> bf_dst_bit; - -    u32 GetBitfieldMask() const { -        return (1 << bf_size) - 1; -    } +MacroInterpreter::MacroInterpreter(Engines::Maxwell3D& maxwell3d) : maxwell3d(maxwell3d) {} -    s32 GetBranchTarget() const { -        return static_cast<s32>(immediate * sizeof(u32)); -    } -}; +std::unique_ptr<CachedMacro> MacroInterpreter::Compile(const std::vector<u32>& code) { +    return std::make_unique<MacroInterpreterImpl>(maxwell3d, code); +} -MacroInterpreter::MacroInterpreter(Engines::Maxwell3D& maxwell3d) : maxwell3d(maxwell3d) {} +MacroInterpreterImpl::MacroInterpreterImpl(Engines::Maxwell3D& maxwell3d, +                                           const std::vector<u32>& code) +    : maxwell3d(maxwell3d), code(code) {} -void MacroInterpreter::Execute(u32 offset, std::size_t num_parameters, const u32* parameters) { +void MacroInterpreterImpl::Execute(const std::vector<u32>& parameters, u32 method) {      MICROPROFILE_SCOPE(MacroInterp);      Reset();      registers[1] = parameters[0]; +    num_parameters = parameters.size();      if (num_parameters > parameters_capacity) {          parameters_capacity = num_parameters;          this->parameters = std::make_unique<u32[]>(num_parameters);      } -    std::memcpy(this->parameters.get(), parameters, num_parameters * sizeof(u32)); +    std::memcpy(this->parameters.get(), parameters.data(), num_parameters * sizeof(u32));      this->num_parameters = num_parameters;      // Execute the code until we hit an exit condition.      bool keep_executing = true;      while (keep_executing) { -        keep_executing = Step(offset, false); +        keep_executing = Step(false);      }      // Assert the the macro used all the input parameters      ASSERT(next_parameter_index == num_parameters);  } -void MacroInterpreter::Reset() { +void MacroInterpreterImpl::Reset() {      registers = {};      pc = 0;      delayed_pc = {}; @@ -120,10 +57,10 @@ void MacroInterpreter::Reset() {      carry_flag = false;  } -bool MacroInterpreter::Step(u32 offset, bool is_delay_slot) { +bool MacroInterpreterImpl::Step(bool is_delay_slot) {      u32 base_address = pc; -    Opcode opcode = GetOpcode(offset); +    Macro::Opcode opcode = GetOpcode();      pc += 4;      // Update the program counter if we were delayed @@ -134,18 +71,18 @@ bool MacroInterpreter::Step(u32 offset, bool is_delay_slot) {      }      switch (opcode.operation) { -    case Operation::ALU: { +    case Macro::Operation::ALU: {          u32 result = GetALUResult(opcode.alu_operation, GetRegister(opcode.src_a),                                    GetRegister(opcode.src_b));          ProcessResult(opcode.result_operation, opcode.dst, result);          break;      } -    case Operation::AddImmediate: { +    case Macro::Operation::AddImmediate: {          ProcessResult(opcode.result_operation, opcode.dst,                        GetRegister(opcode.src_a) + opcode.immediate);          break;      } -    case Operation::ExtractInsert: { +    case Macro::Operation::ExtractInsert: {          u32 dst = GetRegister(opcode.src_a);          u32 src = GetRegister(opcode.src_b); @@ -155,7 +92,7 @@ bool MacroInterpreter::Step(u32 offset, bool is_delay_slot) {          ProcessResult(opcode.result_operation, opcode.dst, dst);          break;      } -    case Operation::ExtractShiftLeftImmediate: { +    case Macro::Operation::ExtractShiftLeftImmediate: {          u32 dst = GetRegister(opcode.src_a);          u32 src = GetRegister(opcode.src_b); @@ -164,7 +101,7 @@ bool MacroInterpreter::Step(u32 offset, bool is_delay_slot) {          ProcessResult(opcode.result_operation, opcode.dst, result);          break;      } -    case Operation::ExtractShiftLeftRegister: { +    case Macro::Operation::ExtractShiftLeftRegister: {          u32 dst = GetRegister(opcode.src_a);          u32 src = GetRegister(opcode.src_b); @@ -173,12 +110,12 @@ bool MacroInterpreter::Step(u32 offset, bool is_delay_slot) {          ProcessResult(opcode.result_operation, opcode.dst, result);          break;      } -    case Operation::Read: { +    case Macro::Operation::Read: {          u32 result = Read(GetRegister(opcode.src_a) + opcode.immediate);          ProcessResult(opcode.result_operation, opcode.dst, result);          break;      } -    case Operation::Branch: { +    case Macro::Operation::Branch: {          ASSERT_MSG(!is_delay_slot, "Executing a branch in a delay slot is not valid");          u32 value = GetRegister(opcode.src_a);          bool taken = EvaluateBranchCondition(opcode.branch_condition, value); @@ -191,7 +128,7 @@ bool MacroInterpreter::Step(u32 offset, bool is_delay_slot) {              delayed_pc = base_address + opcode.GetBranchTarget();              // Execute one more instruction due to the delay slot. -            return Step(offset, true); +            return Step(true);          }          break;      } @@ -204,51 +141,44 @@ bool MacroInterpreter::Step(u32 offset, bool is_delay_slot) {      // cause an exit if it's executed inside a delay slot.      if (opcode.is_exit && !is_delay_slot) {          // Exit has a delay slot, execute the next instruction -        Step(offset, true); +        Step(true);          return false;      }      return true;  } -MacroInterpreter::Opcode MacroInterpreter::GetOpcode(u32 offset) const { -    const auto& macro_memory{maxwell3d.GetMacroMemory()}; -    ASSERT((pc % sizeof(u32)) == 0); -    ASSERT((pc + offset) < macro_memory.size() * sizeof(u32)); -    return {macro_memory[offset + pc / sizeof(u32)]}; -} - -u32 MacroInterpreter::GetALUResult(ALUOperation operation, u32 src_a, u32 src_b) { +u32 MacroInterpreterImpl::GetALUResult(Macro::ALUOperation operation, u32 src_a, u32 src_b) {      switch (operation) { -    case ALUOperation::Add: { +    case Macro::ALUOperation::Add: {          const u64 result{static_cast<u64>(src_a) + src_b};          carry_flag = result > 0xffffffff;          return static_cast<u32>(result);      } -    case ALUOperation::AddWithCarry: { +    case Macro::ALUOperation::AddWithCarry: {          const u64 result{static_cast<u64>(src_a) + src_b + (carry_flag ? 1ULL : 0ULL)};          carry_flag = result > 0xffffffff;          return static_cast<u32>(result);      } -    case ALUOperation::Subtract: { +    case Macro::ALUOperation::Subtract: {          const u64 result{static_cast<u64>(src_a) - src_b};          carry_flag = result < 0x100000000;          return static_cast<u32>(result);      } -    case ALUOperation::SubtractWithBorrow: { +    case Macro::ALUOperation::SubtractWithBorrow: {          const u64 result{static_cast<u64>(src_a) - src_b - (carry_flag ? 0ULL : 1ULL)};          carry_flag = result < 0x100000000;          return static_cast<u32>(result);      } -    case ALUOperation::Xor: +    case Macro::ALUOperation::Xor:          return src_a ^ src_b; -    case ALUOperation::Or: +    case Macro::ALUOperation::Or:          return src_a | src_b; -    case ALUOperation::And: +    case Macro::ALUOperation::And:          return src_a & src_b; -    case ALUOperation::AndNot: +    case Macro::ALUOperation::AndNot:          return src_a & ~src_b; -    case ALUOperation::Nand: +    case Macro::ALUOperation::Nand:          return ~(src_a & src_b);      default: @@ -257,43 +187,43 @@ u32 MacroInterpreter::GetALUResult(ALUOperation operation, u32 src_a, u32 src_b)      }  } -void MacroInterpreter::ProcessResult(ResultOperation operation, u32 reg, u32 result) { +void MacroInterpreterImpl::ProcessResult(Macro::ResultOperation operation, u32 reg, u32 result) {      switch (operation) { -    case ResultOperation::IgnoreAndFetch: +    case Macro::ResultOperation::IgnoreAndFetch:          // Fetch parameter and ignore result.          SetRegister(reg, FetchParameter());          break; -    case ResultOperation::Move: +    case Macro::ResultOperation::Move:          // Move result.          SetRegister(reg, result);          break; -    case ResultOperation::MoveAndSetMethod: +    case Macro::ResultOperation::MoveAndSetMethod:          // Move result and use as Method Address.          SetRegister(reg, result);          SetMethodAddress(result);          break; -    case ResultOperation::FetchAndSend: +    case Macro::ResultOperation::FetchAndSend:          // Fetch parameter and send result.          SetRegister(reg, FetchParameter());          Send(result);          break; -    case ResultOperation::MoveAndSend: +    case Macro::ResultOperation::MoveAndSend:          // Move and send result.          SetRegister(reg, result);          Send(result);          break; -    case ResultOperation::FetchAndSetMethod: +    case Macro::ResultOperation::FetchAndSetMethod:          // Fetch parameter and use result as Method Address.          SetRegister(reg, FetchParameter());          SetMethodAddress(result);          break; -    case ResultOperation::MoveAndSetMethodFetchAndSend: +    case Macro::ResultOperation::MoveAndSetMethodFetchAndSend:          // Move result and use as Method Address, then fetch and send parameter.          SetRegister(reg, result);          SetMethodAddress(result);          Send(FetchParameter());          break; -    case ResultOperation::MoveAndSetMethodSend: +    case Macro::ResultOperation::MoveAndSetMethodSend:          // Move result and use as Method Address, then send bits 12:17 of result.          SetRegister(reg, result);          SetMethodAddress(result); @@ -304,16 +234,28 @@ void MacroInterpreter::ProcessResult(ResultOperation operation, u32 reg, u32 res      }  } -u32 MacroInterpreter::FetchParameter() { -    ASSERT(next_parameter_index < num_parameters); -    return parameters[next_parameter_index++]; +bool MacroInterpreterImpl::EvaluateBranchCondition(Macro::BranchCondition cond, u32 value) const { +    switch (cond) { +    case Macro::BranchCondition::Zero: +        return value == 0; +    case Macro::BranchCondition::NotZero: +        return value != 0; +    } +    UNREACHABLE(); +    return true;  } -u32 MacroInterpreter::GetRegister(u32 register_id) const { +Macro::Opcode MacroInterpreterImpl::GetOpcode() const { +    ASSERT((pc % sizeof(u32)) == 0); +    ASSERT(pc < code.size() * sizeof(u32)); +    return {code[pc / sizeof(u32)]}; +} + +u32 MacroInterpreterImpl::GetRegister(u32 register_id) const {      return registers.at(register_id);  } -void MacroInterpreter::SetRegister(u32 register_id, u32 value) { +void MacroInterpreterImpl::SetRegister(u32 register_id, u32 value) {      // Register 0 is hardwired as the zero register.      // Ensure no writes to it actually occur.      if (register_id == 0) { @@ -323,30 +265,24 @@ void MacroInterpreter::SetRegister(u32 register_id, u32 value) {      registers.at(register_id) = value;  } -void MacroInterpreter::SetMethodAddress(u32 address) { +void MacroInterpreterImpl::SetMethodAddress(u32 address) {      method_address.raw = address;  } -void MacroInterpreter::Send(u32 value) { +void MacroInterpreterImpl::Send(u32 value) {      maxwell3d.CallMethodFromMME(method_address.address, value);      // Increment the method address by the method increment.      method_address.address.Assign(method_address.address.Value() +                                    method_address.increment.Value());  } -u32 MacroInterpreter::Read(u32 method) const { +u32 MacroInterpreterImpl::Read(u32 method) const {      return maxwell3d.GetRegisterValue(method);  } -bool MacroInterpreter::EvaluateBranchCondition(BranchCondition cond, u32 value) const { -    switch (cond) { -    case BranchCondition::Zero: -        return value == 0; -    case BranchCondition::NotZero: -        return value != 0; -    } -    UNREACHABLE(); -    return true; +u32 MacroInterpreterImpl::FetchParameter() { +    ASSERT(next_parameter_index < num_parameters); +    return parameters[next_parameter_index++];  }  } // namespace Tegra diff --git a/src/video_core/macro_interpreter.h b/src/video_core/macro/macro_interpreter.h index 631146d89..90217fc89 100644 --- a/src/video_core/macro_interpreter.h +++ b/src/video_core/macro/macro_interpreter.h @@ -1,44 +1,37 @@ -// Copyright 2018 yuzu Emulator Project +// Copyright 2020 yuzu Emulator Project  // Licensed under GPLv2 or any later version  // Refer to the license.txt file included.  #pragma once -  #include <array>  #include <optional> - +#include <vector>  #include "common/bit_field.h"  #include "common/common_types.h" +#include "video_core/macro/macro.h"  namespace Tegra {  namespace Engines {  class Maxwell3D;  } -class MacroInterpreter final { +class MacroInterpreter final : public MacroEngine {  public:      explicit MacroInterpreter(Engines::Maxwell3D& maxwell3d); -    /** -     * Executes the macro code with the specified input parameters. -     * @param offset Offset to start execution at. -     * @param parameters The parameters of the macro. -     */ -    void Execute(u32 offset, std::size_t num_parameters, const u32* parameters); +protected: +    std::unique_ptr<CachedMacro> Compile(const std::vector<u32>& code) override;  private: -    enum class ALUOperation : u32; -    enum class BranchCondition : u32; -    enum class ResultOperation : u32; - -    union Opcode; +    Engines::Maxwell3D& maxwell3d; +}; -    union MethodAddress { -        u32 raw; -        BitField<0, 12, u32> address; -        BitField<12, 6, u32> increment; -    }; +class MacroInterpreterImpl : public CachedMacro { +public: +    MacroInterpreterImpl(Engines::Maxwell3D& maxwell3d, const std::vector<u32>& code); +    void Execute(const std::vector<u32>& parameters, u32 method) override; +private:      /// Resets the execution engine state, zeroing registers, etc.      void Reset(); @@ -49,20 +42,20 @@ private:       * @param is_delay_slot Whether the current step is being executed due to a delay slot in a       * previous instruction.       */ -    bool Step(u32 offset, bool is_delay_slot); +    bool Step(bool is_delay_slot);      /// Calculates the result of an ALU operation. src_a OP src_b; -    u32 GetALUResult(ALUOperation operation, u32 src_a, u32 src_b); +    u32 GetALUResult(Macro::ALUOperation operation, u32 src_a, u32 src_b);      /// Performs the result operation on the input result and stores it in the specified register      /// (if necessary). -    void ProcessResult(ResultOperation operation, u32 reg, u32 result); +    void ProcessResult(Macro::ResultOperation operation, u32 reg, u32 result);      /// Evaluates the branch condition and returns whether the branch should be taken or not. -    bool EvaluateBranchCondition(BranchCondition cond, u32 value) const; +    bool EvaluateBranchCondition(Macro::BranchCondition cond, u32 value) const;      /// Reads an opcode at the current program counter location. -    Opcode GetOpcode(u32 offset) const; +    Macro::Opcode GetOpcode() const;      /// Returns the specified register's value. Register 0 is hardcoded to always return 0.      u32 GetRegister(u32 register_id) const; @@ -89,13 +82,11 @@ private:      /// Program counter to execute at after the delay slot is executed.      std::optional<u32> delayed_pc; -    static constexpr std::size_t NumMacroRegisters = 8; -      /// General purpose macro registers. -    std::array<u32, NumMacroRegisters> registers = {}; +    std::array<u32, Macro::NUM_MACRO_REGISTERS> registers = {};      /// Method address to use for the next Send instruction. -    MethodAddress method_address = {}; +    Macro::MethodAddress method_address = {};      /// Input parameters of the current macro.      std::unique_ptr<u32[]> parameters; @@ -105,5 +96,7 @@ private:      u32 next_parameter_index = 0;      bool carry_flag = false; +    const std::vector<u32>& code;  }; +  } // namespace Tegra diff --git a/src/video_core/macro/macro_jit_x64.cpp b/src/video_core/macro/macro_jit_x64.cpp new file mode 100644 index 000000000..11c1cc3be --- /dev/null +++ b/src/video_core/macro/macro_jit_x64.cpp @@ -0,0 +1,640 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include "common/assert.h" +#include "common/logging/log.h" +#include "common/microprofile.h" +#include "common/x64/xbyak_util.h" +#include "video_core/engines/maxwell_3d.h" +#include "video_core/macro/macro_interpreter.h" +#include "video_core/macro/macro_jit_x64.h" + +MICROPROFILE_DEFINE(MacroJitCompile, "GPU", "Compile macro JIT", MP_RGB(173, 255, 47)); +MICROPROFILE_DEFINE(MacroJitExecute, "GPU", "Execute macro JIT", MP_RGB(255, 255, 0)); + +namespace Tegra { +static const Xbyak::Reg64 PARAMETERS = Xbyak::util::r9; +static const Xbyak::Reg64 REGISTERS = Xbyak::util::r10; +static const Xbyak::Reg64 STATE = Xbyak::util::r11; +static const Xbyak::Reg64 NEXT_PARAMETER = Xbyak::util::r12; +static const Xbyak::Reg32 RESULT = Xbyak::util::r13d; +static const Xbyak::Reg64 RESULT_64 = Xbyak::util::r13; +static const Xbyak::Reg32 METHOD_ADDRESS = Xbyak::util::r14d; +static const Xbyak::Reg64 METHOD_ADDRESS_64 = Xbyak::util::r14; +static const Xbyak::Reg64 BRANCH_HOLDER = Xbyak::util::r15; + +static const std::bitset<32> PERSISTENT_REGISTERS = Common::X64::BuildRegSet({ +    PARAMETERS, +    REGISTERS, +    STATE, +    NEXT_PARAMETER, +    RESULT, +    METHOD_ADDRESS, +    BRANCH_HOLDER, +}); + +MacroJITx64::MacroJITx64(Engines::Maxwell3D& maxwell3d) : maxwell3d(maxwell3d) {} + +std::unique_ptr<CachedMacro> MacroJITx64::Compile(const std::vector<u32>& code) { +    return std::make_unique<MacroJITx64Impl>(maxwell3d, code); +} + +MacroJITx64Impl::MacroJITx64Impl(Engines::Maxwell3D& maxwell3d, const std::vector<u32>& code) +    : Xbyak::CodeGenerator(MAX_CODE_SIZE), code(code), maxwell3d(maxwell3d) { +    Compile(); +} + +MacroJITx64Impl::~MacroJITx64Impl() = default; + +void MacroJITx64Impl::Execute(const std::vector<u32>& parameters, u32 method) { +    MICROPROFILE_SCOPE(MacroJitExecute); +    ASSERT_OR_EXECUTE(program != nullptr, { return; }); +    JITState state{}; +    state.maxwell3d = &maxwell3d; +    state.registers = {}; +    state.parameters = parameters.data(); +    program(&state); +} + +void MacroJITx64Impl::Compile_ALU(Macro::Opcode opcode) { +    const bool is_a_zero = opcode.src_a == 0; +    const bool is_b_zero = opcode.src_b == 0; +    const bool valid_operation = !is_a_zero && !is_b_zero; +    const bool is_move_operation = !is_a_zero && is_b_zero; +    const bool has_zero_register = is_a_zero || is_b_zero; + +    Xbyak::Reg64 src_a; +    Xbyak::Reg32 src_b; + +    if (!optimizer.zero_reg_skip) { +        src_a = Compile_GetRegister(opcode.src_a, RESULT_64); +        src_b = Compile_GetRegister(opcode.src_b, ebx); +    } else { +        if (!is_a_zero) { +            src_a = Compile_GetRegister(opcode.src_a, RESULT_64); +        } +        if (!is_b_zero) { +            src_b = Compile_GetRegister(opcode.src_b, ebx); +        } +    } +    Xbyak::Label skip_carry{}; + +    bool has_emitted = false; + +    switch (opcode.alu_operation) { +    case Macro::ALUOperation::Add: +        if (optimizer.zero_reg_skip) { +            if (valid_operation) { +                add(src_a, src_b); +            } +        } else { +            add(src_a, src_b); +        } + +        if (!optimizer.can_skip_carry) { +            setc(byte[STATE + offsetof(JITState, carry_flag)]); +        } +        break; +    case Macro::ALUOperation::AddWithCarry: +        bt(dword[STATE + offsetof(JITState, carry_flag)], 0); +        adc(src_a, src_b); +        setc(byte[STATE + offsetof(JITState, carry_flag)]); +        break; +    case Macro::ALUOperation::Subtract: +        if (optimizer.zero_reg_skip) { +            if (valid_operation) { +                sub(src_a, src_b); +                has_emitted = true; +            } +        } else { +            sub(src_a, src_b); +            has_emitted = true; +        } +        if (!optimizer.can_skip_carry && has_emitted) { +            setc(byte[STATE + offsetof(JITState, carry_flag)]); +        } +        break; +    case Macro::ALUOperation::SubtractWithBorrow: +        bt(dword[STATE + offsetof(JITState, carry_flag)], 0); +        sbb(src_a, src_b); +        setc(byte[STATE + offsetof(JITState, carry_flag)]); +        break; +    case Macro::ALUOperation::Xor: +        if (optimizer.zero_reg_skip) { +            if (valid_operation) { +                xor_(src_a, src_b); +            } +        } else { +            xor_(src_a, src_b); +        } +        break; +    case Macro::ALUOperation::Or: +        if (optimizer.zero_reg_skip) { +            if (valid_operation) { +                or_(src_a, src_b); +            } +        } else { +            or_(src_a, src_b); +        } +        break; +    case Macro::ALUOperation::And: +        if (optimizer.zero_reg_skip) { +            if (!has_zero_register) { +                and_(src_a, src_b); +            } +        } else { +            and_(src_a, src_b); +        } +        break; +    case Macro::ALUOperation::AndNot: +        if (optimizer.zero_reg_skip) { +            if (!is_a_zero) { +                not_(src_b); +                and_(src_a, src_b); +            } +        } else { +            not_(src_b); +            and_(src_a, src_b); +        } +        break; +    case Macro::ALUOperation::Nand: +        if (optimizer.zero_reg_skip) { +            if (!is_a_zero) { +                and_(src_a, src_b); +                not_(src_a); +            } +        } else { +            and_(src_a, src_b); +            not_(src_a); +        } +        break; +    default: +        UNIMPLEMENTED_MSG("Unimplemented ALU operation {}", +                          static_cast<std::size_t>(opcode.alu_operation.Value())); +        break; +    } +    Compile_ProcessResult(opcode.result_operation, opcode.dst); +} + +void MacroJITx64Impl::Compile_AddImmediate(Macro::Opcode opcode) { +    if (optimizer.skip_dummy_addimmediate) { +        // Games tend to use this as an exit instruction placeholder. It's to encode an instruction +        // without doing anything. In our case we can just not emit anything. +        if (opcode.result_operation == Macro::ResultOperation::Move && opcode.dst == 0) { +            return; +        } +    } +    // Check for redundant moves +    if (optimizer.optimize_for_method_move && +        opcode.result_operation == Macro::ResultOperation::MoveAndSetMethod) { +        if (next_opcode.has_value()) { +            const auto next = *next_opcode; +            if (next.result_operation == Macro::ResultOperation::MoveAndSetMethod) { +                return; +            } +        } +    } +    if (optimizer.zero_reg_skip && opcode.src_a == 0) { +        if (opcode.immediate == 0) { +            xor_(RESULT, RESULT); +        } else { +            mov(RESULT, opcode.immediate); +        } +    } else { +        auto result = Compile_GetRegister(opcode.src_a, RESULT); +        if (opcode.immediate > 2) { +            add(result, opcode.immediate); +        } else if (opcode.immediate == 1) { +            inc(result); +        } else if (opcode.immediate < 0) { +            sub(result, opcode.immediate * -1); +        } +    } +    Compile_ProcessResult(opcode.result_operation, opcode.dst); +} + +void MacroJITx64Impl::Compile_ExtractInsert(Macro::Opcode opcode) { +    auto dst = Compile_GetRegister(opcode.src_a, RESULT); +    auto src = Compile_GetRegister(opcode.src_b, eax); + +    if (opcode.bf_src_bit != 0 && opcode.bf_src_bit != 31) { +        shr(src, opcode.bf_src_bit); +    } else if (opcode.bf_src_bit == 31) { +        xor_(src, src); +    } +    // Don't bother masking the whole register since we're using a 32 bit register +    if (opcode.bf_size != 31 && opcode.bf_size != 0) { +        and_(src, opcode.GetBitfieldMask()); +    } else if (opcode.bf_size == 0) { +        xor_(src, src); +    } +    if (opcode.bf_dst_bit != 31 && opcode.bf_dst_bit != 0) { +        shl(src, opcode.bf_dst_bit); +    } else if (opcode.bf_dst_bit == 31) { +        xor_(src, src); +    } + +    const u32 mask = ~(opcode.GetBitfieldMask() << opcode.bf_dst_bit); +    if (mask != 0xffffffff) { +        and_(dst, mask); +    } +    or_(dst, src); +    Compile_ProcessResult(opcode.result_operation, opcode.dst); +} + +void MacroJITx64Impl::Compile_ExtractShiftLeftImmediate(Macro::Opcode opcode) { +    auto dst = Compile_GetRegister(opcode.src_a, eax); +    auto src = Compile_GetRegister(opcode.src_b, RESULT); + +    shr(src, al); +    if (opcode.bf_size != 0 && opcode.bf_size != 31) { +        and_(src, opcode.GetBitfieldMask()); +    } else if (opcode.bf_size == 0) { +        xor_(src, src); +    } + +    if (opcode.bf_dst_bit != 0 && opcode.bf_dst_bit != 31) { +        shl(src, opcode.bf_dst_bit); +    } else if (opcode.bf_dst_bit == 31) { +        xor_(src, src); +    } +    Compile_ProcessResult(opcode.result_operation, opcode.dst); +} + +void MacroJITx64Impl::Compile_ExtractShiftLeftRegister(Macro::Opcode opcode) { +    auto dst = Compile_GetRegister(opcode.src_a, eax); +    auto src = Compile_GetRegister(opcode.src_b, RESULT); + +    if (opcode.bf_src_bit != 0) { +        shr(src, opcode.bf_src_bit); +    } + +    if (opcode.bf_size != 31) { +        and_(src, opcode.GetBitfieldMask()); +    } +    shl(src, al); +    Compile_ProcessResult(opcode.result_operation, opcode.dst); +} + +static u32 Read(Engines::Maxwell3D* maxwell3d, u32 method) { +    return maxwell3d->GetRegisterValue(method); +} + +static void Send(Engines::Maxwell3D* maxwell3d, Macro::MethodAddress method_address, u32 value) { +    maxwell3d->CallMethodFromMME(method_address.address, value); +} + +void MacroJITx64Impl::Compile_Read(Macro::Opcode opcode) { +    if (optimizer.zero_reg_skip && opcode.src_a == 0) { +        if (opcode.immediate == 0) { +            xor_(RESULT, RESULT); +        } else { +            mov(RESULT, opcode.immediate); +        } +    } else { +        auto result = Compile_GetRegister(opcode.src_a, RESULT); +        if (opcode.immediate > 2) { +            add(result, opcode.immediate); +        } else if (opcode.immediate == 1) { +            inc(result); +        } else if (opcode.immediate < 0) { +            sub(result, opcode.immediate * -1); +        } +    } +    Common::X64::ABI_PushRegistersAndAdjustStackGPS(*this, PersistentCallerSavedRegs(), 0); +    mov(Common::X64::ABI_PARAM1, qword[STATE]); +    mov(Common::X64::ABI_PARAM2, RESULT); +    Common::X64::CallFarFunction(*this, &Read); +    Common::X64::ABI_PopRegistersAndAdjustStackGPS(*this, PersistentCallerSavedRegs(), 0); +    mov(RESULT, Common::X64::ABI_RETURN.cvt32()); +    Compile_ProcessResult(opcode.result_operation, opcode.dst); +} + +void Tegra::MacroJITx64Impl::Compile_Send(Xbyak::Reg32 value) { +    Common::X64::ABI_PushRegistersAndAdjustStackGPS(*this, PersistentCallerSavedRegs(), 0); +    mov(Common::X64::ABI_PARAM1, qword[STATE]); +    mov(Common::X64::ABI_PARAM2, METHOD_ADDRESS); +    mov(Common::X64::ABI_PARAM3, value); +    Common::X64::CallFarFunction(*this, &Send); +    Common::X64::ABI_PopRegistersAndAdjustStackGPS(*this, PersistentCallerSavedRegs(), 0); + +    Xbyak::Label dont_process{}; +    // Get increment +    test(METHOD_ADDRESS, 0x3f000); +    // If zero, method address doesn't update +    je(dont_process); + +    mov(ecx, METHOD_ADDRESS); +    and_(METHOD_ADDRESS, 0xfff); +    shr(ecx, 12); +    and_(ecx, 0x3f); +    lea(eax, ptr[rcx + METHOD_ADDRESS_64]); +    sal(ecx, 12); +    or_(eax, ecx); + +    mov(METHOD_ADDRESS, eax); + +    L(dont_process); +} + +void Tegra::MacroJITx64Impl::Compile_Branch(Macro::Opcode opcode) { +    ASSERT_MSG(!is_delay_slot, "Executing a branch in a delay slot is not valid"); +    const s32 jump_address = +        static_cast<s32>(pc) + static_cast<s32>(opcode.GetBranchTarget() / sizeof(s32)); + +    Xbyak::Label end; +    auto value = Compile_GetRegister(opcode.src_a, eax); +    test(value, value); +    if (optimizer.has_delayed_pc) { +        switch (opcode.branch_condition) { +        case Macro::BranchCondition::Zero: +            jne(end, T_NEAR); +            break; +        case Macro::BranchCondition::NotZero: +            je(end, T_NEAR); +            break; +        } + +        if (opcode.branch_annul) { +            xor_(BRANCH_HOLDER, BRANCH_HOLDER); +            jmp(labels[jump_address], T_NEAR); +        } else { +            Xbyak::Label handle_post_exit{}; +            Xbyak::Label skip{}; +            jmp(skip, T_NEAR); +            if (opcode.is_exit) { +                L(handle_post_exit); +                // Execute 1 instruction +                mov(BRANCH_HOLDER, end_of_code); +                // Jump to next instruction to skip delay slot check +                jmp(labels[jump_address], T_NEAR); +            } else { +                L(handle_post_exit); +                xor_(BRANCH_HOLDER, BRANCH_HOLDER); +                jmp(labels[jump_address], T_NEAR); +            } +            L(skip); +            mov(BRANCH_HOLDER, handle_post_exit); +            jmp(delay_skip[pc], T_NEAR); +        } +    } else { +        switch (opcode.branch_condition) { +        case Macro::BranchCondition::Zero: +            je(labels[jump_address], T_NEAR); +            break; +        case Macro::BranchCondition::NotZero: +            jne(labels[jump_address], T_NEAR); +            break; +        } +    } + +    L(end); +} + +void Tegra::MacroJITx64Impl::Optimizer_ScanFlags() { +    optimizer.can_skip_carry = true; +    optimizer.has_delayed_pc = false; +    for (auto raw_op : code) { +        Macro::Opcode op{}; +        op.raw = raw_op; + +        if (op.operation == Macro::Operation::ALU) { +            // Scan for any ALU operations which actually use the carry flag, if they don't exist in +            // our current code we can skip emitting the carry flag handling operations +            if (op.alu_operation == Macro::ALUOperation::AddWithCarry || +                op.alu_operation == Macro::ALUOperation::SubtractWithBorrow) { +                optimizer.can_skip_carry = false; +            } +        } + +        if (op.operation == Macro::Operation::Branch) { +            if (!op.branch_annul) { +                optimizer.has_delayed_pc = true; +            } +        } +    } +} + +void MacroJITx64Impl::Compile() { +    MICROPROFILE_SCOPE(MacroJitCompile); +    bool keep_executing = true; +    labels.fill(Xbyak::Label()); + +    Common::X64::ABI_PushRegistersAndAdjustStackGPS(*this, Common::X64::ABI_ALL_CALLEE_SAVED, 8); +    // JIT state +    mov(STATE, Common::X64::ABI_PARAM1); +    mov(PARAMETERS, qword[Common::X64::ABI_PARAM1 + +                          static_cast<Xbyak::uint32>(offsetof(JITState, parameters))]); +    mov(REGISTERS, Common::X64::ABI_PARAM1); +    add(REGISTERS, static_cast<Xbyak::uint32>(offsetof(JITState, registers))); +    xor_(RESULT, RESULT); +    xor_(METHOD_ADDRESS, METHOD_ADDRESS); +    xor_(NEXT_PARAMETER, NEXT_PARAMETER); +    xor_(BRANCH_HOLDER, BRANCH_HOLDER); + +    mov(dword[REGISTERS + 4], Compile_FetchParameter()); + +    // Track get register for zero registers and mark it as no-op +    optimizer.zero_reg_skip = true; + +    // AddImmediate tends to be used as a NOP instruction, if we detect this we can +    // completely skip the entire code path and no emit anything +    optimizer.skip_dummy_addimmediate = true; + +    // SMO tends to emit a lot of unnecessary method moves, we can mitigate this by only emitting +    // one if our register isn't "dirty" +    optimizer.optimize_for_method_move = true; + +    // Check to see if we can skip emitting certain instructions +    Optimizer_ScanFlags(); + +    const u32 op_count = static_cast<u32>(code.size()); +    for (u32 i = 0; i < op_count; i++) { +        if (i < op_count - 1) { +            pc = i + 1; +            next_opcode = GetOpCode(); +        } else { +            next_opcode = {}; +        } +        pc = i; +        Compile_NextInstruction(); +    } + +    L(end_of_code); + +    Common::X64::ABI_PopRegistersAndAdjustStackGPS(*this, Common::X64::ABI_ALL_CALLEE_SAVED, 8); +    ret(); +    ready(); +    program = getCode<ProgramType>(); +} + +bool MacroJITx64Impl::Compile_NextInstruction() { +    const auto opcode = GetOpCode(); +    if (labels[pc].getAddress()) { +        return false; +    } + +    L(labels[pc]); + +    switch (opcode.operation) { +    case Macro::Operation::ALU: +        Compile_ALU(opcode); +        break; +    case Macro::Operation::AddImmediate: +        Compile_AddImmediate(opcode); +        break; +    case Macro::Operation::ExtractInsert: +        Compile_ExtractInsert(opcode); +        break; +    case Macro::Operation::ExtractShiftLeftImmediate: +        Compile_ExtractShiftLeftImmediate(opcode); +        break; +    case Macro::Operation::ExtractShiftLeftRegister: +        Compile_ExtractShiftLeftRegister(opcode); +        break; +    case Macro::Operation::Read: +        Compile_Read(opcode); +        break; +    case Macro::Operation::Branch: +        Compile_Branch(opcode); +        break; +    default: +        UNIMPLEMENTED_MSG("Unimplemented opcode {}", opcode.operation.Value()); +        break; +    } + +    if (optimizer.has_delayed_pc) { +        if (opcode.is_exit) { +            mov(rax, end_of_code); +            test(BRANCH_HOLDER, BRANCH_HOLDER); +            cmove(BRANCH_HOLDER, rax); +            // Jump to next instruction to skip delay slot check +            je(labels[pc + 1], T_NEAR); +        } else { +            // TODO(ogniK): Optimize delay slot branching +            Xbyak::Label no_delay_slot{}; +            test(BRANCH_HOLDER, BRANCH_HOLDER); +            je(no_delay_slot, T_NEAR); +            mov(rax, BRANCH_HOLDER); +            xor_(BRANCH_HOLDER, BRANCH_HOLDER); +            jmp(rax); +            L(no_delay_slot); +        } +        L(delay_skip[pc]); +        if (opcode.is_exit) { +            return false; +        } +    } else { +        test(BRANCH_HOLDER, BRANCH_HOLDER); +        jne(end_of_code, T_NEAR); +        if (opcode.is_exit) { +            inc(BRANCH_HOLDER); +            return false; +        } +    } +    return true; +} + +Xbyak::Reg32 Tegra::MacroJITx64Impl::Compile_FetchParameter() { +    mov(eax, dword[PARAMETERS + NEXT_PARAMETER * sizeof(u32)]); +    inc(NEXT_PARAMETER); +    return eax; +} + +Xbyak::Reg32 MacroJITx64Impl::Compile_GetRegister(u32 index, Xbyak::Reg32 dst) { +    if (index == 0) { +        // Register 0 is always zero +        xor_(dst, dst); +    } else { +        mov(dst, dword[REGISTERS + index * sizeof(u32)]); +    } + +    return dst; +} + +Xbyak::Reg64 Tegra::MacroJITx64Impl::Compile_GetRegister(u32 index, Xbyak::Reg64 dst) { +    if (index == 0) { +        // Register 0 is always zero +        xor_(dst, dst); +    } else { +        mov(dst, dword[REGISTERS + index * sizeof(u32)]); +    } + +    return dst; +} + +void Tegra::MacroJITx64Impl::Compile_WriteCarry(Xbyak::Reg64 dst) { +    Xbyak::Label zero{}, end{}; +    xor_(ecx, ecx); +    shr(dst, 32); +    setne(cl); +    mov(dword[STATE + offsetof(JITState, carry_flag)], ecx); +} + +void MacroJITx64Impl::Compile_ProcessResult(Macro::ResultOperation operation, u32 reg) { +    auto SetRegister = [=](u32 reg, Xbyak::Reg32 result) { +        // Register 0 is supposed to always return 0. NOP is implemented as a store to the zero +        // register. +        if (reg == 0) { +            return; +        } +        mov(dword[REGISTERS + reg * sizeof(u32)], result); +    }; +    auto SetMethodAddress = [=](Xbyak::Reg32 reg) { mov(METHOD_ADDRESS, reg); }; + +    switch (operation) { +    case Macro::ResultOperation::IgnoreAndFetch: +        SetRegister(reg, Compile_FetchParameter()); +        break; +    case Macro::ResultOperation::Move: +        SetRegister(reg, RESULT); +        break; +    case Macro::ResultOperation::MoveAndSetMethod: +        SetRegister(reg, RESULT); +        SetMethodAddress(RESULT); +        break; +    case Macro::ResultOperation::FetchAndSend: +        // Fetch parameter and send result. +        SetRegister(reg, Compile_FetchParameter()); +        Compile_Send(RESULT); +        break; +    case Macro::ResultOperation::MoveAndSend: +        // Move and send result. +        SetRegister(reg, RESULT); +        Compile_Send(RESULT); +        break; +    case Macro::ResultOperation::FetchAndSetMethod: +        // Fetch parameter and use result as Method Address. +        SetRegister(reg, Compile_FetchParameter()); +        SetMethodAddress(RESULT); +        break; +    case Macro::ResultOperation::MoveAndSetMethodFetchAndSend: +        // Move result and use as Method Address, then fetch and send parameter. +        SetRegister(reg, RESULT); +        SetMethodAddress(RESULT); +        Compile_Send(Compile_FetchParameter()); +        break; +    case Macro::ResultOperation::MoveAndSetMethodSend: +        // Move result and use as Method Address, then send bits 12:17 of result. +        SetRegister(reg, RESULT); +        SetMethodAddress(RESULT); +        shr(RESULT, 12); +        and_(RESULT, 0b111111); +        Compile_Send(RESULT); +        break; +    default: +        UNIMPLEMENTED_MSG("Unimplemented macro operation {}", static_cast<std::size_t>(operation)); +    } +} + +Macro::Opcode MacroJITx64Impl::GetOpCode() const { +    ASSERT(pc < code.size()); +    return {code[pc]}; +} + +std::bitset<32> MacroJITx64Impl::PersistentCallerSavedRegs() const { +    return PERSISTENT_REGISTERS & Common::X64::ABI_ALL_CALLER_SAVED; +} + +} // namespace Tegra diff --git a/src/video_core/macro/macro_jit_x64.h b/src/video_core/macro/macro_jit_x64.h new file mode 100644 index 000000000..71f738b9a --- /dev/null +++ b/src/video_core/macro/macro_jit_x64.h @@ -0,0 +1,100 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include <array> +#include <bitset> +#include <xbyak.h> +#include "common/bit_field.h" +#include "common/common_types.h" +#include "common/x64/xbyak_abi.h" +#include "video_core/macro/macro.h" + +namespace Tegra { + +namespace Engines { +class Maxwell3D; +} + +/// MAX_CODE_SIZE is arbitrarily chosen based on current booting games +constexpr size_t MAX_CODE_SIZE = 0x10000; + +class MacroJITx64 final : public MacroEngine { +public: +    explicit MacroJITx64(Engines::Maxwell3D& maxwell3d); + +protected: +    std::unique_ptr<CachedMacro> Compile(const std::vector<u32>& code) override; + +private: +    Engines::Maxwell3D& maxwell3d; +}; + +class MacroJITx64Impl : public Xbyak::CodeGenerator, public CachedMacro { +public: +    MacroJITx64Impl(Engines::Maxwell3D& maxwell3d, const std::vector<u32>& code); +    ~MacroJITx64Impl(); + +    void Execute(const std::vector<u32>& parameters, u32 method) override; + +    void Compile_ALU(Macro::Opcode opcode); +    void Compile_AddImmediate(Macro::Opcode opcode); +    void Compile_ExtractInsert(Macro::Opcode opcode); +    void Compile_ExtractShiftLeftImmediate(Macro::Opcode opcode); +    void Compile_ExtractShiftLeftRegister(Macro::Opcode opcode); +    void Compile_Read(Macro::Opcode opcode); +    void Compile_Branch(Macro::Opcode opcode); + +private: +    void Optimizer_ScanFlags(); + +    void Compile(); +    bool Compile_NextInstruction(); + +    Xbyak::Reg32 Compile_FetchParameter(); +    Xbyak::Reg32 Compile_GetRegister(u32 index, Xbyak::Reg32 dst); +    Xbyak::Reg64 Compile_GetRegister(u32 index, Xbyak::Reg64 dst); +    void Compile_WriteCarry(Xbyak::Reg64 dst); + +    void Compile_ProcessResult(Macro::ResultOperation operation, u32 reg); +    void Compile_Send(Xbyak::Reg32 value); + +    Macro::Opcode GetOpCode() const; +    std::bitset<32> PersistentCallerSavedRegs() const; + +    struct JITState { +        Engines::Maxwell3D* maxwell3d{}; +        std::array<u32, Macro::NUM_MACRO_REGISTERS> registers{}; +        const u32* parameters{}; +        u32 carry_flag{}; +    }; +    static_assert(offsetof(JITState, maxwell3d) == 0, "Maxwell3D is not at 0x0"); +    using ProgramType = void (*)(JITState*); + +    struct OptimizerState { +        bool can_skip_carry{}; +        bool has_delayed_pc{}; +        bool zero_reg_skip{}; +        bool skip_dummy_addimmediate{}; +        bool optimize_for_method_move{}; +    }; +    OptimizerState optimizer{}; + +    std::optional<Macro::Opcode> next_opcode{}; +    ProgramType program{nullptr}; + +    std::array<Xbyak::Label, MAX_CODE_SIZE> labels; +    std::array<Xbyak::Label, MAX_CODE_SIZE> delay_skip; +    Xbyak::Label end_of_code{}; + +    bool is_delay_slot{}; +    u32 pc{}; +    std::optional<u32> delayed_pc; + +    const std::vector<u32>& code; +    Engines::Maxwell3D& maxwell3d; +}; + +} // namespace Tegra diff --git a/src/video_core/rasterizer_cache.cpp b/src/video_core/rasterizer_cache.cpp deleted file mode 100644 index 093b2cdf4..000000000 --- a/src/video_core/rasterizer_cache.cpp +++ /dev/null @@ -1,7 +0,0 @@ -// Copyright 2018 yuzu Emulator Project -// Licensed under GPLv2 or any later version -// Refer to the license.txt file included. - -#include "video_core/rasterizer_cache.h" - -RasterizerCacheObject::~RasterizerCacheObject() = default; diff --git a/src/video_core/rasterizer_cache.h b/src/video_core/rasterizer_cache.h deleted file mode 100644 index 096ee337c..000000000 --- a/src/video_core/rasterizer_cache.h +++ /dev/null @@ -1,253 +0,0 @@ -// Copyright 2018 yuzu Emulator Project -// Licensed under GPLv2 or any later version -// Refer to the license.txt file included. - -#pragma once - -#include <mutex> -#include <set> -#include <unordered_map> - -#include <boost/icl/interval_map.hpp> -#include <boost/range/iterator_range_core.hpp> - -#include "common/common_types.h" -#include "core/settings.h" -#include "video_core/gpu.h" -#include "video_core/rasterizer_interface.h" - -class RasterizerCacheObject { -public: -    explicit RasterizerCacheObject(const VAddr cpu_addr) : cpu_addr{cpu_addr} {} - -    virtual ~RasterizerCacheObject(); - -    VAddr GetCpuAddr() const { -        return cpu_addr; -    } - -    /// Gets the size of the shader in guest memory, required for cache management -    virtual std::size_t GetSizeInBytes() const = 0; - -    /// Sets whether the cached object should be considered registered -    void SetIsRegistered(bool registered) { -        is_registered = registered; -    } - -    /// Returns true if the cached object is registered -    bool IsRegistered() const { -        return is_registered; -    } - -    /// Returns true if the cached object is dirty -    bool IsDirty() const { -        return is_dirty; -    } - -    /// Returns ticks from when this cached object was last modified -    u64 GetLastModifiedTicks() const { -        return last_modified_ticks; -    } - -    /// Marks an object as recently modified, used to specify whether it is clean or dirty -    template <class T> -    void MarkAsModified(bool dirty, T& cache) { -        is_dirty = dirty; -        last_modified_ticks = cache.GetModifiedTicks(); -    } - -    void SetMemoryMarked(bool is_memory_marked_) { -        is_memory_marked = is_memory_marked_; -    } - -    bool IsMemoryMarked() const { -        return is_memory_marked; -    } - -    void SetSyncPending(bool is_sync_pending_) { -        is_sync_pending = is_sync_pending_; -    } - -    bool IsSyncPending() const { -        return is_sync_pending; -    } - -private: -    bool is_registered{};      ///< Whether the object is currently registered with the cache -    bool is_dirty{};           ///< Whether the object is dirty (out of sync with guest memory) -    bool is_memory_marked{};   ///< Whether the object is marking rasterizer memory. -    bool is_sync_pending{};    ///< Whether the object is pending deletion. -    u64 last_modified_ticks{}; ///< When the object was last modified, used for in-order flushing -    VAddr cpu_addr{};          ///< Cpu address memory, unique from emulated virtual address space -}; - -template <class T> -class RasterizerCache : NonCopyable { -    friend class RasterizerCacheObject; - -public: -    explicit RasterizerCache(VideoCore::RasterizerInterface& rasterizer) : rasterizer{rasterizer} {} - -    /// Write any cached resources overlapping the specified region back to memory -    void FlushRegion(VAddr addr, std::size_t size) { -        std::lock_guard lock{mutex}; - -        const auto& objects{GetSortedObjectsFromRegion(addr, size)}; -        for (auto& object : objects) { -            FlushObject(object); -        } -    } - -    /// Mark the specified region as being invalidated -    void InvalidateRegion(VAddr addr, u64 size) { -        std::lock_guard lock{mutex}; - -        const auto& objects{GetSortedObjectsFromRegion(addr, size)}; -        for (auto& object : objects) { -            if (!object->IsRegistered()) { -                // Skip duplicates -                continue; -            } -            Unregister(object); -        } -    } - -    void OnCPUWrite(VAddr addr, std::size_t size) { -        std::lock_guard lock{mutex}; - -        for (const auto& object : GetSortedObjectsFromRegion(addr, size)) { -            if (object->IsRegistered()) { -                UnmarkMemory(object); -                object->SetSyncPending(true); -                marked_for_unregister.emplace_back(object); -            } -        } -    } - -    void SyncGuestHost() { -        std::lock_guard lock{mutex}; - -        for (const auto& object : marked_for_unregister) { -            if (object->IsRegistered()) { -                object->SetSyncPending(false); -                Unregister(object); -            } -        } -        marked_for_unregister.clear(); -    } - -    /// Invalidates everything in the cache -    void InvalidateAll() { -        std::lock_guard lock{mutex}; - -        while (interval_cache.begin() != interval_cache.end()) { -            Unregister(*interval_cache.begin()->second.begin()); -        } -    } - -protected: -    /// Tries to get an object from the cache with the specified cache address -    T TryGet(VAddr addr) const { -        const auto iter = map_cache.find(addr); -        if (iter != map_cache.end()) -            return iter->second; -        return nullptr; -    } - -    /// Register an object into the cache -    virtual void Register(const T& object) { -        std::lock_guard lock{mutex}; - -        object->SetIsRegistered(true); -        interval_cache.add({GetInterval(object), ObjectSet{object}}); -        map_cache.insert({object->GetCpuAddr(), object}); -        rasterizer.UpdatePagesCachedCount(object->GetCpuAddr(), object->GetSizeInBytes(), 1); -        object->SetMemoryMarked(true); -    } - -    /// Unregisters an object from the cache -    virtual void Unregister(const T& object) { -        std::lock_guard lock{mutex}; - -        UnmarkMemory(object); -        object->SetIsRegistered(false); -        if (object->IsSyncPending()) { -            marked_for_unregister.remove(object); -            object->SetSyncPending(false); -        } -        const VAddr addr = object->GetCpuAddr(); -        interval_cache.subtract({GetInterval(object), ObjectSet{object}}); -        map_cache.erase(addr); -    } - -    void UnmarkMemory(const T& object) { -        if (!object->IsMemoryMarked()) { -            return; -        } -        rasterizer.UpdatePagesCachedCount(object->GetCpuAddr(), object->GetSizeInBytes(), -1); -        object->SetMemoryMarked(false); -    } - -    /// Returns a ticks counter used for tracking when cached objects were last modified -    u64 GetModifiedTicks() { -        std::lock_guard lock{mutex}; - -        return ++modified_ticks; -    } - -    virtual void FlushObjectInner(const T& object) = 0; - -    /// Flushes the specified object, updating appropriate cache state as needed -    void FlushObject(const T& object) { -        std::lock_guard lock{mutex}; - -        if (!object->IsDirty()) { -            return; -        } -        FlushObjectInner(object); -        object->MarkAsModified(false, *this); -    } - -    std::recursive_mutex mutex; - -private: -    /// Returns a list of cached objects from the specified memory region, ordered by access time -    std::vector<T> GetSortedObjectsFromRegion(VAddr addr, u64 size) { -        if (size == 0) { -            return {}; -        } - -        std::vector<T> objects; -        const ObjectInterval interval{addr, addr + size}; -        for (auto& pair : boost::make_iterator_range(interval_cache.equal_range(interval))) { -            for (auto& cached_object : pair.second) { -                if (!cached_object) { -                    continue; -                } -                objects.push_back(cached_object); -            } -        } - -        std::sort(objects.begin(), objects.end(), [](const T& a, const T& b) -> bool { -            return a->GetLastModifiedTicks() < b->GetLastModifiedTicks(); -        }); - -        return objects; -    } - -    using ObjectSet = std::set<T>; -    using ObjectCache = std::unordered_map<VAddr, T>; -    using IntervalCache = boost::icl::interval_map<VAddr, ObjectSet>; -    using ObjectInterval = typename IntervalCache::interval_type; - -    static auto GetInterval(const T& object) { -        return ObjectInterval::right_open(object->GetCpuAddr(), -                                          object->GetCpuAddr() + object->GetSizeInBytes()); -    } - -    ObjectCache map_cache; -    IntervalCache interval_cache; ///< Cache of objects -    u64 modified_ticks{};         ///< Counter of cache state ticks, used for in-order flushing -    VideoCore::RasterizerInterface& rasterizer; -    std::list<T> marked_for_unregister; -}; diff --git a/src/video_core/renderer_opengl/gl_buffer_cache.cpp b/src/video_core/renderer_opengl/gl_buffer_cache.cpp index 9964ea894..ad0577a4f 100644 --- a/src/video_core/renderer_opengl/gl_buffer_cache.cpp +++ b/src/video_core/renderer_opengl/gl_buffer_cache.cpp @@ -22,13 +22,12 @@ using Maxwell = Tegra::Engines::Maxwell3D::Regs;  MICROPROFILE_DEFINE(OpenGL_Buffer_Download, "OpenGL", "Buffer Download", MP_RGB(192, 192, 128)); -CachedBufferBlock::CachedBufferBlock(VAddr cpu_addr, const std::size_t size) -    : VideoCommon::BufferBlock{cpu_addr, size} { +Buffer::Buffer(VAddr cpu_addr, const std::size_t size) : VideoCommon::BufferBlock{cpu_addr, size} {      gl_buffer.Create();      glNamedBufferData(gl_buffer.handle, static_cast<GLsizeiptr>(size), nullptr, GL_DYNAMIC_DRAW);  } -CachedBufferBlock::~CachedBufferBlock() = default; +Buffer::~Buffer() = default;  OGLBufferCache::OGLBufferCache(RasterizerOpenGL& rasterizer, Core::System& system,                                 const Device& device, std::size_t stream_size) @@ -48,12 +47,8 @@ OGLBufferCache::~OGLBufferCache() {      glDeleteBuffers(static_cast<GLsizei>(std::size(cbufs)), std::data(cbufs));  } -Buffer OGLBufferCache::CreateBlock(VAddr cpu_addr, std::size_t size) { -    return std::make_shared<CachedBufferBlock>(cpu_addr, size); -} - -GLuint OGLBufferCache::ToHandle(const Buffer& buffer) { -    return buffer->GetHandle(); +std::shared_ptr<Buffer> OGLBufferCache::CreateBlock(VAddr cpu_addr, std::size_t size) { +    return std::make_shared<Buffer>(cpu_addr, size);  }  GLuint OGLBufferCache::GetEmptyBuffer(std::size_t) { @@ -62,7 +57,7 @@ GLuint OGLBufferCache::GetEmptyBuffer(std::size_t) {  void OGLBufferCache::UploadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size,                                       const u8* data) { -    glNamedBufferSubData(buffer->GetHandle(), static_cast<GLintptr>(offset), +    glNamedBufferSubData(buffer.Handle(), static_cast<GLintptr>(offset),                           static_cast<GLsizeiptr>(size), data);  } @@ -70,20 +65,20 @@ void OGLBufferCache::DownloadBlockData(const Buffer& buffer, std::size_t offset,                                         u8* data) {      MICROPROFILE_SCOPE(OpenGL_Buffer_Download);      glMemoryBarrier(GL_BUFFER_UPDATE_BARRIER_BIT); -    glGetNamedBufferSubData(buffer->GetHandle(), static_cast<GLintptr>(offset), +    glGetNamedBufferSubData(buffer.Handle(), static_cast<GLintptr>(offset),                              static_cast<GLsizeiptr>(size), data);  }  void OGLBufferCache::CopyBlock(const Buffer& src, const Buffer& dst, std::size_t src_offset,                                 std::size_t dst_offset, std::size_t size) { -    glCopyNamedBufferSubData(src->GetHandle(), dst->GetHandle(), static_cast<GLintptr>(src_offset), +    glCopyNamedBufferSubData(src.Handle(), dst.Handle(), static_cast<GLintptr>(src_offset),                               static_cast<GLintptr>(dst_offset), static_cast<GLsizeiptr>(size));  }  OGLBufferCache::BufferInfo OGLBufferCache::ConstBufferUpload(const void* raw_pointer,                                                               std::size_t size) {      DEBUG_ASSERT(cbuf_cursor < std::size(cbufs)); -    const GLuint& cbuf = cbufs[cbuf_cursor++]; +    const GLuint cbuf = cbufs[cbuf_cursor++];      glNamedBufferSubData(cbuf, 0, static_cast<GLsizeiptr>(size), raw_pointer);      return {cbuf, 0};  } diff --git a/src/video_core/renderer_opengl/gl_buffer_cache.h b/src/video_core/renderer_opengl/gl_buffer_cache.h index a9e86cfc7..a49aaf9c4 100644 --- a/src/video_core/renderer_opengl/gl_buffer_cache.h +++ b/src/video_core/renderer_opengl/gl_buffer_cache.h @@ -10,7 +10,6 @@  #include "common/common_types.h"  #include "video_core/buffer_cache/buffer_cache.h"  #include "video_core/engines/maxwell_3d.h" -#include "video_core/rasterizer_cache.h"  #include "video_core/renderer_opengl/gl_resource_manager.h"  #include "video_core/renderer_opengl/gl_stream_buffer.h" @@ -24,17 +23,12 @@ class Device;  class OGLStreamBuffer;  class RasterizerOpenGL; -class CachedBufferBlock; - -using Buffer = std::shared_ptr<CachedBufferBlock>; -using GenericBufferCache = VideoCommon::BufferCache<Buffer, GLuint, OGLStreamBuffer>; - -class CachedBufferBlock : public VideoCommon::BufferBlock { +class Buffer : public VideoCommon::BufferBlock {  public: -    explicit CachedBufferBlock(VAddr cpu_addr, const std::size_t size); -    ~CachedBufferBlock(); +    explicit Buffer(VAddr cpu_addr, const std::size_t size); +    ~Buffer(); -    GLuint GetHandle() const { +    GLuint Handle() const {          return gl_buffer.handle;      } @@ -42,6 +36,7 @@ private:      OGLBuffer gl_buffer;  }; +using GenericBufferCache = VideoCommon::BufferCache<Buffer, GLuint, OGLStreamBuffer>;  class OGLBufferCache final : public GenericBufferCache {  public:      explicit OGLBufferCache(RasterizerOpenGL& rasterizer, Core::System& system, @@ -55,9 +50,7 @@ public:      }  protected: -    Buffer CreateBlock(VAddr cpu_addr, std::size_t size) override; - -    GLuint ToHandle(const Buffer& buffer) override; +    std::shared_ptr<Buffer> CreateBlock(VAddr cpu_addr, std::size_t size) override;      void UploadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size,                           const u8* data) override; diff --git a/src/video_core/renderer_opengl/gl_device.cpp b/src/video_core/renderer_opengl/gl_device.cpp index b772c37d9..890fc6c63 100644 --- a/src/video_core/renderer_opengl/gl_device.cpp +++ b/src/video_core/renderer_opengl/gl_device.cpp @@ -185,12 +185,20 @@ bool IsASTCSupported() {  Device::Device()      : max_uniform_buffers{BuildMaxUniformBuffers()}, base_bindings{BuildBaseBindings()} {      const std::string_view vendor = reinterpret_cast<const char*>(glGetString(GL_VENDOR)); -    const auto renderer = reinterpret_cast<const char*>(glGetString(GL_RENDERER)); +    const std::string_view version = reinterpret_cast<const char*>(glGetString(GL_VERSION));      const std::vector extensions = GetExtensions();      const bool is_nvidia = vendor == "NVIDIA Corporation";      const bool is_amd = vendor == "ATI Technologies Inc."; +    bool disable_fast_buffer_sub_data = false; +    if (is_nvidia && version == "4.6.0 NVIDIA 443.24") { +        LOG_WARNING( +            Render_OpenGL, +            "Beta driver 443.24 is known to have issues. There might be performance issues."); +        disable_fast_buffer_sub_data = true; +    } +      uniform_buffer_alignment = GetInteger<std::size_t>(GL_UNIFORM_BUFFER_OFFSET_ALIGNMENT);      shader_storage_alignment = GetInteger<std::size_t>(GL_SHADER_STORAGE_BUFFER_OFFSET_ALIGNMENT);      max_vertex_attributes = GetInteger<u32>(GL_MAX_VERTEX_ATTRIBS); @@ -204,9 +212,10 @@ Device::Device()      has_variable_aoffi = TestVariableAoffi();      has_component_indexing_bug = is_amd;      has_precise_bug = TestPreciseBug(); -    has_fast_buffer_sub_data = is_nvidia; +    has_fast_buffer_sub_data = is_nvidia && !disable_fast_buffer_sub_data;      use_assembly_shaders = Settings::values.use_assembly_shaders && GLAD_GL_NV_gpu_program5 && -                           GLAD_GL_NV_compute_program5; +                           GLAD_GL_NV_compute_program5 && GLAD_GL_NV_transform_feedback && +                           GLAD_GL_NV_transform_feedback2;      LOG_INFO(Render_OpenGL, "Renderer_VariableAOFFI: {}", has_variable_aoffi);      LOG_INFO(Render_OpenGL, "Renderer_ComponentIndexingBug: {}", has_component_indexing_bug); diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp index 55e79aaf6..2d6c11320 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.cpp +++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp @@ -30,6 +30,7 @@  #include "video_core/renderer_opengl/gl_shader_cache.h"  #include "video_core/renderer_opengl/maxwell_to_gl.h"  #include "video_core/renderer_opengl/renderer_opengl.h" +#include "video_core/shader_cache.h"  namespace OpenGL { @@ -65,10 +66,22 @@ constexpr std::size_t NumSupportedVertexAttributes = 16;  template <typename Engine, typename Entry>  Tegra::Texture::FullTextureInfo GetTextureInfo(const Engine& engine, const Entry& entry,                                                 ShaderType shader_type, std::size_t index = 0) { +    if constexpr (std::is_same_v<Entry, SamplerEntry>) { +        if (entry.is_separated) { +            const u32 buffer_1 = entry.buffer; +            const u32 buffer_2 = entry.secondary_buffer; +            const u32 offset_1 = entry.offset; +            const u32 offset_2 = entry.secondary_offset; +            const u32 handle_1 = engine.AccessConstBuffer32(shader_type, buffer_1, offset_1); +            const u32 handle_2 = engine.AccessConstBuffer32(shader_type, buffer_2, offset_2); +            return engine.GetTextureInfo(handle_1 | handle_2); +        } +    }      if (entry.is_bindless) { -        const auto tex_handle = engine.AccessConstBuffer32(shader_type, entry.buffer, entry.offset); -        return engine.GetTextureInfo(tex_handle); +        const u32 handle = engine.AccessConstBuffer32(shader_type, entry.buffer, entry.offset); +        return engine.GetTextureInfo(handle);      } +      const auto& gpu_profile = engine.AccessGuestDriverProfile();      const u32 offset = entry.offset + static_cast<u32>(index * gpu_profile.GetTextureHandlerSize());      if constexpr (std::is_same_v<Engine, Tegra::Engines::Maxwell3D>) { @@ -93,6 +106,34 @@ std::size_t GetConstBufferSize(const Tegra::Engines::ConstBufferInfo& buffer,      return buffer.size;  } +/// Translates hardware transform feedback indices +/// @param location Hardware location +/// @return Pair of ARB_transform_feedback3 token stream first and third arguments +/// @note Read https://www.khronos.org/registry/OpenGL/extensions/ARB/ARB_transform_feedback3.txt +std::pair<GLint, GLint> TransformFeedbackEnum(u8 location) { +    const u8 index = location / 4; +    if (index >= 8 && index <= 39) { +        return {GL_GENERIC_ATTRIB_NV, index - 8}; +    } +    if (index >= 48 && index <= 55) { +        return {GL_TEXTURE_COORD_NV, index - 48}; +    } +    switch (index) { +    case 7: +        return {GL_POSITION, 0}; +    case 40: +        return {GL_PRIMARY_COLOR_NV, 0}; +    case 41: +        return {GL_SECONDARY_COLOR_NV, 0}; +    case 42: +        return {GL_BACK_PRIMARY_COLOR_NV, 0}; +    case 43: +        return {GL_BACK_SECONDARY_COLOR_NV, 0}; +    } +    UNIMPLEMENTED_MSG("index={}", static_cast<int>(index)); +    return {GL_POSITION, 0}; +} +  void oglEnable(GLenum cap, bool state) {      (state ? glEnable : glDisable)(cap);  } @@ -282,7 +323,7 @@ void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) {              continue;          } -        Shader shader{shader_cache.GetStageProgram(program)}; +        Shader* const shader = shader_cache.GetStageProgram(program);          if (device.UseAssemblyShaders()) {              // Check for ARB limitation. We only have 16 SSBOs per context state. To workaround this @@ -576,7 +617,16 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) {                     (Maxwell::MaxConstBufferSize + device.GetUniformBufferAlignment());      // Prepare the vertex array. -    buffer_cache.Map(buffer_size); +    const bool invalidated = buffer_cache.Map(buffer_size); + +    if (invalidated) { +        // When the stream buffer has been invalidated, we have to consider vertex buffers as dirty +        auto& dirty = gpu.dirty.flags; +        dirty[Dirty::VertexBuffers] = true; +        for (int index = Dirty::VertexBuffer0; index <= Dirty::VertexBuffer31; ++index) { +            dirty[index] = true; +        } +    }      // Prepare vertex array format.      SetupVertexFormat(); @@ -842,7 +892,7 @@ bool RasterizerOpenGL::AccelerateDisplay(const Tegra::FramebufferConfig& config,      return true;  } -void RasterizerOpenGL::SetupDrawConstBuffers(std::size_t stage_index, const Shader& shader) { +void RasterizerOpenGL::SetupDrawConstBuffers(std::size_t stage_index, Shader* shader) {      static constexpr std::array PARAMETER_LUT = {          GL_VERTEX_PROGRAM_PARAMETER_BUFFER_NV, GL_TESS_CONTROL_PROGRAM_PARAMETER_BUFFER_NV,          GL_TESS_EVALUATION_PROGRAM_PARAMETER_BUFFER_NV, GL_GEOMETRY_PROGRAM_PARAMETER_BUFFER_NV, @@ -872,7 +922,7 @@ void RasterizerOpenGL::SetupDrawConstBuffers(std::size_t stage_index, const Shad      }  } -void RasterizerOpenGL::SetupComputeConstBuffers(const Shader& kernel) { +void RasterizerOpenGL::SetupComputeConstBuffers(Shader* kernel) {      MICROPROFILE_SCOPE(OpenGL_UBO);      const auto& launch_desc = system.GPU().KeplerCompute().launch_description;      const auto& entries = kernel->GetEntries(); @@ -941,7 +991,7 @@ void RasterizerOpenGL::SetupConstBuffer(GLenum stage, u32 binding,      }  } -void RasterizerOpenGL::SetupDrawGlobalMemory(std::size_t stage_index, const Shader& shader) { +void RasterizerOpenGL::SetupDrawGlobalMemory(std::size_t stage_index, Shader* shader) {      auto& gpu{system.GPU()};      auto& memory_manager{gpu.MemoryManager()};      const auto cbufs{gpu.Maxwell3D().state.shader_stages[stage_index]}; @@ -956,7 +1006,7 @@ void RasterizerOpenGL::SetupDrawGlobalMemory(std::size_t stage_index, const Shad      }  } -void RasterizerOpenGL::SetupComputeGlobalMemory(const Shader& kernel) { +void RasterizerOpenGL::SetupComputeGlobalMemory(Shader* kernel) {      auto& gpu{system.GPU()};      auto& memory_manager{gpu.MemoryManager()};      const auto cbufs{gpu.KeplerCompute().launch_description.const_buffer_config}; @@ -979,7 +1029,7 @@ void RasterizerOpenGL::SetupGlobalMemory(u32 binding, const GlobalMemoryEntry& e                        static_cast<GLsizeiptr>(size));  } -void RasterizerOpenGL::SetupDrawTextures(std::size_t stage_index, const Shader& shader) { +void RasterizerOpenGL::SetupDrawTextures(std::size_t stage_index, Shader* shader) {      MICROPROFILE_SCOPE(OpenGL_Texture);      const auto& maxwell3d = system.GPU().Maxwell3D();      u32 binding = device.GetBaseBindings(stage_index).sampler; @@ -992,7 +1042,7 @@ void RasterizerOpenGL::SetupDrawTextures(std::size_t stage_index, const Shader&      }  } -void RasterizerOpenGL::SetupComputeTextures(const Shader& kernel) { +void RasterizerOpenGL::SetupComputeTextures(Shader* kernel) {      MICROPROFILE_SCOPE(OpenGL_Texture);      const auto& compute = system.GPU().KeplerCompute();      u32 binding = 0; @@ -1021,7 +1071,7 @@ void RasterizerOpenGL::SetupTexture(u32 binding, const Tegra::Texture::FullTextu      }  } -void RasterizerOpenGL::SetupDrawImages(std::size_t stage_index, const Shader& shader) { +void RasterizerOpenGL::SetupDrawImages(std::size_t stage_index, Shader* shader) {      const auto& maxwell3d = system.GPU().Maxwell3D();      u32 binding = device.GetBaseBindings(stage_index).image;      for (const auto& entry : shader->GetEntries().images) { @@ -1031,7 +1081,7 @@ void RasterizerOpenGL::SetupDrawImages(std::size_t stage_index, const Shader& sh      }  } -void RasterizerOpenGL::SetupComputeImages(const Shader& shader) { +void RasterizerOpenGL::SetupComputeImages(Shader* shader) {      const auto& compute = system.GPU().KeplerCompute();      u32 binding = 0;      for (const auto& entry : shader->GetEntries().images) { @@ -1547,12 +1597,70 @@ void RasterizerOpenGL::SyncFramebufferSRGB() {      oglEnable(GL_FRAMEBUFFER_SRGB, gpu.regs.framebuffer_srgb);  } +void RasterizerOpenGL::SyncTransformFeedback() { +    // TODO(Rodrigo): Inject SKIP_COMPONENTS*_NV when required. An unimplemented message will signal +    // when this is required. +    const auto& regs = system.GPU().Maxwell3D().regs; + +    static constexpr std::size_t STRIDE = 3; +    std::array<GLint, 128 * STRIDE * Maxwell::NumTransformFeedbackBuffers> attribs; +    std::array<GLint, Maxwell::NumTransformFeedbackBuffers> streams; + +    GLint* cursor = attribs.data(); +    GLint* current_stream = streams.data(); + +    for (std::size_t feedback = 0; feedback < Maxwell::NumTransformFeedbackBuffers; ++feedback) { +        const auto& layout = regs.tfb_layouts[feedback]; +        UNIMPLEMENTED_IF_MSG(layout.stride != layout.varying_count * 4, "Stride padding"); +        if (layout.varying_count == 0) { +            continue; +        } + +        *current_stream = static_cast<GLint>(feedback); +        if (current_stream != streams.data()) { +            // When stepping one stream, push the expected token +            cursor[0] = GL_NEXT_BUFFER_NV; +            cursor[1] = 0; +            cursor[2] = 0; +            cursor += STRIDE; +        } +        ++current_stream; + +        const auto& locations = regs.tfb_varying_locs[feedback]; +        std::optional<u8> current_index; +        for (u32 offset = 0; offset < layout.varying_count; ++offset) { +            const u8 location = locations[offset]; +            const u8 index = location / 4; + +            if (current_index == index) { +                // Increase number of components of the previous attachment +                ++cursor[-2]; +                continue; +            } +            current_index = index; + +            std::tie(cursor[0], cursor[2]) = TransformFeedbackEnum(location); +            cursor[1] = 1; +            cursor += STRIDE; +        } +    } + +    const GLsizei num_attribs = static_cast<GLsizei>((cursor - attribs.data()) / STRIDE); +    const GLsizei num_strides = static_cast<GLsizei>(current_stream - streams.data()); +    glTransformFeedbackStreamAttribsNV(num_attribs, attribs.data(), num_strides, streams.data(), +                                       GL_INTERLEAVED_ATTRIBS); +} +  void RasterizerOpenGL::BeginTransformFeedback(GLenum primitive_mode) {      const auto& regs = system.GPU().Maxwell3D().regs;      if (regs.tfb_enabled == 0) {          return;      } +    if (device.UseAssemblyShaders()) { +        SyncTransformFeedback(); +    } +      UNIMPLEMENTED_IF(regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::TesselationControl) ||                       regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::TesselationEval) ||                       regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::Geometry)); @@ -1579,6 +1687,10 @@ void RasterizerOpenGL::BeginTransformFeedback(GLenum primitive_mode) {                            static_cast<GLsizeiptr>(size));      } +    // We may have to call BeginTransformFeedbackNV here since they seem to call different +    // implementations on Nvidia's driver (the pointer is different) but we are using +    // ARB_transform_feedback3 features with NV_transform_feedback interactions and the ARB +    // extension doesn't define BeginTransformFeedback (without NV) interactions. It just works.      glBeginTransformFeedback(GL_POINTS);  } diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h index f5dc56a0e..4f082592f 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.h +++ b/src/video_core/renderer_opengl/gl_rasterizer.h @@ -19,7 +19,6 @@  #include "video_core/engines/const_buffer_info.h"  #include "video_core/engines/maxwell_3d.h"  #include "video_core/rasterizer_accelerated.h" -#include "video_core/rasterizer_cache.h"  #include "video_core/rasterizer_interface.h"  #include "video_core/renderer_opengl/gl_buffer_cache.h"  #include "video_core/renderer_opengl/gl_device.h" @@ -100,10 +99,10 @@ private:      void ConfigureClearFramebuffer(bool using_color, bool using_depth_stencil);      /// Configures the current constbuffers to use for the draw command. -    void SetupDrawConstBuffers(std::size_t stage_index, const Shader& shader); +    void SetupDrawConstBuffers(std::size_t stage_index, Shader* shader);      /// Configures the current constbuffers to use for the kernel invocation. -    void SetupComputeConstBuffers(const Shader& kernel); +    void SetupComputeConstBuffers(Shader* kernel);      /// Configures a constant buffer.      void SetupConstBuffer(GLenum stage, u32 binding, const Tegra::Engines::ConstBufferInfo& buffer, @@ -111,30 +110,30 @@ private:                            std::size_t unified_offset);      /// Configures the current global memory entries to use for the draw command. -    void SetupDrawGlobalMemory(std::size_t stage_index, const Shader& shader); +    void SetupDrawGlobalMemory(std::size_t stage_index, Shader* shader);      /// Configures the current global memory entries to use for the kernel invocation. -    void SetupComputeGlobalMemory(const Shader& kernel); +    void SetupComputeGlobalMemory(Shader* kernel);      /// Configures a constant buffer.      void SetupGlobalMemory(u32 binding, const GlobalMemoryEntry& entry, GPUVAddr gpu_addr,                             std::size_t size);      /// Configures the current textures to use for the draw command. -    void SetupDrawTextures(std::size_t stage_index, const Shader& shader); +    void SetupDrawTextures(std::size_t stage_index, Shader* shader);      /// Configures the textures used in a compute shader. -    void SetupComputeTextures(const Shader& kernel); +    void SetupComputeTextures(Shader* kernel);      /// Configures a texture.      void SetupTexture(u32 binding, const Tegra::Texture::FullTextureInfo& texture,                        const SamplerEntry& entry);      /// Configures images in a graphics shader. -    void SetupDrawImages(std::size_t stage_index, const Shader& shader); +    void SetupDrawImages(std::size_t stage_index, Shader* shader);      /// Configures images in a compute shader. -    void SetupComputeImages(const Shader& shader); +    void SetupComputeImages(Shader* shader);      /// Configures an image.      void SetupImage(u32 binding, const Tegra::Texture::TICEntry& tic, const ImageEntry& entry); @@ -202,6 +201,10 @@ private:      /// Syncs the framebuffer sRGB state to match the guest state      void SyncFramebufferSRGB(); +    /// Syncs transform feedback state to match guest state +    /// @note Only valid on assembly shaders +    void SyncTransformFeedback(); +      /// Begin a transform feedback      void BeginTransformFeedback(GLenum primitive_mode); diff --git a/src/video_core/renderer_opengl/gl_shader_cache.cpp b/src/video_core/renderer_opengl/gl_shader_cache.cpp index a991ca64a..c28486b1d 100644 --- a/src/video_core/renderer_opengl/gl_shader_cache.cpp +++ b/src/video_core/renderer_opengl/gl_shader_cache.cpp @@ -29,6 +29,7 @@  #include "video_core/shader/memory_util.h"  #include "video_core/shader/registry.h"  #include "video_core/shader/shader_ir.h" +#include "video_core/shader_cache.h"  namespace OpenGL { @@ -194,12 +195,9 @@ std::unordered_set<GLenum> GetSupportedFormats() {  } // Anonymous namespace -CachedShader::CachedShader(VAddr cpu_addr, std::size_t size_in_bytes, -                           std::shared_ptr<VideoCommon::Shader::Registry> registry, -                           ShaderEntries entries, ProgramSharedPtr program_) -    : RasterizerCacheObject{cpu_addr}, registry{std::move(registry)}, entries{std::move(entries)}, -      size_in_bytes{size_in_bytes}, program{std::move(program_)} { -    // Assign either the assembly program or source program. We can't have both. +Shader::Shader(std::shared_ptr<VideoCommon::Shader::Registry> registry_, ShaderEntries entries_, +               ProgramSharedPtr program_) +    : registry{std::move(registry_)}, entries{std::move(entries_)}, program{std::move(program_)} {      handle = program->assembly_program.handle;      if (handle == 0) {          handle = program->source_program.handle; @@ -207,16 +205,16 @@ CachedShader::CachedShader(VAddr cpu_addr, std::size_t size_in_bytes,      ASSERT(handle != 0);  } -CachedShader::~CachedShader() = default; +Shader::~Shader() = default; -GLuint CachedShader::GetHandle() const { +GLuint Shader::GetHandle() const {      DEBUG_ASSERT(registry->IsConsistent());      return handle;  } -Shader CachedShader::CreateStageFromMemory(const ShaderParameters& params, -                                           Maxwell::ShaderProgram program_type, ProgramCode code, -                                           ProgramCode code_b) { +std::unique_ptr<Shader> Shader::CreateStageFromMemory(const ShaderParameters& params, +                                                      Maxwell::ShaderProgram program_type, +                                                      ProgramCode code, ProgramCode code_b) {      const auto shader_type = GetShaderType(program_type);      const std::size_t size_in_bytes = code.size() * sizeof(u64); @@ -241,12 +239,12 @@ Shader CachedShader::CreateStageFromMemory(const ShaderParameters& params,      entry.bindless_samplers = registry->GetBindlessSamplers();      params.disk_cache.SaveEntry(std::move(entry)); -    return std::shared_ptr<CachedShader>( -        new CachedShader(params.cpu_addr, size_in_bytes, std::move(registry), -                         MakeEntries(params.device, ir, shader_type), std::move(program))); +    return std::unique_ptr<Shader>(new Shader( +        std::move(registry), MakeEntries(params.device, ir, shader_type), std::move(program)));  } -Shader CachedShader::CreateKernelFromMemory(const ShaderParameters& params, ProgramCode code) { +std::unique_ptr<Shader> Shader::CreateKernelFromMemory(const ShaderParameters& params, +                                                       ProgramCode code) {      const std::size_t size_in_bytes = code.size() * sizeof(u64);      auto& engine = params.system.GPU().KeplerCompute(); @@ -266,23 +264,23 @@ Shader CachedShader::CreateKernelFromMemory(const ShaderParameters& params, Prog      entry.bindless_samplers = registry->GetBindlessSamplers();      params.disk_cache.SaveEntry(std::move(entry)); -    return std::shared_ptr<CachedShader>( -        new CachedShader(params.cpu_addr, size_in_bytes, std::move(registry), -                         MakeEntries(params.device, ir, ShaderType::Compute), std::move(program))); +    return std::unique_ptr<Shader>(new Shader(std::move(registry), +                                              MakeEntries(params.device, ir, ShaderType::Compute), +                                              std::move(program)));  } -Shader CachedShader::CreateFromCache(const ShaderParameters& params, -                                     const PrecompiledShader& precompiled_shader, -                                     std::size_t size_in_bytes) { -    return std::shared_ptr<CachedShader>( -        new CachedShader(params.cpu_addr, size_in_bytes, precompiled_shader.registry, -                         precompiled_shader.entries, precompiled_shader.program)); +std::unique_ptr<Shader> Shader::CreateFromCache(const ShaderParameters& params, +                                                const PrecompiledShader& precompiled_shader) { +    return std::unique_ptr<Shader>(new Shader( +        precompiled_shader.registry, precompiled_shader.entries, precompiled_shader.program));  }  ShaderCacheOpenGL::ShaderCacheOpenGL(RasterizerOpenGL& rasterizer, Core::System& system,                                       Core::Frontend::EmuWindow& emu_window, const Device& device) -    : RasterizerCache{rasterizer}, system{system}, emu_window{emu_window}, device{device}, -      disk_cache{system} {} +    : VideoCommon::ShaderCache<Shader>{rasterizer}, system{system}, +      emu_window{emu_window}, device{device}, disk_cache{system} {} + +ShaderCacheOpenGL::~ShaderCacheOpenGL() = default;  void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading,                                        const VideoCore::DiskResourceLoadCallback& callback) { @@ -436,7 +434,7 @@ ProgramSharedPtr ShaderCacheOpenGL::GeneratePrecompiledProgram(      return program;  } -Shader ShaderCacheOpenGL::GetStageProgram(Maxwell::ShaderProgram program) { +Shader* ShaderCacheOpenGL::GetStageProgram(Maxwell::ShaderProgram program) {      if (!system.GPU().Maxwell3D().dirty.flags[Dirty::Shaders]) {          return last_shaders[static_cast<std::size_t>(program)];      } @@ -446,8 +444,7 @@ Shader ShaderCacheOpenGL::GetStageProgram(Maxwell::ShaderProgram program) {      // Look up shader in the cache based on address      const auto cpu_addr{memory_manager.GpuToCpuAddress(address)}; -    Shader shader{cpu_addr ? TryGet(*cpu_addr) : null_shader}; -    if (shader) { +    if (Shader* const shader{cpu_addr ? TryGet(*cpu_addr) : null_shader.get()}) {          return last_shaders[static_cast<std::size_t>(program)] = shader;      } @@ -468,30 +465,29 @@ Shader ShaderCacheOpenGL::GetStageProgram(Maxwell::ShaderProgram program) {      const ShaderParameters params{system,    disk_cache, device,                                    *cpu_addr, host_ptr,   unique_identifier}; +    std::unique_ptr<Shader> shader;      const auto found = runtime_cache.find(unique_identifier);      if (found == runtime_cache.end()) { -        shader = CachedShader::CreateStageFromMemory(params, program, std::move(code), -                                                     std::move(code_b)); +        shader = Shader::CreateStageFromMemory(params, program, std::move(code), std::move(code_b));      } else { -        const std::size_t size_in_bytes = code.size() * sizeof(u64); -        shader = CachedShader::CreateFromCache(params, found->second, size_in_bytes); +        shader = Shader::CreateFromCache(params, found->second);      } +    Shader* const result = shader.get();      if (cpu_addr) { -        Register(shader); +        Register(std::move(shader), *cpu_addr, code.size() * sizeof(u64));      } else { -        null_shader = shader; +        null_shader = std::move(shader);      } -    return last_shaders[static_cast<std::size_t>(program)] = shader; +    return last_shaders[static_cast<std::size_t>(program)] = result;  } -Shader ShaderCacheOpenGL::GetComputeKernel(GPUVAddr code_addr) { +Shader* ShaderCacheOpenGL::GetComputeKernel(GPUVAddr code_addr) {      auto& memory_manager{system.GPU().MemoryManager()};      const auto cpu_addr{memory_manager.GpuToCpuAddress(code_addr)}; -    auto kernel = cpu_addr ? TryGet(*cpu_addr) : null_kernel; -    if (kernel) { +    if (Shader* const kernel = cpu_addr ? TryGet(*cpu_addr) : null_kernel.get()) {          return kernel;      } @@ -503,20 +499,21 @@ Shader ShaderCacheOpenGL::GetComputeKernel(GPUVAddr code_addr) {      const ShaderParameters params{system,    disk_cache, device,                                    *cpu_addr, host_ptr,   unique_identifier}; +    std::unique_ptr<Shader> kernel;      const auto found = runtime_cache.find(unique_identifier);      if (found == runtime_cache.end()) { -        kernel = CachedShader::CreateKernelFromMemory(params, std::move(code)); +        kernel = Shader::CreateKernelFromMemory(params, std::move(code));      } else { -        const std::size_t size_in_bytes = code.size() * sizeof(u64); -        kernel = CachedShader::CreateFromCache(params, found->second, size_in_bytes); +        kernel = Shader::CreateFromCache(params, found->second);      } +    Shader* const result = kernel.get();      if (cpu_addr) { -        Register(kernel); +        Register(std::move(kernel), *cpu_addr, code.size() * sizeof(u64));      } else { -        null_kernel = kernel; +        null_kernel = std::move(kernel);      } -    return kernel; +    return result;  }  } // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_shader_cache.h b/src/video_core/renderer_opengl/gl_shader_cache.h index b2ae8d7f9..6848f1388 100644 --- a/src/video_core/renderer_opengl/gl_shader_cache.h +++ b/src/video_core/renderer_opengl/gl_shader_cache.h @@ -18,12 +18,12 @@  #include "common/common_types.h"  #include "video_core/engines/shader_type.h" -#include "video_core/rasterizer_cache.h"  #include "video_core/renderer_opengl/gl_resource_manager.h"  #include "video_core/renderer_opengl/gl_shader_decompiler.h"  #include "video_core/renderer_opengl/gl_shader_disk_cache.h"  #include "video_core/shader/registry.h"  #include "video_core/shader/shader_ir.h" +#include "video_core/shader_cache.h"  namespace Core {  class System; @@ -35,12 +35,10 @@ class EmuWindow;  namespace OpenGL { -class CachedShader;  class Device;  class RasterizerOpenGL;  struct UnspecializedShader; -using Shader = std::shared_ptr<CachedShader>;  using Maxwell = Tegra::Engines::Maxwell3D::Regs;  struct ProgramHandle { @@ -64,62 +62,53 @@ struct ShaderParameters {      u64 unique_identifier;  }; -class CachedShader final : public RasterizerCacheObject { +class Shader final {  public: -    ~CachedShader(); +    ~Shader();      /// Gets the GL program handle for the shader      GLuint GetHandle() const; -    /// Returns the size in bytes of the shader -    std::size_t GetSizeInBytes() const override { -        return size_in_bytes; -    } -      /// Gets the shader entries for the shader      const ShaderEntries& GetEntries() const {          return entries;      } -    static Shader CreateStageFromMemory(const ShaderParameters& params, -                                        Maxwell::ShaderProgram program_type, -                                        ProgramCode program_code, ProgramCode program_code_b); -    static Shader CreateKernelFromMemory(const ShaderParameters& params, ProgramCode code); +    static std::unique_ptr<Shader> CreateStageFromMemory(const ShaderParameters& params, +                                                         Maxwell::ShaderProgram program_type, +                                                         ProgramCode program_code, +                                                         ProgramCode program_code_b); +    static std::unique_ptr<Shader> CreateKernelFromMemory(const ShaderParameters& params, +                                                          ProgramCode code); -    static Shader CreateFromCache(const ShaderParameters& params, -                                  const PrecompiledShader& precompiled_shader, -                                  std::size_t size_in_bytes); +    static std::unique_ptr<Shader> CreateFromCache(const ShaderParameters& params, +                                                   const PrecompiledShader& precompiled_shader);  private: -    explicit CachedShader(VAddr cpu_addr, std::size_t size_in_bytes, -                          std::shared_ptr<VideoCommon::Shader::Registry> registry, -                          ShaderEntries entries, ProgramSharedPtr program); +    explicit Shader(std::shared_ptr<VideoCommon::Shader::Registry> registry, ShaderEntries entries, +                    ProgramSharedPtr program);      std::shared_ptr<VideoCommon::Shader::Registry> registry;      ShaderEntries entries; -    std::size_t size_in_bytes = 0;      ProgramSharedPtr program;      GLuint handle = 0;  }; -class ShaderCacheOpenGL final : public RasterizerCache<Shader> { +class ShaderCacheOpenGL final : public VideoCommon::ShaderCache<Shader> {  public:      explicit ShaderCacheOpenGL(RasterizerOpenGL& rasterizer, Core::System& system,                                 Core::Frontend::EmuWindow& emu_window, const Device& device); +    ~ShaderCacheOpenGL() override;      /// Loads disk cache for the current game      void LoadDiskCache(const std::atomic_bool& stop_loading,                         const VideoCore::DiskResourceLoadCallback& callback);      /// Gets the current specified shader stage program -    Shader GetStageProgram(Maxwell::ShaderProgram program); +    Shader* GetStageProgram(Maxwell::ShaderProgram program);      /// Gets a compute kernel in the passed address -    Shader GetComputeKernel(GPUVAddr code_addr); - -protected: -    // We do not have to flush this cache as things in it are never modified by us. -    void FlushObjectInner(const Shader& object) override {} +    Shader* GetComputeKernel(GPUVAddr code_addr);  private:      ProgramSharedPtr GeneratePrecompiledProgram( @@ -132,10 +121,10 @@ private:      ShaderDiskCacheOpenGL disk_cache;      std::unordered_map<u64, PrecompiledShader> runtime_cache; -    Shader null_shader{}; -    Shader null_kernel{}; +    std::unique_ptr<Shader> null_shader; +    std::unique_ptr<Shader> null_kernel; -    std::array<Shader, Maxwell::MaxShaderProgram> last_shaders; +    std::array<Shader*, Maxwell::MaxShaderProgram> last_shaders{};  };  } // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp index 502b95973..d6e30b321 100644 --- a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp +++ b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp @@ -626,7 +626,9 @@ private:                  break;              }          } -        if (stage != ShaderType::Vertex || device.HasVertexViewportLayer()) { + +        if (stage != ShaderType::Geometry && +            (stage != ShaderType::Vertex || device.HasVertexViewportLayer())) {              if (ir.UsesLayer()) {                  code.AddLine("int gl_Layer;");              } @@ -655,6 +657,16 @@ private:          --code.scope;          code.AddLine("}};");          code.AddNewLine(); + +        if (stage == ShaderType::Geometry) { +            if (ir.UsesLayer()) { +                code.AddLine("out int gl_Layer;"); +            } +            if (ir.UsesViewportIndex()) { +                code.AddLine("out int gl_ViewportIndex;"); +            } +        } +        code.AddNewLine();      }      void DeclareRegisters() { diff --git a/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp b/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp index 9e95a122b..653c3f2f9 100644 --- a/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp +++ b/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp @@ -29,6 +29,8 @@ using VideoCommon::Shader::KeyMap;  namespace { +using VideoCommon::Shader::SeparateSamplerKey; +  using ShaderCacheVersionHash = std::array<u8, 64>;  struct ConstBufferKey { @@ -37,18 +39,26 @@ struct ConstBufferKey {      u32 value = 0;  }; -struct BoundSamplerKey { +struct BoundSamplerEntry {      u32 offset = 0;      Tegra::Engines::SamplerDescriptor sampler;  }; -struct BindlessSamplerKey { +struct SeparateSamplerEntry { +    u32 cbuf1 = 0; +    u32 cbuf2 = 0; +    u32 offset1 = 0; +    u32 offset2 = 0; +    Tegra::Engines::SamplerDescriptor sampler; +}; + +struct BindlessSamplerEntry {      u32 cbuf = 0;      u32 offset = 0;      Tegra::Engines::SamplerDescriptor sampler;  }; -constexpr u32 NativeVersion = 20; +constexpr u32 NativeVersion = 21;  ShaderCacheVersionHash GetShaderCacheVersionHash() {      ShaderCacheVersionHash hash{}; @@ -87,12 +97,14 @@ bool ShaderDiskCacheEntry::Load(FileUtil::IOFile& file) {      u32 texture_handler_size_value;      u32 num_keys;      u32 num_bound_samplers; +    u32 num_separate_samplers;      u32 num_bindless_samplers;      if (file.ReadArray(&unique_identifier, 1) != 1 || file.ReadArray(&bound_buffer, 1) != 1 ||          file.ReadArray(&is_texture_handler_size_known, 1) != 1 ||          file.ReadArray(&texture_handler_size_value, 1) != 1 ||          file.ReadArray(&graphics_info, 1) != 1 || file.ReadArray(&compute_info, 1) != 1 ||          file.ReadArray(&num_keys, 1) != 1 || file.ReadArray(&num_bound_samplers, 1) != 1 || +        file.ReadArray(&num_separate_samplers, 1) != 1 ||          file.ReadArray(&num_bindless_samplers, 1) != 1) {          return false;      } @@ -101,23 +113,32 @@ bool ShaderDiskCacheEntry::Load(FileUtil::IOFile& file) {      }      std::vector<ConstBufferKey> flat_keys(num_keys); -    std::vector<BoundSamplerKey> flat_bound_samplers(num_bound_samplers); -    std::vector<BindlessSamplerKey> flat_bindless_samplers(num_bindless_samplers); +    std::vector<BoundSamplerEntry> flat_bound_samplers(num_bound_samplers); +    std::vector<SeparateSamplerEntry> flat_separate_samplers(num_separate_samplers); +    std::vector<BindlessSamplerEntry> flat_bindless_samplers(num_bindless_samplers);      if (file.ReadArray(flat_keys.data(), flat_keys.size()) != flat_keys.size() ||          file.ReadArray(flat_bound_samplers.data(), flat_bound_samplers.size()) !=              flat_bound_samplers.size() || +        file.ReadArray(flat_separate_samplers.data(), flat_separate_samplers.size()) != +            flat_separate_samplers.size() ||          file.ReadArray(flat_bindless_samplers.data(), flat_bindless_samplers.size()) !=              flat_bindless_samplers.size()) {          return false;      } -    for (const auto& key : flat_keys) { -        keys.insert({{key.cbuf, key.offset}, key.value}); +    for (const auto& entry : flat_keys) { +        keys.insert({{entry.cbuf, entry.offset}, entry.value});      } -    for (const auto& key : flat_bound_samplers) { -        bound_samplers.emplace(key.offset, key.sampler); +    for (const auto& entry : flat_bound_samplers) { +        bound_samplers.emplace(entry.offset, entry.sampler);      } -    for (const auto& key : flat_bindless_samplers) { -        bindless_samplers.insert({{key.cbuf, key.offset}, key.sampler}); +    for (const auto& entry : flat_separate_samplers) { +        SeparateSamplerKey key; +        key.buffers = {entry.cbuf1, entry.cbuf2}; +        key.offsets = {entry.offset1, entry.offset2}; +        separate_samplers.emplace(key, entry.sampler); +    } +    for (const auto& entry : flat_bindless_samplers) { +        bindless_samplers.insert({{entry.cbuf, entry.offset}, entry.sampler});      }      return true; @@ -142,6 +163,7 @@ bool ShaderDiskCacheEntry::Save(FileUtil::IOFile& file) const {          file.WriteObject(graphics_info) != 1 || file.WriteObject(compute_info) != 1 ||          file.WriteObject(static_cast<u32>(keys.size())) != 1 ||          file.WriteObject(static_cast<u32>(bound_samplers.size())) != 1 || +        file.WriteObject(static_cast<u32>(separate_samplers.size())) != 1 ||          file.WriteObject(static_cast<u32>(bindless_samplers.size())) != 1) {          return false;      } @@ -152,22 +174,34 @@ bool ShaderDiskCacheEntry::Save(FileUtil::IOFile& file) const {          flat_keys.push_back(ConstBufferKey{address.first, address.second, value});      } -    std::vector<BoundSamplerKey> flat_bound_samplers; +    std::vector<BoundSamplerEntry> flat_bound_samplers;      flat_bound_samplers.reserve(bound_samplers.size());      for (const auto& [address, sampler] : bound_samplers) { -        flat_bound_samplers.push_back(BoundSamplerKey{address, sampler}); +        flat_bound_samplers.push_back(BoundSamplerEntry{address, sampler}); +    } + +    std::vector<SeparateSamplerEntry> flat_separate_samplers; +    flat_separate_samplers.reserve(separate_samplers.size()); +    for (const auto& [key, sampler] : separate_samplers) { +        SeparateSamplerEntry entry; +        std::tie(entry.cbuf1, entry.cbuf2) = key.buffers; +        std::tie(entry.offset1, entry.offset2) = key.offsets; +        entry.sampler = sampler; +        flat_separate_samplers.push_back(entry);      } -    std::vector<BindlessSamplerKey> flat_bindless_samplers; +    std::vector<BindlessSamplerEntry> flat_bindless_samplers;      flat_bindless_samplers.reserve(bindless_samplers.size());      for (const auto& [address, sampler] : bindless_samplers) {          flat_bindless_samplers.push_back( -            BindlessSamplerKey{address.first, address.second, sampler}); +            BindlessSamplerEntry{address.first, address.second, sampler});      }      return file.WriteArray(flat_keys.data(), flat_keys.size()) == flat_keys.size() &&             file.WriteArray(flat_bound_samplers.data(), flat_bound_samplers.size()) ==                 flat_bound_samplers.size() && +           file.WriteArray(flat_separate_samplers.data(), flat_separate_samplers.size()) == +               flat_separate_samplers.size() &&             file.WriteArray(flat_bindless_samplers.data(), flat_bindless_samplers.size()) ==                 flat_bindless_samplers.size();  } diff --git a/src/video_core/renderer_opengl/gl_shader_disk_cache.h b/src/video_core/renderer_opengl/gl_shader_disk_cache.h index d5be52e40..a79cef0e9 100644 --- a/src/video_core/renderer_opengl/gl_shader_disk_cache.h +++ b/src/video_core/renderer_opengl/gl_shader_disk_cache.h @@ -57,6 +57,7 @@ struct ShaderDiskCacheEntry {      VideoCommon::Shader::ComputeInfo compute_info;      VideoCommon::Shader::KeyMap keys;      VideoCommon::Shader::BoundSamplerMap bound_samplers; +    VideoCommon::Shader::SeparateSamplerMap separate_samplers;      VideoCommon::Shader::BindlessSamplerMap bindless_samplers;  }; diff --git a/src/video_core/renderer_opengl/gl_stream_buffer.cpp b/src/video_core/renderer_opengl/gl_stream_buffer.cpp index 6ec328c53..932a2f69e 100644 --- a/src/video_core/renderer_opengl/gl_stream_buffer.cpp +++ b/src/video_core/renderer_opengl/gl_stream_buffer.cpp @@ -49,14 +49,6 @@ OGLStreamBuffer::~OGLStreamBuffer() {      gl_buffer.Release();  } -GLuint OGLStreamBuffer::GetHandle() const { -    return gl_buffer.handle; -} - -GLsizeiptr OGLStreamBuffer::GetSize() const { -    return buffer_size; -} -  std::tuple<u8*, GLintptr, bool> OGLStreamBuffer::Map(GLsizeiptr size, GLintptr alignment) {      ASSERT(size <= buffer_size);      ASSERT(alignment <= buffer_size); diff --git a/src/video_core/renderer_opengl/gl_stream_buffer.h b/src/video_core/renderer_opengl/gl_stream_buffer.h index f8383cbd4..866da3594 100644 --- a/src/video_core/renderer_opengl/gl_stream_buffer.h +++ b/src/video_core/renderer_opengl/gl_stream_buffer.h @@ -17,9 +17,6 @@ public:                               bool use_persistent = true);      ~OGLStreamBuffer(); -    GLuint GetHandle() const; -    GLsizeiptr GetSize() const; -      /*       * Allocates a linear chunk of memory in the GPU buffer with at least "size" bytes       * and the optional alignment requirement. @@ -32,6 +29,14 @@ public:      void Unmap(GLsizeiptr size); +    GLuint Handle() const { +        return gl_buffer.handle; +    } + +    GLsizeiptr Size() const { +        return buffer_size; +    } +  private:      OGLBuffer gl_buffer; diff --git a/src/video_core/renderer_opengl/gl_texture_cache.cpp b/src/video_core/renderer_opengl/gl_texture_cache.cpp index 57db5a08b..61505879b 100644 --- a/src/video_core/renderer_opengl/gl_texture_cache.cpp +++ b/src/video_core/renderer_opengl/gl_texture_cache.cpp @@ -263,9 +263,14 @@ CachedSurface::CachedSurface(const GPUVAddr gpu_addr, const SurfaceParams& param      target = GetTextureTarget(params.target);      texture = CreateTexture(params, target, internal_format, texture_buffer);      DecorateSurfaceName(); -    main_view = CreateViewInner( -        ViewParams(params.target, 0, params.is_layered ? params.depth : 1, 0, params.num_levels), -        true); + +    u32 num_layers = 1; +    if (params.is_layered || params.target == SurfaceTarget::Texture3D) { +        num_layers = params.depth; +    } + +    main_view = +        CreateViewInner(ViewParams(params.target, 0, num_layers, 0, params.num_levels), true);  }  CachedSurface::~CachedSurface() = default; @@ -413,20 +418,23 @@ CachedSurfaceView::CachedSurfaceView(CachedSurface& surface, const ViewParams& p  CachedSurfaceView::~CachedSurfaceView() = default; -void CachedSurfaceView::Attach(GLenum attachment, GLenum target) const { +void CachedSurfaceView::Attach(GLenum attachment, GLenum fb_target) const {      ASSERT(params.num_levels == 1); +    if (params.target == SurfaceTarget::Texture3D) { +        if (params.num_layers > 1) { +            ASSERT(params.base_layer == 0); +            glFramebufferTexture(fb_target, attachment, surface.texture.handle, params.base_level); +        } else { +            glFramebufferTexture3D(fb_target, attachment, target, surface.texture.handle, +                                   params.base_level, params.base_layer); +        } +        return; +    } +      if (params.num_layers > 1) { -        // Layered framebuffer attachments          UNIMPLEMENTED_IF(params.base_layer != 0); - -        switch (params.target) { -        case SurfaceTarget::Texture2DArray: -            glFramebufferTexture(target, attachment, GetTexture(), 0); -            break; -        default: -            UNIMPLEMENTED(); -        } +        glFramebufferTexture(fb_target, attachment, GetTexture(), 0);          return;      } @@ -434,16 +442,16 @@ void CachedSurfaceView::Attach(GLenum attachment, GLenum target) const {      const GLuint texture = surface.GetTexture();      switch (surface.GetSurfaceParams().target) {      case SurfaceTarget::Texture1D: -        glFramebufferTexture1D(target, attachment, view_target, texture, params.base_level); +        glFramebufferTexture1D(fb_target, attachment, view_target, texture, params.base_level);          break;      case SurfaceTarget::Texture2D: -        glFramebufferTexture2D(target, attachment, view_target, texture, params.base_level); +        glFramebufferTexture2D(fb_target, attachment, view_target, texture, params.base_level);          break;      case SurfaceTarget::Texture1DArray:      case SurfaceTarget::Texture2DArray:      case SurfaceTarget::TextureCubemap:      case SurfaceTarget::TextureCubeArray: -        glFramebufferTextureLayer(target, attachment, texture, params.base_level, +        glFramebufferTextureLayer(fb_target, attachment, texture, params.base_level,                                    params.base_layer);          break;      default: @@ -500,8 +508,13 @@ OGLTextureView CachedSurfaceView::CreateTextureView() const {      OGLTextureView texture_view;      texture_view.Create(); -    glTextureView(texture_view.handle, target, surface.texture.handle, format, params.base_level, -                  params.num_levels, params.base_layer, params.num_layers); +    if (target == GL_TEXTURE_3D) { +        glTextureView(texture_view.handle, target, surface.texture.handle, format, +                      params.base_level, params.num_levels, 0, 1); +    } else { +        glTextureView(texture_view.handle, target, surface.texture.handle, format, +                      params.base_level, params.num_levels, params.base_layer, params.num_layers); +    }      ApplyTextureDefaults(surface.GetSurfaceParams(), texture_view.handle);      return texture_view; @@ -544,8 +557,8 @@ void TextureCacheOpenGL::ImageBlit(View& src_view, View& dst_view,                                     const Tegra::Engines::Fermi2D::Config& copy_config) {      const auto& src_params{src_view->GetSurfaceParams()};      const auto& dst_params{dst_view->GetSurfaceParams()}; -    UNIMPLEMENTED_IF(src_params.target == SurfaceTarget::Texture3D); -    UNIMPLEMENTED_IF(dst_params.target == SurfaceTarget::Texture3D); +    UNIMPLEMENTED_IF(src_params.depth != 1); +    UNIMPLEMENTED_IF(dst_params.depth != 1);      state_tracker.NotifyScissor0();      state_tracker.NotifyFramebuffer(); diff --git a/src/video_core/renderer_opengl/gl_texture_cache.h b/src/video_core/renderer_opengl/gl_texture_cache.h index 8a2ac8603..bfc4ddf5d 100644 --- a/src/video_core/renderer_opengl/gl_texture_cache.h +++ b/src/video_core/renderer_opengl/gl_texture_cache.h @@ -80,8 +80,10 @@ public:      explicit CachedSurfaceView(CachedSurface& surface, const ViewParams& params, bool is_proxy);      ~CachedSurfaceView(); -    /// Attaches this texture view to the current bound GL_DRAW_FRAMEBUFFER -    void Attach(GLenum attachment, GLenum target) const; +    /// @brief Attaches this texture view to the currently bound fb_target framebuffer +    /// @param attachment   Attachment to bind textures to +    /// @param fb_target    Framebuffer target to attach to (e.g. DRAW_FRAMEBUFFER) +    void Attach(GLenum attachment, GLenum fb_target) const;      GLuint GetTexture(Tegra::Texture::SwizzleSource x_source,                        Tegra::Texture::SwizzleSource y_source, diff --git a/src/video_core/renderer_opengl/renderer_opengl.cpp b/src/video_core/renderer_opengl/renderer_opengl.cpp index e7952924a..6214fcbc3 100644 --- a/src/video_core/renderer_opengl/renderer_opengl.cpp +++ b/src/video_core/renderer_opengl/renderer_opengl.cpp @@ -751,11 +751,9 @@ void RendererOpenGL::RenderScreenshot() {  }  bool RendererOpenGL::Init() { -    if (GLAD_GL_KHR_debug) { +    if (Settings::values.renderer_debug && GLAD_GL_KHR_debug) {          glEnable(GL_DEBUG_OUTPUT); -        if (Settings::values.renderer_debug) { -            glEnable(GL_DEBUG_OUTPUT_SYNCHRONOUS); -        } +        glEnable(GL_DEBUG_OUTPUT_SYNCHRONOUS);          glDebugMessageCallback(DebugHandler, nullptr);      } diff --git a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp index 5f33d9e40..1fde38328 100644 --- a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp @@ -37,8 +37,8 @@ std::unique_ptr<VKStreamBuffer> CreateStreamBuffer(const VKDevice& device, VKSch  } // Anonymous namespace -CachedBufferBlock::CachedBufferBlock(const VKDevice& device, VKMemoryManager& memory_manager, -                                     VAddr cpu_addr, std::size_t size) +Buffer::Buffer(const VKDevice& device, VKMemoryManager& memory_manager, VAddr cpu_addr, +               std::size_t size)      : VideoCommon::BufferBlock{cpu_addr, size} {      VkBufferCreateInfo ci;      ci.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO; @@ -54,7 +54,7 @@ CachedBufferBlock::CachedBufferBlock(const VKDevice& device, VKMemoryManager& me      buffer.commit = memory_manager.Commit(buffer.handle, false);  } -CachedBufferBlock::~CachedBufferBlock() = default; +Buffer::~Buffer() = default;  VKBufferCache::VKBufferCache(VideoCore::RasterizerInterface& rasterizer, Core::System& system,                               const VKDevice& device, VKMemoryManager& memory_manager, @@ -67,12 +67,8 @@ VKBufferCache::VKBufferCache(VideoCore::RasterizerInterface& rasterizer, Core::S  VKBufferCache::~VKBufferCache() = default; -Buffer VKBufferCache::CreateBlock(VAddr cpu_addr, std::size_t size) { -    return std::make_shared<CachedBufferBlock>(device, memory_manager, cpu_addr, size); -} - -VkBuffer VKBufferCache::ToHandle(const Buffer& buffer) { -    return buffer->GetHandle(); +std::shared_ptr<Buffer> VKBufferCache::CreateBlock(VAddr cpu_addr, std::size_t size) { +    return std::make_shared<Buffer>(device, memory_manager, cpu_addr, size);  }  VkBuffer VKBufferCache::GetEmptyBuffer(std::size_t size) { @@ -91,7 +87,7 @@ void VKBufferCache::UploadBlockData(const Buffer& buffer, std::size_t offset, st      std::memcpy(staging.commit->Map(size), data, size);      scheduler.RequestOutsideRenderPassOperationContext(); -    scheduler.Record([staging = *staging.handle, buffer = buffer->GetHandle(), offset, +    scheduler.Record([staging = *staging.handle, buffer = buffer.Handle(), offset,                        size](vk::CommandBuffer cmdbuf) {          cmdbuf.CopyBuffer(staging, buffer, VkBufferCopy{0, offset, size}); @@ -114,7 +110,7 @@ void VKBufferCache::DownloadBlockData(const Buffer& buffer, std::size_t offset,                                        u8* data) {      const auto& staging = staging_pool.GetUnusedBuffer(size, true);      scheduler.RequestOutsideRenderPassOperationContext(); -    scheduler.Record([staging = *staging.handle, buffer = buffer->GetHandle(), offset, +    scheduler.Record([staging = *staging.handle, buffer = buffer.Handle(), offset,                        size](vk::CommandBuffer cmdbuf) {          VkBufferMemoryBarrier barrier;          barrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER; @@ -141,8 +137,8 @@ void VKBufferCache::DownloadBlockData(const Buffer& buffer, std::size_t offset,  void VKBufferCache::CopyBlock(const Buffer& src, const Buffer& dst, std::size_t src_offset,                                std::size_t dst_offset, std::size_t size) {      scheduler.RequestOutsideRenderPassOperationContext(); -    scheduler.Record([src_buffer = src->GetHandle(), dst_buffer = dst->GetHandle(), src_offset, -                      dst_offset, size](vk::CommandBuffer cmdbuf) { +    scheduler.Record([src_buffer = src.Handle(), dst_buffer = dst.Handle(), src_offset, dst_offset, +                      size](vk::CommandBuffer cmdbuf) {          cmdbuf.CopyBuffer(src_buffer, dst_buffer, VkBufferCopy{src_offset, dst_offset, size});          std::array<VkBufferMemoryBarrier, 2> barriers; diff --git a/src/video_core/renderer_vulkan/vk_buffer_cache.h b/src/video_core/renderer_vulkan/vk_buffer_cache.h index a54583e7d..9ebbef835 100644 --- a/src/video_core/renderer_vulkan/vk_buffer_cache.h +++ b/src/video_core/renderer_vulkan/vk_buffer_cache.h @@ -8,7 +8,6 @@  #include "common/common_types.h"  #include "video_core/buffer_cache/buffer_cache.h" -#include "video_core/rasterizer_cache.h"  #include "video_core/renderer_vulkan/vk_memory_manager.h"  #include "video_core/renderer_vulkan/vk_staging_buffer_pool.h"  #include "video_core/renderer_vulkan/vk_stream_buffer.h" @@ -24,13 +23,13 @@ class VKDevice;  class VKMemoryManager;  class VKScheduler; -class CachedBufferBlock final : public VideoCommon::BufferBlock { +class Buffer final : public VideoCommon::BufferBlock {  public: -    explicit CachedBufferBlock(const VKDevice& device, VKMemoryManager& memory_manager, -                               VAddr cpu_addr, std::size_t size); -    ~CachedBufferBlock(); +    explicit Buffer(const VKDevice& device, VKMemoryManager& memory_manager, VAddr cpu_addr, +                    std::size_t size); +    ~Buffer(); -    VkBuffer GetHandle() const { +    VkBuffer Handle() const {          return *buffer.handle;      } @@ -38,8 +37,6 @@ private:      VKBuffer buffer;  }; -using Buffer = std::shared_ptr<CachedBufferBlock>; -  class VKBufferCache final : public VideoCommon::BufferCache<Buffer, VkBuffer, VKStreamBuffer> {  public:      explicit VKBufferCache(VideoCore::RasterizerInterface& rasterizer, Core::System& system, @@ -50,9 +47,7 @@ public:      VkBuffer GetEmptyBuffer(std::size_t size) override;  protected: -    VkBuffer ToHandle(const Buffer& buffer) override; - -    Buffer CreateBlock(VAddr cpu_addr, std::size_t size) override; +    std::shared_ptr<Buffer> CreateBlock(VAddr cpu_addr, std::size_t size) override;      void UploadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size,                           const u8* data) override; diff --git a/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp b/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp index 8e1b46277..281bf9ac3 100644 --- a/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp +++ b/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp @@ -53,8 +53,9 @@ vk::DescriptorSetLayout VKComputePipeline::CreateDescriptorSetLayout() const {      };      add_bindings(VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, entries.const_buffers.size());      add_bindings(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, entries.global_buffers.size()); -    add_bindings(VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER, entries.texel_buffers.size()); +    add_bindings(VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER, entries.uniform_texels.size());      add_bindings(VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, entries.samplers.size()); +    add_bindings(VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER, entries.storage_texels.size());      add_bindings(VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, entries.images.size());      VkDescriptorSetLayoutCreateInfo ci; diff --git a/src/video_core/renderer_vulkan/vk_descriptor_pool.cpp b/src/video_core/renderer_vulkan/vk_descriptor_pool.cpp index 890fd52cf..9259b618d 100644 --- a/src/video_core/renderer_vulkan/vk_descriptor_pool.cpp +++ b/src/video_core/renderer_vulkan/vk_descriptor_pool.cpp @@ -42,6 +42,7 @@ vk::DescriptorPool* VKDescriptorPool::AllocateNewPool() {          {VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, num_sets * 60},          {VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER, num_sets * 64},          {VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, num_sets * 64}, +        {VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER, num_sets * 64},          {VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, num_sets * 40}};      VkDescriptorPoolCreateInfo ci; diff --git a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp index 65a1c6245..ea66e621e 100644 --- a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp @@ -27,6 +27,7 @@  #include "video_core/renderer_vulkan/wrapper.h"  #include "video_core/shader/compiler_settings.h"  #include "video_core/shader/memory_util.h" +#include "video_core/shader_cache.h"  namespace Vulkan { @@ -45,6 +46,7 @@ constexpr VkDescriptorType UNIFORM_BUFFER = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER;  constexpr VkDescriptorType STORAGE_BUFFER = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;  constexpr VkDescriptorType UNIFORM_TEXEL_BUFFER = VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER;  constexpr VkDescriptorType COMBINED_IMAGE_SAMPLER = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER; +constexpr VkDescriptorType STORAGE_TEXEL_BUFFER = VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER;  constexpr VkDescriptorType STORAGE_IMAGE = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE;  constexpr VideoCommon::Shader::CompilerSettings compiler_settings{ @@ -104,8 +106,9 @@ u32 FillDescriptorLayout(const ShaderEntries& entries,      u32 binding = base_binding;      AddBindings<UNIFORM_BUFFER>(bindings, binding, flags, entries.const_buffers);      AddBindings<STORAGE_BUFFER>(bindings, binding, flags, entries.global_buffers); -    AddBindings<UNIFORM_TEXEL_BUFFER>(bindings, binding, flags, entries.texel_buffers); +    AddBindings<UNIFORM_TEXEL_BUFFER>(bindings, binding, flags, entries.uniform_texels);      AddBindings<COMBINED_IMAGE_SAMPLER>(bindings, binding, flags, entries.samplers); +    AddBindings<STORAGE_TEXEL_BUFFER>(bindings, binding, flags, entries.storage_texels);      AddBindings<STORAGE_IMAGE>(bindings, binding, flags, entries.images);      return binding;  } @@ -130,19 +133,18 @@ bool ComputePipelineCacheKey::operator==(const ComputePipelineCacheKey& rhs) con      return std::memcmp(&rhs, this, sizeof *this) == 0;  } -CachedShader::CachedShader(Core::System& system, Tegra::Engines::ShaderType stage, -                           GPUVAddr gpu_addr, VAddr cpu_addr, ProgramCode program_code, -                           u32 main_offset) -    : RasterizerCacheObject{cpu_addr}, gpu_addr{gpu_addr}, program_code{std::move(program_code)}, +Shader::Shader(Core::System& system, Tegra::Engines::ShaderType stage, GPUVAddr gpu_addr, +               VideoCommon::Shader::ProgramCode program_code, u32 main_offset) +    : gpu_addr{gpu_addr}, program_code{std::move(program_code)},        registry{stage, GetEngine(system, stage)}, shader_ir{this->program_code, main_offset,                                                             compiler_settings, registry},        entries{GenerateShaderEntries(shader_ir)} {} -CachedShader::~CachedShader() = default; +Shader::~Shader() = default; -Tegra::Engines::ConstBufferEngineInterface& CachedShader::GetEngine( -    Core::System& system, Tegra::Engines::ShaderType stage) { -    if (stage == Tegra::Engines::ShaderType::Compute) { +Tegra::Engines::ConstBufferEngineInterface& Shader::GetEngine(Core::System& system, +                                                              Tegra::Engines::ShaderType stage) { +    if (stage == ShaderType::Compute) {          return system.GPU().KeplerCompute();      } else {          return system.GPU().Maxwell3D(); @@ -154,16 +156,16 @@ VKPipelineCache::VKPipelineCache(Core::System& system, RasterizerVulkan& rasteri                                   VKDescriptorPool& descriptor_pool,                                   VKUpdateDescriptorQueue& update_descriptor_queue,                                   VKRenderPassCache& renderpass_cache) -    : RasterizerCache{rasterizer}, system{system}, device{device}, scheduler{scheduler}, -      descriptor_pool{descriptor_pool}, update_descriptor_queue{update_descriptor_queue}, -      renderpass_cache{renderpass_cache} {} +    : VideoCommon::ShaderCache<Shader>{rasterizer}, system{system}, device{device}, +      scheduler{scheduler}, descriptor_pool{descriptor_pool}, +      update_descriptor_queue{update_descriptor_queue}, renderpass_cache{renderpass_cache} {}  VKPipelineCache::~VKPipelineCache() = default; -std::array<Shader, Maxwell::MaxShaderProgram> VKPipelineCache::GetShaders() { +std::array<Shader*, Maxwell::MaxShaderProgram> VKPipelineCache::GetShaders() {      const auto& gpu = system.GPU().Maxwell3D(); -    std::array<Shader, Maxwell::MaxShaderProgram> shaders; +    std::array<Shader*, Maxwell::MaxShaderProgram> shaders{};      for (std::size_t index = 0; index < Maxwell::MaxShaderProgram; ++index) {          const auto program{static_cast<Maxwell::ShaderProgram>(index)}; @@ -176,24 +178,28 @@ std::array<Shader, Maxwell::MaxShaderProgram> VKPipelineCache::GetShaders() {          const GPUVAddr program_addr{GetShaderAddress(system, program)};          const std::optional cpu_addr = memory_manager.GpuToCpuAddress(program_addr);          ASSERT(cpu_addr); -        auto shader = cpu_addr ? TryGet(*cpu_addr) : null_shader; -        if (!shader) { + +        Shader* result = cpu_addr ? TryGet(*cpu_addr) : null_shader.get(); +        if (!result) {              const auto host_ptr{memory_manager.GetPointer(program_addr)};              // No shader found - create a new one              constexpr u32 stage_offset = STAGE_MAIN_OFFSET; -            const auto stage = static_cast<Tegra::Engines::ShaderType>(index == 0 ? 0 : index - 1); +            const auto stage = static_cast<ShaderType>(index == 0 ? 0 : index - 1);              ProgramCode code = GetShaderCode(memory_manager, program_addr, host_ptr, false); +            const std::size_t size_in_bytes = code.size() * sizeof(u64); + +            auto shader = std::make_unique<Shader>(system, stage, program_addr, std::move(code), +                                                   stage_offset); +            result = shader.get(); -            shader = std::make_shared<CachedShader>(system, stage, program_addr, *cpu_addr, -                                                    std::move(code), stage_offset);              if (cpu_addr) { -                Register(shader); +                Register(std::move(shader), *cpu_addr, size_in_bytes);              } else { -                null_shader = shader; +                null_shader = std::move(shader);              }          } -        shaders[index] = std::move(shader); +        shaders[index] = result;      }      return last_shaders = shaders;  } @@ -234,19 +240,22 @@ VKComputePipeline& VKPipelineCache::GetComputePipeline(const ComputePipelineCach      const auto cpu_addr = memory_manager.GpuToCpuAddress(program_addr);      ASSERT(cpu_addr); -    auto shader = cpu_addr ? TryGet(*cpu_addr) : null_kernel; +    Shader* shader = cpu_addr ? TryGet(*cpu_addr) : null_kernel.get();      if (!shader) {          // No shader found - create a new one          const auto host_ptr = memory_manager.GetPointer(program_addr);          ProgramCode code = GetShaderCode(memory_manager, program_addr, host_ptr, true); -        shader = std::make_shared<CachedShader>(system, Tegra::Engines::ShaderType::Compute, -                                                program_addr, *cpu_addr, std::move(code), -                                                KERNEL_MAIN_OFFSET); +        const std::size_t size_in_bytes = code.size() * sizeof(u64); + +        auto shader_info = std::make_unique<Shader>(system, ShaderType::Compute, program_addr, +                                                    std::move(code), KERNEL_MAIN_OFFSET); +        shader = shader_info.get(); +          if (cpu_addr) { -            Register(shader); +            Register(std::move(shader_info), *cpu_addr, size_in_bytes);          } else { -            null_kernel = shader; +            null_kernel = std::move(shader_info);          }      } @@ -262,7 +271,7 @@ VKComputePipeline& VKPipelineCache::GetComputePipeline(const ComputePipelineCach      return *entry;  } -void VKPipelineCache::Unregister(const Shader& shader) { +void VKPipelineCache::OnShaderRemoval(Shader* shader) {      bool finished = false;      const auto Finish = [&] {          // TODO(Rodrigo): Instead of finishing here, wait for the fences that use this pipeline and @@ -294,8 +303,6 @@ void VKPipelineCache::Unregister(const Shader& shader) {          Finish();          it = compute_cache.erase(it);      } - -    RasterizerCache::Unregister(shader);  }  std::pair<SPIRVProgram, std::vector<VkDescriptorSetLayoutBinding>> @@ -330,12 +337,11 @@ VKPipelineCache::DecompileShaders(const GraphicsPipelineCacheKey& key) {          }          const GPUVAddr gpu_addr = GetShaderAddress(system, program_enum); -        const auto cpu_addr = memory_manager.GpuToCpuAddress(gpu_addr); -        const auto shader = cpu_addr ? TryGet(*cpu_addr) : null_shader; -        ASSERT(shader); +        const std::optional<VAddr> cpu_addr = memory_manager.GpuToCpuAddress(gpu_addr); +        Shader* const shader = cpu_addr ? TryGet(*cpu_addr) : null_shader.get();          const std::size_t stage = index == 0 ? 0 : index - 1; // Stage indices are 0 - 5 -        const auto program_type = GetShaderType(program_enum); +        const ShaderType program_type = GetShaderType(program_enum);          const auto& entries = shader->GetEntries();          program[stage] = {              Decompile(device, shader->GetIR(), program_type, shader->GetRegistry(), specialization), @@ -377,16 +383,17 @@ void AddEntry(std::vector<VkDescriptorUpdateTemplateEntry>& template_entries, u3          return;      } -    if constexpr (descriptor_type == UNIFORM_TEXEL_BUFFER) { -        // Nvidia has a bug where updating multiple uniform texels at once causes the driver to -        // crash. +    if constexpr (descriptor_type == UNIFORM_TEXEL_BUFFER || +                  descriptor_type == STORAGE_TEXEL_BUFFER) { +        // Nvidia has a bug where updating multiple texels at once causes the driver to crash. +        // Note: Fixed in driver Windows 443.24, Linux 440.66.15          for (u32 i = 0; i < count; ++i) {              VkDescriptorUpdateTemplateEntry& entry = template_entries.emplace_back();              entry.dstBinding = binding + i;              entry.dstArrayElement = 0;              entry.descriptorCount = 1;              entry.descriptorType = descriptor_type; -            entry.offset = offset + i * entry_size; +            entry.offset = static_cast<std::size_t>(offset + i * entry_size);              entry.stride = entry_size;          }      } else if (count > 0) { @@ -407,8 +414,9 @@ void FillDescriptorUpdateTemplateEntries(      std::vector<VkDescriptorUpdateTemplateEntryKHR>& template_entries) {      AddEntry<UNIFORM_BUFFER>(template_entries, offset, binding, entries.const_buffers);      AddEntry<STORAGE_BUFFER>(template_entries, offset, binding, entries.global_buffers); -    AddEntry<UNIFORM_TEXEL_BUFFER>(template_entries, offset, binding, entries.texel_buffers); +    AddEntry<UNIFORM_TEXEL_BUFFER>(template_entries, offset, binding, entries.uniform_texels);      AddEntry<COMBINED_IMAGE_SAMPLER>(template_entries, offset, binding, entries.samplers); +    AddEntry<STORAGE_TEXEL_BUFFER>(template_entries, offset, binding, entries.storage_texels);      AddEntry<STORAGE_IMAGE>(template_entries, offset, binding, entries.images);  } diff --git a/src/video_core/renderer_vulkan/vk_pipeline_cache.h b/src/video_core/renderer_vulkan/vk_pipeline_cache.h index 0b5796fef..0a36e5112 100644 --- a/src/video_core/renderer_vulkan/vk_pipeline_cache.h +++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.h @@ -17,7 +17,6 @@  #include "common/common_types.h"  #include "video_core/engines/const_buffer_engine_interface.h"  #include "video_core/engines/maxwell_3d.h" -#include "video_core/rasterizer_cache.h"  #include "video_core/renderer_vulkan/fixed_pipeline_state.h"  #include "video_core/renderer_vulkan/vk_graphics_pipeline.h"  #include "video_core/renderer_vulkan/vk_renderpass_cache.h" @@ -26,6 +25,7 @@  #include "video_core/shader/memory_util.h"  #include "video_core/shader/registry.h"  #include "video_core/shader/shader_ir.h" +#include "video_core/shader_cache.h"  namespace Core {  class System; @@ -41,8 +41,6 @@ class VKFence;  class VKScheduler;  class VKUpdateDescriptorQueue; -class CachedShader; -using Shader = std::shared_ptr<CachedShader>;  using Maxwell = Tegra::Engines::Maxwell3D::Regs;  struct GraphicsPipelineCacheKey { @@ -102,21 +100,16 @@ struct hash<Vulkan::ComputePipelineCacheKey> {  namespace Vulkan { -class CachedShader final : public RasterizerCacheObject { +class Shader {  public: -    explicit CachedShader(Core::System& system, Tegra::Engines::ShaderType stage, GPUVAddr gpu_addr, -                          VAddr cpu_addr, VideoCommon::Shader::ProgramCode program_code, -                          u32 main_offset); -    ~CachedShader(); +    explicit Shader(Core::System& system, Tegra::Engines::ShaderType stage, GPUVAddr gpu_addr, +                    VideoCommon::Shader::ProgramCode program_code, u32 main_offset); +    ~Shader();      GPUVAddr GetGpuAddr() const {          return gpu_addr;      } -    std::size_t GetSizeInBytes() const override { -        return program_code.size() * sizeof(u64); -    } -      VideoCommon::Shader::ShaderIR& GetIR() {          return shader_ir;      } @@ -144,25 +137,23 @@ private:      ShaderEntries entries;  }; -class VKPipelineCache final : public RasterizerCache<Shader> { +class VKPipelineCache final : public VideoCommon::ShaderCache<Shader> {  public:      explicit VKPipelineCache(Core::System& system, RasterizerVulkan& rasterizer,                               const VKDevice& device, VKScheduler& scheduler,                               VKDescriptorPool& descriptor_pool,                               VKUpdateDescriptorQueue& update_descriptor_queue,                               VKRenderPassCache& renderpass_cache); -    ~VKPipelineCache(); +    ~VKPipelineCache() override; -    std::array<Shader, Maxwell::MaxShaderProgram> GetShaders(); +    std::array<Shader*, Maxwell::MaxShaderProgram> GetShaders();      VKGraphicsPipeline& GetGraphicsPipeline(const GraphicsPipelineCacheKey& key);      VKComputePipeline& GetComputePipeline(const ComputePipelineCacheKey& key);  protected: -    void Unregister(const Shader& shader) override; - -    void FlushObjectInner(const Shader& object) override {} +    void OnShaderRemoval(Shader* shader) final;  private:      std::pair<SPIRVProgram, std::vector<VkDescriptorSetLayoutBinding>> DecompileShaders( @@ -175,10 +166,10 @@ private:      VKUpdateDescriptorQueue& update_descriptor_queue;      VKRenderPassCache& renderpass_cache; -    Shader null_shader{}; -    Shader null_kernel{}; +    std::unique_ptr<Shader> null_shader; +    std::unique_ptr<Shader> null_kernel; -    std::array<Shader, Maxwell::MaxShaderProgram> last_shaders; +    std::array<Shader*, Maxwell::MaxShaderProgram> last_shaders{};      GraphicsPipelineCacheKey last_graphics_key;      VKGraphicsPipeline* last_graphics_pipeline = nullptr; diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp index a3d992ed3..184b2238a 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp +++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp @@ -38,6 +38,7 @@  #include "video_core/renderer_vulkan/vk_texture_cache.h"  #include "video_core/renderer_vulkan/vk_update_descriptor.h"  #include "video_core/renderer_vulkan/wrapper.h" +#include "video_core/shader_cache.h"  namespace Vulkan { @@ -98,7 +99,7 @@ VkRect2D GetScissorState(const Maxwell& regs, std::size_t index) {  }  std::array<GPUVAddr, Maxwell::MaxShaderProgram> GetShaderAddresses( -    const std::array<Shader, Maxwell::MaxShaderProgram>& shaders) { +    const std::array<Shader*, Maxwell::MaxShaderProgram>& shaders) {      std::array<GPUVAddr, Maxwell::MaxShaderProgram> addresses;      for (std::size_t i = 0; i < std::size(addresses); ++i) {          addresses[i] = shaders[i] ? shaders[i]->GetGpuAddr() : 0; @@ -117,6 +118,17 @@ template <typename Engine, typename Entry>  Tegra::Texture::FullTextureInfo GetTextureInfo(const Engine& engine, const Entry& entry,                                                 std::size_t stage, std::size_t index = 0) {      const auto stage_type = static_cast<Tegra::Engines::ShaderType>(stage); +    if constexpr (std::is_same_v<Entry, SamplerEntry>) { +        if (entry.is_separated) { +            const u32 buffer_1 = entry.buffer; +            const u32 buffer_2 = entry.secondary_buffer; +            const u32 offset_1 = entry.offset; +            const u32 offset_2 = entry.secondary_offset; +            const u32 handle_1 = engine.AccessConstBuffer32(stage_type, buffer_1, offset_1); +            const u32 handle_2 = engine.AccessConstBuffer32(stage_type, buffer_2, offset_2); +            return engine.GetTextureInfo(handle_1 | handle_2); +        } +    }      if (entry.is_bindless) {          const auto tex_handle = engine.AccessConstBuffer32(stage_type, entry.buffer, entry.offset);          return engine.GetTextureInfo(tex_handle); @@ -468,8 +480,9 @@ void RasterizerVulkan::DispatchCompute(GPUVAddr code_addr) {      const auto& entries = pipeline.GetEntries();      SetupComputeConstBuffers(entries);      SetupComputeGlobalBuffers(entries); -    SetupComputeTexelBuffers(entries); +    SetupComputeUniformTexels(entries);      SetupComputeTextures(entries); +    SetupComputeStorageTexels(entries);      SetupComputeImages(entries);      buffer_cache.Unmap(); @@ -715,7 +728,7 @@ std::tuple<VkFramebuffer, VkExtent2D> RasterizerVulkan::ConfigureFramebuffers(          if (!view) {              return false;          } -        key.views.push_back(view->GetHandle()); +        key.views.push_back(view->GetAttachment());          key.width = std::min(key.width, view->GetWidth());          key.height = std::min(key.height, view->GetHeight());          key.layers = std::min(key.layers, view->GetNumLayers()); @@ -775,20 +788,21 @@ RasterizerVulkan::DrawParameters RasterizerVulkan::SetupGeometry(FixedPipelineSt  }  void RasterizerVulkan::SetupShaderDescriptors( -    const std::array<Shader, Maxwell::MaxShaderProgram>& shaders) { +    const std::array<Shader*, Maxwell::MaxShaderProgram>& shaders) {      texture_cache.GuardSamplers(true);      for (std::size_t stage = 0; stage < Maxwell::MaxShaderStage; ++stage) {          // Skip VertexA stage -        const auto& shader = shaders[stage + 1]; +        Shader* const shader = shaders[stage + 1];          if (!shader) {              continue;          }          const auto& entries = shader->GetEntries();          SetupGraphicsConstBuffers(entries, stage);          SetupGraphicsGlobalBuffers(entries, stage); -        SetupGraphicsTexelBuffers(entries, stage); +        SetupGraphicsUniformTexels(entries, stage);          SetupGraphicsTextures(entries, stage); +        SetupGraphicsStorageTexels(entries, stage);          SetupGraphicsImages(entries, stage);      }      texture_cache.GuardSamplers(false); @@ -838,6 +852,10 @@ void RasterizerVulkan::BeginTransformFeedback() {      if (regs.tfb_enabled == 0) {          return;      } +    if (!device.IsExtTransformFeedbackSupported()) { +        LOG_ERROR(Render_Vulkan, "Transform feedbacks used but not supported"); +        return; +    }      UNIMPLEMENTED_IF(regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::TesselationControl) ||                       regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::TesselationEval) || @@ -866,6 +884,9 @@ void RasterizerVulkan::EndTransformFeedback() {      if (regs.tfb_enabled == 0) {          return;      } +    if (!device.IsExtTransformFeedbackSupported()) { +        return; +    }      scheduler.Record(          [](vk::CommandBuffer cmdbuf) { cmdbuf.EndTransformFeedbackEXT(0, 0, nullptr, nullptr); }); @@ -976,12 +997,12 @@ void RasterizerVulkan::SetupGraphicsGlobalBuffers(const ShaderEntries& entries,      }  } -void RasterizerVulkan::SetupGraphicsTexelBuffers(const ShaderEntries& entries, std::size_t stage) { +void RasterizerVulkan::SetupGraphicsUniformTexels(const ShaderEntries& entries, std::size_t stage) {      MICROPROFILE_SCOPE(Vulkan_Textures);      const auto& gpu = system.GPU().Maxwell3D(); -    for (const auto& entry : entries.texel_buffers) { +    for (const auto& entry : entries.uniform_texels) {          const auto image = GetTextureInfo(gpu, entry, stage).tic; -        SetupTexelBuffer(image, entry); +        SetupUniformTexels(image, entry);      }  } @@ -996,6 +1017,15 @@ void RasterizerVulkan::SetupGraphicsTextures(const ShaderEntries& entries, std::      }  } +void RasterizerVulkan::SetupGraphicsStorageTexels(const ShaderEntries& entries, std::size_t stage) { +    MICROPROFILE_SCOPE(Vulkan_Textures); +    const auto& gpu = system.GPU().Maxwell3D(); +    for (const auto& entry : entries.storage_texels) { +        const auto image = GetTextureInfo(gpu, entry, stage).tic; +        SetupStorageTexel(image, entry); +    } +} +  void RasterizerVulkan::SetupGraphicsImages(const ShaderEntries& entries, std::size_t stage) {      MICROPROFILE_SCOPE(Vulkan_Images);      const auto& gpu = system.GPU().Maxwell3D(); @@ -1028,12 +1058,12 @@ void RasterizerVulkan::SetupComputeGlobalBuffers(const ShaderEntries& entries) {      }  } -void RasterizerVulkan::SetupComputeTexelBuffers(const ShaderEntries& entries) { +void RasterizerVulkan::SetupComputeUniformTexels(const ShaderEntries& entries) {      MICROPROFILE_SCOPE(Vulkan_Textures);      const auto& gpu = system.GPU().KeplerCompute(); -    for (const auto& entry : entries.texel_buffers) { +    for (const auto& entry : entries.uniform_texels) {          const auto image = GetTextureInfo(gpu, entry, ComputeShaderIndex).tic; -        SetupTexelBuffer(image, entry); +        SetupUniformTexels(image, entry);      }  } @@ -1048,6 +1078,15 @@ void RasterizerVulkan::SetupComputeTextures(const ShaderEntries& entries) {      }  } +void RasterizerVulkan::SetupComputeStorageTexels(const ShaderEntries& entries) { +    MICROPROFILE_SCOPE(Vulkan_Textures); +    const auto& gpu = system.GPU().KeplerCompute(); +    for (const auto& entry : entries.storage_texels) { +        const auto image = GetTextureInfo(gpu, entry, ComputeShaderIndex).tic; +        SetupStorageTexel(image, entry); +    } +} +  void RasterizerVulkan::SetupComputeImages(const ShaderEntries& entries) {      MICROPROFILE_SCOPE(Vulkan_Images);      const auto& gpu = system.GPU().KeplerCompute(); @@ -1097,8 +1136,8 @@ void RasterizerVulkan::SetupGlobalBuffer(const GlobalBufferEntry& entry, GPUVAdd      update_descriptor_queue.AddBuffer(buffer, offset, size);  } -void RasterizerVulkan::SetupTexelBuffer(const Tegra::Texture::TICEntry& tic, -                                        const TexelBufferEntry& entry) { +void RasterizerVulkan::SetupUniformTexels(const Tegra::Texture::TICEntry& tic, +                                          const UniformTexelEntry& entry) {      const auto view = texture_cache.GetTextureSurface(tic, entry);      ASSERT(view->IsBufferView()); @@ -1110,8 +1149,8 @@ void RasterizerVulkan::SetupTexture(const Tegra::Texture::FullTextureInfo& textu      auto view = texture_cache.GetTextureSurface(texture.tic, entry);      ASSERT(!view->IsBufferView()); -    const auto image_view = view->GetHandle(texture.tic.x_source, texture.tic.y_source, -                                            texture.tic.z_source, texture.tic.w_source); +    const VkImageView image_view = view->GetImageView(texture.tic.x_source, texture.tic.y_source, +                                                      texture.tic.z_source, texture.tic.w_source);      const auto sampler = sampler_cache.GetSampler(texture.tsc);      update_descriptor_queue.AddSampledImage(sampler, image_view); @@ -1120,6 +1159,14 @@ void RasterizerVulkan::SetupTexture(const Tegra::Texture::FullTextureInfo& textu      sampled_views.push_back(ImageView{std::move(view), image_layout});  } +void RasterizerVulkan::SetupStorageTexel(const Tegra::Texture::TICEntry& tic, +                                         const StorageTexelEntry& entry) { +    const auto view = texture_cache.GetImageSurface(tic, entry); +    ASSERT(view->IsBufferView()); + +    update_descriptor_queue.AddTexelBuffer(view->GetBufferView()); +} +  void RasterizerVulkan::SetupImage(const Tegra::Texture::TICEntry& tic, const ImageEntry& entry) {      auto view = texture_cache.GetImageSurface(tic, entry); @@ -1129,7 +1176,8 @@ void RasterizerVulkan::SetupImage(const Tegra::Texture::TICEntry& tic, const Ima      UNIMPLEMENTED_IF(tic.IsBuffer()); -    const auto image_view = view->GetHandle(tic.x_source, tic.y_source, tic.z_source, tic.w_source); +    const VkImageView image_view = +        view->GetImageView(tic.x_source, tic.y_source, tic.z_source, tic.w_source);      update_descriptor_queue.AddImage(image_view);      const auto image_layout = update_descriptor_queue.GetLastImageLayout(); diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.h b/src/video_core/renderer_vulkan/vk_rasterizer.h index 0ed0e48c6..c8c187606 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.h +++ b/src/video_core/renderer_vulkan/vk_rasterizer.h @@ -168,7 +168,7 @@ private:                                   bool is_indexed, bool is_instanced);      /// Setup descriptors in the graphics pipeline. -    void SetupShaderDescriptors(const std::array<Shader, Maxwell::MaxShaderProgram>& shaders); +    void SetupShaderDescriptors(const std::array<Shader*, Maxwell::MaxShaderProgram>& shaders);      void SetupImageTransitions(Texceptions texceptions,                                 const std::array<View, Maxwell::NumRenderTargets>& color_attachments, @@ -193,12 +193,15 @@ private:      /// Setup global buffers in the graphics pipeline.      void SetupGraphicsGlobalBuffers(const ShaderEntries& entries, std::size_t stage); -    /// Setup texel buffers in the graphics pipeline. -    void SetupGraphicsTexelBuffers(const ShaderEntries& entries, std::size_t stage); +    /// Setup uniform texels in the graphics pipeline. +    void SetupGraphicsUniformTexels(const ShaderEntries& entries, std::size_t stage);      /// Setup textures in the graphics pipeline.      void SetupGraphicsTextures(const ShaderEntries& entries, std::size_t stage); +    /// Setup storage texels in the graphics pipeline. +    void SetupGraphicsStorageTexels(const ShaderEntries& entries, std::size_t stage); +      /// Setup images in the graphics pipeline.      void SetupGraphicsImages(const ShaderEntries& entries, std::size_t stage); @@ -209,11 +212,14 @@ private:      void SetupComputeGlobalBuffers(const ShaderEntries& entries);      /// Setup texel buffers in the compute pipeline. -    void SetupComputeTexelBuffers(const ShaderEntries& entries); +    void SetupComputeUniformTexels(const ShaderEntries& entries);      /// Setup textures in the compute pipeline.      void SetupComputeTextures(const ShaderEntries& entries); +    /// Setup storage texels in the compute pipeline. +    void SetupComputeStorageTexels(const ShaderEntries& entries); +      /// Setup images in the compute pipeline.      void SetupComputeImages(const ShaderEntries& entries); @@ -222,10 +228,12 @@ private:      void SetupGlobalBuffer(const GlobalBufferEntry& entry, GPUVAddr address); -    void SetupTexelBuffer(const Tegra::Texture::TICEntry& image, const TexelBufferEntry& entry); +    void SetupUniformTexels(const Tegra::Texture::TICEntry& image, const UniformTexelEntry& entry);      void SetupTexture(const Tegra::Texture::FullTextureInfo& texture, const SamplerEntry& entry); +    void SetupStorageTexel(const Tegra::Texture::TICEntry& tic, const StorageTexelEntry& entry); +      void SetupImage(const Tegra::Texture::TICEntry& tic, const ImageEntry& entry);      void UpdateViewportsState(Tegra::Engines::Maxwell3D::Regs& regs); diff --git a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp index a13e8baa7..97429cc59 100644 --- a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp +++ b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp @@ -400,8 +400,9 @@ private:          u32 binding = specialization.base_binding;          binding = DeclareConstantBuffers(binding);          binding = DeclareGlobalBuffers(binding); -        binding = DeclareTexelBuffers(binding); +        binding = DeclareUniformTexels(binding);          binding = DeclareSamplers(binding); +        binding = DeclareStorageTexels(binding);          binding = DeclareImages(binding);          const Id main = OpFunction(t_void, {}, TypeFunction(t_void)); @@ -889,7 +890,7 @@ private:          return binding;      } -    u32 DeclareTexelBuffers(u32 binding) { +    u32 DeclareUniformTexels(u32 binding) {          for (const auto& sampler : ir.GetSamplers()) {              if (!sampler.is_buffer) {                  continue; @@ -910,7 +911,7 @@ private:              Decorate(id, spv::Decoration::Binding, binding++);              Decorate(id, spv::Decoration::DescriptorSet, DESCRIPTOR_SET); -            texel_buffers.emplace(sampler.index, TexelBuffer{image_type, id}); +            uniform_texels.emplace(sampler.index, TexelBuffer{image_type, id});          }          return binding;      } @@ -945,31 +946,48 @@ private:          return binding;      } -    u32 DeclareImages(u32 binding) { +    u32 DeclareStorageTexels(u32 binding) {          for (const auto& image : ir.GetImages()) { -            const auto [dim, arrayed] = GetImageDim(image); -            constexpr int depth = 0; -            constexpr bool ms = false; -            constexpr int sampled = 2; // This won't be accessed with a sampler -            constexpr auto format = spv::ImageFormat::Unknown; -            const Id image_type = TypeImage(t_uint, dim, depth, arrayed, ms, sampled, format, {}); -            const Id pointer_type = TypePointer(spv::StorageClass::UniformConstant, image_type); -            const Id id = OpVariable(pointer_type, spv::StorageClass::UniformConstant); -            AddGlobalVariable(Name(id, fmt::format("image_{}", image.index))); - -            Decorate(id, spv::Decoration::Binding, binding++); -            Decorate(id, spv::Decoration::DescriptorSet, DESCRIPTOR_SET); -            if (image.is_read && !image.is_written) { -                Decorate(id, spv::Decoration::NonWritable); -            } else if (image.is_written && !image.is_read) { -                Decorate(id, spv::Decoration::NonReadable); +            if (image.type != Tegra::Shader::ImageType::TextureBuffer) { +                continue;              } +            DeclareImage(image, binding); +        } +        return binding; +    } -            images.emplace(image.index, StorageImage{image_type, id}); +    u32 DeclareImages(u32 binding) { +        for (const auto& image : ir.GetImages()) { +            if (image.type == Tegra::Shader::ImageType::TextureBuffer) { +                continue; +            } +            DeclareImage(image, binding);          }          return binding;      } +    void DeclareImage(const Image& image, u32& binding) { +        const auto [dim, arrayed] = GetImageDim(image); +        constexpr int depth = 0; +        constexpr bool ms = false; +        constexpr int sampled = 2; // This won't be accessed with a sampler +        const auto format = image.is_atomic ? spv::ImageFormat::R32ui : spv::ImageFormat::Unknown; +        const Id image_type = TypeImage(t_uint, dim, depth, arrayed, ms, sampled, format, {}); +        const Id pointer_type = TypePointer(spv::StorageClass::UniformConstant, image_type); +        const Id id = OpVariable(pointer_type, spv::StorageClass::UniformConstant); +        AddGlobalVariable(Name(id, fmt::format("image_{}", image.index))); + +        Decorate(id, spv::Decoration::Binding, binding++); +        Decorate(id, spv::Decoration::DescriptorSet, DESCRIPTOR_SET); +        if (image.is_read && !image.is_written) { +            Decorate(id, spv::Decoration::NonWritable); +        } else if (image.is_written && !image.is_read) { +            Decorate(id, spv::Decoration::NonReadable); +        } + +        images.emplace(image.index, StorageImage{image_type, id}); +    } +      bool IsRenderTargetEnabled(u32 rt) const {          for (u32 component = 0; component < 4; ++component) {              if (header.ps.IsColorComponentOutputEnabled(rt, component)) { @@ -1256,7 +1274,7 @@ private:                  } else {                      UNREACHABLE_MSG("Unmanaged offset node type");                  } -                pointer = OpAccessChain(t_cbuf_float, buffer_id, Constant(t_uint, 0), buffer_index, +                pointer = OpAccessChain(t_cbuf_float, buffer_id, v_uint_zero, buffer_index,                                          buffer_element);              }              return {OpLoad(t_float, pointer), Type::Float}; @@ -1611,7 +1629,7 @@ private:          const Id result = OpIAddCarry(TypeStruct({t_uint, t_uint}), op_a, op_b);          const Id carry = OpCompositeExtract(t_uint, result, 1); -        return {OpINotEqual(t_bool, carry, Constant(t_uint, 0)), Type::Bool}; +        return {OpINotEqual(t_bool, carry, v_uint_zero), Type::Bool};      }      Expression LogicalAssign(Operation operation) { @@ -1674,7 +1692,7 @@ private:          const auto& meta = std::get<MetaTexture>(operation.GetMeta());          const u32 index = meta.sampler.index;          if (meta.sampler.is_buffer) { -            const auto& entry = texel_buffers.at(index); +            const auto& entry = uniform_texels.at(index);              return OpLoad(entry.image_type, entry.image);          } else {              const auto& entry = sampled_images.at(index); @@ -1951,39 +1969,20 @@ private:          return {};      } -    Expression AtomicImageAdd(Operation operation) { -        UNIMPLEMENTED(); -        return {}; -    } - -    Expression AtomicImageMin(Operation operation) { -        UNIMPLEMENTED(); -        return {}; -    } - -    Expression AtomicImageMax(Operation operation) { -        UNIMPLEMENTED(); -        return {}; -    } - -    Expression AtomicImageAnd(Operation operation) { -        UNIMPLEMENTED(); -        return {}; -    } - -    Expression AtomicImageOr(Operation operation) { -        UNIMPLEMENTED(); -        return {}; -    } +    template <Id (Module::*func)(Id, Id, Id, Id, Id)> +    Expression AtomicImage(Operation operation) { +        const auto& meta{std::get<MetaImage>(operation.GetMeta())}; +        ASSERT(meta.values.size() == 1); -    Expression AtomicImageXor(Operation operation) { -        UNIMPLEMENTED(); -        return {}; -    } +        const Id coordinate = GetCoordinates(operation, Type::Int); +        const Id image = images.at(meta.image.index).image; +        const Id sample = v_uint_zero; +        const Id pointer = OpImageTexelPointer(t_image_uint, image, coordinate, sample); -    Expression AtomicImageExchange(Operation operation) { -        UNIMPLEMENTED(); -        return {}; +        const Id scope = Constant(t_uint, static_cast<u32>(spv::Scope::Device)); +        const Id semantics = v_uint_zero; +        const Id value = AsUint(Visit(meta.values[0])); +        return {(this->*func)(t_uint, pointer, scope, semantics, value), Type::Uint};      }      template <Id (Module::*func)(Id, Id, Id, Id, Id)> @@ -1998,7 +1997,7 @@ private:              return {v_float_zero, Type::Float};          }          const Id scope = Constant(t_uint, static_cast<u32>(spv::Scope::Device)); -        const Id semantics = Constant(t_uint, 0); +        const Id semantics = v_uint_zero;          const Id value = AsUint(Visit(operation[1]));          return {(this->*func)(t_uint, pointer, scope, semantics, value), Type::Uint}; @@ -2622,11 +2621,11 @@ private:          &SPIRVDecompiler::ImageLoad,          &SPIRVDecompiler::ImageStore, -        &SPIRVDecompiler::AtomicImageAdd, -        &SPIRVDecompiler::AtomicImageAnd, -        &SPIRVDecompiler::AtomicImageOr, -        &SPIRVDecompiler::AtomicImageXor, -        &SPIRVDecompiler::AtomicImageExchange, +        &SPIRVDecompiler::AtomicImage<&Module::OpAtomicIAdd>, +        &SPIRVDecompiler::AtomicImage<&Module::OpAtomicAnd>, +        &SPIRVDecompiler::AtomicImage<&Module::OpAtomicOr>, +        &SPIRVDecompiler::AtomicImage<&Module::OpAtomicXor>, +        &SPIRVDecompiler::AtomicImage<&Module::OpAtomicExchange>,          &SPIRVDecompiler::Atomic<&Module::OpAtomicExchange>,          &SPIRVDecompiler::Atomic<&Module::OpAtomicIAdd>, @@ -2768,8 +2767,11 @@ private:          Decorate(TypeStruct(t_gmem_array), spv::Decoration::Block), 0, spv::Decoration::Offset, 0);      const Id t_gmem_ssbo = TypePointer(spv::StorageClass::StorageBuffer, t_gmem_struct); +    const Id t_image_uint = TypePointer(spv::StorageClass::Image, t_uint); +      const Id v_float_zero = Constant(t_float, 0.0f);      const Id v_float_one = Constant(t_float, 1.0f); +    const Id v_uint_zero = Constant(t_uint, 0);      // Nvidia uses these defaults for varyings (e.g. position and generic attributes)      const Id v_varying_default = @@ -2794,15 +2796,16 @@ private:      std::unordered_map<u8, GenericVaryingDescription> output_attributes;      std::map<u32, Id> constant_buffers;      std::map<GlobalMemoryBase, Id> global_buffers; -    std::map<u32, TexelBuffer> texel_buffers; +    std::map<u32, TexelBuffer> uniform_texels;      std::map<u32, SampledImage> sampled_images; +    std::map<u32, TexelBuffer> storage_texels;      std::map<u32, StorageImage> images; +    std::array<Id, Maxwell::NumRenderTargets> frag_colors{};      Id instance_index{};      Id vertex_index{};      Id base_instance{};      Id base_vertex{}; -    std::array<Id, Maxwell::NumRenderTargets> frag_colors{};      Id frag_depth{};      Id frag_coord{};      Id front_facing{}; @@ -3058,13 +3061,17 @@ ShaderEntries GenerateShaderEntries(const VideoCommon::Shader::ShaderIR& ir) {      }      for (const auto& sampler : ir.GetSamplers()) {          if (sampler.is_buffer) { -            entries.texel_buffers.emplace_back(sampler); +            entries.uniform_texels.emplace_back(sampler);          } else {              entries.samplers.emplace_back(sampler);          }      }      for (const auto& image : ir.GetImages()) { -        entries.images.emplace_back(image); +        if (image.type == Tegra::Shader::ImageType::TextureBuffer) { +            entries.storage_texels.emplace_back(image); +        } else { +            entries.images.emplace_back(image); +        }      }      for (const auto& attribute : ir.GetInputAttributes()) {          if (IsGenericAttribute(attribute)) { diff --git a/src/video_core/renderer_vulkan/vk_shader_decompiler.h b/src/video_core/renderer_vulkan/vk_shader_decompiler.h index b7af26388..2b0e90396 100644 --- a/src/video_core/renderer_vulkan/vk_shader_decompiler.h +++ b/src/video_core/renderer_vulkan/vk_shader_decompiler.h @@ -21,8 +21,9 @@ class VKDevice;  namespace Vulkan {  using Maxwell = Tegra::Engines::Maxwell3D::Regs; -using TexelBufferEntry = VideoCommon::Shader::Sampler; +using UniformTexelEntry = VideoCommon::Shader::Sampler;  using SamplerEntry = VideoCommon::Shader::Sampler; +using StorageTexelEntry = VideoCommon::Shader::Image;  using ImageEntry = VideoCommon::Shader::Image;  constexpr u32 DESCRIPTOR_SET = 0; @@ -66,13 +67,15 @@ private:  struct ShaderEntries {      u32 NumBindings() const {          return static_cast<u32>(const_buffers.size() + global_buffers.size() + -                                texel_buffers.size() + samplers.size() + images.size()); +                                uniform_texels.size() + samplers.size() + storage_texels.size() + +                                images.size());      }      std::vector<ConstBufferEntry> const_buffers;      std::vector<GlobalBufferEntry> global_buffers; -    std::vector<TexelBufferEntry> texel_buffers; +    std::vector<UniformTexelEntry> uniform_texels;      std::vector<SamplerEntry> samplers; +    std::vector<StorageTexelEntry> storage_texels;      std::vector<ImageEntry> images;      std::set<u32> attributes;      std::array<bool, Maxwell::NumClipDistances> clip_distances{}; diff --git a/src/video_core/renderer_vulkan/vk_stream_buffer.h b/src/video_core/renderer_vulkan/vk_stream_buffer.h index dfddf7ad6..c765c60a0 100644 --- a/src/video_core/renderer_vulkan/vk_stream_buffer.h +++ b/src/video_core/renderer_vulkan/vk_stream_buffer.h @@ -35,7 +35,7 @@ public:      /// Ensures that "size" bytes of memory are available to the GPU, potentially recording a copy.      void Unmap(u64 size); -    VkBuffer GetHandle() const { +    VkBuffer Handle() const {          return *buffer;      } diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.cpp b/src/video_core/renderer_vulkan/vk_texture_cache.cpp index 2f1d5021d..430031665 100644 --- a/src/video_core/renderer_vulkan/vk_texture_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_texture_cache.cpp @@ -100,8 +100,8 @@ vk::Buffer CreateBuffer(const VKDevice& device, const SurfaceParams& params,      ci.pNext = nullptr;      ci.flags = 0;      ci.size = static_cast<VkDeviceSize>(host_memory_size); -    ci.usage = VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT | -               VK_BUFFER_USAGE_TRANSFER_DST_BIT; +    ci.usage = VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT | VK_BUFFER_USAGE_STORAGE_TEXEL_BUFFER_BIT | +               VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT;      ci.sharingMode = VK_SHARING_MODE_EXCLUSIVE;      ci.queueFamilyIndexCount = 0;      ci.pQueueFamilyIndices = nullptr; @@ -167,6 +167,7 @@ VkImageCreateInfo GenerateImageCreateInfo(const VKDevice& device, const SurfaceP          ci.extent = {params.width, params.height, 1};          break;      case SurfaceTarget::Texture3D: +        ci.flags |= VK_IMAGE_CREATE_2D_ARRAY_COMPATIBLE_BIT;          ci.extent = {params.width, params.height, params.depth};          break;      case SurfaceTarget::TextureBuffer: @@ -176,6 +177,12 @@ VkImageCreateInfo GenerateImageCreateInfo(const VKDevice& device, const SurfaceP      return ci;  } +u32 EncodeSwizzle(Tegra::Texture::SwizzleSource x_source, Tegra::Texture::SwizzleSource y_source, +                  Tegra::Texture::SwizzleSource z_source, Tegra::Texture::SwizzleSource w_source) { +    return (static_cast<u32>(x_source) << 24) | (static_cast<u32>(y_source) << 16) | +           (static_cast<u32>(z_source) << 8) | static_cast<u32>(w_source); +} +  } // Anonymous namespace  CachedSurface::CachedSurface(Core::System& system, const VKDevice& device, @@ -203,9 +210,11 @@ CachedSurface::CachedSurface(Core::System& system, const VKDevice& device,      }      // TODO(Rodrigo): Move this to a virtual function. -    main_view = CreateViewInner( -        ViewParams(params.target, 0, static_cast<u32>(params.GetNumLayers()), 0, params.num_levels), -        true); +    u32 num_layers = 1; +    if (params.is_layered || params.target == SurfaceTarget::Texture3D) { +        num_layers = params.depth; +    } +    main_view = CreateView(ViewParams(params.target, 0, num_layers, 0, params.num_levels));  }  CachedSurface::~CachedSurface() = default; @@ -253,12 +262,8 @@ void CachedSurface::DecorateSurfaceName() {  }  View CachedSurface::CreateView(const ViewParams& params) { -    return CreateViewInner(params, false); -} - -View CachedSurface::CreateViewInner(const ViewParams& params, bool is_proxy) {      // TODO(Rodrigo): Add name decorations -    return views[params] = std::make_shared<CachedSurfaceView>(device, *this, params, is_proxy); +    return views[params] = std::make_shared<CachedSurfaceView>(device, *this, params);  }  void CachedSurface::UploadBuffer(const std::vector<u8>& staging_buffer) { @@ -342,18 +347,27 @@ VkImageSubresourceRange CachedSurface::GetImageSubresourceRange() const {  }  CachedSurfaceView::CachedSurfaceView(const VKDevice& device, CachedSurface& surface, -                                     const ViewParams& params, bool is_proxy) +                                     const ViewParams& params)      : VideoCommon::ViewBase{params}, params{surface.GetSurfaceParams()},        image{surface.GetImageHandle()}, buffer_view{surface.GetBufferViewHandle()},        aspect_mask{surface.GetAspectMask()}, device{device}, surface{surface}, -      base_layer{params.base_layer}, num_layers{params.num_layers}, base_level{params.base_level}, -      num_levels{params.num_levels}, image_view_type{image ? GetImageViewType(params.target) -                                                           : VK_IMAGE_VIEW_TYPE_1D} {} +      base_level{params.base_level}, num_levels{params.num_levels}, +      image_view_type{image ? GetImageViewType(params.target) : VK_IMAGE_VIEW_TYPE_1D} { +    if (image_view_type == VK_IMAGE_VIEW_TYPE_3D) { +        base_layer = 0; +        num_layers = 1; +        base_slice = params.base_layer; +        num_slices = params.num_layers; +    } else { +        base_layer = params.base_layer; +        num_layers = params.num_layers; +    } +}  CachedSurfaceView::~CachedSurfaceView() = default; -VkImageView CachedSurfaceView::GetHandle(SwizzleSource x_source, SwizzleSource y_source, -                                         SwizzleSource z_source, SwizzleSource w_source) { +VkImageView CachedSurfaceView::GetImageView(SwizzleSource x_source, SwizzleSource y_source, +                                            SwizzleSource z_source, SwizzleSource w_source) {      const u32 new_swizzle = EncodeSwizzle(x_source, y_source, z_source, w_source);      if (last_image_view && last_swizzle == new_swizzle) {          return last_image_view; @@ -399,6 +413,11 @@ VkImageView CachedSurfaceView::GetHandle(SwizzleSource x_source, SwizzleSource y              });      } +    if (image_view_type == VK_IMAGE_VIEW_TYPE_3D) { +        ASSERT(base_slice == 0); +        ASSERT(num_slices == params.depth); +    } +      VkImageViewCreateInfo ci;      ci.sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO;      ci.pNext = nullptr; @@ -417,6 +436,35 @@ VkImageView CachedSurfaceView::GetHandle(SwizzleSource x_source, SwizzleSource y      return last_image_view = *image_view;  } +VkImageView CachedSurfaceView::GetAttachment() { +    if (render_target) { +        return *render_target; +    } + +    VkImageViewCreateInfo ci; +    ci.sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO; +    ci.pNext = nullptr; +    ci.flags = 0; +    ci.image = surface.GetImageHandle(); +    ci.format = surface.GetImage().GetFormat(); +    ci.components = {VK_COMPONENT_SWIZZLE_IDENTITY, VK_COMPONENT_SWIZZLE_IDENTITY, +                     VK_COMPONENT_SWIZZLE_IDENTITY, VK_COMPONENT_SWIZZLE_IDENTITY}; +    ci.subresourceRange.aspectMask = aspect_mask; +    ci.subresourceRange.baseMipLevel = base_level; +    ci.subresourceRange.levelCount = num_levels; +    if (image_view_type == VK_IMAGE_VIEW_TYPE_3D) { +        ci.viewType = num_slices > 1 ? VK_IMAGE_VIEW_TYPE_2D_ARRAY : VK_IMAGE_VIEW_TYPE_2D; +        ci.subresourceRange.baseArrayLayer = base_slice; +        ci.subresourceRange.layerCount = num_slices; +    } else { +        ci.viewType = image_view_type; +        ci.subresourceRange.baseArrayLayer = base_layer; +        ci.subresourceRange.layerCount = num_layers; +    } +    render_target = device.GetLogical().CreateImageView(ci); +    return *render_target; +} +  VKTextureCache::VKTextureCache(Core::System& system, VideoCore::RasterizerInterface& rasterizer,                                 const VKDevice& device, VKResourceManager& resource_manager,                                 VKMemoryManager& memory_manager, VKScheduler& scheduler, diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.h b/src/video_core/renderer_vulkan/vk_texture_cache.h index f211ccb1e..807e26c8a 100644 --- a/src/video_core/renderer_vulkan/vk_texture_cache.h +++ b/src/video_core/renderer_vulkan/vk_texture_cache.h @@ -91,7 +91,6 @@ protected:      void DecorateSurfaceName();      View CreateView(const ViewParams& params) override; -    View CreateViewInner(const ViewParams& params, bool is_proxy);  private:      void UploadBuffer(const std::vector<u8>& staging_buffer); @@ -120,23 +119,20 @@ private:  class CachedSurfaceView final : public VideoCommon::ViewBase {  public:      explicit CachedSurfaceView(const VKDevice& device, CachedSurface& surface, -                               const ViewParams& params, bool is_proxy); +                               const ViewParams& params);      ~CachedSurfaceView(); -    VkImageView GetHandle(Tegra::Texture::SwizzleSource x_source, -                          Tegra::Texture::SwizzleSource y_source, -                          Tegra::Texture::SwizzleSource z_source, -                          Tegra::Texture::SwizzleSource w_source); +    VkImageView GetImageView(Tegra::Texture::SwizzleSource x_source, +                             Tegra::Texture::SwizzleSource y_source, +                             Tegra::Texture::SwizzleSource z_source, +                             Tegra::Texture::SwizzleSource w_source); + +    VkImageView GetAttachment();      bool IsSameSurface(const CachedSurfaceView& rhs) const {          return &surface == &rhs.surface;      } -    VkImageView GetHandle() { -        return GetHandle(Tegra::Texture::SwizzleSource::R, Tegra::Texture::SwizzleSource::G, -                         Tegra::Texture::SwizzleSource::B, Tegra::Texture::SwizzleSource::A); -    } -      u32 GetWidth() const {          return params.GetMipWidth(base_level);      } @@ -180,14 +176,6 @@ public:      }  private: -    static u32 EncodeSwizzle(Tegra::Texture::SwizzleSource x_source, -                             Tegra::Texture::SwizzleSource y_source, -                             Tegra::Texture::SwizzleSource z_source, -                             Tegra::Texture::SwizzleSource w_source) { -        return (static_cast<u32>(x_source) << 24) | (static_cast<u32>(y_source) << 16) | -               (static_cast<u32>(z_source) << 8) | static_cast<u32>(w_source); -    } -      // Store a copy of these values to avoid double dereference when reading them      const SurfaceParams params;      const VkImage image; @@ -196,15 +184,18 @@ private:      const VKDevice& device;      CachedSurface& surface; -    const u32 base_layer; -    const u32 num_layers;      const u32 base_level;      const u32 num_levels;      const VkImageViewType image_view_type; +    u32 base_layer = 0; +    u32 num_layers = 0; +    u32 base_slice = 0; +    u32 num_slices = 0;      VkImageView last_image_view = nullptr;      u32 last_swizzle = 0; +    vk::ImageView render_target;      std::unordered_map<u32, vk::ImageView> view_cache;  }; diff --git a/src/video_core/shader/decode/texture.cpp b/src/video_core/shader/decode/texture.cpp index 8f0bb996e..29ebf65ba 100644 --- a/src/video_core/shader/decode/texture.cpp +++ b/src/video_core/shader/decode/texture.cpp @@ -357,13 +357,11 @@ u32 ShaderIR::DecodeTexture(NodeBlock& bb, u32 pc) {      return pc;  } -ShaderIR::SamplerInfo ShaderIR::GetSamplerInfo(SamplerInfo info, u32 offset, -                                               std::optional<u32> buffer) { +ShaderIR::SamplerInfo ShaderIR::GetSamplerInfo( +    SamplerInfo info, std::optional<Tegra::Engines::SamplerDescriptor> sampler) {      if (info.IsComplete()) {          return info;      } -    const auto sampler = buffer ? registry.ObtainBindlessSampler(*buffer, offset) -                                : registry.ObtainBoundSampler(offset);      if (!sampler) {          LOG_WARNING(HW_GPU, "Unknown sampler info");          info.type = info.type.value_or(Tegra::Shader::TextureType::Texture2D); @@ -381,8 +379,8 @@ ShaderIR::SamplerInfo ShaderIR::GetSamplerInfo(SamplerInfo info, u32 offset,  std::optional<Sampler> ShaderIR::GetSampler(Tegra::Shader::Sampler sampler,                                              SamplerInfo sampler_info) { -    const auto offset = static_cast<u32>(sampler.index.Value()); -    const auto info = GetSamplerInfo(sampler_info, offset); +    const u32 offset = static_cast<u32>(sampler.index.Value()); +    const auto info = GetSamplerInfo(sampler_info, registry.ObtainBoundSampler(offset));      // If this sampler has already been used, return the existing mapping.      const auto it = std::find_if(used_samplers.begin(), used_samplers.end(), @@ -404,20 +402,19 @@ std::optional<Sampler> ShaderIR::GetBindlessSampler(Tegra::Shader::Register reg,      const Node sampler_register = GetRegister(reg);      const auto [base_node, tracked_sampler_info] =          TrackBindlessSampler(sampler_register, global_code, static_cast<s64>(global_code.size())); -    ASSERT(base_node != nullptr); -    if (base_node == nullptr) { +    if (!base_node) { +        UNREACHABLE();          return std::nullopt;      } -    if (const auto bindless_sampler_info = -            std::get_if<BindlessSamplerNode>(&*tracked_sampler_info)) { -        const u32 buffer = bindless_sampler_info->GetIndex(); -        const u32 offset = bindless_sampler_info->GetOffset(); -        info = GetSamplerInfo(info, offset, buffer); +    if (const auto sampler_info = std::get_if<BindlessSamplerNode>(&*tracked_sampler_info)) { +        const u32 buffer = sampler_info->index; +        const u32 offset = sampler_info->offset; +        info = GetSamplerInfo(info, registry.ObtainBindlessSampler(buffer, offset));          // If this sampler has already been used, return the existing mapping.          const auto it = std::find_if(used_samplers.begin(), used_samplers.end(), -                                     [buffer = buffer, offset = offset](const Sampler& entry) { +                                     [buffer, offset](const Sampler& entry) {                                           return entry.buffer == buffer && entry.offset == offset;                                       });          if (it != used_samplers.end()) { @@ -431,10 +428,32 @@ std::optional<Sampler> ShaderIR::GetBindlessSampler(Tegra::Shader::Register reg,          return used_samplers.emplace_back(next_index, offset, buffer, *info.type, *info.is_array,                                            *info.is_shadow, *info.is_buffer, false);      } -    if (const auto array_sampler_info = std::get_if<ArraySamplerNode>(&*tracked_sampler_info)) { -        const u32 base_offset = array_sampler_info->GetBaseOffset() / 4; -        index_var = GetCustomVariable(array_sampler_info->GetIndexVar()); -        info = GetSamplerInfo(info, base_offset); +    if (const auto sampler_info = std::get_if<SeparateSamplerNode>(&*tracked_sampler_info)) { +        const std::pair indices = sampler_info->indices; +        const std::pair offsets = sampler_info->offsets; +        info = GetSamplerInfo(info, registry.ObtainSeparateSampler(indices, offsets)); + +        // Try to use an already created sampler if it exists +        const auto it = std::find_if( +            used_samplers.begin(), used_samplers.end(), [indices, offsets](const Sampler& entry) { +                return offsets == std::pair{entry.offset, entry.secondary_offset} && +                       indices == std::pair{entry.buffer, entry.secondary_buffer}; +            }); +        if (it != used_samplers.end()) { +            ASSERT(it->is_separated && it->type == info.type && it->is_array == info.is_array && +                   it->is_shadow == info.is_shadow && it->is_buffer == info.is_buffer); +            return *it; +        } + +        // Otherwise create a new mapping for this sampler +        const u32 next_index = static_cast<u32>(used_samplers.size()); +        return used_samplers.emplace_back(next_index, offsets, indices, *info.type, *info.is_array, +                                          *info.is_shadow, *info.is_buffer); +    } +    if (const auto sampler_info = std::get_if<ArraySamplerNode>(&*tracked_sampler_info)) { +        const u32 base_offset = sampler_info->base_offset / 4; +        index_var = GetCustomVariable(sampler_info->bindless_var); +        info = GetSamplerInfo(info, registry.ObtainBoundSampler(base_offset));          // If this sampler has already been used, return the existing mapping.          const auto it = std::find_if( diff --git a/src/video_core/shader/node.h b/src/video_core/shader/node.h index c5e5165ff..8f230d57a 100644 --- a/src/video_core/shader/node.h +++ b/src/video_core/shader/node.h @@ -275,10 +275,11 @@ using Node = std::shared_ptr<NodeData>;  using Node4 = std::array<Node, 4>;  using NodeBlock = std::vector<Node>; -class BindlessSamplerNode; -class ArraySamplerNode; +struct ArraySamplerNode; +struct BindlessSamplerNode; +struct SeparateSamplerNode; -using TrackSamplerData = std::variant<BindlessSamplerNode, ArraySamplerNode>; +using TrackSamplerData = std::variant<BindlessSamplerNode, SeparateSamplerNode, ArraySamplerNode>;  using TrackSampler = std::shared_ptr<TrackSamplerData>;  struct Sampler { @@ -288,63 +289,51 @@ struct Sampler {          : index{index}, offset{offset}, type{type}, is_array{is_array}, is_shadow{is_shadow},            is_buffer{is_buffer}, is_indexed{is_indexed} {} +    /// Separate sampler constructor +    constexpr explicit Sampler(u32 index, std::pair<u32, u32> offsets, std::pair<u32, u32> buffers, +                               Tegra::Shader::TextureType type, bool is_array, bool is_shadow, +                               bool is_buffer) +        : index{index}, offset{offsets.first}, secondary_offset{offsets.second}, +          buffer{buffers.first}, secondary_buffer{buffers.second}, type{type}, is_array{is_array}, +          is_shadow{is_shadow}, is_buffer{is_buffer}, is_separated{true} {} +      /// Bindless samplers constructor      constexpr explicit Sampler(u32 index, u32 offset, u32 buffer, Tegra::Shader::TextureType type,                                 bool is_array, bool is_shadow, bool is_buffer, bool is_indexed)          : index{index}, offset{offset}, buffer{buffer}, type{type}, is_array{is_array},            is_shadow{is_shadow}, is_buffer{is_buffer}, is_bindless{true}, is_indexed{is_indexed} {} -    u32 index = 0;  ///< Emulated index given for the this sampler. -    u32 offset = 0; ///< Offset in the const buffer from where the sampler is being read. -    u32 buffer = 0; ///< Buffer where the bindless sampler is being read (unused on bound samplers). -    u32 size = 1;   ///< Size of the sampler. +    u32 index = 0;            ///< Emulated index given for the this sampler. +    u32 offset = 0;           ///< Offset in the const buffer from where the sampler is being read. +    u32 secondary_offset = 0; ///< Secondary offset in the const buffer. +    u32 buffer = 0;           ///< Buffer where the bindless sampler is read. +    u32 secondary_buffer = 0; ///< Secondary buffer where the bindless sampler is read. +    u32 size = 1;             ///< Size of the sampler.      Tegra::Shader::TextureType type{}; ///< The type used to sample this texture (Texture2D, etc) -    bool is_array = false;    ///< Whether the texture is being sampled as an array texture or not. -    bool is_shadow = false;   ///< Whether the texture is being sampled as a depth texture or not. -    bool is_buffer = false;   ///< Whether the texture is a texture buffer without sampler. -    bool is_bindless = false; ///< Whether this sampler belongs to a bindless texture or not. -    bool is_indexed = false;  ///< Whether this sampler is an indexed array of textures. +    bool is_array = false;     ///< Whether the texture is being sampled as an array texture or not. +    bool is_shadow = false;    ///< Whether the texture is being sampled as a depth texture or not. +    bool is_buffer = false;    ///< Whether the texture is a texture buffer without sampler. +    bool is_bindless = false;  ///< Whether this sampler belongs to a bindless texture or not. +    bool is_indexed = false;   ///< Whether this sampler is an indexed array of textures. +    bool is_separated = false; ///< Whether the image and sampler is separated or not.  };  /// Represents a tracked bindless sampler into a direct const buffer -class ArraySamplerNode final { -public: -    explicit ArraySamplerNode(u32 index, u32 base_offset, u32 bindless_var) -        : index{index}, base_offset{base_offset}, bindless_var{bindless_var} {} - -    constexpr u32 GetIndex() const { -        return index; -    } - -    constexpr u32 GetBaseOffset() const { -        return base_offset; -    } - -    constexpr u32 GetIndexVar() const { -        return bindless_var; -    } - -private: +struct ArraySamplerNode {      u32 index;      u32 base_offset;      u32 bindless_var;  }; -/// Represents a tracked bindless sampler into a direct const buffer -class BindlessSamplerNode final { -public: -    explicit BindlessSamplerNode(u32 index, u32 offset) : index{index}, offset{offset} {} - -    constexpr u32 GetIndex() const { -        return index; -    } - -    constexpr u32 GetOffset() const { -        return offset; -    } +/// Represents a tracked separate sampler image pair that was folded statically +struct SeparateSamplerNode { +    std::pair<u32, u32> indices; +    std::pair<u32, u32> offsets; +}; -private: +/// Represents a tracked bindless sampler into a direct const buffer +struct BindlessSamplerNode {      u32 index;      u32 offset;  }; diff --git a/src/video_core/shader/node_helper.h b/src/video_core/shader/node_helper.h index 11231bbea..1e0886185 100644 --- a/src/video_core/shader/node_helper.h +++ b/src/video_core/shader/node_helper.h @@ -48,7 +48,7 @@ Node MakeNode(Args&&... args) {  template <typename T, typename... Args>  TrackSampler MakeTrackSampler(Args&&... args) {      static_assert(std::is_convertible_v<T, TrackSamplerData>); -    return std::make_shared<TrackSamplerData>(T(std::forward<Args>(args)...)); +    return std::make_shared<TrackSamplerData>(T{std::forward<Args>(args)...});  }  template <typename... Args> diff --git a/src/video_core/shader/registry.cpp b/src/video_core/shader/registry.cpp index af70b3f35..cdf274e54 100644 --- a/src/video_core/shader/registry.cpp +++ b/src/video_core/shader/registry.cpp @@ -93,6 +93,26 @@ std::optional<SamplerDescriptor> Registry::ObtainBoundSampler(u32 offset) {      return value;  } +std::optional<Tegra::Engines::SamplerDescriptor> Registry::ObtainSeparateSampler( +    std::pair<u32, u32> buffers, std::pair<u32, u32> offsets) { +    SeparateSamplerKey key; +    key.buffers = buffers; +    key.offsets = offsets; +    const auto iter = separate_samplers.find(key); +    if (iter != separate_samplers.end()) { +        return iter->second; +    } +    if (!engine) { +        return std::nullopt; +    } + +    const u32 handle_1 = engine->AccessConstBuffer32(stage, key.buffers.first, key.offsets.first); +    const u32 handle_2 = engine->AccessConstBuffer32(stage, key.buffers.second, key.offsets.second); +    const SamplerDescriptor value = engine->AccessSampler(handle_1 | handle_2); +    separate_samplers.emplace(key, value); +    return value; +} +  std::optional<Tegra::Engines::SamplerDescriptor> Registry::ObtainBindlessSampler(u32 buffer,                                                                                   u32 offset) {      const std::pair key = {buffer, offset}; diff --git a/src/video_core/shader/registry.h b/src/video_core/shader/registry.h index 0c80d35fd..231206765 100644 --- a/src/video_core/shader/registry.h +++ b/src/video_core/shader/registry.h @@ -19,8 +19,39 @@  namespace VideoCommon::Shader { +struct SeparateSamplerKey { +    std::pair<u32, u32> buffers; +    std::pair<u32, u32> offsets; +}; + +} // namespace VideoCommon::Shader + +namespace std { + +template <> +struct hash<VideoCommon::Shader::SeparateSamplerKey> { +    std::size_t operator()(const VideoCommon::Shader::SeparateSamplerKey& key) const noexcept { +        return std::hash<u32>{}(key.buffers.first ^ key.buffers.second ^ key.offsets.first ^ +                                key.offsets.second); +    } +}; + +template <> +struct equal_to<VideoCommon::Shader::SeparateSamplerKey> { +    bool operator()(const VideoCommon::Shader::SeparateSamplerKey& lhs, +                    const VideoCommon::Shader::SeparateSamplerKey& rhs) const noexcept { +        return lhs.buffers == rhs.buffers && lhs.offsets == rhs.offsets; +    } +}; + +} // namespace std + +namespace VideoCommon::Shader { +  using KeyMap = std::unordered_map<std::pair<u32, u32>, u32, Common::PairHash>;  using BoundSamplerMap = std::unordered_map<u32, Tegra::Engines::SamplerDescriptor>; +using SeparateSamplerMap = +    std::unordered_map<SeparateSamplerKey, Tegra::Engines::SamplerDescriptor>;  using BindlessSamplerMap =      std::unordered_map<std::pair<u32, u32>, Tegra::Engines::SamplerDescriptor, Common::PairHash>; @@ -73,6 +104,9 @@ public:      std::optional<Tegra::Engines::SamplerDescriptor> ObtainBoundSampler(u32 offset); +    std::optional<Tegra::Engines::SamplerDescriptor> ObtainSeparateSampler( +        std::pair<u32, u32> buffers, std::pair<u32, u32> offsets); +      std::optional<Tegra::Engines::SamplerDescriptor> ObtainBindlessSampler(u32 buffer, u32 offset);      /// Inserts a key. @@ -128,6 +162,7 @@ private:      Tegra::Engines::ConstBufferEngineInterface* engine = nullptr;      KeyMap keys;      BoundSamplerMap bound_samplers; +    SeparateSamplerMap separate_samplers;      BindlessSamplerMap bindless_samplers;      u32 bound_buffer;      GraphicsInfo graphics_info; diff --git a/src/video_core/shader/shader_ir.h b/src/video_core/shader/shader_ir.h index 15ae152f2..3a98b2104 100644 --- a/src/video_core/shader/shader_ir.h +++ b/src/video_core/shader/shader_ir.h @@ -330,8 +330,8 @@ private:      OperationCode GetPredicateCombiner(Tegra::Shader::PredOperation operation);      /// Queries the missing sampler info from the execution context. -    SamplerInfo GetSamplerInfo(SamplerInfo info, u32 offset, -                               std::optional<u32> buffer = std::nullopt); +    SamplerInfo GetSamplerInfo(SamplerInfo info, +                               std::optional<Tegra::Engines::SamplerDescriptor> sampler);      /// Accesses a texture sampler.      std::optional<Sampler> GetSampler(Tegra::Shader::Sampler sampler, SamplerInfo info); @@ -409,8 +409,14 @@ private:      std::tuple<Node, u32, u32> TrackCbuf(Node tracked, const NodeBlock& code, s64 cursor) const; -    std::tuple<Node, TrackSampler> TrackBindlessSampler(Node tracked, const NodeBlock& code, -                                                        s64 cursor); +    std::pair<Node, TrackSampler> TrackBindlessSampler(Node tracked, const NodeBlock& code, +                                                       s64 cursor); + +    std::pair<Node, TrackSampler> HandleBindlessIndirectRead(const CbufNode& cbuf, +                                                             const OperationNode& operation, +                                                             Node gpr, Node base_offset, +                                                             Node tracked, const NodeBlock& code, +                                                             s64 cursor);      std::optional<u32> TrackImmediate(Node tracked, const NodeBlock& code, s64 cursor) const; diff --git a/src/video_core/shader/track.cpp b/src/video_core/shader/track.cpp index eb97bfd41..d5ed81442 100644 --- a/src/video_core/shader/track.cpp +++ b/src/video_core/shader/track.cpp @@ -14,6 +14,7 @@  namespace VideoCommon::Shader {  namespace { +  std::pair<Node, s64> FindOperation(const NodeBlock& code, s64 cursor,                                     OperationCode operation_code) {      for (; cursor >= 0; --cursor) { @@ -63,7 +64,8 @@ bool AmendNodeCv(std::size_t amend_index, Node node) {      if (const auto operation = std::get_if<OperationNode>(&*node)) {          operation->SetAmendIndex(amend_index);          return true; -    } else if (const auto conditional = std::get_if<ConditionalNode>(&*node)) { +    } +    if (const auto conditional = std::get_if<ConditionalNode>(&*node)) {          conditional->SetAmendIndex(amend_index);          return true;      } @@ -72,40 +74,27 @@ bool AmendNodeCv(std::size_t amend_index, Node node) {  } // Anonymous namespace -std::tuple<Node, TrackSampler> ShaderIR::TrackBindlessSampler(Node tracked, const NodeBlock& code, -                                                              s64 cursor) { +std::pair<Node, TrackSampler> ShaderIR::TrackBindlessSampler(Node tracked, const NodeBlock& code, +                                                             s64 cursor) {      if (const auto cbuf = std::get_if<CbufNode>(&*tracked)) { +        const u32 cbuf_index = cbuf->GetIndex(); +          // Constant buffer found, test if it's an immediate          const auto& offset = cbuf->GetOffset();          if (const auto immediate = std::get_if<ImmediateNode>(&*offset)) { -            auto track = -                MakeTrackSampler<BindlessSamplerNode>(cbuf->GetIndex(), immediate->GetValue()); +            auto track = MakeTrackSampler<BindlessSamplerNode>(cbuf_index, immediate->GetValue());              return {tracked, track};          }          if (const auto operation = std::get_if<OperationNode>(&*offset)) {              const u32 bound_buffer = registry.GetBoundBuffer(); -            if (bound_buffer != cbuf->GetIndex()) { +            if (bound_buffer != cbuf_index) {                  return {};              } -            const auto pair = DecoupleIndirectRead(*operation); -            if (!pair) { -                return {}; +            if (const std::optional pair = DecoupleIndirectRead(*operation)) { +                auto [gpr, base_offset] = *pair; +                return HandleBindlessIndirectRead(*cbuf, *operation, gpr, base_offset, tracked, +                                                  code, cursor);              } -            auto [gpr, base_offset] = *pair; -            const auto offset_inm = std::get_if<ImmediateNode>(&*base_offset); -            const auto& gpu_driver = registry.AccessGuestDriverProfile(); -            const u32 bindless_cv = NewCustomVariable(); -            Node op = -                Operation(OperationCode::UDiv, gpr, Immediate(gpu_driver.GetTextureHandlerSize())); - -            const Node cv_node = GetCustomVariable(bindless_cv); -            Node amend_op = Operation(OperationCode::Assign, cv_node, std::move(op)); -            const std::size_t amend_index = DeclareAmend(std::move(amend_op)); -            AmendNodeCv(amend_index, code[cursor]); -            // TODO Implement Bindless Index custom variable -            auto track = MakeTrackSampler<ArraySamplerNode>(cbuf->GetIndex(), -                                                            offset_inm->GetValue(), bindless_cv); -            return {tracked, track};          }          return {};      } @@ -122,10 +111,23 @@ std::tuple<Node, TrackSampler> ShaderIR::TrackBindlessSampler(Node tracked, cons          return TrackBindlessSampler(source, code, new_cursor);      }      if (const auto operation = std::get_if<OperationNode>(&*tracked)) { -        for (std::size_t i = operation->GetOperandsCount(); i > 0; --i) { -            if (auto found = TrackBindlessSampler((*operation)[i - 1], code, cursor); -                std::get<0>(found)) { -                // Cbuf found in operand. +        const OperationNode& op = *operation; + +        const OperationCode opcode = operation->GetCode(); +        if (opcode == OperationCode::IBitwiseOr || opcode == OperationCode::UBitwiseOr) { +            ASSERT(op.GetOperandsCount() == 2); +            auto [node_a, index_a, offset_a] = TrackCbuf(op[0], code, cursor); +            auto [node_b, index_b, offset_b] = TrackCbuf(op[1], code, cursor); +            if (node_a && node_b) { +                auto track = MakeTrackSampler<SeparateSamplerNode>(std::pair{index_a, index_b}, +                                                                   std::pair{offset_a, offset_b}); +                return {tracked, std::move(track)}; +            } +        } +        std::size_t i = op.GetOperandsCount(); +        while (i--) { +            if (auto found = TrackBindlessSampler(op[i - 1], code, cursor); std::get<0>(found)) { +                // Constant buffer found in operand.                  return found;              }          } @@ -139,6 +141,26 @@ std::tuple<Node, TrackSampler> ShaderIR::TrackBindlessSampler(Node tracked, cons      return {};  } +std::pair<Node, TrackSampler> ShaderIR::HandleBindlessIndirectRead( +    const CbufNode& cbuf, const OperationNode& operation, Node gpr, Node base_offset, Node tracked, +    const NodeBlock& code, s64 cursor) { +    const auto offset_imm = std::get<ImmediateNode>(*base_offset); +    const auto& gpu_driver = registry.AccessGuestDriverProfile(); +    const u32 bindless_cv = NewCustomVariable(); +    const u32 texture_handler_size = gpu_driver.GetTextureHandlerSize(); +    Node op = Operation(OperationCode::UDiv, gpr, Immediate(texture_handler_size)); + +    Node cv_node = GetCustomVariable(bindless_cv); +    Node amend_op = Operation(OperationCode::Assign, std::move(cv_node), std::move(op)); +    const std::size_t amend_index = DeclareAmend(std::move(amend_op)); +    AmendNodeCv(amend_index, code[cursor]); + +    // TODO: Implement bindless index custom variable +    auto track = +        MakeTrackSampler<ArraySamplerNode>(cbuf.GetIndex(), offset_imm.GetValue(), bindless_cv); +    return {tracked, track}; +} +  std::tuple<Node, u32, u32> ShaderIR::TrackCbuf(Node tracked, const NodeBlock& code,                                                 s64 cursor) const {      if (const auto cbuf = std::get_if<CbufNode>(&*tracked)) { diff --git a/src/video_core/shader_cache.h b/src/video_core/shader_cache.h new file mode 100644 index 000000000..a23c23886 --- /dev/null +++ b/src/video_core/shader_cache.h @@ -0,0 +1,228 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include <algorithm> +#include <memory> +#include <mutex> +#include <unordered_map> +#include <utility> +#include <vector> + +#include "common/assert.h" +#include "common/common_types.h" +#include "video_core/rasterizer_interface.h" + +namespace VideoCommon { + +template <class T> +class ShaderCache { +    static constexpr u64 PAGE_SHIFT = 14; + +    struct Entry { +        VAddr addr_start; +        VAddr addr_end; +        T* data; + +        bool is_memory_marked = true; + +        constexpr bool Overlaps(VAddr start, VAddr end) const noexcept { +            return start < addr_end && addr_start < end; +        } +    }; + +public: +    virtual ~ShaderCache() = default; + +    /// @brief Removes shaders inside a given region +    /// @note Checks for ranges +    /// @param addr Start address of the invalidation +    /// @param size Number of bytes of the invalidation +    void InvalidateRegion(VAddr addr, std::size_t size) { +        std::scoped_lock lock{invalidation_mutex}; +        InvalidatePagesInRegion(addr, size); +        RemovePendingShaders(); +    } + +    /// @brief Unmarks a memory region as cached and marks it for removal +    /// @param addr Start address of the CPU write operation +    /// @param size Number of bytes of the CPU write operation +    void OnCPUWrite(VAddr addr, std::size_t size) { +        std::lock_guard lock{invalidation_mutex}; +        InvalidatePagesInRegion(addr, size); +    } + +    /// @brief Flushes delayed removal operations +    void SyncGuestHost() { +        std::scoped_lock lock{invalidation_mutex}; +        RemovePendingShaders(); +    } + +    /// @brief Tries to obtain a cached shader starting in a given address +    /// @note Doesn't check for ranges, the given address has to be the start of the shader +    /// @param addr Start address of the shader, this doesn't cache for region +    /// @return Pointer to a valid shader, nullptr when nothing is found +    T* TryGet(VAddr addr) const { +        std::scoped_lock lock{lookup_mutex}; + +        const auto it = lookup_cache.find(addr); +        if (it == lookup_cache.end()) { +            return nullptr; +        } +        return it->second->data; +    } + +protected: +    explicit ShaderCache(VideoCore::RasterizerInterface& rasterizer_) : rasterizer{rasterizer_} {} + +    /// @brief Register in the cache a given entry +    /// @param data Shader to store in the cache +    /// @param addr Start address of the shader that will be registered +    /// @param size Size in bytes of the shader +    void Register(std::unique_ptr<T> data, VAddr addr, std::size_t size) { +        std::scoped_lock lock{invalidation_mutex, lookup_mutex}; + +        const VAddr addr_end = addr + size; +        Entry* const entry = NewEntry(addr, addr_end, data.get()); + +        const u64 page_end = addr_end >> PAGE_SHIFT; +        for (u64 page = addr >> PAGE_SHIFT; page <= page_end; ++page) { +            invalidation_cache[page].push_back(entry); +        } + +        storage.push_back(std::move(data)); + +        rasterizer.UpdatePagesCachedCount(addr, size, 1); +    } + +    /// @brief Called when a shader is going to be removed +    /// @param shader Shader that will be removed +    /// @pre invalidation_cache is locked +    /// @pre lookup_mutex is locked +    virtual void OnShaderRemoval([[maybe_unused]] T* shader) {} + +private: +    /// @brief Invalidate pages in a given region +    /// @pre invalidation_mutex is locked +    void InvalidatePagesInRegion(VAddr addr, std::size_t size) { +        const VAddr addr_end = addr + size; +        const u64 page_end = addr_end >> PAGE_SHIFT; +        for (u64 page = addr >> PAGE_SHIFT; page <= page_end; ++page) { +            const auto it = invalidation_cache.find(page); +            if (it == invalidation_cache.end()) { +                continue; +            } + +            std::vector<Entry*>& entries = it->second; +            InvalidatePageEntries(entries, addr, addr_end); + +            // If there's nothing else in this page, remove it to avoid overpopulating the hash map. +            if (entries.empty()) { +                invalidation_cache.erase(it); +            } +        } +    } + +    /// @brief Remove shaders marked for deletion +    /// @pre invalidation_mutex is locked +    void RemovePendingShaders() { +        if (marked_for_removal.empty()) { +            return; +        } +        std::scoped_lock lock{lookup_mutex}; + +        std::vector<T*> removed_shaders; +        removed_shaders.reserve(marked_for_removal.size()); + +        for (Entry* const entry : marked_for_removal) { +            if (lookup_cache.erase(entry->addr_start) > 0) { +                removed_shaders.push_back(entry->data); +            } +        } +        marked_for_removal.clear(); + +        if (!removed_shaders.empty()) { +            RemoveShadersFromStorage(std::move(removed_shaders)); +        } +    } + +    /// @brief Invalidates entries in a given range for the passed page +    /// @param entries         Vector of entries in the page, it will be modified on overlaps +    /// @param addr            Start address of the invalidation +    /// @param addr_end        Non-inclusive end address of the invalidation +    /// @pre invalidation_mutex is locked +    void InvalidatePageEntries(std::vector<Entry*>& entries, VAddr addr, VAddr addr_end) { +        auto it = entries.begin(); +        while (it != entries.end()) { +            Entry* const entry = *it; +            if (!entry->Overlaps(addr, addr_end)) { +                ++it; +                continue; +            } +            UnmarkMemory(entry); +            marked_for_removal.push_back(entry); + +            it = entries.erase(it); +        } +    } + +    /// @brief Unmarks an entry from the rasterizer cache +    /// @param entry Entry to unmark from memory +    void UnmarkMemory(Entry* entry) { +        if (!entry->is_memory_marked) { +            return; +        } +        entry->is_memory_marked = false; + +        const VAddr addr = entry->addr_start; +        const std::size_t size = entry->addr_end - addr; +        rasterizer.UpdatePagesCachedCount(addr, size, -1); +    } + +    /// @brief Removes a vector of shaders from a list +    /// @param removed_shaders Shaders to be removed from the storage, it can contain duplicates +    /// @pre invalidation_mutex is locked +    /// @pre lookup_mutex is locked +    void RemoveShadersFromStorage(std::vector<T*> removed_shaders) { +        // Remove duplicates +        std::sort(removed_shaders.begin(), removed_shaders.end()); +        removed_shaders.erase(std::unique(removed_shaders.begin(), removed_shaders.end()), +                              removed_shaders.end()); + +        // Now that there are no duplicates, we can notify removals +        for (T* const shader : removed_shaders) { +            OnShaderRemoval(shader); +        } + +        // Remove them from the cache +        const auto is_removed = [&removed_shaders](std::unique_ptr<T>& shader) { +            return std::find(removed_shaders.begin(), removed_shaders.end(), shader.get()) != +                   removed_shaders.end(); +        }; +        storage.erase(std::remove_if(storage.begin(), storage.end(), is_removed), storage.end()); +    } + +    /// @brief Creates a new entry in the lookup cache and returns its pointer +    /// @pre lookup_mutex is locked +    Entry* NewEntry(VAddr addr, VAddr addr_end, T* data) { +        auto entry = std::make_unique<Entry>(Entry{addr, addr_end, data}); +        Entry* const entry_pointer = entry.get(); + +        lookup_cache.emplace(addr, std::move(entry)); +        return entry_pointer; +    } + +    VideoCore::RasterizerInterface& rasterizer; + +    mutable std::mutex lookup_mutex; +    std::mutex invalidation_mutex; + +    std::unordered_map<u64, std::unique_ptr<Entry>> lookup_cache; +    std::unordered_map<u64, std::vector<Entry*>> invalidation_cache; +    std::vector<std::unique_ptr<T>> storage; +    std::vector<Entry*> marked_for_removal; +}; + +} // namespace VideoCommon diff --git a/src/video_core/texture_cache/surface_base.cpp b/src/video_core/texture_cache/surface_base.cpp index 715f39d0d..94d3a6ae5 100644 --- a/src/video_core/texture_cache/surface_base.cpp +++ b/src/video_core/texture_cache/surface_base.cpp @@ -248,12 +248,11 @@ void SurfaceBaseImpl::FlushBuffer(Tegra::MemoryManager& memory_manager,      // Use an extra temporal buffer      auto& tmp_buffer = staging_cache.GetBuffer(1); -    // Special case for 3D Texture Segments -    const bool must_read_current_data = -        params.block_depth > 0 && params.target == VideoCore::Surface::SurfaceTarget::Texture2D;      tmp_buffer.resize(guest_memory_size);      host_ptr = tmp_buffer.data(); -    if (must_read_current_data) { + +    if (params.target == SurfaceTarget::Texture3D) { +        // Special case for 3D texture segments          memory_manager.ReadBlockUnsafe(gpu_addr, host_ptr, guest_memory_size);      } diff --git a/src/video_core/texture_cache/surface_base.h b/src/video_core/texture_cache/surface_base.h index 79e10ffbb..173f2edba 100644 --- a/src/video_core/texture_cache/surface_base.h +++ b/src/video_core/texture_cache/surface_base.h @@ -217,8 +217,8 @@ public:      }      bool IsProtected() const { -        // Only 3D Slices are to be protected -        return is_target && params.block_depth > 0; +        // Only 3D slices are to be protected +        return is_target && params.target == SurfaceTarget::Texture3D;      }      bool IsRenderTarget() const { @@ -250,6 +250,11 @@ public:          return GetView(ViewParams(overview_params.target, 0, num_layers, 0, params.num_levels));      } +    TView Emplace3DView(u32 slice, u32 depth, u32 base_level, u32 num_levels) { +        return GetView(ViewParams(VideoCore::Surface::SurfaceTarget::Texture3D, slice, depth, +                                  base_level, num_levels)); +    } +      std::optional<TView> EmplaceIrregularView(const SurfaceParams& view_params,                                                const GPUVAddr view_addr,                                                const std::size_t candidate_size, const u32 mipmap, @@ -272,8 +277,8 @@ public:      std::optional<TView> EmplaceView(const SurfaceParams& view_params, const GPUVAddr view_addr,                                       const std::size_t candidate_size) {          if (params.target == SurfaceTarget::Texture3D || -            (params.num_levels == 1 && !params.is_layered) || -            view_params.target == SurfaceTarget::Texture3D) { +            view_params.target == SurfaceTarget::Texture3D || +            (params.num_levels == 1 && !params.is_layered)) {              return {};          }          const auto layer_mipmap{GetLayerMipmap(view_addr)}; diff --git a/src/video_core/texture_cache/surface_params.cpp b/src/video_core/texture_cache/surface_params.cpp index 884fabffe..0b2b2b8c4 100644 --- a/src/video_core/texture_cache/surface_params.cpp +++ b/src/video_core/texture_cache/surface_params.cpp @@ -215,10 +215,19 @@ SurfaceParams SurfaceParams::CreateForFramebuffer(Core::System& system, std::siz      params.num_levels = 1;      params.emulated_levels = 1; -    const bool is_layered = config.layers > 1 && params.block_depth == 0; -    params.is_layered = is_layered; -    params.depth = is_layered ? config.layers.Value() : 1; -    params.target = is_layered ? SurfaceTarget::Texture2DArray : SurfaceTarget::Texture2D; +    if (config.memory_layout.is_3d != 0) { +        params.depth = config.layers.Value(); +        params.is_layered = false; +        params.target = SurfaceTarget::Texture3D; +    } else if (config.layers > 1) { +        params.depth = config.layers.Value(); +        params.is_layered = true; +        params.target = SurfaceTarget::Texture2DArray; +    } else { +        params.depth = 1; +        params.is_layered = false; +        params.target = SurfaceTarget::Texture2D; +    }      return params;  } @@ -237,7 +246,7 @@ SurfaceParams SurfaceParams::CreateForFermiCopySurface(      params.width = config.width;      params.height = config.height;      params.pitch = config.pitch; -    // TODO(Rodrigo): Try to guess the surface target from depth and layer parameters +    // TODO(Rodrigo): Try to guess texture arrays from parameters      params.target = SurfaceTarget::Texture2D;      params.depth = 1;      params.num_levels = 1; diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h index 6f63217a2..b543fc8c0 100644 --- a/src/video_core/texture_cache/texture_cache.h +++ b/src/video_core/texture_cache/texture_cache.h @@ -298,15 +298,13 @@ public:          const GPUVAddr src_gpu_addr = src_config.Address();          const GPUVAddr dst_gpu_addr = dst_config.Address();          DeduceBestBlit(src_params, dst_params, src_gpu_addr, dst_gpu_addr); -        const std::optional<VAddr> dst_cpu_addr = -            system.GPU().MemoryManager().GpuToCpuAddress(dst_gpu_addr); -        const std::optional<VAddr> src_cpu_addr = -            system.GPU().MemoryManager().GpuToCpuAddress(src_gpu_addr); -        std::pair<TSurface, TView> dst_surface = -            GetSurface(dst_gpu_addr, *dst_cpu_addr, dst_params, true, false); -        std::pair<TSurface, TView> src_surface = -            GetSurface(src_gpu_addr, *src_cpu_addr, src_params, true, false); -        ImageBlit(src_surface.second, dst_surface.second, copy_config); + +        const auto& memory_manager = system.GPU().MemoryManager(); +        const std::optional<VAddr> dst_cpu_addr = memory_manager.GpuToCpuAddress(dst_gpu_addr); +        const std::optional<VAddr> src_cpu_addr = memory_manager.GpuToCpuAddress(src_gpu_addr); +        std::pair dst_surface = GetSurface(dst_gpu_addr, *dst_cpu_addr, dst_params, true, false); +        TView src_surface = GetSurface(src_gpu_addr, *src_cpu_addr, src_params, true, false).second; +        ImageBlit(src_surface, dst_surface.second, copy_config);          dst_surface.first->MarkAsModified(true, Tick());      } @@ -508,12 +506,12 @@ private:              return RecycleStrategy::Flush;          }          // 3D Textures decision -        if (params.block_depth > 1 || params.target == SurfaceTarget::Texture3D) { +        if (params.target == SurfaceTarget::Texture3D) {              return RecycleStrategy::Flush;          }          for (const auto& s : overlaps) {              const auto& s_params = s->GetSurfaceParams(); -            if (s_params.block_depth > 1 || s_params.target == SurfaceTarget::Texture3D) { +            if (s_params.target == SurfaceTarget::Texture3D) {                  return RecycleStrategy::Flush;              }          } @@ -731,51 +729,9 @@ private:       */      std::optional<std::pair<TSurface, TView>> Manage3DSurfaces(VectorSurface& overlaps,                                                                 const SurfaceParams& params, -                                                               const GPUVAddr gpu_addr, -                                                               const VAddr cpu_addr, +                                                               GPUVAddr gpu_addr, VAddr cpu_addr,                                                                 bool preserve_contents) { -        if (params.target == SurfaceTarget::Texture3D) { -            bool failed = false; -            if (params.num_levels > 1) { -                // We can't handle mipmaps in 3D textures yet, better fallback to LLE approach -                return std::nullopt; -            } -            TSurface new_surface = GetUncachedSurface(gpu_addr, params); -            bool modified = false; -            for (auto& surface : overlaps) { -                const SurfaceParams& src_params = surface->GetSurfaceParams(); -                if (src_params.target != SurfaceTarget::Texture2D) { -                    failed = true; -                    break; -                } -                if (src_params.height != params.height) { -                    failed = true; -                    break; -                } -                if (src_params.block_depth != params.block_depth || -                    src_params.block_height != params.block_height) { -                    failed = true; -                    break; -                } -                const u32 offset = static_cast<u32>(surface->GetCpuAddr() - cpu_addr); -                const auto offsets = params.GetBlockOffsetXYZ(offset); -                const auto z = std::get<2>(offsets); -                modified |= surface->IsModified(); -                const CopyParams copy_params(0, 0, 0, 0, 0, z, 0, 0, params.width, params.height, -                                             1); -                ImageCopy(surface, new_surface, copy_params); -            } -            if (failed) { -                return std::nullopt; -            } -            for (const auto& surface : overlaps) { -                Unregister(surface); -            } -            new_surface->MarkAsModified(modified, Tick()); -            Register(new_surface); -            auto view = new_surface->GetMainView(); -            return {{std::move(new_surface), view}}; -        } else { +        if (params.target != SurfaceTarget::Texture3D) {              for (const auto& surface : overlaps) {                  if (!surface->MatchTarget(params.target)) {                      if (overlaps.size() == 1 && surface->GetCpuAddr() == cpu_addr) { @@ -791,11 +747,60 @@ private:                      continue;                  }                  if (surface->MatchesStructure(params) == MatchStructureResult::FullMatch) { -                    return {{surface, surface->GetMainView()}}; +                    return std::make_pair(surface, surface->GetMainView());                  }              }              return InitializeSurface(gpu_addr, params, preserve_contents);          } + +        if (params.num_levels > 1) { +            // We can't handle mipmaps in 3D textures yet, better fallback to LLE approach +            return std::nullopt; +        } + +        if (overlaps.size() == 1) { +            const auto& surface = overlaps[0]; +            const SurfaceParams& overlap_params = surface->GetSurfaceParams(); +            // Don't attempt to render to textures with more than one level for now +            // The texture has to be to the right or the sample address if we want to render to it +            if (overlap_params.num_levels == 1 && cpu_addr >= surface->GetCpuAddr()) { +                const u32 offset = static_cast<u32>(cpu_addr - surface->GetCpuAddr()); +                const u32 slice = std::get<2>(params.GetBlockOffsetXYZ(offset)); +                if (slice < overlap_params.depth) { +                    auto view = surface->Emplace3DView(slice, params.depth, 0, 1); +                    return std::make_pair(std::move(surface), std::move(view)); +                } +            } +        } + +        TSurface new_surface = GetUncachedSurface(gpu_addr, params); +        bool modified = false; + +        for (auto& surface : overlaps) { +            const SurfaceParams& src_params = surface->GetSurfaceParams(); +            if (src_params.target != SurfaceTarget::Texture2D || +                src_params.height != params.height || +                src_params.block_depth != params.block_depth || +                src_params.block_height != params.block_height) { +                return std::nullopt; +            } +            modified |= surface->IsModified(); + +            const u32 offset = static_cast<u32>(surface->GetCpuAddr() - cpu_addr); +            const u32 slice = std::get<2>(params.GetBlockOffsetXYZ(offset)); +            const u32 width = params.width; +            const u32 height = params.height; +            const CopyParams copy_params(0, 0, 0, 0, 0, slice, 0, 0, width, height, 1); +            ImageCopy(surface, new_surface, copy_params); +        } +        for (const auto& surface : overlaps) { +            Unregister(surface); +        } +        new_surface->MarkAsModified(modified, Tick()); +        Register(new_surface); + +        TView view = new_surface->GetMainView(); +        return std::make_pair(std::move(new_surface), std::move(view));      }      /** @@ -873,7 +878,7 @@ private:              }          } -        // Check if it's a 3D texture +        // Manage 3D textures          if (params.block_depth > 0) {              auto surface =                  Manage3DSurfaces(overlaps, params, gpu_addr, cpu_addr, preserve_contents); diff --git a/src/yuzu/configuration/config.cpp b/src/yuzu/configuration/config.cpp index d6c9e5013..32c81dc70 100644 --- a/src/yuzu/configuration/config.cpp +++ b/src/yuzu/configuration/config.cpp @@ -533,6 +533,8 @@ void Config::ReadDebuggingValues() {      Settings::values.quest_flag = ReadSetting(QStringLiteral("quest_flag"), false).toBool();      Settings::values.disable_cpu_opt =          ReadSetting(QStringLiteral("disable_cpu_opt"), false).toBool(); +    Settings::values.disable_macro_jit = +        ReadSetting(QStringLiteral("disable_macro_jit"), false).toBool();      qt_config->endGroup();  } @@ -1007,6 +1009,7 @@ void Config::SaveDebuggingValues() {      WriteSetting(QStringLiteral("dump_nso"), Settings::values.dump_nso, false);      WriteSetting(QStringLiteral("quest_flag"), Settings::values.quest_flag, false);      WriteSetting(QStringLiteral("disable_cpu_opt"), Settings::values.disable_cpu_opt, false); +    WriteSetting(QStringLiteral("disable_macro_jit"), Settings::values.disable_macro_jit, false);      qt_config->endGroup();  } diff --git a/src/yuzu/configuration/configure_debug.cpp b/src/yuzu/configuration/configure_debug.cpp index c2026763e..2c77441fd 100644 --- a/src/yuzu/configuration/configure_debug.cpp +++ b/src/yuzu/configuration/configure_debug.cpp @@ -39,6 +39,8 @@ void ConfigureDebug::SetConfiguration() {      ui->disable_cpu_opt->setChecked(Settings::values.disable_cpu_opt);      ui->enable_graphics_debugging->setEnabled(!Core::System::GetInstance().IsPoweredOn());      ui->enable_graphics_debugging->setChecked(Settings::values.renderer_debug); +    ui->disable_macro_jit->setEnabled(!Core::System::GetInstance().IsPoweredOn()); +    ui->disable_macro_jit->setChecked(Settings::values.disable_macro_jit);  }  void ConfigureDebug::ApplyConfiguration() { @@ -51,6 +53,7 @@ void ConfigureDebug::ApplyConfiguration() {      Settings::values.quest_flag = ui->quest_flag->isChecked();      Settings::values.disable_cpu_opt = ui->disable_cpu_opt->isChecked();      Settings::values.renderer_debug = ui->enable_graphics_debugging->isChecked(); +    Settings::values.disable_macro_jit = ui->disable_macro_jit->isChecked();      Debugger::ToggleConsole();      Log::Filter filter;      filter.ParseFilterString(Settings::values.log_filter); diff --git a/src/yuzu/configuration/configure_debug.ui b/src/yuzu/configuration/configure_debug.ui index e0d4c4a44..46f0208c6 100644 --- a/src/yuzu/configuration/configure_debug.ui +++ b/src/yuzu/configuration/configure_debug.ui @@ -148,6 +148,19 @@          </property>         </widget>        </item> +      <item> +       <widget class="QCheckBox" name="disable_macro_jit"> +        <property name="enabled"> +         <bool>true</bool> +        </property> +        <property name="whatsThis"> +         <string>When checked, it disables the macro Just In Time compiler. Enabled this makes games run slower</string> +        </property> +        <property name="text"> +         <string>Disable Macro JIT</string> +        </property> +       </widget> +      </item>       </layout>      </widget>     </item> diff --git a/src/yuzu_cmd/config.cpp b/src/yuzu_cmd/config.cpp index 5f9cc158e..659b9f701 100644 --- a/src/yuzu_cmd/config.cpp +++ b/src/yuzu_cmd/config.cpp @@ -430,6 +430,8 @@ void Config::ReadValues() {      Settings::values.quest_flag = sdl2_config->GetBoolean("Debugging", "quest_flag", false);      Settings::values.disable_cpu_opt =          sdl2_config->GetBoolean("Debugging", "disable_cpu_opt", false); +    Settings::values.disable_macro_jit = +        sdl2_config->GetBoolean("Debugging", "disable_macro_jit", false);      const auto title_list = sdl2_config->Get("AddOns", "title_ids", "");      std::stringstream ss(title_list); diff --git a/src/yuzu_cmd/default_ini.h b/src/yuzu_cmd/default_ini.h index 102502084..45c07ed5d 100644 --- a/src/yuzu_cmd/default_ini.h +++ b/src/yuzu_cmd/default_ini.h @@ -286,6 +286,8 @@ quest_flag =  # Determines whether or not JIT CPU optimizations are enabled  # false: Optimizations Enabled, true: Optimizations Disabled  disable_cpu_opt = +# Enables/Disables the macro JIT compiler +disable_macro_jit=false  [WebService]  # Whether or not to enable telemetry | 
