diff options
21 files changed, 1125 insertions, 149 deletions
| diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt index db9332d00..4b0c6346f 100644 --- a/src/video_core/CMakeLists.txt +++ b/src/video_core/CMakeLists.txt @@ -37,6 +37,7 @@ add_library(video_core STATIC      memory_manager.h      morton.cpp      morton.h +    query_cache.h      rasterizer_accelerated.cpp      rasterizer_accelerated.h      rasterizer_cache.cpp @@ -74,6 +75,8 @@ add_library(video_core STATIC      renderer_opengl/gl_stream_buffer.h      renderer_opengl/gl_texture_cache.cpp      renderer_opengl/gl_texture_cache.h +    renderer_opengl/gl_query_cache.cpp +    renderer_opengl/gl_query_cache.h      renderer_opengl/maxwell_to_gl.h      renderer_opengl/renderer_opengl.cpp      renderer_opengl/renderer_opengl.h @@ -177,6 +180,8 @@ if (ENABLE_VULKAN)          renderer_vulkan/vk_memory_manager.h          renderer_vulkan/vk_pipeline_cache.cpp          renderer_vulkan/vk_pipeline_cache.h +        renderer_vulkan/vk_query_cache.cpp +        renderer_vulkan/vk_query_cache.h          renderer_vulkan/vk_rasterizer.cpp          renderer_vulkan/vk_rasterizer.h          renderer_vulkan/vk_renderpass_cache.cpp diff --git a/src/video_core/engines/maxwell_3d.cpp b/src/video_core/engines/maxwell_3d.cpp index 0b3e8749b..b28de1092 100644 --- a/src/video_core/engines/maxwell_3d.cpp +++ b/src/video_core/engines/maxwell_3d.cpp @@ -4,6 +4,7 @@  #include <cinttypes>  #include <cstring> +#include <optional>  #include "common/assert.h"  #include "core/core.h"  #include "core/core_timing.h" @@ -16,6 +17,8 @@  namespace Tegra::Engines { +using VideoCore::QueryType; +  /// First register id that is actually a Macro call.  constexpr u32 MacroRegistersStart = 0xE00; @@ -400,6 +403,10 @@ void Maxwell3D::CallMethod(const GPU::MethodCall& method_call) {          ProcessQueryCondition();          break;      } +    case MAXWELL3D_REG_INDEX(counter_reset): { +        ProcessCounterReset(); +        break; +    }      case MAXWELL3D_REG_INDEX(sync_info): {          ProcessSyncPoint();          break; @@ -482,7 +489,7 @@ void Maxwell3D::FlushMMEInlineDraw() {      const bool is_indexed = mme_draw.current_mode == MMEDrawMode::Indexed;      if (ShouldExecute()) { -        rasterizer.DrawMultiBatch(is_indexed); +        rasterizer.Draw(is_indexed, true);      }      // TODO(bunnei): Below, we reset vertex count so that we can use these registers to determine if @@ -544,40 +551,28 @@ void Maxwell3D::ProcessQueryGet() {                 "Units other than CROP are unimplemented");      switch (regs.query.query_get.operation) { -    case Regs::QueryOperation::Release: { -        const u64 result = regs.query.query_sequence; -        StampQueryResult(result, regs.query.query_get.short_query == 0); +    case Regs::QueryOperation::Release: +        StampQueryResult(regs.query.query_sequence, regs.query.query_get.short_query == 0);          break; -    } -    case Regs::QueryOperation::Acquire: { -        // Todo(Blinkhawk): Under this operation, the GPU waits for the CPU -        // to write a value that matches the current payload. +    case Regs::QueryOperation::Acquire: +        // TODO(Blinkhawk): Under this operation, the GPU waits for the CPU to write a value that +        // matches the current payload.          UNIMPLEMENTED_MSG("Unimplemented query operation ACQUIRE");          break; -    } -    case Regs::QueryOperation::Counter: { -        u64 result{}; -        switch (regs.query.query_get.select) { -        case Regs::QuerySelect::Zero: -            result = 0; -            break; -        default: -            result = 1; -            UNIMPLEMENTED_MSG("Unimplemented query select type {}", -                              static_cast<u32>(regs.query.query_get.select.Value())); +    case Regs::QueryOperation::Counter: +        if (const std::optional<u64> result = GetQueryResult()) { +            // If the query returns an empty optional it means it's cached and deferred. +            // In this case we have a non-empty result, so we stamp it immediately. +            StampQueryResult(*result, regs.query.query_get.short_query == 0);          } -        StampQueryResult(result, regs.query.query_get.short_query == 0);          break; -    } -    case Regs::QueryOperation::Trap: { +    case Regs::QueryOperation::Trap:          UNIMPLEMENTED_MSG("Unimplemented query operation TRAP");          break; -    } -    default: { +    default:          UNIMPLEMENTED_MSG("Unknown query operation");          break;      } -    }  }  void Maxwell3D::ProcessQueryCondition() { @@ -593,20 +588,20 @@ void Maxwell3D::ProcessQueryCondition() {      }      case Regs::ConditionMode::ResNonZero: {          Regs::QueryCompare cmp; -        memory_manager.ReadBlockUnsafe(condition_address, &cmp, sizeof(cmp)); +        memory_manager.ReadBlock(condition_address, &cmp, sizeof(cmp));          execute_on = cmp.initial_sequence != 0U && cmp.initial_mode != 0U;          break;      }      case Regs::ConditionMode::Equal: {          Regs::QueryCompare cmp; -        memory_manager.ReadBlockUnsafe(condition_address, &cmp, sizeof(cmp)); +        memory_manager.ReadBlock(condition_address, &cmp, sizeof(cmp));          execute_on =              cmp.initial_sequence == cmp.current_sequence && cmp.initial_mode == cmp.current_mode;          break;      }      case Regs::ConditionMode::NotEqual: {          Regs::QueryCompare cmp; -        memory_manager.ReadBlockUnsafe(condition_address, &cmp, sizeof(cmp)); +        memory_manager.ReadBlock(condition_address, &cmp, sizeof(cmp));          execute_on =              cmp.initial_sequence != cmp.current_sequence || cmp.initial_mode != cmp.current_mode;          break; @@ -619,6 +614,18 @@ void Maxwell3D::ProcessQueryCondition() {      }  } +void Maxwell3D::ProcessCounterReset() { +    switch (regs.counter_reset) { +    case Regs::CounterReset::SampleCnt: +        rasterizer.ResetCounter(QueryType::SamplesPassed); +        break; +    default: +        LOG_WARNING(Render_OpenGL, "Unimplemented counter reset={}", +                    static_cast<int>(regs.counter_reset)); +        break; +    } +} +  void Maxwell3D::ProcessSyncPoint() {      const u32 sync_point = regs.sync_info.sync_point.Value();      const u32 increment = regs.sync_info.increment.Value(); @@ -647,7 +654,7 @@ void Maxwell3D::DrawArrays() {      const bool is_indexed{regs.index_array.count && !regs.vertex_buffer.count};      if (ShouldExecute()) { -        rasterizer.DrawBatch(is_indexed); +        rasterizer.Draw(is_indexed, false);      }      // TODO(bunnei): Below, we reset vertex count so that we can use these registers to determine if @@ -661,6 +668,22 @@ void Maxwell3D::DrawArrays() {      }  } +std::optional<u64> Maxwell3D::GetQueryResult() { +    switch (regs.query.query_get.select) { +    case Regs::QuerySelect::Zero: +        return 0; +    case Regs::QuerySelect::SamplesPassed: +        // Deferred. +        rasterizer.Query(regs.query.QueryAddress(), VideoCore::QueryType::SamplesPassed, +                         system.GPU().GetTicks()); +        return {}; +    default: +        UNIMPLEMENTED_MSG("Unimplemented query select type {}", +                          static_cast<u32>(regs.query.query_get.select.Value())); +        return 1; +    } +} +  void Maxwell3D::ProcessCBBind(std::size_t stage_index) {      // Bind the buffer currently in CB_ADDRESS to the specified index in the desired shader stage.      auto& shader = state.shader_stages[stage_index]; diff --git a/src/video_core/engines/maxwell_3d.h b/src/video_core/engines/maxwell_3d.h index 0a2af54e5..26939be3f 100644 --- a/src/video_core/engines/maxwell_3d.h +++ b/src/video_core/engines/maxwell_3d.h @@ -6,6 +6,7 @@  #include <array>  #include <bitset> +#include <optional>  #include <type_traits>  #include <unordered_map>  #include <vector> @@ -409,6 +410,27 @@ public:              Linear = 1,          }; +        enum class CounterReset : u32 { +            SampleCnt = 0x01, +            Unk02 = 0x02, +            Unk03 = 0x03, +            Unk04 = 0x04, +            EmittedPrimitives = 0x10, // Not tested +            Unk11 = 0x11, +            Unk12 = 0x12, +            Unk13 = 0x13, +            Unk15 = 0x15, +            Unk16 = 0x16, +            Unk17 = 0x17, +            Unk18 = 0x18, +            Unk1A = 0x1A, +            Unk1B = 0x1B, +            Unk1C = 0x1C, +            Unk1D = 0x1D, +            Unk1E = 0x1E, +            GeneratedPrimitives = 0x1F, +        }; +          struct Cull {              enum class FrontFace : u32 {                  ClockWise = 0x0900, @@ -857,7 +879,7 @@ public:                      BitField<7, 1, u32> c7;                  } clip_distance_enabled; -                INSERT_UNION_PADDING_WORDS(0x1); +                u32 samplecnt_enable;                  float point_size; @@ -865,7 +887,11 @@ public:                  u32 point_sprite_enable; -                INSERT_UNION_PADDING_WORDS(0x5); +                INSERT_UNION_PADDING_WORDS(0x3); + +                CounterReset counter_reset; + +                INSERT_UNION_PADDING_WORDS(0x1);                  u32 zeta_enable; @@ -1412,12 +1438,15 @@ private:      /// Handles a write to the QUERY_GET register.      void ProcessQueryGet(); -    // Writes the query result accordingly +    /// Writes the query result accordingly.      void StampQueryResult(u64 payload, bool long_query); -    // Handles Conditional Rendering +    /// Handles conditional rendering.      void ProcessQueryCondition(); +    /// Handles counter resets. +    void ProcessCounterReset(); +      /// Handles writes to syncing register.      void ProcessSyncPoint(); @@ -1434,6 +1463,9 @@ private:      // Handles a instance drawcall from MME      void StepInstance(MMEDrawMode expected_mode, u32 count); + +    /// Returns a query's value or an empty object if the value will be deferred through a cache. +    std::optional<u64> GetQueryResult();  };  #define ASSERT_REG_POSITION(field_name, position)                                                  \ @@ -1499,8 +1531,10 @@ ASSERT_REG_POSITION(screen_y_control, 0x4EB);  ASSERT_REG_POSITION(vb_element_base, 0x50D);  ASSERT_REG_POSITION(vb_base_instance, 0x50E);  ASSERT_REG_POSITION(clip_distance_enabled, 0x544); +ASSERT_REG_POSITION(samplecnt_enable, 0x545);  ASSERT_REG_POSITION(point_size, 0x546);  ASSERT_REG_POSITION(point_sprite_enable, 0x548); +ASSERT_REG_POSITION(counter_reset, 0x54C);  ASSERT_REG_POSITION(zeta_enable, 0x54E);  ASSERT_REG_POSITION(multisample_control, 0x54F);  ASSERT_REG_POSITION(condition, 0x554); diff --git a/src/video_core/query_cache.h b/src/video_core/query_cache.h new file mode 100644 index 000000000..e66054ed0 --- /dev/null +++ b/src/video_core/query_cache.h @@ -0,0 +1,359 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include <algorithm> +#include <array> +#include <cstring> +#include <iterator> +#include <memory> +#include <mutex> +#include <optional> +#include <unordered_map> +#include <vector> + +#include "common/assert.h" +#include "core/core.h" +#include "video_core/engines/maxwell_3d.h" +#include "video_core/gpu.h" +#include "video_core/memory_manager.h" +#include "video_core/rasterizer_interface.h" + +namespace VideoCommon { + +template <class QueryCache, class HostCounter> +class CounterStreamBase { +public: +    explicit CounterStreamBase(QueryCache& cache, VideoCore::QueryType type) +        : cache{cache}, type{type} {} + +    /// Updates the state of the stream, enabling or disabling as needed. +    void Update(bool enabled) { +        if (enabled) { +            Enable(); +        } else { +            Disable(); +        } +    } + +    /// Resets the stream to zero. It doesn't disable the query after resetting. +    void Reset() { +        if (current) { +            current->EndQuery(); + +            // Immediately start a new query to avoid disabling its state. +            current = cache.Counter(nullptr, type); +        } +        last = nullptr; +    } + +    /// Returns the current counter slicing as needed. +    std::shared_ptr<HostCounter> Current() { +        if (!current) { +            return nullptr; +        } +        current->EndQuery(); +        last = std::move(current); +        current = cache.Counter(last, type); +        return last; +    } + +    /// Returns true when the counter stream is enabled. +    bool IsEnabled() const { +        return current != nullptr; +    } + +private: +    /// Enables the stream. +    void Enable() { +        if (current) { +            return; +        } +        current = cache.Counter(last, type); +    } + +    // Disables the stream. +    void Disable() { +        if (current) { +            current->EndQuery(); +        } +        last = std::exchange(current, nullptr); +    } + +    QueryCache& cache; +    const VideoCore::QueryType type; + +    std::shared_ptr<HostCounter> current; +    std::shared_ptr<HostCounter> last; +}; + +template <class QueryCache, class CachedQuery, class CounterStream, class HostCounter, +          class QueryPool> +class QueryCacheBase { +public: +    explicit QueryCacheBase(Core::System& system, VideoCore::RasterizerInterface& rasterizer) +        : system{system}, rasterizer{rasterizer}, streams{{CounterStream{ +                                                      static_cast<QueryCache&>(*this), +                                                      VideoCore::QueryType::SamplesPassed}}} {} + +    void InvalidateRegion(CacheAddr addr, std::size_t size) { +        std::unique_lock lock{mutex}; +        FlushAndRemoveRegion(addr, size); +    } + +    void FlushRegion(CacheAddr addr, std::size_t size) { +        std::unique_lock lock{mutex}; +        FlushAndRemoveRegion(addr, size); +    } + +    /** +     * Records a query in GPU mapped memory, potentially marked with a timestamp. +     * @param gpu_addr  GPU address to flush to when the mapped memory is read. +     * @param type      Query type, e.g. SamplesPassed. +     * @param timestamp Timestamp, when empty the flushed query is assumed to be short. +     */ +    void Query(GPUVAddr gpu_addr, VideoCore::QueryType type, std::optional<u64> timestamp) { +        std::unique_lock lock{mutex}; +        auto& memory_manager = system.GPU().MemoryManager(); +        const auto host_ptr = memory_manager.GetPointer(gpu_addr); + +        CachedQuery* query = TryGet(ToCacheAddr(host_ptr)); +        if (!query) { +            const auto cpu_addr = memory_manager.GpuToCpuAddress(gpu_addr); +            ASSERT_OR_EXECUTE(cpu_addr, return;); + +            query = Register(type, *cpu_addr, host_ptr, timestamp.has_value()); +        } + +        query->BindCounter(Stream(type).Current(), timestamp); +    } + +    /// Updates counters from GPU state. Expected to be called once per draw, clear or dispatch. +    void UpdateCounters() { +        std::unique_lock lock{mutex}; +        const auto& regs = system.GPU().Maxwell3D().regs; +        Stream(VideoCore::QueryType::SamplesPassed).Update(regs.samplecnt_enable); +    } + +    /// Resets a counter to zero. It doesn't disable the query after resetting. +    void ResetCounter(VideoCore::QueryType type) { +        std::unique_lock lock{mutex}; +        Stream(type).Reset(); +    } + +    /// Disable all active streams. Expected to be called at the end of a command buffer. +    void DisableStreams() { +        std::unique_lock lock{mutex}; +        for (auto& stream : streams) { +            stream.Update(false); +        } +    } + +    /// Returns a new host counter. +    std::shared_ptr<HostCounter> Counter(std::shared_ptr<HostCounter> dependency, +                                         VideoCore::QueryType type) { +        return std::make_shared<HostCounter>(static_cast<QueryCache&>(*this), std::move(dependency), +                                             type); +    } + +    /// Returns the counter stream of the specified type. +    CounterStream& Stream(VideoCore::QueryType type) { +        return streams[static_cast<std::size_t>(type)]; +    } + +    /// Returns the counter stream of the specified type. +    const CounterStream& Stream(VideoCore::QueryType type) const { +        return streams[static_cast<std::size_t>(type)]; +    } + +protected: +    std::array<QueryPool, VideoCore::NumQueryTypes> query_pools; + +private: +    /// Flushes a memory range to guest memory and removes it from the cache. +    void FlushAndRemoveRegion(CacheAddr addr, std::size_t size) { +        const u64 addr_begin = static_cast<u64>(addr); +        const u64 addr_end = addr_begin + static_cast<u64>(size); +        const auto in_range = [addr_begin, addr_end](CachedQuery& query) { +            const u64 cache_begin = query.GetCacheAddr(); +            const u64 cache_end = cache_begin + query.SizeInBytes(); +            return cache_begin < addr_end && addr_begin < cache_end; +        }; + +        const u64 page_end = addr_end >> PAGE_SHIFT; +        for (u64 page = addr_begin >> PAGE_SHIFT; page <= page_end; ++page) { +            const auto& it = cached_queries.find(page); +            if (it == std::end(cached_queries)) { +                continue; +            } +            auto& contents = it->second; +            for (auto& query : contents) { +                if (!in_range(query)) { +                    continue; +                } +                rasterizer.UpdatePagesCachedCount(query.CpuAddr(), query.SizeInBytes(), -1); +                query.Flush(); +            } +            contents.erase(std::remove_if(std::begin(contents), std::end(contents), in_range), +                           std::end(contents)); +        } +    } + +    /// Registers the passed parameters as cached and returns a pointer to the stored cached query. +    CachedQuery* Register(VideoCore::QueryType type, VAddr cpu_addr, u8* host_ptr, bool timestamp) { +        rasterizer.UpdatePagesCachedCount(cpu_addr, CachedQuery::SizeInBytes(timestamp), 1); +        const u64 page = static_cast<u64>(ToCacheAddr(host_ptr)) >> PAGE_SHIFT; +        return &cached_queries[page].emplace_back(static_cast<QueryCache&>(*this), type, cpu_addr, +                                                  host_ptr); +    } + +    /// Tries to a get a cached query. Returns nullptr on failure. +    CachedQuery* TryGet(CacheAddr addr) { +        const u64 page = static_cast<u64>(addr) >> PAGE_SHIFT; +        const auto it = cached_queries.find(page); +        if (it == std::end(cached_queries)) { +            return nullptr; +        } +        auto& contents = it->second; +        const auto found = +            std::find_if(std::begin(contents), std::end(contents), +                         [addr](auto& query) { return query.GetCacheAddr() == addr; }); +        return found != std::end(contents) ? &*found : nullptr; +    } + +    static constexpr std::uintptr_t PAGE_SIZE = 4096; +    static constexpr unsigned PAGE_SHIFT = 12; + +    Core::System& system; +    VideoCore::RasterizerInterface& rasterizer; + +    std::recursive_mutex mutex; + +    std::unordered_map<u64, std::vector<CachedQuery>> cached_queries; + +    std::array<CounterStream, VideoCore::NumQueryTypes> streams; +}; + +template <class QueryCache, class HostCounter> +class HostCounterBase { +public: +    explicit HostCounterBase(std::shared_ptr<HostCounter> dependency_) +        : dependency{std::move(dependency_)}, depth{dependency ? (dependency->Depth() + 1) : 0} { +        // Avoid nesting too many dependencies to avoid a stack overflow when these are deleted. +        constexpr u64 depth_threshold = 96; +        if (depth > depth_threshold) { +            depth = 0; +            base_result = dependency->Query(); +            dependency = nullptr; +        } +    } +    virtual ~HostCounterBase() = default; + +    /// Returns the current value of the query. +    u64 Query() { +        if (result) { +            return *result; +        } + +        u64 value = BlockingQuery() + base_result; +        if (dependency) { +            value += dependency->Query(); +            dependency = nullptr; +        } + +        result = value; +        return *result; +    } + +    /// Returns true when flushing this query will potentially wait. +    bool WaitPending() const noexcept { +        return result.has_value(); +    } + +    u64 Depth() const noexcept { +        return depth; +    } + +protected: +    /// Returns the value of query from the backend API blocking as needed. +    virtual u64 BlockingQuery() const = 0; + +private: +    std::shared_ptr<HostCounter> dependency; ///< Counter to add to this value. +    std::optional<u64> result;               ///< Filled with the already returned value. +    u64 depth;                               ///< Number of nested dependencies. +    u64 base_result = 0;                     ///< Equivalent to nested dependencies value. +}; + +template <class HostCounter> +class CachedQueryBase { +public: +    explicit CachedQueryBase(VAddr cpu_addr, u8* host_ptr) +        : cpu_addr{cpu_addr}, host_ptr{host_ptr} {} +    virtual ~CachedQueryBase() = default; + +    CachedQueryBase(CachedQueryBase&&) noexcept = default; +    CachedQueryBase(const CachedQueryBase&) = delete; + +    CachedQueryBase& operator=(CachedQueryBase&&) noexcept = default; +    CachedQueryBase& operator=(const CachedQueryBase&) = delete; + +    /// Flushes the query to guest memory. +    virtual void Flush() { +        // When counter is nullptr it means that it's just been reseted. We are supposed to write a +        // zero in these cases. +        const u64 value = counter ? counter->Query() : 0; +        std::memcpy(host_ptr, &value, sizeof(u64)); + +        if (timestamp) { +            std::memcpy(host_ptr + TIMESTAMP_OFFSET, &*timestamp, sizeof(u64)); +        } +    } + +    /// Binds a counter to this query. +    void BindCounter(std::shared_ptr<HostCounter> counter_, std::optional<u64> timestamp_) { +        if (counter) { +            // If there's an old counter set it means the query is being rewritten by the game. +            // To avoid losing the data forever, flush here. +            Flush(); +        } +        counter = std::move(counter_); +        timestamp = timestamp_; +    } + +    VAddr CpuAddr() const noexcept { +        return cpu_addr; +    } + +    CacheAddr GetCacheAddr() const noexcept { +        return ToCacheAddr(host_ptr); +    } + +    u64 SizeInBytes() const noexcept { +        return SizeInBytes(timestamp.has_value()); +    } + +    static constexpr u64 SizeInBytes(bool with_timestamp) noexcept { +        return with_timestamp ? LARGE_QUERY_SIZE : SMALL_QUERY_SIZE; +    } + +protected: +    /// Returns true when querying the counter may potentially block. +    bool WaitPending() const noexcept { +        return counter && counter->WaitPending(); +    } + +private: +    static constexpr std::size_t SMALL_QUERY_SIZE = 8;   // Query size without timestamp. +    static constexpr std::size_t LARGE_QUERY_SIZE = 16;  // Query size with timestamp. +    static constexpr std::intptr_t TIMESTAMP_OFFSET = 8; // Timestamp offset in a large query. + +    VAddr cpu_addr;                       ///< Guest CPU address. +    u8* host_ptr;                         ///< Writable host pointer. +    std::shared_ptr<HostCounter> counter; ///< Host counter to query, owns the dependency tree. +    std::optional<u64> timestamp;         ///< Timestamp to flush to guest memory. +}; + +} // namespace VideoCommon diff --git a/src/video_core/rasterizer_interface.h b/src/video_core/rasterizer_interface.h index c586cd6fe..f18eaf4bc 100644 --- a/src/video_core/rasterizer_interface.h +++ b/src/video_core/rasterizer_interface.h @@ -6,6 +6,7 @@  #include <atomic>  #include <functional> +#include <optional>  #include "common/common_types.h"  #include "video_core/engines/fermi_2d.h"  #include "video_core/gpu.h" @@ -17,6 +18,11 @@ class MemoryManager;  namespace VideoCore { +enum class QueryType { +    SamplesPassed, +}; +constexpr std::size_t NumQueryTypes = 1; +  enum class LoadCallbackStage {      Prepare,      Decompile, @@ -29,11 +35,8 @@ class RasterizerInterface {  public:      virtual ~RasterizerInterface() {} -    /// Draw the current batch of vertex arrays -    virtual bool DrawBatch(bool is_indexed) = 0; - -    /// Draw the current batch of multiple instances of vertex arrays -    virtual bool DrawMultiBatch(bool is_indexed) = 0; +    /// Dispatches a draw invocation +    virtual void Draw(bool is_indexed, bool is_instanced) = 0;      /// Clear the current framebuffer      virtual void Clear() = 0; @@ -41,6 +44,12 @@ public:      /// Dispatches a compute shader invocation      virtual void DispatchCompute(GPUVAddr code_addr) = 0; +    /// Resets the counter of a query +    virtual void ResetCounter(QueryType type) = 0; + +    /// Records a GPU query and caches it +    virtual void Query(GPUVAddr gpu_addr, QueryType type, std::optional<u64> timestamp) = 0; +      /// Notify rasterizer that all caches should be flushed to Switch memory      virtual void FlushAll() = 0; diff --git a/src/video_core/renderer_opengl/gl_query_cache.cpp b/src/video_core/renderer_opengl/gl_query_cache.cpp new file mode 100644 index 000000000..f12e9f55f --- /dev/null +++ b/src/video_core/renderer_opengl/gl_query_cache.cpp @@ -0,0 +1,120 @@ +// Copyright 2019 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include <algorithm> +#include <cstring> +#include <memory> +#include <unordered_map> +#include <utility> +#include <vector> + +#include <glad/glad.h> + +#include "common/assert.h" +#include "core/core.h" +#include "video_core/engines/maxwell_3d.h" +#include "video_core/memory_manager.h" +#include "video_core/renderer_opengl/gl_query_cache.h" +#include "video_core/renderer_opengl/gl_rasterizer.h" + +namespace OpenGL { + +namespace { + +constexpr std::array<GLenum, VideoCore::NumQueryTypes> QueryTargets = {GL_SAMPLES_PASSED}; + +constexpr GLenum GetTarget(VideoCore::QueryType type) { +    return QueryTargets[static_cast<std::size_t>(type)]; +} + +} // Anonymous namespace + +QueryCache::QueryCache(Core::System& system, RasterizerOpenGL& gl_rasterizer) +    : VideoCommon::QueryCacheBase< +          QueryCache, CachedQuery, CounterStream, HostCounter, +          std::vector<OGLQuery>>{system, +                                 static_cast<VideoCore::RasterizerInterface&>(gl_rasterizer)}, +      gl_rasterizer{gl_rasterizer} {} + +QueryCache::~QueryCache() = default; + +OGLQuery QueryCache::AllocateQuery(VideoCore::QueryType type) { +    auto& reserve = query_pools[static_cast<std::size_t>(type)]; +    OGLQuery query; +    if (reserve.empty()) { +        query.Create(GetTarget(type)); +        return query; +    } + +    query = std::move(reserve.back()); +    reserve.pop_back(); +    return query; +} + +void QueryCache::Reserve(VideoCore::QueryType type, OGLQuery&& query) { +    query_pools[static_cast<std::size_t>(type)].push_back(std::move(query)); +} + +bool QueryCache::AnyCommandQueued() const noexcept { +    return gl_rasterizer.AnyCommandQueued(); +} + +HostCounter::HostCounter(QueryCache& cache, std::shared_ptr<HostCounter> dependency, +                         VideoCore::QueryType type) +    : VideoCommon::HostCounterBase<QueryCache, HostCounter>{std::move(dependency)}, cache{cache}, +      type{type}, query{cache.AllocateQuery(type)} { +    glBeginQuery(GetTarget(type), query.handle); +} + +HostCounter::~HostCounter() { +    cache.Reserve(type, std::move(query)); +} + +void HostCounter::EndQuery() { +    if (!cache.AnyCommandQueued()) { +        // There are chances a query waited on without commands (glDraw, glClear, glDispatch). Not +        // having any of these causes a lock. glFlush is considered a command, so we can safely wait +        // for this. Insert to the OpenGL command stream a flush. +        glFlush(); +    } +    glEndQuery(GetTarget(type)); +} + +u64 HostCounter::BlockingQuery() const { +    GLint64 value; +    glGetQueryObjecti64v(query.handle, GL_QUERY_RESULT, &value); +    return static_cast<u64>(value); +} + +CachedQuery::CachedQuery(QueryCache& cache, VideoCore::QueryType type, VAddr cpu_addr, u8* host_ptr) +    : VideoCommon::CachedQueryBase<HostCounter>{cpu_addr, host_ptr}, cache{&cache}, type{type} {} + +CachedQuery::CachedQuery(CachedQuery&& rhs) noexcept +    : VideoCommon::CachedQueryBase<HostCounter>(std::move(rhs)), cache{rhs.cache}, type{rhs.type} {} + +CachedQuery& CachedQuery::operator=(CachedQuery&& rhs) noexcept { +    VideoCommon::CachedQueryBase<HostCounter>::operator=(std::move(rhs)); +    cache = rhs.cache; +    type = rhs.type; +    return *this; +} + +void CachedQuery::Flush() { +    // Waiting for a query while another query of the same target is enabled locks Nvidia's driver. +    // To avoid this disable and re-enable keeping the dependency stream. +    // But we only have to do this if we have pending waits to be done. +    auto& stream = cache->Stream(type); +    const bool slice_counter = WaitPending() && stream.IsEnabled(); +    if (slice_counter) { +        stream.Update(false); +    } + +    VideoCommon::CachedQueryBase<HostCounter>::Flush(); + +    if (slice_counter) { +        stream.Update(true); +    } +} + +} // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_query_cache.h b/src/video_core/renderer_opengl/gl_query_cache.h new file mode 100644 index 000000000..d8e7052a1 --- /dev/null +++ b/src/video_core/renderer_opengl/gl_query_cache.h @@ -0,0 +1,78 @@ +// Copyright 2019 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include <array> +#include <memory> +#include <vector> + +#include "common/common_types.h" +#include "video_core/query_cache.h" +#include "video_core/rasterizer_interface.h" +#include "video_core/renderer_opengl/gl_resource_manager.h" + +namespace Core { +class System; +} + +namespace OpenGL { + +class CachedQuery; +class HostCounter; +class QueryCache; +class RasterizerOpenGL; + +using CounterStream = VideoCommon::CounterStreamBase<QueryCache, HostCounter>; + +class QueryCache final : public VideoCommon::QueryCacheBase<QueryCache, CachedQuery, CounterStream, +                                                            HostCounter, std::vector<OGLQuery>> { +public: +    explicit QueryCache(Core::System& system, RasterizerOpenGL& rasterizer); +    ~QueryCache(); + +    OGLQuery AllocateQuery(VideoCore::QueryType type); + +    void Reserve(VideoCore::QueryType type, OGLQuery&& query); + +    bool AnyCommandQueued() const noexcept; + +private: +    RasterizerOpenGL& gl_rasterizer; +}; + +class HostCounter final : public VideoCommon::HostCounterBase<QueryCache, HostCounter> { +public: +    explicit HostCounter(QueryCache& cache, std::shared_ptr<HostCounter> dependency, +                         VideoCore::QueryType type); +    ~HostCounter(); + +    void EndQuery(); + +private: +    u64 BlockingQuery() const override; + +    QueryCache& cache; +    const VideoCore::QueryType type; +    OGLQuery query; +}; + +class CachedQuery final : public VideoCommon::CachedQueryBase<HostCounter> { +public: +    explicit CachedQuery(QueryCache& cache, VideoCore::QueryType type, VAddr cpu_addr, +                         u8* host_ptr); +    CachedQuery(CachedQuery&& rhs) noexcept; +    CachedQuery(const CachedQuery&) = delete; + +    CachedQuery& operator=(CachedQuery&& rhs) noexcept; +    CachedQuery& operator=(const CachedQuery&) = delete; + +    void Flush() override; + +private: +    QueryCache* cache; +    VideoCore::QueryType type; +}; + +} // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp index b0eb14c8b..e1965fb21 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.cpp +++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp @@ -25,6 +25,7 @@  #include "video_core/engines/maxwell_3d.h"  #include "video_core/engines/shader_type.h"  #include "video_core/memory_manager.h" +#include "video_core/renderer_opengl/gl_query_cache.h"  #include "video_core/renderer_opengl/gl_rasterizer.h"  #include "video_core/renderer_opengl/gl_shader_cache.h"  #include "video_core/renderer_opengl/gl_shader_gen.h" @@ -92,8 +93,8 @@ std::size_t GetConstBufferSize(const Tegra::Engines::ConstBufferInfo& buffer,  RasterizerOpenGL::RasterizerOpenGL(Core::System& system, Core::Frontend::EmuWindow& emu_window,                                     ScreenInfo& info)      : RasterizerAccelerated{system.Memory()}, texture_cache{system, *this, device}, -      shader_cache{*this, system, emu_window, device}, system{system}, screen_info{info}, -      buffer_cache{*this, system, device, STREAM_BUFFER_SIZE} { +      shader_cache{*this, system, emu_window, device}, query_cache{system, *this}, system{system}, +      screen_info{info}, buffer_cache{*this, system, device, STREAM_BUFFER_SIZE} {      shader_program_manager = std::make_unique<GLShader::ProgramManager>();      state.draw.shader_program = 0;      state.Apply(); @@ -541,11 +542,16 @@ void RasterizerOpenGL::Clear() {      } else if (use_stencil) {          glClearBufferiv(GL_STENCIL, 0, ®s.clear_stencil);      } + +    ++num_queued_commands;  }  void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) {      MICROPROFILE_SCOPE(OpenGL_Drawing);      auto& gpu = system.GPU().Maxwell3D(); +    const auto& regs = gpu.regs; + +    query_cache.UpdateCounters();      SyncRasterizeEnable(state);      SyncColorMask(); @@ -611,7 +617,7 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) {      // Setup shaders and their used resources.      texture_cache.GuardSamplers(true); -    const auto primitive_mode = MaxwellToGL::PrimitiveTopology(gpu.regs.draw.topology); +    const GLenum primitive_mode = MaxwellToGL::PrimitiveTopology(gpu.regs.draw.topology);      SetupShaders(primitive_mode);      texture_cache.GuardSamplers(false); @@ -638,35 +644,47 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) {          glTextureBarrier();      } +    ++num_queued_commands; +      const GLuint base_instance = static_cast<GLuint>(gpu.regs.vb_base_instance);      const GLsizei num_instances =          static_cast<GLsizei>(is_instanced ? gpu.mme_draw.instance_count : 1);      if (is_indexed) { -        const GLenum index_format = MaxwellToGL::IndexFormat(gpu.regs.index_array.format);          const GLint base_vertex = static_cast<GLint>(gpu.regs.vb_element_base);          const GLsizei num_vertices = static_cast<GLsizei>(gpu.regs.index_array.count); -        glDrawElementsInstancedBaseVertexBaseInstance( -            primitive_mode, num_vertices, index_format, -            reinterpret_cast<const void*>(index_buffer_offset), num_instances, base_vertex, -            base_instance); +        const GLvoid* offset = reinterpret_cast<const GLvoid*>(index_buffer_offset); +        const GLenum format = MaxwellToGL::IndexFormat(gpu.regs.index_array.format); +        if (num_instances == 1 && base_instance == 0 && base_vertex == 0) { +            glDrawElements(primitive_mode, num_vertices, format, offset); +        } else if (num_instances == 1 && base_instance == 0) { +            glDrawElementsBaseVertex(primitive_mode, num_vertices, format, offset, base_vertex); +        } else if (base_vertex == 0 && base_instance == 0) { +            glDrawElementsInstanced(primitive_mode, num_vertices, format, offset, num_instances); +        } else if (base_vertex == 0) { +            glDrawElementsInstancedBaseInstance(primitive_mode, num_vertices, format, offset, +                                                num_instances, base_instance); +        } else if (base_instance == 0) { +            glDrawElementsInstancedBaseVertex(primitive_mode, num_vertices, format, offset, +                                              num_instances, base_vertex); +        } else { +            glDrawElementsInstancedBaseVertexBaseInstance(primitive_mode, num_vertices, format, +                                                          offset, num_instances, base_vertex, +                                                          base_instance); +        }      } else {          const GLint base_vertex = static_cast<GLint>(gpu.regs.vertex_buffer.first);          const GLsizei num_vertices = static_cast<GLsizei>(gpu.regs.vertex_buffer.count); -        glDrawArraysInstancedBaseInstance(primitive_mode, base_vertex, num_vertices, num_instances, -                                          base_instance); +        if (num_instances == 1 && base_instance == 0) { +            glDrawArrays(primitive_mode, base_vertex, num_vertices); +        } else if (base_instance == 0) { +            glDrawArraysInstanced(primitive_mode, base_vertex, num_vertices, num_instances); +        } else { +            glDrawArraysInstancedBaseInstance(primitive_mode, base_vertex, num_vertices, +                                              num_instances, base_instance); +        }      }  } -bool RasterizerOpenGL::DrawBatch(bool is_indexed) { -    Draw(is_indexed, false); -    return true; -} - -bool RasterizerOpenGL::DrawMultiBatch(bool is_indexed) { -    Draw(is_indexed, true); -    return true; -} -  void RasterizerOpenGL::DispatchCompute(GPUVAddr code_addr) {      if (device.HasBrokenCompute()) {          return; @@ -707,6 +725,16 @@ void RasterizerOpenGL::DispatchCompute(GPUVAddr code_addr) {      state.ApplyProgramPipeline();      glDispatchCompute(launch_desc.grid_dim_x, launch_desc.grid_dim_y, launch_desc.grid_dim_z); +    ++num_queued_commands; +} + +void RasterizerOpenGL::ResetCounter(VideoCore::QueryType type) { +    query_cache.ResetCounter(type); +} + +void RasterizerOpenGL::Query(GPUVAddr gpu_addr, VideoCore::QueryType type, +                             std::optional<u64> timestamp) { +    query_cache.Query(gpu_addr, type, timestamp);  }  void RasterizerOpenGL::FlushAll() {} @@ -718,6 +746,7 @@ void RasterizerOpenGL::FlushRegion(CacheAddr addr, u64 size) {      }      texture_cache.FlushRegion(addr, size);      buffer_cache.FlushRegion(addr, size); +    query_cache.FlushRegion(addr, size);  }  void RasterizerOpenGL::InvalidateRegion(CacheAddr addr, u64 size) { @@ -728,6 +757,7 @@ void RasterizerOpenGL::InvalidateRegion(CacheAddr addr, u64 size) {      texture_cache.InvalidateRegion(addr, size);      shader_cache.InvalidateRegion(addr, size);      buffer_cache.InvalidateRegion(addr, size); +    query_cache.InvalidateRegion(addr, size);  }  void RasterizerOpenGL::FlushAndInvalidateRegion(CacheAddr addr, u64 size) { @@ -738,10 +768,18 @@ void RasterizerOpenGL::FlushAndInvalidateRegion(CacheAddr addr, u64 size) {  }  void RasterizerOpenGL::FlushCommands() { +    // Only flush when we have commands queued to OpenGL. +    if (num_queued_commands == 0) { +        return; +    } +    num_queued_commands = 0;      glFlush();  }  void RasterizerOpenGL::TickFrame() { +    // Ticking a frame means that buffers will be swapped, calling glFlush implicitly. +    num_queued_commands = 0; +      buffer_cache.TickFrame();  } diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h index 0501f3828..68abe9a21 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.h +++ b/src/video_core/renderer_opengl/gl_rasterizer.h @@ -24,6 +24,7 @@  #include "video_core/renderer_opengl/gl_buffer_cache.h"  #include "video_core/renderer_opengl/gl_device.h"  #include "video_core/renderer_opengl/gl_framebuffer_cache.h" +#include "video_core/renderer_opengl/gl_query_cache.h"  #include "video_core/renderer_opengl/gl_resource_manager.h"  #include "video_core/renderer_opengl/gl_sampler_cache.h"  #include "video_core/renderer_opengl/gl_shader_cache.h" @@ -57,10 +58,11 @@ public:                                ScreenInfo& info);      ~RasterizerOpenGL() override; -    bool DrawBatch(bool is_indexed) override; -    bool DrawMultiBatch(bool is_indexed) override; +    void Draw(bool is_indexed, bool is_instanced) override;      void Clear() override;      void DispatchCompute(GPUVAddr code_addr) override; +    void ResetCounter(VideoCore::QueryType type) override; +    void Query(GPUVAddr gpu_addr, VideoCore::QueryType type, std::optional<u64> timestamp) override;      void FlushAll() override;      void FlushRegion(CacheAddr addr, u64 size) override;      void InvalidateRegion(CacheAddr addr, u64 size) override; @@ -75,6 +77,11 @@ public:      void LoadDiskResources(const std::atomic_bool& stop_loading,                             const VideoCore::DiskResourceLoadCallback& callback) override; +    /// Returns true when there are commands queued to the OpenGL server. +    bool AnyCommandQueued() const { +        return num_queued_commands > 0; +    } +  private:      /// Configures the color and depth framebuffer states.      void ConfigureFramebuffers(); @@ -102,9 +109,6 @@ private:      void SetupGlobalMemory(u32 binding, const GLShader::GlobalMemoryEntry& entry, GPUVAddr gpu_addr,                             std::size_t size); -    /// Syncs all the state, shaders, render targets and textures setting before a draw call. -    void Draw(bool is_indexed, bool is_instanced); -      /// Configures the current textures to use for the draw command.      void SetupDrawTextures(std::size_t stage_index, const Shader& shader); @@ -180,10 +184,23 @@ private:      /// Syncs the alpha test state to match the guest state      void SyncAlphaTest(); -    /// Check for extension that are not strictly required -    /// but are needed for correct emulation +    /// Check for extension that are not strictly required but are needed for correct emulation      void CheckExtensions(); +    std::size_t CalculateVertexArraysSize() const; + +    std::size_t CalculateIndexBufferSize() const; + +    /// Updates and returns a vertex array object representing current vertex format +    GLuint SetupVertexFormat(); + +    void SetupVertexBuffer(GLuint vao); +    void SetupVertexInstances(GLuint vao); + +    GLintptr SetupIndexBuffer(); + +    void SetupShaders(GLenum primitive_mode); +      const Device device;      OpenGLState state; @@ -191,6 +208,7 @@ private:      ShaderCacheOpenGL shader_cache;      SamplerCacheOpenGL sampler_cache;      FramebufferCacheOpenGL framebuffer_cache; +    QueryCache query_cache;      Core::System& system;      ScreenInfo& screen_info; @@ -208,19 +226,8 @@ private:      BindBuffersRangePushBuffer bind_ubo_pushbuffer{GL_UNIFORM_BUFFER};      BindBuffersRangePushBuffer bind_ssbo_pushbuffer{GL_SHADER_STORAGE_BUFFER}; -    std::size_t CalculateVertexArraysSize() const; - -    std::size_t CalculateIndexBufferSize() const; - -    /// Updates and returns a vertex array object representing current vertex format -    GLuint SetupVertexFormat(); - -    void SetupVertexBuffer(GLuint vao); -    void SetupVertexInstances(GLuint vao); - -    GLintptr SetupIndexBuffer(); - -    void SetupShaders(GLenum primitive_mode); +    /// Number of commands queued to the OpenGL driver. Reseted on flush. +    std::size_t num_queued_commands = 0;  };  } // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_resource_manager.cpp b/src/video_core/renderer_opengl/gl_resource_manager.cpp index 5c96c1d46..f0ddfb276 100644 --- a/src/video_core/renderer_opengl/gl_resource_manager.cpp +++ b/src/video_core/renderer_opengl/gl_resource_manager.cpp @@ -207,4 +207,21 @@ void OGLFramebuffer::Release() {      handle = 0;  } +void OGLQuery::Create(GLenum target) { +    if (handle != 0) +        return; + +    MICROPROFILE_SCOPE(OpenGL_ResourceCreation); +    glCreateQueries(target, 1, &handle); +} + +void OGLQuery::Release() { +    if (handle == 0) +        return; + +    MICROPROFILE_SCOPE(OpenGL_ResourceDeletion); +    glDeleteQueries(1, &handle); +    handle = 0; +} +  } // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_resource_manager.h b/src/video_core/renderer_opengl/gl_resource_manager.h index 3a85a1d4c..514d1d165 100644 --- a/src/video_core/renderer_opengl/gl_resource_manager.h +++ b/src/video_core/renderer_opengl/gl_resource_manager.h @@ -266,4 +266,29 @@ public:      GLuint handle = 0;  }; +class OGLQuery : private NonCopyable { +public: +    OGLQuery() = default; + +    OGLQuery(OGLQuery&& o) noexcept : handle(std::exchange(o.handle, 0)) {} + +    ~OGLQuery() { +        Release(); +    } + +    OGLQuery& operator=(OGLQuery&& o) noexcept { +        Release(); +        handle = std::exchange(o.handle, 0); +        return *this; +    } + +    /// Creates a new internal OpenGL resource and stores the handle +    void Create(GLenum target); + +    /// Deletes the internal OpenGL resource +    void Release(); + +    GLuint handle = 0; +}; +  } // namespace OpenGL diff --git a/src/video_core/renderer_vulkan/vk_device.cpp b/src/video_core/renderer_vulkan/vk_device.cpp index 9840f26e5..588a6835f 100644 --- a/src/video_core/renderer_vulkan/vk_device.cpp +++ b/src/video_core/renderer_vulkan/vk_device.cpp @@ -104,6 +104,7 @@ bool VKDevice::Create(const vk::DispatchLoaderDynamic& dldi, vk::Instance instan      features.depthBiasClamp = true;      features.geometryShader = true;      features.tessellationShader = true; +    features.occlusionQueryPrecise = true;      features.fragmentStoresAndAtomics = true;      features.shaderImageGatherExtended = true;      features.shaderStorageImageWriteWithoutFormat = true; @@ -117,6 +118,10 @@ bool VKDevice::Create(const vk::DispatchLoaderDynamic& dldi, vk::Instance instan      bit8_storage.uniformAndStorageBuffer8BitAccess = true;      SetNext(next, bit8_storage); +    vk::PhysicalDeviceHostQueryResetFeaturesEXT host_query_reset; +    host_query_reset.hostQueryReset = true; +    SetNext(next, host_query_reset); +      vk::PhysicalDeviceFloat16Int8FeaturesKHR float16_int8;      if (is_float16_supported) {          float16_int8.shaderFloat16 = true; @@ -273,6 +278,7 @@ bool VKDevice::IsSuitable(const vk::DispatchLoaderDynamic& dldi, vk::PhysicalDev          VK_EXT_VERTEX_ATTRIBUTE_DIVISOR_EXTENSION_NAME,          VK_EXT_SHADER_SUBGROUP_BALLOT_EXTENSION_NAME,          VK_EXT_SHADER_SUBGROUP_VOTE_EXTENSION_NAME, +        VK_EXT_HOST_QUERY_RESET_EXTENSION_NAME,      };      std::bitset<required_extensions.size()> available_extensions{}; @@ -340,6 +346,7 @@ bool VKDevice::IsSuitable(const vk::DispatchLoaderDynamic& dldi, vk::PhysicalDev          std::make_pair(features.depthBiasClamp, "depthBiasClamp"),          std::make_pair(features.geometryShader, "geometryShader"),          std::make_pair(features.tessellationShader, "tessellationShader"), +        std::make_pair(features.occlusionQueryPrecise, "occlusionQueryPrecise"),          std::make_pair(features.fragmentStoresAndAtomics, "fragmentStoresAndAtomics"),          std::make_pair(features.shaderImageGatherExtended, "shaderImageGatherExtended"),          std::make_pair(features.shaderStorageImageWriteWithoutFormat, @@ -376,7 +383,7 @@ std::vector<const char*> VKDevice::LoadExtensions(const vk::DispatchLoaderDynami          }      }; -    extensions.reserve(13); +    extensions.reserve(14);      extensions.push_back(VK_KHR_SWAPCHAIN_EXTENSION_NAME);      extensions.push_back(VK_KHR_16BIT_STORAGE_EXTENSION_NAME);      extensions.push_back(VK_KHR_8BIT_STORAGE_EXTENSION_NAME); @@ -384,6 +391,7 @@ std::vector<const char*> VKDevice::LoadExtensions(const vk::DispatchLoaderDynami      extensions.push_back(VK_EXT_VERTEX_ATTRIBUTE_DIVISOR_EXTENSION_NAME);      extensions.push_back(VK_EXT_SHADER_SUBGROUP_BALLOT_EXTENSION_NAME);      extensions.push_back(VK_EXT_SHADER_SUBGROUP_VOTE_EXTENSION_NAME); +    extensions.push_back(VK_EXT_HOST_QUERY_RESET_EXTENSION_NAME);      [[maybe_unused]] const bool nsight =          std::getenv("NVTX_INJECTION64_PATH") || std::getenv("NSIGHT_LAUNCHED"); diff --git a/src/video_core/renderer_vulkan/vk_query_cache.cpp b/src/video_core/renderer_vulkan/vk_query_cache.cpp new file mode 100644 index 000000000..ffbf60dda --- /dev/null +++ b/src/video_core/renderer_vulkan/vk_query_cache.cpp @@ -0,0 +1,122 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include <algorithm> +#include <cstddef> +#include <cstdint> +#include <utility> +#include <vector> + +#include "video_core/renderer_vulkan/declarations.h" +#include "video_core/renderer_vulkan/vk_device.h" +#include "video_core/renderer_vulkan/vk_query_cache.h" +#include "video_core/renderer_vulkan/vk_resource_manager.h" +#include "video_core/renderer_vulkan/vk_scheduler.h" + +namespace Vulkan { + +namespace { + +constexpr std::array QUERY_TARGETS = {vk::QueryType::eOcclusion}; + +constexpr vk::QueryType GetTarget(VideoCore::QueryType type) { +    return QUERY_TARGETS[static_cast<std::size_t>(type)]; +} + +} // Anonymous namespace + +QueryPool::QueryPool() : VKFencedPool{GROW_STEP} {} + +QueryPool::~QueryPool() = default; + +void QueryPool::Initialize(const VKDevice& device_, VideoCore::QueryType type_) { +    device = &device_; +    type = type_; +} + +std::pair<vk::QueryPool, std::uint32_t> QueryPool::Commit(VKFence& fence) { +    std::size_t index; +    do { +        index = CommitResource(fence); +    } while (usage[index]); +    usage[index] = true; + +    return {*pools[index / GROW_STEP], static_cast<std::uint32_t>(index % GROW_STEP)}; +} + +void QueryPool::Allocate(std::size_t begin, std::size_t end) { +    usage.resize(end); + +    const auto dev = device->GetLogical(); +    const u32 size = static_cast<u32>(end - begin); +    const vk::QueryPoolCreateInfo query_pool_ci({}, GetTarget(type), size, {}); +    pools.push_back(dev.createQueryPoolUnique(query_pool_ci, nullptr, device->GetDispatchLoader())); +} + +void QueryPool::Reserve(std::pair<vk::QueryPool, std::uint32_t> query) { +    const auto it = +        std::find_if(std::begin(pools), std::end(pools), +                     [query_pool = query.first](auto& pool) { return query_pool == *pool; }); +    ASSERT(it != std::end(pools)); + +    const std::ptrdiff_t pool_index = std::distance(std::begin(pools), it); +    usage[pool_index * GROW_STEP + static_cast<std::ptrdiff_t>(query.second)] = false; +} + +VKQueryCache::VKQueryCache(Core::System& system, VideoCore::RasterizerInterface& rasterizer, +                           const VKDevice& device, VKScheduler& scheduler) +    : VideoCommon::QueryCacheBase<VKQueryCache, CachedQuery, CounterStream, HostCounter, +                                  QueryPool>{system, rasterizer}, +      device{device}, scheduler{scheduler} { +    for (std::size_t i = 0; i < static_cast<std::size_t>(VideoCore::NumQueryTypes); ++i) { +        query_pools[i].Initialize(device, static_cast<VideoCore::QueryType>(i)); +    } +} + +VKQueryCache::~VKQueryCache() = default; + +std::pair<vk::QueryPool, std::uint32_t> VKQueryCache::AllocateQuery(VideoCore::QueryType type) { +    return query_pools[static_cast<std::size_t>(type)].Commit(scheduler.GetFence()); +} + +void VKQueryCache::Reserve(VideoCore::QueryType type, +                           std::pair<vk::QueryPool, std::uint32_t> query) { +    query_pools[static_cast<std::size_t>(type)].Reserve(query); +} + +HostCounter::HostCounter(VKQueryCache& cache, std::shared_ptr<HostCounter> dependency, +                         VideoCore::QueryType type) +    : VideoCommon::HostCounterBase<VKQueryCache, HostCounter>{std::move(dependency)}, cache{cache}, +      type{type}, query{cache.AllocateQuery(type)}, ticks{cache.Scheduler().Ticks()} { +    const auto dev = cache.Device().GetLogical(); +    cache.Scheduler().Record([dev, query = query](vk::CommandBuffer cmdbuf, auto& dld) { +        dev.resetQueryPoolEXT(query.first, query.second, 1, dld); +        cmdbuf.beginQuery(query.first, query.second, vk::QueryControlFlagBits::ePrecise, dld); +    }); +} + +HostCounter::~HostCounter() { +    cache.Reserve(type, query); +} + +void HostCounter::EndQuery() { +    cache.Scheduler().Record([query = query](auto cmdbuf, auto& dld) { +        cmdbuf.endQuery(query.first, query.second, dld); +    }); +} + +u64 HostCounter::BlockingQuery() const { +    if (ticks >= cache.Scheduler().Ticks()) { +        cache.Scheduler().Flush(); +    } + +    const auto dev = cache.Device().GetLogical(); +    const auto& dld = cache.Device().GetDispatchLoader(); +    u64 value; +    dev.getQueryPoolResults(query.first, query.second, 1, sizeof(value), &value, sizeof(value), +                            vk::QueryResultFlagBits::e64 | vk::QueryResultFlagBits::eWait, dld); +    return value; +} + +} // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_query_cache.h b/src/video_core/renderer_vulkan/vk_query_cache.h new file mode 100644 index 000000000..c3092ee96 --- /dev/null +++ b/src/video_core/renderer_vulkan/vk_query_cache.h @@ -0,0 +1,104 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include <cstddef> +#include <cstdint> +#include <memory> +#include <utility> +#include <vector> + +#include "common/common_types.h" +#include "video_core/query_cache.h" +#include "video_core/renderer_vulkan/declarations.h" +#include "video_core/renderer_vulkan/vk_resource_manager.h" + +namespace VideoCore { +class RasterizerInterface; +} + +namespace Vulkan { + +class CachedQuery; +class HostCounter; +class VKDevice; +class VKQueryCache; +class VKScheduler; + +using CounterStream = VideoCommon::CounterStreamBase<VKQueryCache, HostCounter>; + +class QueryPool final : public VKFencedPool { +public: +    explicit QueryPool(); +    ~QueryPool() override; + +    void Initialize(const VKDevice& device, VideoCore::QueryType type); + +    std::pair<vk::QueryPool, std::uint32_t> Commit(VKFence& fence); + +    void Reserve(std::pair<vk::QueryPool, std::uint32_t> query); + +protected: +    void Allocate(std::size_t begin, std::size_t end) override; + +private: +    static constexpr std::size_t GROW_STEP = 512; + +    const VKDevice* device = nullptr; +    VideoCore::QueryType type = {}; + +    std::vector<UniqueQueryPool> pools; +    std::vector<bool> usage; +}; + +class VKQueryCache final +    : public VideoCommon::QueryCacheBase<VKQueryCache, CachedQuery, CounterStream, HostCounter, +                                         QueryPool> { +public: +    explicit VKQueryCache(Core::System& system, VideoCore::RasterizerInterface& rasterizer, +                          const VKDevice& device, VKScheduler& scheduler); +    ~VKQueryCache(); + +    std::pair<vk::QueryPool, std::uint32_t> AllocateQuery(VideoCore::QueryType type); + +    void Reserve(VideoCore::QueryType type, std::pair<vk::QueryPool, std::uint32_t> query); + +    const VKDevice& Device() const noexcept { +        return device; +    } + +    VKScheduler& Scheduler() const noexcept { +        return scheduler; +    } + +private: +    const VKDevice& device; +    VKScheduler& scheduler; +}; + +class HostCounter final : public VideoCommon::HostCounterBase<VKQueryCache, HostCounter> { +public: +    explicit HostCounter(VKQueryCache& cache, std::shared_ptr<HostCounter> dependency, +                         VideoCore::QueryType type); +    ~HostCounter(); + +    void EndQuery(); + +private: +    u64 BlockingQuery() const override; + +    VKQueryCache& cache; +    const VideoCore::QueryType type; +    const std::pair<vk::QueryPool, std::uint32_t> query; +    const u64 ticks; +}; + +class CachedQuery : public VideoCommon::CachedQueryBase<HostCounter> { +public: +    explicit CachedQuery(VKQueryCache&, VideoCore::QueryType, VAddr cpu_addr, u8* host_ptr) +        : VideoCommon::CachedQueryBase<HostCounter>{cpu_addr, host_ptr} {} +}; + +} // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp index aada38702..31c078f6a 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp +++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp @@ -289,25 +289,19 @@ RasterizerVulkan::RasterizerVulkan(Core::System& system, Core::Frontend::EmuWind                      staging_pool),        pipeline_cache(system, *this, device, scheduler, descriptor_pool, update_descriptor_queue),        buffer_cache(*this, system, device, memory_manager, scheduler, staging_pool), -      sampler_cache(device) {} - -RasterizerVulkan::~RasterizerVulkan() = default; - -bool RasterizerVulkan::DrawBatch(bool is_indexed) { -    Draw(is_indexed, false); -    return true; +      sampler_cache(device), query_cache(system, *this, device, scheduler) { +    scheduler.SetQueryCache(query_cache);  } -bool RasterizerVulkan::DrawMultiBatch(bool is_indexed) { -    Draw(is_indexed, true); -    return true; -} +RasterizerVulkan::~RasterizerVulkan() = default;  void RasterizerVulkan::Draw(bool is_indexed, bool is_instanced) {      MICROPROFILE_SCOPE(Vulkan_Drawing);      FlushWork(); +    query_cache.UpdateCounters(); +      const auto& gpu = system.GPU().Maxwell3D();      GraphicsPipelineCacheKey key{GetFixedPipelineState(gpu.regs)}; @@ -362,6 +356,8 @@ void RasterizerVulkan::Draw(bool is_indexed, bool is_instanced) {  void RasterizerVulkan::Clear() {      MICROPROFILE_SCOPE(Vulkan_Clearing); +    query_cache.UpdateCounters(); +      const auto& gpu = system.GPU().Maxwell3D();      if (!system.GPU().Maxwell3D().ShouldExecute()) {          return; @@ -429,6 +425,8 @@ void RasterizerVulkan::DispatchCompute(GPUVAddr code_addr) {      sampled_views.clear();      image_views.clear(); +    query_cache.UpdateCounters(); +      const auto& launch_desc = system.GPU().KeplerCompute().launch_description;      const ComputePipelineCacheKey key{          code_addr, @@ -471,17 +469,28 @@ void RasterizerVulkan::DispatchCompute(GPUVAddr code_addr) {      });  } +void RasterizerVulkan::ResetCounter(VideoCore::QueryType type) { +    query_cache.ResetCounter(type); +} + +void RasterizerVulkan::Query(GPUVAddr gpu_addr, VideoCore::QueryType type, +                             std::optional<u64> timestamp) { +    query_cache.Query(gpu_addr, type, timestamp); +} +  void RasterizerVulkan::FlushAll() {}  void RasterizerVulkan::FlushRegion(CacheAddr addr, u64 size) {      texture_cache.FlushRegion(addr, size);      buffer_cache.FlushRegion(addr, size); +    query_cache.FlushRegion(addr, size);  }  void RasterizerVulkan::InvalidateRegion(CacheAddr addr, u64 size) {      texture_cache.InvalidateRegion(addr, size);      pipeline_cache.InvalidateRegion(addr, size);      buffer_cache.InvalidateRegion(addr, size); +    query_cache.InvalidateRegion(addr, size);  }  void RasterizerVulkan::FlushAndInvalidateRegion(CacheAddr addr, u64 size) { diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.h b/src/video_core/renderer_vulkan/vk_rasterizer.h index 7be71e734..138903d60 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.h +++ b/src/video_core/renderer_vulkan/vk_rasterizer.h @@ -24,6 +24,7 @@  #include "video_core/renderer_vulkan/vk_descriptor_pool.h"  #include "video_core/renderer_vulkan/vk_memory_manager.h"  #include "video_core/renderer_vulkan/vk_pipeline_cache.h" +#include "video_core/renderer_vulkan/vk_query_cache.h"  #include "video_core/renderer_vulkan/vk_renderpass_cache.h"  #include "video_core/renderer_vulkan/vk_resource_manager.h"  #include "video_core/renderer_vulkan/vk_sampler_cache.h" @@ -96,7 +97,7 @@ struct ImageView {      vk::ImageLayout* layout = nullptr;  }; -class RasterizerVulkan : public VideoCore::RasterizerAccelerated { +class RasterizerVulkan final : public VideoCore::RasterizerAccelerated {  public:      explicit RasterizerVulkan(Core::System& system, Core::Frontend::EmuWindow& render_window,                                VKScreenInfo& screen_info, const VKDevice& device, @@ -104,10 +105,11 @@ public:                                VKScheduler& scheduler);      ~RasterizerVulkan() override; -    bool DrawBatch(bool is_indexed) override; -    bool DrawMultiBatch(bool is_indexed) override; +    void Draw(bool is_indexed, bool is_instanced) override;      void Clear() override;      void DispatchCompute(GPUVAddr code_addr) override; +    void ResetCounter(VideoCore::QueryType type) override; +    void Query(GPUVAddr gpu_addr, VideoCore::QueryType type, std::optional<u64> timestamp) override;      void FlushAll() override;      void FlushRegion(CacheAddr addr, u64 size) override;      void InvalidateRegion(CacheAddr addr, u64 size) override; @@ -140,8 +142,6 @@ private:      static constexpr std::size_t ZETA_TEXCEPTION_INDEX = 8; -    void Draw(bool is_indexed, bool is_instanced); -      void FlushWork();      Texceptions UpdateAttachments(); @@ -247,6 +247,7 @@ private:      VKPipelineCache pipeline_cache;      VKBufferCache buffer_cache;      VKSamplerCache sampler_cache; +    VKQueryCache query_cache;      std::array<View, Maxwell::NumRenderTargets> color_attachments;      View zeta_attachment; diff --git a/src/video_core/renderer_vulkan/vk_scheduler.cpp b/src/video_core/renderer_vulkan/vk_scheduler.cpp index d66133ad1..92bd6c344 100644 --- a/src/video_core/renderer_vulkan/vk_scheduler.cpp +++ b/src/video_core/renderer_vulkan/vk_scheduler.cpp @@ -6,6 +6,7 @@  #include "common/microprofile.h"  #include "video_core/renderer_vulkan/declarations.h"  #include "video_core/renderer_vulkan/vk_device.h" +#include "video_core/renderer_vulkan/vk_query_cache.h"  #include "video_core/renderer_vulkan/vk_resource_manager.h"  #include "video_core/renderer_vulkan/vk_scheduler.h" @@ -139,6 +140,8 @@ void VKScheduler::SubmitExecution(vk::Semaphore semaphore) {  }  void VKScheduler::AllocateNewContext() { +    ++ticks; +      std::unique_lock lock{mutex};      current_fence = next_fence;      next_fence = &resource_manager.CommitFence(); @@ -146,6 +149,10 @@ void VKScheduler::AllocateNewContext() {      current_cmdbuf = resource_manager.CommitCommandBuffer(*current_fence);      current_cmdbuf.begin({vk::CommandBufferUsageFlagBits::eOneTimeSubmit},                           device.GetDispatchLoader()); +    // Enable counters once again. These are disabled when a command buffer is finished. +    if (query_cache) { +        query_cache->UpdateCounters(); +    }  }  void VKScheduler::InvalidateState() { @@ -159,6 +166,7 @@ void VKScheduler::InvalidateState() {  }  void VKScheduler::EndPendingOperations() { +    query_cache->DisableStreams();      EndRenderPass();  } diff --git a/src/video_core/renderer_vulkan/vk_scheduler.h b/src/video_core/renderer_vulkan/vk_scheduler.h index bcdffbba0..62fd7858b 100644 --- a/src/video_core/renderer_vulkan/vk_scheduler.h +++ b/src/video_core/renderer_vulkan/vk_scheduler.h @@ -4,6 +4,7 @@  #pragma once +#include <atomic>  #include <condition_variable>  #include <memory>  #include <optional> @@ -18,6 +19,7 @@ namespace Vulkan {  class VKDevice;  class VKFence; +class VKQueryCache;  class VKResourceManager;  class VKFenceView { @@ -67,6 +69,11 @@ public:      /// Binds a pipeline to the current execution context.      void BindGraphicsPipeline(vk::Pipeline pipeline); +    /// Assigns the query cache. +    void SetQueryCache(VKQueryCache& query_cache_) { +        query_cache = &query_cache_; +    } +      /// Returns true when viewports have been set in the current command buffer.      bool TouchViewports() {          return std::exchange(state.viewports, true); @@ -112,6 +119,11 @@ public:          return current_fence;      } +    /// Returns the current command buffer tick. +    u64 Ticks() const { +        return ticks; +    } +  private:      class Command {      public: @@ -205,6 +217,8 @@ private:      const VKDevice& device;      VKResourceManager& resource_manager; +    VKQueryCache* query_cache = nullptr; +      vk::CommandBuffer current_cmdbuf;      VKFence* current_fence = nullptr;      VKFence* next_fence = nullptr; @@ -227,6 +241,7 @@ private:      Common::SPSCQueue<std::unique_ptr<CommandChunk>> chunk_reserve;      std::mutex mutex;      std::condition_variable cv; +    std::atomic<u64> ticks = 0;      bool quit = false;  }; diff --git a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp index 24a658dce..f64f5da28 100644 --- a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp +++ b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp @@ -275,12 +275,14 @@ public:          AddCapability(spv::Capability::ImageGatherExtended);          AddCapability(spv::Capability::SampledBuffer);          AddCapability(spv::Capability::StorageImageWriteWithoutFormat); +        AddCapability(spv::Capability::DrawParameters);          AddCapability(spv::Capability::SubgroupBallotKHR);          AddCapability(spv::Capability::SubgroupVoteKHR);          AddExtension("SPV_KHR_shader_ballot");          AddExtension("SPV_KHR_subgroup_vote");          AddExtension("SPV_KHR_storage_buffer_storage_class");          AddExtension("SPV_KHR_variable_pointers"); +        AddExtension("SPV_KHR_shader_draw_parameters");          if (ir.UsesViewportIndex()) {              AddCapability(spv::Capability::MultiViewport); @@ -492,9 +494,11 @@ private:          interfaces.push_back(AddGlobalVariable(Name(out_vertex, "out_vertex")));          // Declare input attributes -        vertex_index = DeclareInputBuiltIn(spv::BuiltIn::VertexIndex, t_in_uint, "vertex_index"); +        vertex_index = DeclareInputBuiltIn(spv::BuiltIn::VertexIndex, t_in_int, "vertex_index");          instance_index = -            DeclareInputBuiltIn(spv::BuiltIn::InstanceIndex, t_in_uint, "instance_index"); +            DeclareInputBuiltIn(spv::BuiltIn::InstanceIndex, t_in_int, "instance_index"); +        base_vertex = DeclareInputBuiltIn(spv::BuiltIn::BaseVertex, t_in_int, "base_vertex"); +        base_instance = DeclareInputBuiltIn(spv::BuiltIn::BaseInstance, t_in_int, "base_instance");      }      void DeclareTessControl() { @@ -1068,9 +1072,12 @@ private:                      return {OpLoad(t_float, AccessElement(t_in_float, tess_coord, element)),                              Type::Float};                  case 2: -                    return {OpLoad(t_uint, instance_index), Type::Uint}; +                    return { +                        OpISub(t_int, OpLoad(t_int, instance_index), OpLoad(t_int, base_instance)), +                        Type::Int};                  case 3: -                    return {OpLoad(t_uint, vertex_index), Type::Uint}; +                    return {OpISub(t_int, OpLoad(t_int, vertex_index), OpLoad(t_int, base_vertex)), +                            Type::Int};                  }                  UNIMPLEMENTED_MSG("Unmanaged TessCoordInstanceIDVertexID element={}", element);                  return {Constant(t_uint, 0U), Type::Uint}; @@ -2542,6 +2549,8 @@ private:      Id instance_index{};      Id vertex_index{}; +    Id base_instance{}; +    Id base_vertex{};      std::array<Id, Maxwell::NumRenderTargets> frag_colors{};      Id frag_depth{};      Id frag_coord{}; diff --git a/src/video_core/shader/decode/conversion.cpp b/src/video_core/shader/decode/conversion.cpp index 0eeb75559..6ead42070 100644 --- a/src/video_core/shader/decode/conversion.cpp +++ b/src/video_core/shader/decode/conversion.cpp @@ -83,14 +83,14 @@ u32 ShaderIR::DecodeConversion(NodeBlock& bb, u32 pc) {          const bool input_signed = instr.conversion.is_input_signed; -        if (instr.conversion.src_size == Register::Size::Byte) { -            const u32 offset = static_cast<u32>(instr.conversion.int_src.selector) * 8; -            if (offset > 0) { -                value = SignedOperation(OperationCode::ILogicalShiftRight, input_signed, -                                        std::move(value), Immediate(offset)); +        if (const u32 offset = static_cast<u32>(instr.conversion.int_src.selector); offset > 0) { +            ASSERT(instr.conversion.src_size == Register::Size::Byte || +                   instr.conversion.src_size == Register::Size::Short); +            if (instr.conversion.src_size == Register::Size::Short) { +                ASSERT(offset == 0 || offset == 2);              } -        } else { -            UNIMPLEMENTED_IF(instr.conversion.int_src.selector != 0); +            value = SignedOperation(OperationCode::ILogicalShiftRight, input_signed, +                                    std::move(value), Immediate(offset * 8));          }          value = ConvertIntegerSize(value, instr.conversion.src_size, input_signed); diff --git a/src/video_core/shader/decode/texture.cpp b/src/video_core/shader/decode/texture.cpp index 351c8c2f1..542636430 100644 --- a/src/video_core/shader/decode/texture.cpp +++ b/src/video_core/shader/decode/texture.cpp @@ -522,68 +522,53 @@ Node4 ShaderIR::GetTextureCode(Instruction instr, TextureType texture_type,                                 Node array, Node depth_compare, u32 bias_offset,                                 std::vector<Node> aoffi,                                 std::optional<Tegra::Shader::Register> bindless_reg) { -    const auto is_array = static_cast<bool>(array); -    const auto is_shadow = static_cast<bool>(depth_compare); +    const bool is_array = array != nullptr; +    const bool is_shadow = depth_compare != nullptr;      const bool is_bindless = bindless_reg.has_value(); -    UNIMPLEMENTED_IF_MSG((texture_type == TextureType::Texture3D && (is_array || is_shadow)) || -                             (texture_type == TextureType::TextureCube && is_array && is_shadow), -                         "This method is not supported."); +    UNIMPLEMENTED_IF(texture_type == TextureType::TextureCube && is_array && is_shadow); +    ASSERT_MSG(texture_type != TextureType::Texture3D || is_array || is_shadow, +               "Illegal texture type");      const SamplerInfo info{texture_type, is_array, is_shadow, false}; -    Node index_var{}; +    Node index_var;      const Sampler* sampler = is_bindless ? GetBindlessSampler(*bindless_reg, index_var, info)                                           : GetSampler(instr.sampler, info); -    Node4 values; -    if (sampler == nullptr) { -        for (u32 element = 0; element < values.size(); ++element) { -            values[element] = Immediate(0); -        } -        return values; +    if (!sampler) { +        return {Immediate(0), Immediate(0), Immediate(0), Immediate(0)};      }      const bool lod_needed = process_mode == TextureProcessMode::LZ ||                              process_mode == TextureProcessMode::LL ||                              process_mode == TextureProcessMode::LLA; - -    // LOD selection (either via bias or explicit textureLod) not supported in GL for -    // sampler2DArrayShadow and samplerCubeArrayShadow. -    const bool gl_lod_supported = -        !((texture_type == Tegra::Shader::TextureType::Texture2D && is_array && is_shadow) || -          (texture_type == Tegra::Shader::TextureType::TextureCube && is_array && is_shadow)); - -    const OperationCode read_method = -        (lod_needed && gl_lod_supported) ? OperationCode::TextureLod : OperationCode::Texture; - -    UNIMPLEMENTED_IF(process_mode != TextureProcessMode::None && !gl_lod_supported); +    const OperationCode opcode = lod_needed ? OperationCode::TextureLod : OperationCode::Texture;      Node bias;      Node lod; -    if (process_mode != TextureProcessMode::None && gl_lod_supported) { -        switch (process_mode) { -        case TextureProcessMode::LZ: -            lod = Immediate(0.0f); -            break; -        case TextureProcessMode::LB: -            // If present, lod or bias are always stored in the register -            // indexed by the gpr20 field with an offset depending on the -            // usage of the other registers -            bias = GetRegister(instr.gpr20.Value() + bias_offset); -            break; -        case TextureProcessMode::LL: -            lod = GetRegister(instr.gpr20.Value() + bias_offset); -            break; -        default: -            UNIMPLEMENTED_MSG("Unimplemented process mode={}", static_cast<u32>(process_mode)); -            break; -        } +    switch (process_mode) { +    case TextureProcessMode::None: +        break; +    case TextureProcessMode::LZ: +        lod = Immediate(0.0f); +        break; +    case TextureProcessMode::LB: +        // If present, lod or bias are always stored in the register indexed by the gpr20 field with +        // an offset depending on the usage of the other registers. +        bias = GetRegister(instr.gpr20.Value() + bias_offset); +        break; +    case TextureProcessMode::LL: +        lod = GetRegister(instr.gpr20.Value() + bias_offset); +        break; +    default: +        UNIMPLEMENTED_MSG("Unimplemented process mode={}", static_cast<u32>(process_mode)); +        break;      } +    Node4 values;      for (u32 element = 0; element < values.size(); ++element) { -        auto copy_coords = coords;          MetaTexture meta{*sampler, array, depth_compare, aoffi,    {}, {}, bias,                           lod,      {},    element,       index_var}; -        values[element] = Operation(read_method, meta, std::move(copy_coords)); +        values[element] = Operation(opcode, meta, coords);      }      return values; | 
