diff options
Diffstat (limited to 'src/video_core')
| -rw-r--r-- | src/video_core/cdma_pusher.cpp | 1 | ||||
| -rw-r--r-- | src/video_core/cdma_pusher.h | 2 | ||||
| -rw-r--r-- | src/video_core/engines/maxwell_3d.h | 1 | ||||
| -rw-r--r-- | src/video_core/framebuffer_config.h | 20 | ||||
| -rw-r--r-- | src/video_core/gpu.cpp | 1220 | ||||
| -rw-r--r-- | src/video_core/gpu.h | 217 | ||||
| -rw-r--r-- | src/video_core/gpu_thread.h | 3 | ||||
| -rw-r--r-- | src/video_core/query_cache.h | 1 | ||||
| -rw-r--r-- | src/video_core/renderer_opengl/gl_texture_cache.cpp | 1 | ||||
| -rw-r--r-- | src/video_core/renderer_opengl/gl_texture_cache.h | 1 | ||||
| -rw-r--r-- | src/video_core/renderer_vulkan/vk_texture_cache.cpp | 1 | ||||
| -rw-r--r-- | src/video_core/renderer_vulkan/vk_texture_cache.h | 2 | ||||
| -rw-r--r-- | src/video_core/shader_environment.cpp | 1 | ||||
| -rw-r--r-- | src/video_core/shader_environment.h | 4 | ||||
| -rw-r--r-- | src/video_core/texture_cache/image_view_info.cpp | 1 | ||||
| -rw-r--r-- | src/video_core/texture_cache/texture_cache.h | 5 | ||||
| -rw-r--r-- | src/video_core/texture_cache/texture_cache_base.h | 8 | 
17 files changed, 862 insertions, 627 deletions
| diff --git a/src/video_core/cdma_pusher.cpp b/src/video_core/cdma_pusher.cpp index 8b86ad050..a8c4b4415 100644 --- a/src/video_core/cdma_pusher.cpp +++ b/src/video_core/cdma_pusher.cpp @@ -24,6 +24,7 @@  #include "command_classes/vic.h"  #include "video_core/cdma_pusher.h"  #include "video_core/command_classes/nvdec_common.h" +#include "video_core/command_classes/sync_manager.h"  #include "video_core/engines/maxwell_3d.h"  #include "video_core/gpu.h"  #include "video_core/memory_manager.h" diff --git a/src/video_core/cdma_pusher.h b/src/video_core/cdma_pusher.h index 1bada44dd..87b49d6ea 100644 --- a/src/video_core/cdma_pusher.h +++ b/src/video_core/cdma_pusher.h @@ -9,13 +9,13 @@  #include "common/bit_field.h"  #include "common/common_types.h" -#include "video_core/command_classes/sync_manager.h"  namespace Tegra {  class GPU;  class Host1x;  class Nvdec; +class SyncptIncrManager;  class Vic;  enum class ChSubmissionMode : u32 { diff --git a/src/video_core/engines/maxwell_3d.h b/src/video_core/engines/maxwell_3d.h index 7f4ca6282..f22342dfb 100644 --- a/src/video_core/engines/maxwell_3d.h +++ b/src/video_core/engines/maxwell_3d.h @@ -6,6 +6,7 @@  #include <array>  #include <bitset> +#include <cmath>  #include <limits>  #include <optional>  #include <type_traits> diff --git a/src/video_core/framebuffer_config.h b/src/video_core/framebuffer_config.h index b86c3a757..b1d455e30 100644 --- a/src/video_core/framebuffer_config.h +++ b/src/video_core/framebuffer_config.h @@ -4,8 +4,10 @@  #pragma once -namespace Tegra { +#include "common/common_types.h" +#include "common/math_util.h" +namespace Tegra {  /**   * Struct describing framebuffer configuration   */ @@ -16,6 +18,21 @@ struct FramebufferConfig {          B8G8R8A8_UNORM = 5,      }; +    enum class TransformFlags : u32 { +        /// No transform flags are set +        Unset = 0x00, +        /// Flip source image horizontally (around the vertical axis) +        FlipH = 0x01, +        /// Flip source image vertically (around the horizontal axis) +        FlipV = 0x02, +        /// Rotate source image 90 degrees clockwise +        Rotate90 = 0x04, +        /// Rotate source image 180 degrees +        Rotate180 = 0x03, +        /// Rotate source image 270 degrees clockwise +        Rotate270 = 0x07, +    }; +      VAddr address{};      u32 offset{};      u32 width{}; @@ -23,7 +40,6 @@ struct FramebufferConfig {      u32 stride{};      PixelFormat pixel_format{}; -    using TransformFlags = Service::NVFlinger::BufferQueue::BufferTransformFlags;      TransformFlags transform_flags{};      Common::Rectangle<int> crop_rect;  }; diff --git a/src/video_core/gpu.cpp b/src/video_core/gpu.cpp index 2ae3639b5..520675873 100644 --- a/src/video_core/gpu.cpp +++ b/src/video_core/gpu.cpp @@ -2,540 +2,920 @@  // Licensed under GPLv2 or any later version  // Refer to the license.txt file included. +#include <array> +#include <atomic>  #include <chrono> +#include <condition_variable> +#include <list> +#include <memory>  #include "common/assert.h"  #include "common/microprofile.h"  #include "common/settings.h"  #include "core/core.h"  #include "core/core_timing.h" -#include "core/core_timing_util.h"  #include "core/frontend/emu_window.h"  #include "core/hardware_interrupt_manager.h" -#include "core/memory.h" +#include "core/hle/service/nvdrv/nvdata.h" +#include "core/hle/service/nvflinger/buffer_queue.h"  #include "core/perf_stats.h" +#include "video_core/cdma_pusher.h" +#include "video_core/dma_pusher.h"  #include "video_core/engines/fermi_2d.h"  #include "video_core/engines/kepler_compute.h"  #include "video_core/engines/kepler_memory.h"  #include "video_core/engines/maxwell_3d.h"  #include "video_core/engines/maxwell_dma.h"  #include "video_core/gpu.h" +#include "video_core/gpu_thread.h"  #include "video_core/memory_manager.h"  #include "video_core/renderer_base.h"  #include "video_core/shader_notify.h" -#include "video_core/video_core.h"  namespace Tegra {  MICROPROFILE_DEFINE(GPU_wait, "GPU", "Wait for the GPU", MP_RGB(128, 128, 192)); -GPU::GPU(Core::System& system_, bool is_async_, bool use_nvdec_) -    : system{system_}, memory_manager{std::make_unique<Tegra::MemoryManager>(system)}, -      dma_pusher{std::make_unique<Tegra::DmaPusher>(system, *this)}, use_nvdec{use_nvdec_}, -      maxwell_3d{std::make_unique<Engines::Maxwell3D>(system, *memory_manager)}, -      fermi_2d{std::make_unique<Engines::Fermi2D>()}, -      kepler_compute{std::make_unique<Engines::KeplerCompute>(system, *memory_manager)}, -      maxwell_dma{std::make_unique<Engines::MaxwellDMA>(system, *memory_manager)}, -      kepler_memory{std::make_unique<Engines::KeplerMemory>(system, *memory_manager)}, -      shader_notify{std::make_unique<VideoCore::ShaderNotify>()}, is_async{is_async_}, -      gpu_thread{system_, is_async_} {} +struct GPU::Impl { +    explicit Impl(GPU& gpu_, Core::System& system_, bool is_async_, bool use_nvdec_) +        : gpu{gpu_}, system{system_}, memory_manager{std::make_unique<Tegra::MemoryManager>( +                                          system)}, +          dma_pusher{std::make_unique<Tegra::DmaPusher>(system, gpu)}, use_nvdec{use_nvdec_}, +          maxwell_3d{std::make_unique<Engines::Maxwell3D>(system, *memory_manager)}, +          fermi_2d{std::make_unique<Engines::Fermi2D>()}, +          kepler_compute{std::make_unique<Engines::KeplerCompute>(system, *memory_manager)}, +          maxwell_dma{std::make_unique<Engines::MaxwellDMA>(system, *memory_manager)}, +          kepler_memory{std::make_unique<Engines::KeplerMemory>(system, *memory_manager)}, +          shader_notify{std::make_unique<VideoCore::ShaderNotify>()}, is_async{is_async_}, +          gpu_thread{system_, is_async_} {} + +    ~Impl() = default; + +    /// Binds a renderer to the GPU. +    void BindRenderer(std::unique_ptr<VideoCore::RendererBase> renderer_) { +        renderer = std::move(renderer_); +        rasterizer = renderer->ReadRasterizer(); + +        memory_manager->BindRasterizer(rasterizer); +        maxwell_3d->BindRasterizer(rasterizer); +        fermi_2d->BindRasterizer(rasterizer); +        kepler_compute->BindRasterizer(rasterizer); +        maxwell_dma->BindRasterizer(rasterizer); +    } + +    /// Calls a GPU method. +    void CallMethod(const GPU::MethodCall& method_call) { +        LOG_TRACE(HW_GPU, "Processing method {:08X} on subchannel {}", method_call.method, +                  method_call.subchannel); + +        ASSERT(method_call.subchannel < bound_engines.size()); + +        if (ExecuteMethodOnEngine(method_call.method)) { +            CallEngineMethod(method_call); +        } else { +            CallPullerMethod(method_call); +        } +    } + +    /// Calls a GPU multivalue method. +    void CallMultiMethod(u32 method, u32 subchannel, const u32* base_start, u32 amount, +                         u32 methods_pending) { +        LOG_TRACE(HW_GPU, "Processing method {:08X} on subchannel {}", method, subchannel); + +        ASSERT(subchannel < bound_engines.size()); + +        if (ExecuteMethodOnEngine(method)) { +            CallEngineMultiMethod(method, subchannel, base_start, amount, methods_pending); +        } else { +            for (std::size_t i = 0; i < amount; i++) { +                CallPullerMethod(GPU::MethodCall{ +                    method, +                    base_start[i], +                    subchannel, +                    methods_pending - static_cast<u32>(i), +                }); +            } +        } +    } + +    /// Flush all current written commands into the host GPU for execution. +    void FlushCommands() { +        rasterizer->FlushCommands(); +    } + +    /// Synchronizes CPU writes with Host GPU memory. +    void SyncGuestHost() { +        rasterizer->SyncGuestHost(); +    } + +    /// Signal the ending of command list. +    void OnCommandListEnd() { +        if (is_async) { +            // This command only applies to asynchronous GPU mode +            gpu_thread.OnCommandListEnd(); +        } +    } + +    /// Request a host GPU memory flush from the CPU. +    [[nodiscard]] u64 RequestFlush(VAddr addr, std::size_t size) { +        std::unique_lock lck{flush_request_mutex}; +        const u64 fence = ++last_flush_fence; +        flush_requests.emplace_back(fence, addr, size); +        return fence; +    } + +    /// Obtains current flush request fence id. +    [[nodiscard]] u64 CurrentFlushRequestFence() const { +        return current_flush_fence.load(std::memory_order_relaxed); +    } + +    /// Tick pending requests within the GPU. +    void TickWork() { +        std::unique_lock lck{flush_request_mutex}; +        while (!flush_requests.empty()) { +            auto& request = flush_requests.front(); +            const u64 fence = request.fence; +            const VAddr addr = request.addr; +            const std::size_t size = request.size; +            flush_requests.pop_front(); +            flush_request_mutex.unlock(); +            rasterizer->FlushRegion(addr, size); +            current_flush_fence.store(fence); +            flush_request_mutex.lock(); +        } +    } + +    /// Returns a reference to the Maxwell3D GPU engine. +    [[nodiscard]] Engines::Maxwell3D& Maxwell3D() { +        return *maxwell_3d; +    } + +    /// Returns a const reference to the Maxwell3D GPU engine. +    [[nodiscard]] const Engines::Maxwell3D& Maxwell3D() const { +        return *maxwell_3d; +    } + +    /// Returns a reference to the KeplerCompute GPU engine. +    [[nodiscard]] Engines::KeplerCompute& KeplerCompute() { +        return *kepler_compute; +    } + +    /// Returns a reference to the KeplerCompute GPU engine. +    [[nodiscard]] const Engines::KeplerCompute& KeplerCompute() const { +        return *kepler_compute; +    } + +    /// Returns a reference to the GPU memory manager. +    [[nodiscard]] Tegra::MemoryManager& MemoryManager() { +        return *memory_manager; +    } + +    /// Returns a const reference to the GPU memory manager. +    [[nodiscard]] const Tegra::MemoryManager& MemoryManager() const { +        return *memory_manager; +    } + +    /// Returns a reference to the GPU DMA pusher. +    [[nodiscard]] Tegra::DmaPusher& DmaPusher() { +        return *dma_pusher; +    } + +    /// Returns a const reference to the GPU DMA pusher. +    [[nodiscard]] const Tegra::DmaPusher& DmaPusher() const { +        return *dma_pusher; +    } + +    /// Returns a reference to the GPU CDMA pusher. +    [[nodiscard]] Tegra::CDmaPusher& CDmaPusher() { +        return *cdma_pusher; +    } + +    /// Returns a const reference to the GPU CDMA pusher. +    [[nodiscard]] const Tegra::CDmaPusher& CDmaPusher() const { +        return *cdma_pusher; +    } + +    /// Returns a reference to the underlying renderer. +    [[nodiscard]] VideoCore::RendererBase& Renderer() { +        return *renderer; +    } + +    /// Returns a const reference to the underlying renderer. +    [[nodiscard]] const VideoCore::RendererBase& Renderer() const { +        return *renderer; +    } + +    /// Returns a reference to the shader notifier. +    [[nodiscard]] VideoCore::ShaderNotify& ShaderNotify() { +        return *shader_notify; +    } + +    /// Returns a const reference to the shader notifier. +    [[nodiscard]] const VideoCore::ShaderNotify& ShaderNotify() const { +        return *shader_notify; +    } + +    /// Allows the CPU/NvFlinger to wait on the GPU before presenting a frame. +    void WaitFence(u32 syncpoint_id, u32 value) { +        // Synced GPU, is always in sync +        if (!is_async) { +            return; +        } +        if (syncpoint_id == UINT32_MAX) { +            // TODO: Research what this does. +            LOG_ERROR(HW_GPU, "Waiting for syncpoint -1 not implemented"); +            return; +        } +        MICROPROFILE_SCOPE(GPU_wait); +        std::unique_lock lock{sync_mutex}; +        sync_cv.wait(lock, [=, this] { +            if (shutting_down.load(std::memory_order_relaxed)) { +                // We're shutting down, ensure no threads continue to wait for the next syncpoint +                return true; +            } +            return syncpoints.at(syncpoint_id).load() >= value; +        }); +    } + +    void IncrementSyncPoint(u32 syncpoint_id) { +        auto& syncpoint = syncpoints.at(syncpoint_id); +        syncpoint++; +        std::lock_guard lock{sync_mutex}; +        sync_cv.notify_all(); +        auto& interrupt = syncpt_interrupts.at(syncpoint_id); +        if (!interrupt.empty()) { +            u32 value = syncpoint.load(); +            auto it = interrupt.begin(); +            while (it != interrupt.end()) { +                if (value >= *it) { +                    TriggerCpuInterrupt(syncpoint_id, *it); +                    it = interrupt.erase(it); +                    continue; +                } +                it++; +            } +        } +    } + +    [[nodiscard]] u32 GetSyncpointValue(u32 syncpoint_id) const { +        return syncpoints.at(syncpoint_id).load(); +    } + +    void RegisterSyncptInterrupt(u32 syncpoint_id, u32 value) { +        auto& interrupt = syncpt_interrupts.at(syncpoint_id); +        bool contains = std::any_of(interrupt.begin(), interrupt.end(), +                                    [value](u32 in_value) { return in_value == value; }); +        if (contains) { +            return; +        } +        interrupt.emplace_back(value); +    } + +    [[nodiscard]] bool CancelSyncptInterrupt(u32 syncpoint_id, u32 value) { +        std::lock_guard lock{sync_mutex}; +        auto& interrupt = syncpt_interrupts.at(syncpoint_id); +        const auto iter = +            std::find_if(interrupt.begin(), interrupt.end(), +                         [value](u32 interrupt_value) { return value == interrupt_value; }); + +        if (iter == interrupt.end()) { +            return false; +        } +        interrupt.erase(iter); +        return true; +    } + +    [[nodiscard]] u64 GetTicks() const { +        // This values were reversed engineered by fincs from NVN +        // The gpu clock is reported in units of 385/625 nanoseconds +        constexpr u64 gpu_ticks_num = 384; +        constexpr u64 gpu_ticks_den = 625; + +        u64 nanoseconds = system.CoreTiming().GetGlobalTimeNs().count(); +        if (Settings::values.use_fast_gpu_time.GetValue()) { +            nanoseconds /= 256; +        } +        const u64 nanoseconds_num = nanoseconds / gpu_ticks_den; +        const u64 nanoseconds_rem = nanoseconds % gpu_ticks_den; +        return nanoseconds_num * gpu_ticks_num + (nanoseconds_rem * gpu_ticks_num) / gpu_ticks_den; +    } + +    [[nodiscard]] std::unique_lock<std::mutex> LockSync() { +        return std::unique_lock{sync_mutex}; +    } + +    [[nodiscard]] bool IsAsync() const { +        return is_async; +    } + +    [[nodiscard]] bool UseNvdec() const { +        return use_nvdec; +    } + +    void RendererFrameEndNotify() { +        system.GetPerfStats().EndGameFrame(); +    } + +    /// Performs any additional setup necessary in order to begin GPU emulation. +    /// This can be used to launch any necessary threads and register any necessary +    /// core timing events. +    void Start() { +        gpu_thread.StartThread(*renderer, renderer->Context(), *dma_pusher); +        cpu_context = renderer->GetRenderWindow().CreateSharedContext(); +        cpu_context->MakeCurrent(); +    } + +    /// Obtain the CPU Context +    void ObtainContext() { +        cpu_context->MakeCurrent(); +    } + +    /// Release the CPU Context +    void ReleaseContext() { +        cpu_context->DoneCurrent(); +    } + +    /// Push GPU command entries to be processed +    void PushGPUEntries(Tegra::CommandList&& entries) { +        gpu_thread.SubmitList(std::move(entries)); +    } + +    /// Push GPU command buffer entries to be processed +    void PushCommandBuffer(Tegra::ChCommandHeaderList& entries) { +        if (!use_nvdec) { +            return; +        } + +        if (!cdma_pusher) { +            cdma_pusher = std::make_unique<Tegra::CDmaPusher>(gpu); +        } + +        // SubmitCommandBuffer would make the nvdec operations async, this is not currently working +        // TODO(ameerj): RE proper async nvdec operation +        // gpu_thread.SubmitCommandBuffer(std::move(entries)); + +        cdma_pusher->ProcessEntries(std::move(entries)); +    } + +    /// Frees the CDMAPusher instance to free up resources +    void ClearCdmaInstance() { +        cdma_pusher.reset(); +    } + +    /// Swap buffers (render frame) +    void SwapBuffers(const Tegra::FramebufferConfig* framebuffer) { +        gpu_thread.SwapBuffers(framebuffer); +    } + +    /// Notify rasterizer that any caches of the specified region should be flushed to Switch memory +    void FlushRegion(VAddr addr, u64 size) { +        gpu_thread.FlushRegion(addr, size); +    } + +    /// Notify rasterizer that any caches of the specified region should be invalidated +    void InvalidateRegion(VAddr addr, u64 size) { +        gpu_thread.InvalidateRegion(addr, size); +    } + +    /// Notify rasterizer that any caches of the specified region should be flushed and invalidated +    void FlushAndInvalidateRegion(VAddr addr, u64 size) { +        gpu_thread.FlushAndInvalidateRegion(addr, size); +    } + +    void TriggerCpuInterrupt(u32 syncpoint_id, u32 value) const { +        auto& interrupt_manager = system.InterruptManager(); +        interrupt_manager.GPUInterruptSyncpt(syncpoint_id, value); +    } + +    void ProcessBindMethod(const GPU::MethodCall& method_call) { +        // Bind the current subchannel to the desired engine id. +        LOG_DEBUG(HW_GPU, "Binding subchannel {} to engine {}", method_call.subchannel, +                  method_call.argument); +        const auto engine_id = static_cast<EngineID>(method_call.argument); +        bound_engines[method_call.subchannel] = static_cast<EngineID>(engine_id); +        switch (engine_id) { +        case EngineID::FERMI_TWOD_A: +            dma_pusher->BindSubchannel(fermi_2d.get(), method_call.subchannel); +            break; +        case EngineID::MAXWELL_B: +            dma_pusher->BindSubchannel(maxwell_3d.get(), method_call.subchannel); +            break; +        case EngineID::KEPLER_COMPUTE_B: +            dma_pusher->BindSubchannel(kepler_compute.get(), method_call.subchannel); +            break; +        case EngineID::MAXWELL_DMA_COPY_A: +            dma_pusher->BindSubchannel(maxwell_dma.get(), method_call.subchannel); +            break; +        case EngineID::KEPLER_INLINE_TO_MEMORY_B: +            dma_pusher->BindSubchannel(kepler_memory.get(), method_call.subchannel); +            break; +        default: +            UNIMPLEMENTED_MSG("Unimplemented engine {:04X}", engine_id); +        } +    } + +    void ProcessFenceActionMethod() { +        switch (regs.fence_action.op) { +        case GPU::FenceOperation::Acquire: +            WaitFence(regs.fence_action.syncpoint_id, regs.fence_value); +            break; +        case GPU::FenceOperation::Increment: +            IncrementSyncPoint(regs.fence_action.syncpoint_id); +            break; +        default: +            UNIMPLEMENTED_MSG("Unimplemented operation {}", regs.fence_action.op.Value()); +        } +    } + +    void ProcessWaitForInterruptMethod() { +        // TODO(bunnei) ImplementMe +        LOG_WARNING(HW_GPU, "(STUBBED) called"); +    } + +    void ProcessSemaphoreTriggerMethod() { +        const auto semaphoreOperationMask = 0xF; +        const auto op = +            static_cast<GpuSemaphoreOperation>(regs.semaphore_trigger & semaphoreOperationMask); +        if (op == GpuSemaphoreOperation::WriteLong) { +            struct Block { +                u32 sequence; +                u32 zeros = 0; +                u64 timestamp; +            }; + +            Block block{}; +            block.sequence = regs.semaphore_sequence; +            // TODO(Kmather73): Generate a real GPU timestamp and write it here instead of +            // CoreTiming +            block.timestamp = GetTicks(); +            memory_manager->WriteBlock(regs.semaphore_address.SemaphoreAddress(), &block, +                                       sizeof(block)); +        } else { +            const u32 word{memory_manager->Read<u32>(regs.semaphore_address.SemaphoreAddress())}; +            if ((op == GpuSemaphoreOperation::AcquireEqual && word == regs.semaphore_sequence) || +                (op == GpuSemaphoreOperation::AcquireGequal && +                 static_cast<s32>(word - regs.semaphore_sequence) > 0) || +                (op == GpuSemaphoreOperation::AcquireMask && (word & regs.semaphore_sequence))) { +                // Nothing to do in this case +            } else { +                regs.acquire_source = true; +                regs.acquire_value = regs.semaphore_sequence; +                if (op == GpuSemaphoreOperation::AcquireEqual) { +                    regs.acquire_active = true; +                    regs.acquire_mode = false; +                } else if (op == GpuSemaphoreOperation::AcquireGequal) { +                    regs.acquire_active = true; +                    regs.acquire_mode = true; +                } else if (op == GpuSemaphoreOperation::AcquireMask) { +                    // TODO(kemathe) The acquire mask operation waits for a value that, ANDed with +                    // semaphore_sequence, gives a non-0 result +                    LOG_ERROR(HW_GPU, "Invalid semaphore operation AcquireMask not implemented"); +                } else { +                    LOG_ERROR(HW_GPU, "Invalid semaphore operation"); +                } +            } +        } +    } + +    void ProcessSemaphoreRelease() { +        memory_manager->Write<u32>(regs.semaphore_address.SemaphoreAddress(), +                                   regs.semaphore_release); +    } + +    void ProcessSemaphoreAcquire() { +        const u32 word = memory_manager->Read<u32>(regs.semaphore_address.SemaphoreAddress()); +        const auto value = regs.semaphore_acquire; +        if (word != value) { +            regs.acquire_active = true; +            regs.acquire_value = value; +            // TODO(kemathe73) figure out how to do the acquire_timeout +            regs.acquire_mode = false; +            regs.acquire_source = false; +        } +    } + +    /// Calls a GPU puller method. +    void CallPullerMethod(const GPU::MethodCall& method_call) { +        regs.reg_array[method_call.method] = method_call.argument; +        const auto method = static_cast<BufferMethods>(method_call.method); + +        switch (method) { +        case BufferMethods::BindObject: { +            ProcessBindMethod(method_call); +            break; +        } +        case BufferMethods::Nop: +        case BufferMethods::SemaphoreAddressHigh: +        case BufferMethods::SemaphoreAddressLow: +        case BufferMethods::SemaphoreSequence: +        case BufferMethods::UnkCacheFlush: +        case BufferMethods::WrcacheFlush: +        case BufferMethods::FenceValue: +            break; +        case BufferMethods::RefCnt: +            rasterizer->SignalReference(); +            break; +        case BufferMethods::FenceAction: +            ProcessFenceActionMethod(); +            break; +        case BufferMethods::WaitForInterrupt: +            ProcessWaitForInterruptMethod(); +            break; +        case BufferMethods::SemaphoreTrigger: { +            ProcessSemaphoreTriggerMethod(); +            break; +        } +        case BufferMethods::NotifyIntr: { +            // TODO(Kmather73): Research and implement this method. +            LOG_ERROR(HW_GPU, "Special puller engine method NotifyIntr not implemented"); +            break; +        } +        case BufferMethods::Unk28: { +            // TODO(Kmather73): Research and implement this method. +            LOG_ERROR(HW_GPU, "Special puller engine method Unk28 not implemented"); +            break; +        } +        case BufferMethods::SemaphoreAcquire: { +            ProcessSemaphoreAcquire(); +            break; +        } +        case BufferMethods::SemaphoreRelease: { +            ProcessSemaphoreRelease(); +            break; +        } +        case BufferMethods::Yield: { +            // TODO(Kmather73): Research and implement this method. +            LOG_ERROR(HW_GPU, "Special puller engine method Yield not implemented"); +            break; +        } +        default: +            LOG_ERROR(HW_GPU, "Special puller engine method {:X} not implemented", method); +            break; +        } +    } + +    /// Calls a GPU engine method. +    void CallEngineMethod(const GPU::MethodCall& method_call) { +        const EngineID engine = bound_engines[method_call.subchannel]; + +        switch (engine) { +        case EngineID::FERMI_TWOD_A: +            fermi_2d->CallMethod(method_call.method, method_call.argument, +                                 method_call.IsLastCall()); +            break; +        case EngineID::MAXWELL_B: +            maxwell_3d->CallMethod(method_call.method, method_call.argument, +                                   method_call.IsLastCall()); +            break; +        case EngineID::KEPLER_COMPUTE_B: +            kepler_compute->CallMethod(method_call.method, method_call.argument, +                                       method_call.IsLastCall()); +            break; +        case EngineID::MAXWELL_DMA_COPY_A: +            maxwell_dma->CallMethod(method_call.method, method_call.argument, +                                    method_call.IsLastCall()); +            break; +        case EngineID::KEPLER_INLINE_TO_MEMORY_B: +            kepler_memory->CallMethod(method_call.method, method_call.argument, +                                      method_call.IsLastCall()); +            break; +        default: +            UNIMPLEMENTED_MSG("Unimplemented engine"); +        } +    } + +    /// Calls a GPU engine multivalue method. +    void CallEngineMultiMethod(u32 method, u32 subchannel, const u32* base_start, u32 amount, +                               u32 methods_pending) { +        const EngineID engine = bound_engines[subchannel]; + +        switch (engine) { +        case EngineID::FERMI_TWOD_A: +            fermi_2d->CallMultiMethod(method, base_start, amount, methods_pending); +            break; +        case EngineID::MAXWELL_B: +            maxwell_3d->CallMultiMethod(method, base_start, amount, methods_pending); +            break; +        case EngineID::KEPLER_COMPUTE_B: +            kepler_compute->CallMultiMethod(method, base_start, amount, methods_pending); +            break; +        case EngineID::MAXWELL_DMA_COPY_A: +            maxwell_dma->CallMultiMethod(method, base_start, amount, methods_pending); +            break; +        case EngineID::KEPLER_INLINE_TO_MEMORY_B: +            kepler_memory->CallMultiMethod(method, base_start, amount, methods_pending); +            break; +        default: +            UNIMPLEMENTED_MSG("Unimplemented engine"); +        } +    } + +    /// Determines where the method should be executed. +    [[nodiscard]] bool ExecuteMethodOnEngine(u32 method) { +        const auto buffer_method = static_cast<BufferMethods>(method); +        return buffer_method >= BufferMethods::NonPullerMethods; +    } + +    struct Regs { +        static constexpr size_t NUM_REGS = 0x40; + +        union { +            struct { +                INSERT_PADDING_WORDS_NOINIT(0x4); +                struct { +                    u32 address_high; +                    u32 address_low; + +                    [[nodiscard]] GPUVAddr SemaphoreAddress() const { +                        return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high) << 32) | +                                                     address_low); +                    } +                } semaphore_address; + +                u32 semaphore_sequence; +                u32 semaphore_trigger; +                INSERT_PADDING_WORDS_NOINIT(0xC); + +                // The pusher and the puller share the reference counter, the pusher only has read +                // access +                u32 reference_count; +                INSERT_PADDING_WORDS_NOINIT(0x5); + +                u32 semaphore_acquire; +                u32 semaphore_release; +                u32 fence_value; +                GPU::FenceAction fence_action; +                INSERT_PADDING_WORDS_NOINIT(0xE2); + +                // Puller state +                u32 acquire_mode; +                u32 acquire_source; +                u32 acquire_active; +                u32 acquire_timeout; +                u32 acquire_value; +            }; +            std::array<u32, NUM_REGS> reg_array; +        }; +    } regs{}; + +    GPU& gpu; +    Core::System& system; +    std::unique_ptr<Tegra::MemoryManager> memory_manager; +    std::unique_ptr<Tegra::DmaPusher> dma_pusher; +    std::unique_ptr<Tegra::CDmaPusher> cdma_pusher; +    std::unique_ptr<VideoCore::RendererBase> renderer; +    VideoCore::RasterizerInterface* rasterizer = nullptr; +    const bool use_nvdec; + +    /// Mapping of command subchannels to their bound engine ids +    std::array<EngineID, 8> bound_engines{}; +    /// 3D engine +    std::unique_ptr<Engines::Maxwell3D> maxwell_3d; +    /// 2D engine +    std::unique_ptr<Engines::Fermi2D> fermi_2d; +    /// Compute engine +    std::unique_ptr<Engines::KeplerCompute> kepler_compute; +    /// DMA engine +    std::unique_ptr<Engines::MaxwellDMA> maxwell_dma; +    /// Inline memory engine +    std::unique_ptr<Engines::KeplerMemory> kepler_memory; +    /// Shader build notifier +    std::unique_ptr<VideoCore::ShaderNotify> shader_notify; +    /// When true, we are about to shut down emulation session, so terminate outstanding tasks +    std::atomic_bool shutting_down{}; + +    std::array<std::atomic<u32>, Service::Nvidia::MaxSyncPoints> syncpoints{}; + +    std::array<std::list<u32>, Service::Nvidia::MaxSyncPoints> syncpt_interrupts; + +    std::mutex sync_mutex; +    std::mutex device_mutex; + +    std::condition_variable sync_cv; + +    struct FlushRequest { +        explicit FlushRequest(u64 fence_, VAddr addr_, std::size_t size_) +            : fence{fence_}, addr{addr_}, size{size_} {} +        u64 fence; +        VAddr addr; +        std::size_t size; +    }; + +    std::list<FlushRequest> flush_requests; +    std::atomic<u64> current_flush_fence{}; +    u64 last_flush_fence{}; +    std::mutex flush_request_mutex; + +    const bool is_async; + +    VideoCommon::GPUThread::ThreadManager gpu_thread; +    std::unique_ptr<Core::Frontend::GraphicsContext> cpu_context; + +#define ASSERT_REG_POSITION(field_name, position)                                                  \ +    static_assert(offsetof(Regs, field_name) == position * 4,                                      \ +                  "Field " #field_name " has invalid position") + +    ASSERT_REG_POSITION(semaphore_address, 0x4); +    ASSERT_REG_POSITION(semaphore_sequence, 0x6); +    ASSERT_REG_POSITION(semaphore_trigger, 0x7); +    ASSERT_REG_POSITION(reference_count, 0x14); +    ASSERT_REG_POSITION(semaphore_acquire, 0x1A); +    ASSERT_REG_POSITION(semaphore_release, 0x1B); +    ASSERT_REG_POSITION(fence_value, 0x1C); +    ASSERT_REG_POSITION(fence_action, 0x1D); + +    ASSERT_REG_POSITION(acquire_mode, 0x100); +    ASSERT_REG_POSITION(acquire_source, 0x101); +    ASSERT_REG_POSITION(acquire_active, 0x102); +    ASSERT_REG_POSITION(acquire_timeout, 0x103); +    ASSERT_REG_POSITION(acquire_value, 0x104); + +#undef ASSERT_REG_POSITION + +    enum class GpuSemaphoreOperation { +        AcquireEqual = 0x1, +        WriteLong = 0x2, +        AcquireGequal = 0x4, +        AcquireMask = 0x8, +    }; +}; + +GPU::GPU(Core::System& system, bool is_async, bool use_nvdec) +    : impl{std::make_unique<Impl>(*this, system, is_async, use_nvdec)} {}  GPU::~GPU() = default; -void GPU::BindRenderer(std::unique_ptr<VideoCore::RendererBase> renderer_) { -    renderer = std::move(renderer_); -    rasterizer = renderer->ReadRasterizer(); +void GPU::BindRenderer(std::unique_ptr<VideoCore::RendererBase> renderer) { +    impl->BindRenderer(std::move(renderer)); +} -    memory_manager->BindRasterizer(rasterizer); -    maxwell_3d->BindRasterizer(rasterizer); -    fermi_2d->BindRasterizer(rasterizer); -    kepler_compute->BindRasterizer(rasterizer); -    maxwell_dma->BindRasterizer(rasterizer); +void GPU::CallMethod(const MethodCall& method_call) { +    impl->CallMethod(method_call);  } -Engines::Maxwell3D& GPU::Maxwell3D() { -    return *maxwell_3d; +void GPU::CallMultiMethod(u32 method, u32 subchannel, const u32* base_start, u32 amount, +                          u32 methods_pending) { +    impl->CallMultiMethod(method, subchannel, base_start, amount, methods_pending);  } -const Engines::Maxwell3D& GPU::Maxwell3D() const { -    return *maxwell_3d; +void GPU::FlushCommands() { +    impl->FlushCommands();  } -Engines::KeplerCompute& GPU::KeplerCompute() { -    return *kepler_compute; +void GPU::SyncGuestHost() { +    impl->SyncGuestHost();  } -const Engines::KeplerCompute& GPU::KeplerCompute() const { -    return *kepler_compute; +void GPU::OnCommandListEnd() { +    impl->OnCommandListEnd();  } -MemoryManager& GPU::MemoryManager() { -    return *memory_manager; +u64 GPU::RequestFlush(VAddr addr, std::size_t size) { +    return impl->RequestFlush(addr, size);  } -const MemoryManager& GPU::MemoryManager() const { -    return *memory_manager; +u64 GPU::CurrentFlushRequestFence() const { +    return impl->CurrentFlushRequestFence();  } -DmaPusher& GPU::DmaPusher() { -    return *dma_pusher; +void GPU::TickWork() { +    impl->TickWork();  } -Tegra::CDmaPusher& GPU::CDmaPusher() { -    return *cdma_pusher; +Engines::Maxwell3D& GPU::Maxwell3D() { +    return impl->Maxwell3D();  } -const DmaPusher& GPU::DmaPusher() const { -    return *dma_pusher; +const Engines::Maxwell3D& GPU::Maxwell3D() const { +    return impl->Maxwell3D();  } -const Tegra::CDmaPusher& GPU::CDmaPusher() const { -    return *cdma_pusher; +Engines::KeplerCompute& GPU::KeplerCompute() { +    return impl->KeplerCompute();  } -void GPU::WaitFence(u32 syncpoint_id, u32 value) { -    // Synced GPU, is always in sync -    if (!is_async) { -        return; -    } -    if (syncpoint_id == UINT32_MAX) { -        // TODO: Research what this does. -        LOG_ERROR(HW_GPU, "Waiting for syncpoint -1 not implemented"); -        return; -    } -    MICROPROFILE_SCOPE(GPU_wait); -    std::unique_lock lock{sync_mutex}; -    sync_cv.wait(lock, [=, this] { -        if (shutting_down.load(std::memory_order_relaxed)) { -            // We're shutting down, ensure no threads continue to wait for the next syncpoint -            return true; -        } -        return syncpoints.at(syncpoint_id).load() >= value; -    }); -} - -void GPU::IncrementSyncPoint(const u32 syncpoint_id) { -    auto& syncpoint = syncpoints.at(syncpoint_id); -    syncpoint++; -    std::lock_guard lock{sync_mutex}; -    sync_cv.notify_all(); -    auto& interrupt = syncpt_interrupts.at(syncpoint_id); -    if (!interrupt.empty()) { -        u32 value = syncpoint.load(); -        auto it = interrupt.begin(); -        while (it != interrupt.end()) { -            if (value >= *it) { -                TriggerCpuInterrupt(syncpoint_id, *it); -                it = interrupt.erase(it); -                continue; -            } -            it++; -        } -    } +const Engines::KeplerCompute& GPU::KeplerCompute() const { +    return impl->KeplerCompute();  } -u32 GPU::GetSyncpointValue(const u32 syncpoint_id) const { -    return syncpoints.at(syncpoint_id).load(); +Tegra::MemoryManager& GPU::MemoryManager() { +    return impl->MemoryManager();  } -void GPU::RegisterSyncptInterrupt(const u32 syncpoint_id, const u32 value) { -    auto& interrupt = syncpt_interrupts.at(syncpoint_id); -    bool contains = std::any_of(interrupt.begin(), interrupt.end(), -                                [value](u32 in_value) { return in_value == value; }); -    if (contains) { -        return; -    } -    interrupt.emplace_back(value); +const Tegra::MemoryManager& GPU::MemoryManager() const { +    return impl->MemoryManager();  } -bool GPU::CancelSyncptInterrupt(const u32 syncpoint_id, const u32 value) { -    std::lock_guard lock{sync_mutex}; -    auto& interrupt = syncpt_interrupts.at(syncpoint_id); -    const auto iter = -        std::find_if(interrupt.begin(), interrupt.end(), -                     [value](u32 interrupt_value) { return value == interrupt_value; }); +Tegra::DmaPusher& GPU::DmaPusher() { +    return impl->DmaPusher(); +} -    if (iter == interrupt.end()) { -        return false; -    } -    interrupt.erase(iter); -    return true; +const Tegra::DmaPusher& GPU::DmaPusher() const { +    return impl->DmaPusher();  } -u64 GPU::RequestFlush(VAddr addr, std::size_t size) { -    std::unique_lock lck{flush_request_mutex}; -    const u64 fence = ++last_flush_fence; -    flush_requests.emplace_back(fence, addr, size); -    return fence; +Tegra::CDmaPusher& GPU::CDmaPusher() { +    return impl->CDmaPusher();  } -void GPU::TickWork() { -    std::unique_lock lck{flush_request_mutex}; -    while (!flush_requests.empty()) { -        auto& request = flush_requests.front(); -        const u64 fence = request.fence; -        const VAddr addr = request.addr; -        const std::size_t size = request.size; -        flush_requests.pop_front(); -        flush_request_mutex.unlock(); -        rasterizer->FlushRegion(addr, size); -        current_flush_fence.store(fence); -        flush_request_mutex.lock(); -    } +const Tegra::CDmaPusher& GPU::CDmaPusher() const { +    return impl->CDmaPusher();  } -u64 GPU::GetTicks() const { -    // This values were reversed engineered by fincs from NVN -    // The gpu clock is reported in units of 385/625 nanoseconds -    constexpr u64 gpu_ticks_num = 384; -    constexpr u64 gpu_ticks_den = 625; +VideoCore::RendererBase& GPU::Renderer() { +    return impl->Renderer(); +} -    u64 nanoseconds = system.CoreTiming().GetGlobalTimeNs().count(); -    if (Settings::values.use_fast_gpu_time.GetValue()) { -        nanoseconds /= 256; -    } -    const u64 nanoseconds_num = nanoseconds / gpu_ticks_den; -    const u64 nanoseconds_rem = nanoseconds % gpu_ticks_den; -    return nanoseconds_num * gpu_ticks_num + (nanoseconds_rem * gpu_ticks_num) / gpu_ticks_den; +const VideoCore::RendererBase& GPU::Renderer() const { +    return impl->Renderer();  } -void GPU::RendererFrameEndNotify() { -    system.GetPerfStats().EndGameFrame(); +VideoCore::ShaderNotify& GPU::ShaderNotify() { +    return impl->ShaderNotify();  } -void GPU::FlushCommands() { -    rasterizer->FlushCommands(); +const VideoCore::ShaderNotify& GPU::ShaderNotify() const { +    return impl->ShaderNotify();  } -void GPU::SyncGuestHost() { -    rasterizer->SyncGuestHost(); +void GPU::WaitFence(u32 syncpoint_id, u32 value) { +    impl->WaitFence(syncpoint_id, value);  } -enum class GpuSemaphoreOperation { -    AcquireEqual = 0x1, -    WriteLong = 0x2, -    AcquireGequal = 0x4, -    AcquireMask = 0x8, -}; +void GPU::IncrementSyncPoint(u32 syncpoint_id) { +    impl->IncrementSyncPoint(syncpoint_id); +} -void GPU::CallMethod(const MethodCall& method_call) { -    LOG_TRACE(HW_GPU, "Processing method {:08X} on subchannel {}", method_call.method, -              method_call.subchannel); +u32 GPU::GetSyncpointValue(u32 syncpoint_id) const { +    return impl->GetSyncpointValue(syncpoint_id); +} -    ASSERT(method_call.subchannel < bound_engines.size()); +void GPU::RegisterSyncptInterrupt(u32 syncpoint_id, u32 value) { +    impl->RegisterSyncptInterrupt(syncpoint_id, value); +} -    if (ExecuteMethodOnEngine(method_call.method)) { -        CallEngineMethod(method_call); -    } else { -        CallPullerMethod(method_call); -    } +bool GPU::CancelSyncptInterrupt(u32 syncpoint_id, u32 value) { +    return impl->CancelSyncptInterrupt(syncpoint_id, value);  } -void GPU::CallMultiMethod(u32 method, u32 subchannel, const u32* base_start, u32 amount, -                          u32 methods_pending) { -    LOG_TRACE(HW_GPU, "Processing method {:08X} on subchannel {}", method, subchannel); - -    ASSERT(subchannel < bound_engines.size()); - -    if (ExecuteMethodOnEngine(method)) { -        CallEngineMultiMethod(method, subchannel, base_start, amount, methods_pending); -    } else { -        for (std::size_t i = 0; i < amount; i++) { -            CallPullerMethod(MethodCall{ -                method, -                base_start[i], -                subchannel, -                methods_pending - static_cast<u32>(i), -            }); -        } -    } +u64 GPU::GetTicks() const { +    return impl->GetTicks();  } -bool GPU::ExecuteMethodOnEngine(u32 method) { -    const auto buffer_method = static_cast<BufferMethods>(method); -    return buffer_method >= BufferMethods::NonPullerMethods; -} - -void GPU::CallPullerMethod(const MethodCall& method_call) { -    regs.reg_array[method_call.method] = method_call.argument; -    const auto method = static_cast<BufferMethods>(method_call.method); - -    switch (method) { -    case BufferMethods::BindObject: { -        ProcessBindMethod(method_call); -        break; -    } -    case BufferMethods::Nop: -    case BufferMethods::SemaphoreAddressHigh: -    case BufferMethods::SemaphoreAddressLow: -    case BufferMethods::SemaphoreSequence: -    case BufferMethods::UnkCacheFlush: -    case BufferMethods::WrcacheFlush: -    case BufferMethods::FenceValue: -        break; -    case BufferMethods::RefCnt: -        rasterizer->SignalReference(); -        break; -    case BufferMethods::FenceAction: -        ProcessFenceActionMethod(); -        break; -    case BufferMethods::WaitForInterrupt: -        ProcessWaitForInterruptMethod(); -        break; -    case BufferMethods::SemaphoreTrigger: { -        ProcessSemaphoreTriggerMethod(); -        break; -    } -    case BufferMethods::NotifyIntr: { -        // TODO(Kmather73): Research and implement this method. -        LOG_ERROR(HW_GPU, "Special puller engine method NotifyIntr not implemented"); -        break; -    } -    case BufferMethods::Unk28: { -        // TODO(Kmather73): Research and implement this method. -        LOG_ERROR(HW_GPU, "Special puller engine method Unk28 not implemented"); -        break; -    } -    case BufferMethods::SemaphoreAcquire: { -        ProcessSemaphoreAcquire(); -        break; -    } -    case BufferMethods::SemaphoreRelease: { -        ProcessSemaphoreRelease(); -        break; -    } -    case BufferMethods::Yield: { -        // TODO(Kmather73): Research and implement this method. -        LOG_ERROR(HW_GPU, "Special puller engine method Yield not implemented"); -        break; -    } -    default: -        LOG_ERROR(HW_GPU, "Special puller engine method {:X} not implemented", method); -        break; -    } -} - -void GPU::CallEngineMethod(const MethodCall& method_call) { -    const EngineID engine = bound_engines[method_call.subchannel]; - -    switch (engine) { -    case EngineID::FERMI_TWOD_A: -        fermi_2d->CallMethod(method_call.method, method_call.argument, method_call.IsLastCall()); -        break; -    case EngineID::MAXWELL_B: -        maxwell_3d->CallMethod(method_call.method, method_call.argument, method_call.IsLastCall()); -        break; -    case EngineID::KEPLER_COMPUTE_B: -        kepler_compute->CallMethod(method_call.method, method_call.argument, -                                   method_call.IsLastCall()); -        break; -    case EngineID::MAXWELL_DMA_COPY_A: -        maxwell_dma->CallMethod(method_call.method, method_call.argument, method_call.IsLastCall()); -        break; -    case EngineID::KEPLER_INLINE_TO_MEMORY_B: -        kepler_memory->CallMethod(method_call.method, method_call.argument, -                                  method_call.IsLastCall()); -        break; -    default: -        UNIMPLEMENTED_MSG("Unimplemented engine"); -    } -} - -void GPU::CallEngineMultiMethod(u32 method, u32 subchannel, const u32* base_start, u32 amount, -                                u32 methods_pending) { -    const EngineID engine = bound_engines[subchannel]; - -    switch (engine) { -    case EngineID::FERMI_TWOD_A: -        fermi_2d->CallMultiMethod(method, base_start, amount, methods_pending); -        break; -    case EngineID::MAXWELL_B: -        maxwell_3d->CallMultiMethod(method, base_start, amount, methods_pending); -        break; -    case EngineID::KEPLER_COMPUTE_B: -        kepler_compute->CallMultiMethod(method, base_start, amount, methods_pending); -        break; -    case EngineID::MAXWELL_DMA_COPY_A: -        maxwell_dma->CallMultiMethod(method, base_start, amount, methods_pending); -        break; -    case EngineID::KEPLER_INLINE_TO_MEMORY_B: -        kepler_memory->CallMultiMethod(method, base_start, amount, methods_pending); -        break; -    default: -        UNIMPLEMENTED_MSG("Unimplemented engine"); -    } -} - -void GPU::ProcessBindMethod(const MethodCall& method_call) { -    // Bind the current subchannel to the desired engine id. -    LOG_DEBUG(HW_GPU, "Binding subchannel {} to engine {}", method_call.subchannel, -              method_call.argument); -    const auto engine_id = static_cast<EngineID>(method_call.argument); -    bound_engines[method_call.subchannel] = static_cast<EngineID>(engine_id); -    switch (engine_id) { -    case EngineID::FERMI_TWOD_A: -        dma_pusher->BindSubchannel(fermi_2d.get(), method_call.subchannel); -        break; -    case EngineID::MAXWELL_B: -        dma_pusher->BindSubchannel(maxwell_3d.get(), method_call.subchannel); -        break; -    case EngineID::KEPLER_COMPUTE_B: -        dma_pusher->BindSubchannel(kepler_compute.get(), method_call.subchannel); -        break; -    case EngineID::MAXWELL_DMA_COPY_A: -        dma_pusher->BindSubchannel(maxwell_dma.get(), method_call.subchannel); -        break; -    case EngineID::KEPLER_INLINE_TO_MEMORY_B: -        dma_pusher->BindSubchannel(kepler_memory.get(), method_call.subchannel); -        break; -    default: -        UNIMPLEMENTED_MSG("Unimplemented engine {:04X}", engine_id); -    } -} - -void GPU::ProcessFenceActionMethod() { -    switch (regs.fence_action.op) { -    case FenceOperation::Acquire: -        WaitFence(regs.fence_action.syncpoint_id, regs.fence_value); -        break; -    case FenceOperation::Increment: -        IncrementSyncPoint(regs.fence_action.syncpoint_id); -        break; -    default: -        UNIMPLEMENTED_MSG("Unimplemented operation {}", regs.fence_action.op.Value()); -    } -} - -void GPU::ProcessWaitForInterruptMethod() { -    // TODO(bunnei) ImplementMe -    LOG_WARNING(HW_GPU, "(STUBBED) called"); -} - -void GPU::ProcessSemaphoreTriggerMethod() { -    const auto semaphoreOperationMask = 0xF; -    const auto op = -        static_cast<GpuSemaphoreOperation>(regs.semaphore_trigger & semaphoreOperationMask); -    if (op == GpuSemaphoreOperation::WriteLong) { -        struct Block { -            u32 sequence; -            u32 zeros = 0; -            u64 timestamp; -        }; +std::unique_lock<std::mutex> GPU::LockSync() { +    return impl->LockSync(); +} -        Block block{}; -        block.sequence = regs.semaphore_sequence; -        // TODO(Kmather73): Generate a real GPU timestamp and write it here instead of -        // CoreTiming -        block.timestamp = GetTicks(); -        memory_manager->WriteBlock(regs.semaphore_address.SemaphoreAddress(), &block, -                                   sizeof(block)); -    } else { -        const u32 word{memory_manager->Read<u32>(regs.semaphore_address.SemaphoreAddress())}; -        if ((op == GpuSemaphoreOperation::AcquireEqual && word == regs.semaphore_sequence) || -            (op == GpuSemaphoreOperation::AcquireGequal && -             static_cast<s32>(word - regs.semaphore_sequence) > 0) || -            (op == GpuSemaphoreOperation::AcquireMask && (word & regs.semaphore_sequence))) { -            // Nothing to do in this case -        } else { -            regs.acquire_source = true; -            regs.acquire_value = regs.semaphore_sequence; -            if (op == GpuSemaphoreOperation::AcquireEqual) { -                regs.acquire_active = true; -                regs.acquire_mode = false; -            } else if (op == GpuSemaphoreOperation::AcquireGequal) { -                regs.acquire_active = true; -                regs.acquire_mode = true; -            } else if (op == GpuSemaphoreOperation::AcquireMask) { -                // TODO(kemathe) The acquire mask operation waits for a value that, ANDed with -                // semaphore_sequence, gives a non-0 result -                LOG_ERROR(HW_GPU, "Invalid semaphore operation AcquireMask not implemented"); -            } else { -                LOG_ERROR(HW_GPU, "Invalid semaphore operation"); -            } -        } -    } +bool GPU::IsAsync() const { +    return impl->IsAsync();  } -void GPU::ProcessSemaphoreRelease() { -    memory_manager->Write<u32>(regs.semaphore_address.SemaphoreAddress(), regs.semaphore_release); +bool GPU::UseNvdec() const { +    return impl->UseNvdec();  } -void GPU::ProcessSemaphoreAcquire() { -    const u32 word = memory_manager->Read<u32>(regs.semaphore_address.SemaphoreAddress()); -    const auto value = regs.semaphore_acquire; -    if (word != value) { -        regs.acquire_active = true; -        regs.acquire_value = value; -        // TODO(kemathe73) figure out how to do the acquire_timeout -        regs.acquire_mode = false; -        regs.acquire_source = false; -    } +void GPU::RendererFrameEndNotify() { +    impl->RendererFrameEndNotify();  }  void GPU::Start() { -    gpu_thread.StartThread(*renderer, renderer->Context(), *dma_pusher); -    cpu_context = renderer->GetRenderWindow().CreateSharedContext(); -    cpu_context->MakeCurrent(); +    impl->Start();  }  void GPU::ObtainContext() { -    cpu_context->MakeCurrent(); +    impl->ObtainContext();  }  void GPU::ReleaseContext() { -    cpu_context->DoneCurrent(); +    impl->ReleaseContext();  }  void GPU::PushGPUEntries(Tegra::CommandList&& entries) { -    gpu_thread.SubmitList(std::move(entries)); +    impl->PushGPUEntries(std::move(entries));  }  void GPU::PushCommandBuffer(Tegra::ChCommandHeaderList& entries) { -    if (!use_nvdec) { -        return; -    } - -    if (!cdma_pusher) { -        cdma_pusher = std::make_unique<Tegra::CDmaPusher>(*this); -    } - -    // SubmitCommandBuffer would make the nvdec operations async, this is not currently working -    // TODO(ameerj): RE proper async nvdec operation -    // gpu_thread.SubmitCommandBuffer(std::move(entries)); - -    cdma_pusher->ProcessEntries(std::move(entries)); +    impl->PushCommandBuffer(entries);  }  void GPU::ClearCdmaInstance() { -    cdma_pusher.reset(); +    impl->ClearCdmaInstance();  }  void GPU::SwapBuffers(const Tegra::FramebufferConfig* framebuffer) { -    gpu_thread.SwapBuffers(framebuffer); +    impl->SwapBuffers(framebuffer);  }  void GPU::FlushRegion(VAddr addr, u64 size) { -    gpu_thread.FlushRegion(addr, size); +    impl->FlushRegion(addr, size);  }  void GPU::InvalidateRegion(VAddr addr, u64 size) { -    gpu_thread.InvalidateRegion(addr, size); +    impl->InvalidateRegion(addr, size);  }  void GPU::FlushAndInvalidateRegion(VAddr addr, u64 size) { -    gpu_thread.FlushAndInvalidateRegion(addr, size); -} - -void GPU::TriggerCpuInterrupt(const u32 syncpoint_id, const u32 value) const { -    auto& interrupt_manager = system.InterruptManager(); -    interrupt_manager.GPUInterruptSyncpt(syncpoint_id, value); -} - -void GPU::OnCommandListEnd() { -    if (is_async) { -        // This command only applies to asynchronous GPU mode -        gpu_thread.OnCommandListEnd(); -    } +    impl->FlushAndInvalidateRegion(addr, size);  }  } // namespace Tegra diff --git a/src/video_core/gpu.h b/src/video_core/gpu.h index e6a02a71b..39b304823 100644 --- a/src/video_core/gpu.h +++ b/src/video_core/gpu.h @@ -4,19 +4,13 @@  #pragma once -#include <array> -#include <atomic> -#include <condition_variable> -#include <list>  #include <memory>  #include <mutex> + +#include "common/bit_field.h"  #include "common/common_types.h" -#include "core/hle/service/nvdrv/nvdata.h" -#include "core/hle/service/nvflinger/buffer_queue.h"  #include "video_core/cdma_pusher.h" -#include "video_core/dma_pusher.h"  #include "video_core/framebuffer_config.h" -#include "video_core/gpu_thread.h"  using CacheAddr = std::uintptr_t;  [[nodiscard]] inline CacheAddr ToCacheAddr(const void* host_ptr) { @@ -40,6 +34,9 @@ class ShaderNotify;  } // namespace VideoCore  namespace Tegra { +class DmaPusher; +class CDmaPusher; +struct CommandList;  enum class RenderTargetFormat : u32 {      NONE = 0x0, @@ -138,7 +135,18 @@ public:          }      }; -    explicit GPU(Core::System& system_, bool is_async_, bool use_nvdec_); +    enum class FenceOperation : u32 { +        Acquire = 0, +        Increment = 1, +    }; + +    union FenceAction { +        u32 raw; +        BitField<0, 1, FenceOperation> op; +        BitField<8, 24, u32> syncpoint_id; +    }; + +    explicit GPU(Core::System& system, bool is_async, bool use_nvdec);      ~GPU();      /// Binds a renderer to the GPU. @@ -162,9 +170,7 @@ public:      [[nodiscard]] u64 RequestFlush(VAddr addr, std::size_t size);      /// Obtains current flush request fence id. -    [[nodiscard]] u64 CurrentFlushRequestFence() const { -        return current_flush_fence.load(std::memory_order_relaxed); -    } +    [[nodiscard]] u64 CurrentFlushRequestFence() const;      /// Tick pending requests within the GPU.      void TickWork(); @@ -200,24 +206,16 @@ public:      [[nodiscard]] const Tegra::CDmaPusher& CDmaPusher() const;      /// Returns a reference to the underlying renderer. -    [[nodiscard]] VideoCore::RendererBase& Renderer() { -        return *renderer; -    } +    [[nodiscard]] VideoCore::RendererBase& Renderer();      /// Returns a const reference to the underlying renderer. -    [[nodiscard]] const VideoCore::RendererBase& Renderer() const { -        return *renderer; -    } +    [[nodiscard]] const VideoCore::RendererBase& Renderer() const;      /// Returns a reference to the shader notifier. -    [[nodiscard]] VideoCore::ShaderNotify& ShaderNotify() { -        return *shader_notify; -    } +    [[nodiscard]] VideoCore::ShaderNotify& ShaderNotify();      /// Returns a const reference to the shader notifier. -    [[nodiscard]] const VideoCore::ShaderNotify& ShaderNotify() const { -        return *shader_notify; -    } +    [[nodiscard]] const VideoCore::ShaderNotify& ShaderNotify() const;      /// Allows the CPU/NvFlinger to wait on the GPU before presenting a frame.      void WaitFence(u32 syncpoint_id, u32 value); @@ -232,80 +230,14 @@ public:      [[nodiscard]] u64 GetTicks() const; -    [[nodiscard]] std::unique_lock<std::mutex> LockSync() { -        return std::unique_lock{sync_mutex}; -    } +    [[nodiscard]] std::unique_lock<std::mutex> LockSync(); -    [[nodiscard]] bool IsAsync() const { -        return is_async; -    } +    [[nodiscard]] bool IsAsync() const; -    [[nodiscard]] bool UseNvdec() const { -        return use_nvdec; -    } +    [[nodiscard]] bool UseNvdec() const;      void RendererFrameEndNotify(); -    enum class FenceOperation : u32 { -        Acquire = 0, -        Increment = 1, -    }; - -    union FenceAction { -        u32 raw; -        BitField<0, 1, FenceOperation> op; -        BitField<8, 24, u32> syncpoint_id; - -        [[nodiscard]] static CommandHeader Build(FenceOperation op, u32 syncpoint_id) { -            FenceAction result{}; -            result.op.Assign(op); -            result.syncpoint_id.Assign(syncpoint_id); -            return {result.raw}; -        } -    }; - -    struct Regs { -        static constexpr size_t NUM_REGS = 0x40; - -        union { -            struct { -                INSERT_PADDING_WORDS_NOINIT(0x4); -                struct { -                    u32 address_high; -                    u32 address_low; - -                    [[nodiscard]] GPUVAddr SemaphoreAddress() const { -                        return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high) << 32) | -                                                     address_low); -                    } -                } semaphore_address; - -                u32 semaphore_sequence; -                u32 semaphore_trigger; -                INSERT_PADDING_WORDS_NOINIT(0xC); - -                // The pusher and the puller share the reference counter, the pusher only has read -                // access -                u32 reference_count; -                INSERT_PADDING_WORDS_NOINIT(0x5); - -                u32 semaphore_acquire; -                u32 semaphore_release; -                u32 fence_value; -                FenceAction fence_action; -                INSERT_PADDING_WORDS_NOINIT(0xE2); - -                // Puller state -                u32 acquire_mode; -                u32 acquire_source; -                u32 acquire_active; -                u32 acquire_timeout; -                u32 acquire_value; -            }; -            std::array<u32, NUM_REGS> reg_array; -        }; -    } regs{}; -      /// Performs any additional setup necessary in order to begin GPU emulation.      /// This can be used to launch any necessary threads and register any necessary      /// core timing events. @@ -338,104 +270,9 @@ public:      /// Notify rasterizer that any caches of the specified region should be flushed and invalidated      void FlushAndInvalidateRegion(VAddr addr, u64 size); -protected: -    void TriggerCpuInterrupt(u32 syncpoint_id, u32 value) const; - -private: -    void ProcessBindMethod(const MethodCall& method_call); -    void ProcessFenceActionMethod(); -    void ProcessWaitForInterruptMethod(); -    void ProcessSemaphoreTriggerMethod(); -    void ProcessSemaphoreRelease(); -    void ProcessSemaphoreAcquire(); - -    /// Calls a GPU puller method. -    void CallPullerMethod(const MethodCall& method_call); - -    /// Calls a GPU engine method. -    void CallEngineMethod(const MethodCall& method_call); - -    /// Calls a GPU engine multivalue method. -    void CallEngineMultiMethod(u32 method, u32 subchannel, const u32* base_start, u32 amount, -                               u32 methods_pending); - -    /// Determines where the method should be executed. -    [[nodiscard]] bool ExecuteMethodOnEngine(u32 method); - -protected: -    Core::System& system; -    std::unique_ptr<Tegra::MemoryManager> memory_manager; -    std::unique_ptr<Tegra::DmaPusher> dma_pusher; -    std::unique_ptr<Tegra::CDmaPusher> cdma_pusher; -    std::unique_ptr<VideoCore::RendererBase> renderer; -    VideoCore::RasterizerInterface* rasterizer = nullptr; -    const bool use_nvdec; -  private: -    /// Mapping of command subchannels to their bound engine ids -    std::array<EngineID, 8> bound_engines = {}; -    /// 3D engine -    std::unique_ptr<Engines::Maxwell3D> maxwell_3d; -    /// 2D engine -    std::unique_ptr<Engines::Fermi2D> fermi_2d; -    /// Compute engine -    std::unique_ptr<Engines::KeplerCompute> kepler_compute; -    /// DMA engine -    std::unique_ptr<Engines::MaxwellDMA> maxwell_dma; -    /// Inline memory engine -    std::unique_ptr<Engines::KeplerMemory> kepler_memory; -    /// Shader build notifier -    std::unique_ptr<VideoCore::ShaderNotify> shader_notify; -    /// When true, we are about to shut down emulation session, so terminate outstanding tasks -    std::atomic_bool shutting_down{}; - -    std::array<std::atomic<u32>, Service::Nvidia::MaxSyncPoints> syncpoints{}; - -    std::array<std::list<u32>, Service::Nvidia::MaxSyncPoints> syncpt_interrupts; - -    std::mutex sync_mutex; -    std::mutex device_mutex; - -    std::condition_variable sync_cv; - -    struct FlushRequest { -        explicit FlushRequest(u64 fence_, VAddr addr_, std::size_t size_) -            : fence{fence_}, addr{addr_}, size{size_} {} -        u64 fence; -        VAddr addr; -        std::size_t size; -    }; - -    std::list<FlushRequest> flush_requests; -    std::atomic<u64> current_flush_fence{}; -    u64 last_flush_fence{}; -    std::mutex flush_request_mutex; - -    const bool is_async; - -    VideoCommon::GPUThread::ThreadManager gpu_thread; -    std::unique_ptr<Core::Frontend::GraphicsContext> cpu_context; +    struct Impl; +    std::unique_ptr<Impl> impl;  }; -#define ASSERT_REG_POSITION(field_name, position)                                                  \ -    static_assert(offsetof(GPU::Regs, field_name) == position * 4,                                 \ -                  "Field " #field_name " has invalid position") - -ASSERT_REG_POSITION(semaphore_address, 0x4); -ASSERT_REG_POSITION(semaphore_sequence, 0x6); -ASSERT_REG_POSITION(semaphore_trigger, 0x7); -ASSERT_REG_POSITION(reference_count, 0x14); -ASSERT_REG_POSITION(semaphore_acquire, 0x1A); -ASSERT_REG_POSITION(semaphore_release, 0x1B); -ASSERT_REG_POSITION(fence_value, 0x1C); -ASSERT_REG_POSITION(fence_action, 0x1D); - -ASSERT_REG_POSITION(acquire_mode, 0x100); -ASSERT_REG_POSITION(acquire_source, 0x101); -ASSERT_REG_POSITION(acquire_active, 0x102); -ASSERT_REG_POSITION(acquire_timeout, 0x103); -ASSERT_REG_POSITION(acquire_value, 0x104); - -#undef ASSERT_REG_POSITION -  } // namespace Tegra diff --git a/src/video_core/gpu_thread.h b/src/video_core/gpu_thread.h index 91bada925..00984188e 100644 --- a/src/video_core/gpu_thread.h +++ b/src/video_core/gpu_thread.h @@ -130,9 +130,6 @@ public:      /// Notify rasterizer that any caches of the specified region should be flushed and invalidated      void FlushAndInvalidateRegion(VAddr addr, u64 size); -    // Stops the GPU execution and waits for the GPU to finish working -    void ShutDown(); -      void OnCommandListEnd();  private: diff --git a/src/video_core/query_cache.h b/src/video_core/query_cache.h index aac851253..73231061a 100644 --- a/src/video_core/query_cache.h +++ b/src/video_core/query_cache.h @@ -8,6 +8,7 @@  #include <array>  #include <cstring>  #include <iterator> +#include <list>  #include <memory>  #include <mutex>  #include <optional> diff --git a/src/video_core/renderer_opengl/gl_texture_cache.cpp b/src/video_core/renderer_opengl/gl_texture_cache.cpp index 54dae2c41..8c3ca3d82 100644 --- a/src/video_core/renderer_opengl/gl_texture_cache.cpp +++ b/src/video_core/renderer_opengl/gl_texture_cache.cpp @@ -20,6 +20,7 @@  #include "video_core/surface.h"  #include "video_core/texture_cache/formatter.h"  #include "video_core/texture_cache/samples_helper.h" +#include "video_core/texture_cache/util.h"  namespace OpenGL {  namespace { diff --git a/src/video_core/renderer_opengl/gl_texture_cache.h b/src/video_core/renderer_opengl/gl_texture_cache.h index c498a8a8f..1ca2c90be 100644 --- a/src/video_core/renderer_opengl/gl_texture_cache.h +++ b/src/video_core/renderer_opengl/gl_texture_cache.h @@ -12,6 +12,7 @@  #include "shader_recompiler/shader_info.h"  #include "video_core/renderer_opengl/gl_resource_manager.h"  #include "video_core/renderer_opengl/util_shaders.h" +#include "video_core/texture_cache/image_view_base.h"  #include "video_core/texture_cache/texture_cache_base.h"  namespace OpenGL { diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.cpp b/src/video_core/renderer_vulkan/vk_texture_cache.cpp index 3b87640b5..06c5fb867 100644 --- a/src/video_core/renderer_vulkan/vk_texture_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_texture_cache.cpp @@ -21,6 +21,7 @@  #include "video_core/renderer_vulkan/vk_texture_cache.h"  #include "video_core/texture_cache/formatter.h"  #include "video_core/texture_cache/samples_helper.h" +#include "video_core/texture_cache/util.h"  #include "video_core/vulkan_common/vulkan_device.h"  #include "video_core/vulkan_common/vulkan_memory_allocator.h"  #include "video_core/vulkan_common/vulkan_wrapper.h" diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.h b/src/video_core/renderer_vulkan/vk_texture_cache.h index 6d5a68bfe..b09c468e4 100644 --- a/src/video_core/renderer_vulkan/vk_texture_cache.h +++ b/src/video_core/renderer_vulkan/vk_texture_cache.h @@ -4,11 +4,11 @@  #pragma once -#include <compare>  #include <span>  #include "shader_recompiler/shader_info.h"  #include "video_core/renderer_vulkan/vk_staging_buffer_pool.h" +#include "video_core/texture_cache/image_view_base.h"  #include "video_core/texture_cache/texture_cache_base.h"  #include "video_core/vulkan_common/vulkan_memory_allocator.h"  #include "video_core/vulkan_common/vulkan_wrapper.h" diff --git a/src/video_core/shader_environment.cpp b/src/video_core/shader_environment.cpp index 81a878bb2..05850afd0 100644 --- a/src/video_core/shader_environment.cpp +++ b/src/video_core/shader_environment.cpp @@ -16,6 +16,7 @@  #include "common/fs/fs.h"  #include "common/logging/log.h"  #include "shader_recompiler/environment.h" +#include "video_core/engines/kepler_compute.h"  #include "video_core/memory_manager.h"  #include "video_core/shader_environment.h"  #include "video_core/textures/texture.h" diff --git a/src/video_core/shader_environment.h b/src/video_core/shader_environment.h index 2079979db..6640e53d0 100644 --- a/src/video_core/shader_environment.h +++ b/src/video_core/shader_environment.h @@ -5,13 +5,13 @@  #pragma once  #include <array> -#include <atomic>  #include <filesystem>  #include <iosfwd>  #include <limits>  #include <memory>  #include <optional>  #include <span> +#include <stop_token>  #include <type_traits>  #include <unordered_map>  #include <vector> @@ -19,9 +19,7 @@  #include "common/common_types.h"  #include "common/unique_function.h"  #include "shader_recompiler/environment.h" -#include "video_core/engines/kepler_compute.h"  #include "video_core/engines/maxwell_3d.h" -#include "video_core/textures/texture.h"  namespace Tegra {  class Memorymanager; diff --git a/src/video_core/texture_cache/image_view_info.cpp b/src/video_core/texture_cache/image_view_info.cpp index 6527e14c8..e751f26c7 100644 --- a/src/video_core/texture_cache/image_view_info.cpp +++ b/src/video_core/texture_cache/image_view_info.cpp @@ -8,6 +8,7 @@  #include "video_core/texture_cache/image_view_info.h"  #include "video_core/texture_cache/texture_cache_base.h"  #include "video_core/texture_cache/types.h" +#include "video_core/texture_cache/util.h"  #include "video_core/textures/texture.h"  namespace VideoCommon { diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h index 24b809242..329df2e49 100644 --- a/src/video_core/texture_cache/texture_cache.h +++ b/src/video_core/texture_cache/texture_cache.h @@ -4,10 +4,15 @@  #pragma once +#include <unordered_set> +  #include "common/alignment.h"  #include "video_core/dirty_flags.h" +#include "video_core/engines/kepler_compute.h" +#include "video_core/texture_cache/image_view_base.h"  #include "video_core/texture_cache/samples_helper.h"  #include "video_core/texture_cache/texture_cache_base.h" +#include "video_core/texture_cache/util.h"  namespace VideoCommon { diff --git a/src/video_core/texture_cache/texture_cache_base.h b/src/video_core/texture_cache/texture_cache_base.h index d7528ed24..2d1893c1c 100644 --- a/src/video_core/texture_cache/texture_cache_base.h +++ b/src/video_core/texture_cache/texture_cache_base.h @@ -4,13 +4,12 @@  #pragma once -#include <array>  #include <mutex>  #include <span>  #include <type_traits>  #include <unordered_map> -#include <unordered_set>  #include <vector> +#include <queue>  #include "common/common_types.h"  #include "common/literals.h" @@ -18,10 +17,6 @@  #include "video_core/compatible_formats.h"  #include "video_core/delayed_destruction_ring.h"  #include "video_core/engines/fermi_2d.h" -#include "video_core/engines/kepler_compute.h" -#include "video_core/engines/maxwell_3d.h" -#include "video_core/memory_manager.h" -#include "video_core/rasterizer_interface.h"  #include "video_core/surface.h"  #include "video_core/texture_cache/descriptor_table.h"  #include "video_core/texture_cache/image_base.h" @@ -30,7 +25,6 @@  #include "video_core/texture_cache/render_targets.h"  #include "video_core/texture_cache/slot_vector.h"  #include "video_core/texture_cache/types.h" -#include "video_core/texture_cache/util.h"  #include "video_core/textures/texture.h"  namespace VideoCommon { | 
