From eb67a45ca82bc01ac843c853fd3c17f2a90e0250 Mon Sep 17 00:00:00 2001
From: ameerj <aj662@drexel.edu>
Date: Mon, 26 Oct 2020 23:07:36 -0400
Subject: video_core: NVDEC Implementation

This commit aims to implement the NVDEC (Nvidia Decoder) functionality, with video frame decoding being handled by the FFmpeg library.

The process begins with Ioctl commands being sent to the NVDEC and VIC (Video Image Composer) emulated devices. These allocate the necessary GPU buffers for the frame data, along with providing information on the incoming video data. A Submit command then signals the GPU to process and decode the frame data.

To decode the frame, the respective codec's header must be manually composed from the information provided by NVDEC, then sent with the raw frame data to the ffmpeg library.

Currently, H264 and VP9 are supported, with VP9 having some minor artifacting issues related mainly to the reference frame composition in its uncompressed header.

Async GPU is not properly implemented at the moment.

Co-Authored-By: David <25727384+ogniK5377@users.noreply.github.com>
---
 src/video_core/gpu.cpp | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

(limited to 'src/video_core/gpu.cpp')

diff --git a/src/video_core/gpu.cpp b/src/video_core/gpu.cpp
index 4bb9256e9..171f78183 100644
--- a/src/video_core/gpu.cpp
+++ b/src/video_core/gpu.cpp
@@ -27,9 +27,10 @@ namespace Tegra {
 
 MICROPROFILE_DEFINE(GPU_wait, "GPU", "Wait for the GPU", MP_RGB(128, 128, 192));
 
-GPU::GPU(Core::System& system_, bool is_async_)
+GPU::GPU(Core::System& system_, bool is_async_, bool use_nvdec_)
     : system{system_}, memory_manager{std::make_unique<Tegra::MemoryManager>(system)},
       dma_pusher{std::make_unique<Tegra::DmaPusher>(system, *this)},
+      cdma_pusher{std::make_unique<Tegra::CDmaPusher>(*this)}, use_nvdec{use_nvdec_},
       maxwell_3d{std::make_unique<Engines::Maxwell3D>(system, *memory_manager)},
       fermi_2d{std::make_unique<Engines::Fermi2D>()},
       kepler_compute{std::make_unique<Engines::KeplerCompute>(system, *memory_manager)},
@@ -77,10 +78,18 @@ DmaPusher& GPU::DmaPusher() {
     return *dma_pusher;
 }
 
+Tegra::CDmaPusher& GPU::CDmaPusher() {
+    return *cdma_pusher;
+}
+
 const DmaPusher& GPU::DmaPusher() const {
     return *dma_pusher;
 }
 
+const Tegra::CDmaPusher& GPU::CDmaPusher() const {
+    return *cdma_pusher;
+}
+
 void GPU::WaitFence(u32 syncpoint_id, u32 value) {
     // Synced GPU, is always in sync
     if (!is_async) {
-- 
cgit v1.2.3


From 6053b955525be69eb73a928a7bdd43ba8f5e69a7 Mon Sep 17 00:00:00 2001
From: bunnei <bunneidev@gmail.com>
Date: Mon, 26 Oct 2020 22:11:41 -0700
Subject: video_core: gpu: Implement WaitFence and IncrementSyncPoint.

---
 src/video_core/gpu.cpp | 48 ++++++++++++++++++++++++------------------------
 1 file changed, 24 insertions(+), 24 deletions(-)

(limited to 'src/video_core/gpu.cpp')

diff --git a/src/video_core/gpu.cpp b/src/video_core/gpu.cpp
index 171f78183..ebd149c3a 100644
--- a/src/video_core/gpu.cpp
+++ b/src/video_core/gpu.cpp
@@ -194,30 +194,6 @@ void GPU::SyncGuestHost() {
 void GPU::OnCommandListEnd() {
     renderer->Rasterizer().ReleaseFences();
 }
-// Note that, traditionally, methods are treated as 4-byte addressable locations, and hence
-// their numbers are written down multiplied by 4 in Docs. Here we are not multiply by 4.
-// So the values you see in docs might be multiplied by 4.
-enum class BufferMethods {
-    BindObject = 0x0,
-    Nop = 0x2,
-    SemaphoreAddressHigh = 0x4,
-    SemaphoreAddressLow = 0x5,
-    SemaphoreSequence = 0x6,
-    SemaphoreTrigger = 0x7,
-    NotifyIntr = 0x8,
-    WrcacheFlush = 0x9,
-    Unk28 = 0xA,
-    UnkCacheFlush = 0xB,
-    RefCnt = 0x14,
-    SemaphoreAcquire = 0x1A,
-    SemaphoreRelease = 0x1B,
-    FenceValue = 0x1C,
-    FenceAction = 0x1D,
-    Unk78 = 0x1E,
-    Unk7c = 0x1F,
-    Yield = 0x20,
-    NonPullerMethods = 0x40,
-};
 
 enum class GpuSemaphoreOperation {
     AcquireEqual = 0x1,
@@ -277,7 +253,12 @@ void GPU::CallPullerMethod(const MethodCall& method_call) {
     case BufferMethods::UnkCacheFlush:
     case BufferMethods::WrcacheFlush:
     case BufferMethods::FenceValue:
+        break;
     case BufferMethods::FenceAction:
+        ProcessFenceActionMethod();
+        break;
+    case BufferMethods::WaitForInterrupt:
+        ProcessWaitForInterruptMethod();
         break;
     case BufferMethods::SemaphoreTrigger: {
         ProcessSemaphoreTriggerMethod();
@@ -391,6 +372,25 @@ void GPU::ProcessBindMethod(const MethodCall& method_call) {
     }
 }
 
+void GPU::ProcessFenceActionMethod() {
+    switch (regs.fence_action.op) {
+    case FenceOperation::Acquire:
+        WaitFence(regs.fence_action.syncpoint_id, regs.fence_value);
+        break;
+    case FenceOperation::Increment:
+        IncrementSyncPoint(regs.fence_action.syncpoint_id);
+        break;
+    default:
+        UNIMPLEMENTED_MSG("Unimplemented operation {}",
+                          static_cast<u32>(regs.fence_action.op.Value()));
+    }
+}
+
+void GPU::ProcessWaitForInterruptMethod() {
+    // TODO(bunnei) ImplementMe
+    LOG_WARNING(HW_GPU, "(STUBBED) called");
+}
+
 void GPU::ProcessSemaphoreTriggerMethod() {
     const auto semaphoreOperationMask = 0xF;
     const auto op =
-- 
cgit v1.2.3


From e8b2fd21d861997e558180d775b14afdc46f3bbd Mon Sep 17 00:00:00 2001
From: comex <comexk@gmail.com>
Date: Sun, 22 Nov 2020 15:48:23 -0500
Subject: nvdrv, video_core: Don't index out of bounds when given invalid
 syncpoint ID

- Use .at() instead of raw indexing when dealing with untrusted indices.

- For the special case of WaitFence with syncpoint id UINT32_MAX,
  instead of crashing, log an error and ignore.  This is what I get when
  running Super Mario Maker 2.
---
 src/video_core/gpu.cpp | 29 ++++++++++++++++++-----------
 1 file changed, 18 insertions(+), 11 deletions(-)

(limited to 'src/video_core/gpu.cpp')

diff --git a/src/video_core/gpu.cpp b/src/video_core/gpu.cpp
index ebd149c3a..e91f52938 100644
--- a/src/video_core/gpu.cpp
+++ b/src/video_core/gpu.cpp
@@ -95,22 +95,29 @@ void GPU::WaitFence(u32 syncpoint_id, u32 value) {
     if (!is_async) {
         return;
     }
+    if (syncpoint_id == UINT32_MAX) {
+        // TODO: Research what this does.
+        LOG_ERROR(HW_GPU, "Waiting for syncpoint -1 not implemented");
+        return;
+    }
     MICROPROFILE_SCOPE(GPU_wait);
     std::unique_lock lock{sync_mutex};
-    sync_cv.wait(lock, [=, this] { return syncpoints[syncpoint_id].load() >= value; });
+    sync_cv.wait(lock, [=, this] { return syncpoints.at(syncpoint_id).load() >= value; });
 }
 
 void GPU::IncrementSyncPoint(const u32 syncpoint_id) {
-    syncpoints[syncpoint_id]++;
+    auto& syncpoint = syncpoints.at(syncpoint_id);
+    syncpoint++;
     std::lock_guard lock{sync_mutex};
     sync_cv.notify_all();
-    if (!syncpt_interrupts[syncpoint_id].empty()) {
-        u32 value = syncpoints[syncpoint_id].load();
-        auto it = syncpt_interrupts[syncpoint_id].begin();
-        while (it != syncpt_interrupts[syncpoint_id].end()) {
+    auto& interrupt = syncpt_interrupts.at(syncpoint_id);
+    if (!interrupt.empty()) {
+        u32 value = syncpoint.load();
+        auto it = interrupt.begin();
+        while (it != interrupt.end()) {
             if (value >= *it) {
                 TriggerCpuInterrupt(syncpoint_id, *it);
-                it = syncpt_interrupts[syncpoint_id].erase(it);
+                it = interrupt.erase(it);
                 continue;
             }
             it++;
@@ -119,22 +126,22 @@ void GPU::IncrementSyncPoint(const u32 syncpoint_id) {
 }
 
 u32 GPU::GetSyncpointValue(const u32 syncpoint_id) const {
-    return syncpoints[syncpoint_id].load();
+    return syncpoints.at(syncpoint_id).load();
 }
 
 void GPU::RegisterSyncptInterrupt(const u32 syncpoint_id, const u32 value) {
-    auto& interrupt = syncpt_interrupts[syncpoint_id];
+    auto& interrupt = syncpt_interrupts.at(syncpoint_id);
     bool contains = std::any_of(interrupt.begin(), interrupt.end(),
                                 [value](u32 in_value) { return in_value == value; });
     if (contains) {
         return;
     }
-    syncpt_interrupts[syncpoint_id].emplace_back(value);
+    interrupt.emplace_back(value);
 }
 
 bool GPU::CancelSyncptInterrupt(const u32 syncpoint_id, const u32 value) {
     std::lock_guard lock{sync_mutex};
-    auto& interrupt = syncpt_interrupts[syncpoint_id];
+    auto& interrupt = syncpt_interrupts.at(syncpoint_id);
     const auto iter =
         std::find_if(interrupt.begin(), interrupt.end(),
                      [value](u32 interrupt_value) { return value == interrupt_value; });
-- 
cgit v1.2.3


From 677a8b208d47d0d2397197ce74c7039a8ea79d20 Mon Sep 17 00:00:00 2001
From: Lioncash <mathew1800@gmail.com>
Date: Fri, 4 Dec 2020 14:39:12 -0500
Subject: video_core: Resolve more variable shadowing scenarios

Resolves variable shadowing scenarios up to the end of the OpenGL code
to make it nicer to review. The rest will be resolved in a following
commit.
---
 src/video_core/gpu.cpp | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'src/video_core/gpu.cpp')

diff --git a/src/video_core/gpu.cpp b/src/video_core/gpu.cpp
index e91f52938..964b3f3dc 100644
--- a/src/video_core/gpu.cpp
+++ b/src/video_core/gpu.cpp
@@ -232,8 +232,12 @@ void GPU::CallMultiMethod(u32 method, u32 subchannel, const u32* base_start, u32
         CallEngineMultiMethod(method, subchannel, base_start, amount, methods_pending);
     } else {
         for (std::size_t i = 0; i < amount; i++) {
-            CallPullerMethod(
-                {method, base_start[i], subchannel, methods_pending - static_cast<u32>(i)});
+            CallPullerMethod(MethodCall{
+                method,
+                base_start[i],
+                subchannel,
+                methods_pending - static_cast<u32>(i),
+            });
         }
     }
 }
-- 
cgit v1.2.3


From 4c5f5c9bf301d3626df104dbed6fed6f115cedc8 Mon Sep 17 00:00:00 2001
From: Lioncash <mathew1800@gmail.com>
Date: Mon, 7 Dec 2020 00:41:47 -0500
Subject: video_core: Remove unnecessary enum class casting in logging messages

fmt now automatically prints the numeric value of an enum class member
by default, so we don't need to use casts any more.

Reduces the line noise a bit.
---
 src/video_core/gpu.cpp | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

(limited to 'src/video_core/gpu.cpp')

diff --git a/src/video_core/gpu.cpp b/src/video_core/gpu.cpp
index 964b3f3dc..e2512a7f2 100644
--- a/src/video_core/gpu.cpp
+++ b/src/video_core/gpu.cpp
@@ -299,8 +299,7 @@ void GPU::CallPullerMethod(const MethodCall& method_call) {
         break;
     }
     default:
-        LOG_ERROR(HW_GPU, "Special puller engine method {:X} not implemented",
-                  static_cast<u32>(method));
+        LOG_ERROR(HW_GPU, "Special puller engine method {:X} not implemented", method);
         break;
     }
 }
@@ -379,7 +378,7 @@ void GPU::ProcessBindMethod(const MethodCall& method_call) {
         dma_pusher->BindSubchannel(kepler_memory.get(), method_call.subchannel);
         break;
     default:
-        UNIMPLEMENTED_MSG("Unimplemented engine {:04X}", static_cast<u32>(engine_id));
+        UNIMPLEMENTED_MSG("Unimplemented engine {:04X}", engine_id);
     }
 }
 
@@ -392,8 +391,7 @@ void GPU::ProcessFenceActionMethod() {
         IncrementSyncPoint(regs.fence_action.syncpoint_id);
         break;
     default:
-        UNIMPLEMENTED_MSG("Unimplemented operation {}",
-                          static_cast<u32>(regs.fence_action.op.Value()));
+        UNIMPLEMENTED_MSG("Unimplemented operation {}", regs.fence_action.op.Value());
     }
 }
 
-- 
cgit v1.2.3


From 14c825bd1c37b2444e858bf1a75fb77455b4eb52 Mon Sep 17 00:00:00 2001
From: bunnei <bunneidev@gmail.com>
Date: Fri, 11 Dec 2020 22:26:14 -0800
Subject: video_core: gpu: Refactor out synchronous/asynchronous GPU
 implementations.

- We must always use a GPU thread now, even with synchronous GPU.
---
 src/video_core/gpu.cpp | 76 ++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 71 insertions(+), 5 deletions(-)

(limited to 'src/video_core/gpu.cpp')

diff --git a/src/video_core/gpu.cpp b/src/video_core/gpu.cpp
index e2512a7f2..f99a8a0de 100644
--- a/src/video_core/gpu.cpp
+++ b/src/video_core/gpu.cpp
@@ -10,6 +10,7 @@
 #include "core/core_timing.h"
 #include "core/core_timing_util.h"
 #include "core/frontend/emu_window.h"
+#include "core/hardware_interrupt_manager.h"
 #include "core/memory.h"
 #include "core/settings.h"
 #include "video_core/engines/fermi_2d.h"
@@ -36,7 +37,8 @@ GPU::GPU(Core::System& system_, bool is_async_, bool use_nvdec_)
       kepler_compute{std::make_unique<Engines::KeplerCompute>(system, *memory_manager)},
       maxwell_dma{std::make_unique<Engines::MaxwellDMA>(system, *memory_manager)},
       kepler_memory{std::make_unique<Engines::KeplerMemory>(system, *memory_manager)},
-      shader_notify{std::make_unique<VideoCore::ShaderNotify>()}, is_async{is_async_} {}
+      shader_notify{std::make_unique<VideoCore::ShaderNotify>()}, is_async{is_async_},
+      gpu_thread{system_} {}
 
 GPU::~GPU() = default;
 
@@ -198,10 +200,6 @@ void GPU::SyncGuestHost() {
     renderer->Rasterizer().SyncGuestHost();
 }
 
-void GPU::OnCommandListEnd() {
-    renderer->Rasterizer().ReleaseFences();
-}
-
 enum class GpuSemaphoreOperation {
     AcquireEqual = 0x1,
     WriteLong = 0x2,
@@ -461,4 +459,72 @@ void GPU::ProcessSemaphoreAcquire() {
     }
 }
 
+void GPU::Start() {
+    gpu_thread.StartThread(*renderer, renderer->Context(), *dma_pusher, *cdma_pusher);
+    cpu_context = renderer->GetRenderWindow().CreateSharedContext();
+    cpu_context->MakeCurrent();
+}
+
+void GPU::ObtainContext() {
+    cpu_context->MakeCurrent();
+}
+
+void GPU::ReleaseContext() {
+    cpu_context->DoneCurrent();
+}
+
+void GPU::PushGPUEntries(Tegra::CommandList&& entries) {
+    gpu_thread.SubmitList(std::move(entries));
+}
+
+void GPU::PushCommandBuffer(Tegra::ChCommandHeaderList& entries) {
+    if (!use_nvdec) {
+        return;
+    }
+    // This condition fires when a video stream ends, clear all intermediary data
+    if (entries[0].raw == 0xDEADB33F) {
+        cdma_pusher.reset();
+        return;
+    }
+    if (!cdma_pusher) {
+        cdma_pusher = std::make_unique<Tegra::CDmaPusher>(*this);
+    }
+
+    // SubmitCommandBuffer would make the nvdec operations async, this is not currently working
+    // TODO(ameerj): RE proper async nvdec operation
+    // gpu_thread.SubmitCommandBuffer(std::move(entries));
+
+    cdma_pusher->Push(std::move(entries));
+    cdma_pusher->DispatchCalls();
+}
+
+void GPU::SwapBuffers(const Tegra::FramebufferConfig* framebuffer) {
+    gpu_thread.SwapBuffers(framebuffer);
+}
+
+void GPU::FlushRegion(VAddr addr, u64 size) {
+    gpu_thread.FlushRegion(addr, size);
+}
+
+void GPU::InvalidateRegion(VAddr addr, u64 size) {
+    gpu_thread.InvalidateRegion(addr, size);
+}
+
+void GPU::FlushAndInvalidateRegion(VAddr addr, u64 size) {
+    gpu_thread.FlushAndInvalidateRegion(addr, size);
+}
+
+void GPU::TriggerCpuInterrupt(const u32 syncpoint_id, const u32 value) const {
+    auto& interrupt_manager = system.InterruptManager();
+    interrupt_manager.GPUInterruptSyncpt(syncpoint_id, value);
+}
+
+void GPU::WaitIdle() const {
+    gpu_thread.WaitIdle();
+}
+
+void GPU::OnCommandListEnd() {
+    gpu_thread.OnCommandListEnd();
+}
+
 } // namespace Tegra
-- 
cgit v1.2.3


From 40571c073faa02a6a4301e7f0ce365ef50a400aa Mon Sep 17 00:00:00 2001
From: bunnei <bunneidev@gmail.com>
Date: Sat, 12 Dec 2020 00:24:33 -0800
Subject: video_core: gpu: Implement synchronous mode using threaded GPU.

---
 src/video_core/gpu.cpp | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'src/video_core/gpu.cpp')

diff --git a/src/video_core/gpu.cpp b/src/video_core/gpu.cpp
index f99a8a0de..6ab06775f 100644
--- a/src/video_core/gpu.cpp
+++ b/src/video_core/gpu.cpp
@@ -38,7 +38,7 @@ GPU::GPU(Core::System& system_, bool is_async_, bool use_nvdec_)
       maxwell_dma{std::make_unique<Engines::MaxwellDMA>(system, *memory_manager)},
       kepler_memory{std::make_unique<Engines::KeplerMemory>(system, *memory_manager)},
       shader_notify{std::make_unique<VideoCore::ShaderNotify>()}, is_async{is_async_},
-      gpu_thread{system_} {}
+      gpu_thread{system_, is_async_} {}
 
 GPU::~GPU() = default;
 
@@ -524,7 +524,10 @@ void GPU::WaitIdle() const {
 }
 
 void GPU::OnCommandListEnd() {
-    gpu_thread.OnCommandListEnd();
+    if (is_async) {
+        // This command only applies to asynchronous GPU mode
+        gpu_thread.OnCommandListEnd();
+    }
 }
 
 } // namespace Tegra
-- 
cgit v1.2.3