From eb67a45ca82bc01ac843c853fd3c17f2a90e0250 Mon Sep 17 00:00:00 2001 From: ameerj Date: Mon, 26 Oct 2020 23:07:36 -0400 Subject: video_core: NVDEC Implementation This commit aims to implement the NVDEC (Nvidia Decoder) functionality, with video frame decoding being handled by the FFmpeg library. The process begins with Ioctl commands being sent to the NVDEC and VIC (Video Image Composer) emulated devices. These allocate the necessary GPU buffers for the frame data, along with providing information on the incoming video data. A Submit command then signals the GPU to process and decode the frame data. To decode the frame, the respective codec's header must be manually composed from the information provided by NVDEC, then sent with the raw frame data to the ffmpeg library. Currently, H264 and VP9 are supported, with VP9 having some minor artifacting issues related mainly to the reference frame composition in its uncompressed header. Async GPU is not properly implemented at the moment. Co-Authored-By: David <25727384+ogniK5377@users.noreply.github.com> --- src/video_core/gpu.cpp | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'src/video_core/gpu.cpp') diff --git a/src/video_core/gpu.cpp b/src/video_core/gpu.cpp index 4bb9256e9..171f78183 100644 --- a/src/video_core/gpu.cpp +++ b/src/video_core/gpu.cpp @@ -27,9 +27,10 @@ namespace Tegra { MICROPROFILE_DEFINE(GPU_wait, "GPU", "Wait for the GPU", MP_RGB(128, 128, 192)); -GPU::GPU(Core::System& system_, bool is_async_) +GPU::GPU(Core::System& system_, bool is_async_, bool use_nvdec_) : system{system_}, memory_manager{std::make_unique(system)}, dma_pusher{std::make_unique(system, *this)}, + cdma_pusher{std::make_unique(*this)}, use_nvdec{use_nvdec_}, maxwell_3d{std::make_unique(system, *memory_manager)}, fermi_2d{std::make_unique()}, kepler_compute{std::make_unique(system, *memory_manager)}, @@ -77,10 +78,18 @@ DmaPusher& GPU::DmaPusher() { return *dma_pusher; } +Tegra::CDmaPusher& GPU::CDmaPusher() { + return *cdma_pusher; +} + const DmaPusher& GPU::DmaPusher() const { return *dma_pusher; } +const Tegra::CDmaPusher& GPU::CDmaPusher() const { + return *cdma_pusher; +} + void GPU::WaitFence(u32 syncpoint_id, u32 value) { // Synced GPU, is always in sync if (!is_async) { -- cgit v1.2.3 From 6053b955525be69eb73a928a7bdd43ba8f5e69a7 Mon Sep 17 00:00:00 2001 From: bunnei Date: Mon, 26 Oct 2020 22:11:41 -0700 Subject: video_core: gpu: Implement WaitFence and IncrementSyncPoint. --- src/video_core/gpu.cpp | 48 ++++++++++++++++++++++++------------------------ 1 file changed, 24 insertions(+), 24 deletions(-) (limited to 'src/video_core/gpu.cpp') diff --git a/src/video_core/gpu.cpp b/src/video_core/gpu.cpp index 171f78183..ebd149c3a 100644 --- a/src/video_core/gpu.cpp +++ b/src/video_core/gpu.cpp @@ -194,30 +194,6 @@ void GPU::SyncGuestHost() { void GPU::OnCommandListEnd() { renderer->Rasterizer().ReleaseFences(); } -// Note that, traditionally, methods are treated as 4-byte addressable locations, and hence -// their numbers are written down multiplied by 4 in Docs. Here we are not multiply by 4. -// So the values you see in docs might be multiplied by 4. -enum class BufferMethods { - BindObject = 0x0, - Nop = 0x2, - SemaphoreAddressHigh = 0x4, - SemaphoreAddressLow = 0x5, - SemaphoreSequence = 0x6, - SemaphoreTrigger = 0x7, - NotifyIntr = 0x8, - WrcacheFlush = 0x9, - Unk28 = 0xA, - UnkCacheFlush = 0xB, - RefCnt = 0x14, - SemaphoreAcquire = 0x1A, - SemaphoreRelease = 0x1B, - FenceValue = 0x1C, - FenceAction = 0x1D, - Unk78 = 0x1E, - Unk7c = 0x1F, - Yield = 0x20, - NonPullerMethods = 0x40, -}; enum class GpuSemaphoreOperation { AcquireEqual = 0x1, @@ -277,7 +253,12 @@ void GPU::CallPullerMethod(const MethodCall& method_call) { case BufferMethods::UnkCacheFlush: case BufferMethods::WrcacheFlush: case BufferMethods::FenceValue: + break; case BufferMethods::FenceAction: + ProcessFenceActionMethod(); + break; + case BufferMethods::WaitForInterrupt: + ProcessWaitForInterruptMethod(); break; case BufferMethods::SemaphoreTrigger: { ProcessSemaphoreTriggerMethod(); @@ -391,6 +372,25 @@ void GPU::ProcessBindMethod(const MethodCall& method_call) { } } +void GPU::ProcessFenceActionMethod() { + switch (regs.fence_action.op) { + case FenceOperation::Acquire: + WaitFence(regs.fence_action.syncpoint_id, regs.fence_value); + break; + case FenceOperation::Increment: + IncrementSyncPoint(regs.fence_action.syncpoint_id); + break; + default: + UNIMPLEMENTED_MSG("Unimplemented operation {}", + static_cast(regs.fence_action.op.Value())); + } +} + +void GPU::ProcessWaitForInterruptMethod() { + // TODO(bunnei) ImplementMe + LOG_WARNING(HW_GPU, "(STUBBED) called"); +} + void GPU::ProcessSemaphoreTriggerMethod() { const auto semaphoreOperationMask = 0xF; const auto op = -- cgit v1.2.3 From e8b2fd21d861997e558180d775b14afdc46f3bbd Mon Sep 17 00:00:00 2001 From: comex Date: Sun, 22 Nov 2020 15:48:23 -0500 Subject: nvdrv, video_core: Don't index out of bounds when given invalid syncpoint ID - Use .at() instead of raw indexing when dealing with untrusted indices. - For the special case of WaitFence with syncpoint id UINT32_MAX, instead of crashing, log an error and ignore. This is what I get when running Super Mario Maker 2. --- src/video_core/gpu.cpp | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) (limited to 'src/video_core/gpu.cpp') diff --git a/src/video_core/gpu.cpp b/src/video_core/gpu.cpp index ebd149c3a..e91f52938 100644 --- a/src/video_core/gpu.cpp +++ b/src/video_core/gpu.cpp @@ -95,22 +95,29 @@ void GPU::WaitFence(u32 syncpoint_id, u32 value) { if (!is_async) { return; } + if (syncpoint_id == UINT32_MAX) { + // TODO: Research what this does. + LOG_ERROR(HW_GPU, "Waiting for syncpoint -1 not implemented"); + return; + } MICROPROFILE_SCOPE(GPU_wait); std::unique_lock lock{sync_mutex}; - sync_cv.wait(lock, [=, this] { return syncpoints[syncpoint_id].load() >= value; }); + sync_cv.wait(lock, [=, this] { return syncpoints.at(syncpoint_id).load() >= value; }); } void GPU::IncrementSyncPoint(const u32 syncpoint_id) { - syncpoints[syncpoint_id]++; + auto& syncpoint = syncpoints.at(syncpoint_id); + syncpoint++; std::lock_guard lock{sync_mutex}; sync_cv.notify_all(); - if (!syncpt_interrupts[syncpoint_id].empty()) { - u32 value = syncpoints[syncpoint_id].load(); - auto it = syncpt_interrupts[syncpoint_id].begin(); - while (it != syncpt_interrupts[syncpoint_id].end()) { + auto& interrupt = syncpt_interrupts.at(syncpoint_id); + if (!interrupt.empty()) { + u32 value = syncpoint.load(); + auto it = interrupt.begin(); + while (it != interrupt.end()) { if (value >= *it) { TriggerCpuInterrupt(syncpoint_id, *it); - it = syncpt_interrupts[syncpoint_id].erase(it); + it = interrupt.erase(it); continue; } it++; @@ -119,22 +126,22 @@ void GPU::IncrementSyncPoint(const u32 syncpoint_id) { } u32 GPU::GetSyncpointValue(const u32 syncpoint_id) const { - return syncpoints[syncpoint_id].load(); + return syncpoints.at(syncpoint_id).load(); } void GPU::RegisterSyncptInterrupt(const u32 syncpoint_id, const u32 value) { - auto& interrupt = syncpt_interrupts[syncpoint_id]; + auto& interrupt = syncpt_interrupts.at(syncpoint_id); bool contains = std::any_of(interrupt.begin(), interrupt.end(), [value](u32 in_value) { return in_value == value; }); if (contains) { return; } - syncpt_interrupts[syncpoint_id].emplace_back(value); + interrupt.emplace_back(value); } bool GPU::CancelSyncptInterrupt(const u32 syncpoint_id, const u32 value) { std::lock_guard lock{sync_mutex}; - auto& interrupt = syncpt_interrupts[syncpoint_id]; + auto& interrupt = syncpt_interrupts.at(syncpoint_id); const auto iter = std::find_if(interrupt.begin(), interrupt.end(), [value](u32 interrupt_value) { return value == interrupt_value; }); -- cgit v1.2.3 From 677a8b208d47d0d2397197ce74c7039a8ea79d20 Mon Sep 17 00:00:00 2001 From: Lioncash Date: Fri, 4 Dec 2020 14:39:12 -0500 Subject: video_core: Resolve more variable shadowing scenarios Resolves variable shadowing scenarios up to the end of the OpenGL code to make it nicer to review. The rest will be resolved in a following commit. --- src/video_core/gpu.cpp | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'src/video_core/gpu.cpp') diff --git a/src/video_core/gpu.cpp b/src/video_core/gpu.cpp index e91f52938..964b3f3dc 100644 --- a/src/video_core/gpu.cpp +++ b/src/video_core/gpu.cpp @@ -232,8 +232,12 @@ void GPU::CallMultiMethod(u32 method, u32 subchannel, const u32* base_start, u32 CallEngineMultiMethod(method, subchannel, base_start, amount, methods_pending); } else { for (std::size_t i = 0; i < amount; i++) { - CallPullerMethod( - {method, base_start[i], subchannel, methods_pending - static_cast(i)}); + CallPullerMethod(MethodCall{ + method, + base_start[i], + subchannel, + methods_pending - static_cast(i), + }); } } } -- cgit v1.2.3 From 4c5f5c9bf301d3626df104dbed6fed6f115cedc8 Mon Sep 17 00:00:00 2001 From: Lioncash Date: Mon, 7 Dec 2020 00:41:47 -0500 Subject: video_core: Remove unnecessary enum class casting in logging messages fmt now automatically prints the numeric value of an enum class member by default, so we don't need to use casts any more. Reduces the line noise a bit. --- src/video_core/gpu.cpp | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'src/video_core/gpu.cpp') diff --git a/src/video_core/gpu.cpp b/src/video_core/gpu.cpp index 964b3f3dc..e2512a7f2 100644 --- a/src/video_core/gpu.cpp +++ b/src/video_core/gpu.cpp @@ -299,8 +299,7 @@ void GPU::CallPullerMethod(const MethodCall& method_call) { break; } default: - LOG_ERROR(HW_GPU, "Special puller engine method {:X} not implemented", - static_cast(method)); + LOG_ERROR(HW_GPU, "Special puller engine method {:X} not implemented", method); break; } } @@ -379,7 +378,7 @@ void GPU::ProcessBindMethod(const MethodCall& method_call) { dma_pusher->BindSubchannel(kepler_memory.get(), method_call.subchannel); break; default: - UNIMPLEMENTED_MSG("Unimplemented engine {:04X}", static_cast(engine_id)); + UNIMPLEMENTED_MSG("Unimplemented engine {:04X}", engine_id); } } @@ -392,8 +391,7 @@ void GPU::ProcessFenceActionMethod() { IncrementSyncPoint(regs.fence_action.syncpoint_id); break; default: - UNIMPLEMENTED_MSG("Unimplemented operation {}", - static_cast(regs.fence_action.op.Value())); + UNIMPLEMENTED_MSG("Unimplemented operation {}", regs.fence_action.op.Value()); } } -- cgit v1.2.3 From 14c825bd1c37b2444e858bf1a75fb77455b4eb52 Mon Sep 17 00:00:00 2001 From: bunnei Date: Fri, 11 Dec 2020 22:26:14 -0800 Subject: video_core: gpu: Refactor out synchronous/asynchronous GPU implementations. - We must always use a GPU thread now, even with synchronous GPU. --- src/video_core/gpu.cpp | 76 ++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 71 insertions(+), 5 deletions(-) (limited to 'src/video_core/gpu.cpp') diff --git a/src/video_core/gpu.cpp b/src/video_core/gpu.cpp index e2512a7f2..f99a8a0de 100644 --- a/src/video_core/gpu.cpp +++ b/src/video_core/gpu.cpp @@ -10,6 +10,7 @@ #include "core/core_timing.h" #include "core/core_timing_util.h" #include "core/frontend/emu_window.h" +#include "core/hardware_interrupt_manager.h" #include "core/memory.h" #include "core/settings.h" #include "video_core/engines/fermi_2d.h" @@ -36,7 +37,8 @@ GPU::GPU(Core::System& system_, bool is_async_, bool use_nvdec_) kepler_compute{std::make_unique(system, *memory_manager)}, maxwell_dma{std::make_unique(system, *memory_manager)}, kepler_memory{std::make_unique(system, *memory_manager)}, - shader_notify{std::make_unique()}, is_async{is_async_} {} + shader_notify{std::make_unique()}, is_async{is_async_}, + gpu_thread{system_} {} GPU::~GPU() = default; @@ -198,10 +200,6 @@ void GPU::SyncGuestHost() { renderer->Rasterizer().SyncGuestHost(); } -void GPU::OnCommandListEnd() { - renderer->Rasterizer().ReleaseFences(); -} - enum class GpuSemaphoreOperation { AcquireEqual = 0x1, WriteLong = 0x2, @@ -461,4 +459,72 @@ void GPU::ProcessSemaphoreAcquire() { } } +void GPU::Start() { + gpu_thread.StartThread(*renderer, renderer->Context(), *dma_pusher, *cdma_pusher); + cpu_context = renderer->GetRenderWindow().CreateSharedContext(); + cpu_context->MakeCurrent(); +} + +void GPU::ObtainContext() { + cpu_context->MakeCurrent(); +} + +void GPU::ReleaseContext() { + cpu_context->DoneCurrent(); +} + +void GPU::PushGPUEntries(Tegra::CommandList&& entries) { + gpu_thread.SubmitList(std::move(entries)); +} + +void GPU::PushCommandBuffer(Tegra::ChCommandHeaderList& entries) { + if (!use_nvdec) { + return; + } + // This condition fires when a video stream ends, clear all intermediary data + if (entries[0].raw == 0xDEADB33F) { + cdma_pusher.reset(); + return; + } + if (!cdma_pusher) { + cdma_pusher = std::make_unique(*this); + } + + // SubmitCommandBuffer would make the nvdec operations async, this is not currently working + // TODO(ameerj): RE proper async nvdec operation + // gpu_thread.SubmitCommandBuffer(std::move(entries)); + + cdma_pusher->Push(std::move(entries)); + cdma_pusher->DispatchCalls(); +} + +void GPU::SwapBuffers(const Tegra::FramebufferConfig* framebuffer) { + gpu_thread.SwapBuffers(framebuffer); +} + +void GPU::FlushRegion(VAddr addr, u64 size) { + gpu_thread.FlushRegion(addr, size); +} + +void GPU::InvalidateRegion(VAddr addr, u64 size) { + gpu_thread.InvalidateRegion(addr, size); +} + +void GPU::FlushAndInvalidateRegion(VAddr addr, u64 size) { + gpu_thread.FlushAndInvalidateRegion(addr, size); +} + +void GPU::TriggerCpuInterrupt(const u32 syncpoint_id, const u32 value) const { + auto& interrupt_manager = system.InterruptManager(); + interrupt_manager.GPUInterruptSyncpt(syncpoint_id, value); +} + +void GPU::WaitIdle() const { + gpu_thread.WaitIdle(); +} + +void GPU::OnCommandListEnd() { + gpu_thread.OnCommandListEnd(); +} + } // namespace Tegra -- cgit v1.2.3 From 40571c073faa02a6a4301e7f0ce365ef50a400aa Mon Sep 17 00:00:00 2001 From: bunnei Date: Sat, 12 Dec 2020 00:24:33 -0800 Subject: video_core: gpu: Implement synchronous mode using threaded GPU. --- src/video_core/gpu.cpp | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'src/video_core/gpu.cpp') diff --git a/src/video_core/gpu.cpp b/src/video_core/gpu.cpp index f99a8a0de..6ab06775f 100644 --- a/src/video_core/gpu.cpp +++ b/src/video_core/gpu.cpp @@ -38,7 +38,7 @@ GPU::GPU(Core::System& system_, bool is_async_, bool use_nvdec_) maxwell_dma{std::make_unique(system, *memory_manager)}, kepler_memory{std::make_unique(system, *memory_manager)}, shader_notify{std::make_unique()}, is_async{is_async_}, - gpu_thread{system_} {} + gpu_thread{system_, is_async_} {} GPU::~GPU() = default; @@ -524,7 +524,10 @@ void GPU::WaitIdle() const { } void GPU::OnCommandListEnd() { - gpu_thread.OnCommandListEnd(); + if (is_async) { + // This command only applies to asynchronous GPU mode + gpu_thread.OnCommandListEnd(); + } } } // namespace Tegra -- cgit v1.2.3