diff options
53 files changed, 4033 insertions, 310 deletions
| diff --git a/CMakeLists.txt b/CMakeLists.txt index 45bd03a65..8e9502a97 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -263,6 +263,7 @@ if (CONAN_REQUIRED_LIBS)          libzip:with_openssl=False          libzip:enable_windows_crypto=False      ) +      conan_check(VERSION 1.24.0 REQUIRED)      # Add the bincrafters remote      conan_add_remote(NAME bincrafters @@ -354,6 +355,19 @@ if (NOT LIBUSB_FOUND)      set(LIBUSB_LIBRARIES usb)  endif() +# Use system installed ffmpeg. +if (NOT MSVC) +    find_package(FFmpeg REQUIRED) +else() +    set(FFMPEG_EXT_NAME "ffmpeg-4.2.1") +    set(FFMPEG_PATH "${CMAKE_BINARY_DIR}/externals/${FFMPEG_EXT_NAME}") +    download_bundled_external("ffmpeg/" ${FFMPEG_EXT_NAME} "") +    set(FFMPEG_FOUND YES) +    set(FFMPEG_INCLUDE_DIR "${FFMPEG_PATH}/include" CACHE PATH "Path to FFmpeg headers" FORCE) +    set(FFMPEG_LIBRARY_DIR "${FFMPEG_PATH}/bin" CACHE PATH "Path to FFmpeg library" FORCE) +    set(FFMPEG_DLL_DIR "${FFMPEG_PATH}/bin" CACHE PATH "Path to FFmpeg dll's" FORCE) +endif() +  # Prefer the -pthread flag on Linux.  set(THREADS_PREFER_PTHREAD_FLAG ON)  find_package(Threads REQUIRED) diff --git a/CMakeModules/CopyYuzuFFmpegDeps.cmake b/CMakeModules/CopyYuzuFFmpegDeps.cmake new file mode 100644 index 000000000..cca1eeeab --- /dev/null +++ b/CMakeModules/CopyYuzuFFmpegDeps.cmake @@ -0,0 +1,10 @@ +function(copy_yuzu_FFmpeg_deps target_dir) +    include(WindowsCopyFiles) +    set(DLL_DEST "${CMAKE_BINARY_DIR}/bin/$<CONFIG>/") +    windows_copy_files(${target_dir} ${FFMPEG_DLL_DIR} ${DLL_DEST} +        avcodec-58.dll +        avutil-56.dll +        swresample-3.dll +        swscale-5.dll +    ) +endfunction(copy_yuzu_FFmpeg_deps) diff --git a/externals/find-modules/FindFFmpeg.cmake b/externals/find-modules/FindFFmpeg.cmake new file mode 100644 index 000000000..77b331e00 --- /dev/null +++ b/externals/find-modules/FindFFmpeg.cmake @@ -0,0 +1,100 @@ +# - Try to find ffmpeg libraries (libavcodec, libavformat and libavutil) +# Once done this will define +# +# FFMPEG_FOUND - system has ffmpeg or libav +# FFMPEG_INCLUDE_DIR - the ffmpeg include directory +# FFMPEG_LIBRARIES - Link these to use ffmpeg +# FFMPEG_LIBAVCODEC +# FFMPEG_LIBAVFORMAT +# FFMPEG_LIBAVUTIL +# +# Copyright (c) 2008 Andreas Schneider <mail@cynapses.org> +# Modified for other libraries by Lasse Kärkkäinen <tronic> +# Modified for Hedgewars by Stepik777 +# Modified for FFmpeg-example Tuukka Pasanen 2018 +# Modified for yuzu toastUnlimted 2020 +# +# Redistribution and use is allowed according to the terms of the New +# BSD license. +# + +include(FindPackageHandleStandardArgs) + +find_package_handle_standard_args(FFMPEG +  FOUND_VAR FFMPEG_FOUND +  REQUIRED_VARS +      FFMPEG_LIBRARY +      FFMPEG_INCLUDE_DIR +  VERSION_VAR FFMPEG_VERSION +) + +if(FFMPEG_LIBRARIES AND FFMPEG_INCLUDE_DIR) +  # in cache already +  set(FFMPEG_FOUND TRUE) +else() +  # use pkg-config to get the directories and then use these values +  # in the FIND_PATH() and FIND_LIBRARY() calls +  find_package(PkgConfig) +  if(PKG_CONFIG_FOUND) +    pkg_check_modules(_FFMPEG_AVCODEC libavcodec) +    pkg_check_modules(_FFMPEG_AVUTIL libavutil) +    pkg_check_modules(_FFMPEG_SWSCALE libswscale) +  endif() + +  find_path(FFMPEG_AVCODEC_INCLUDE_DIR +    NAMES libavcodec/avcodec.h +    PATHS ${_FFMPEG_AVCODEC_INCLUDE_DIRS} +      /usr/include +      /usr/local/include +      /opt/local/include +      /sw/include +    PATH_SUFFIXES ffmpeg libav) + +  find_library(FFMPEG_LIBAVCODEC +    NAMES avcodec +    PATHS ${_FFMPEG_AVCODEC_LIBRARY_DIRS} +      /usr/lib +      /usr/local/lib +      /opt/local/lib +      /sw/lib) + +  find_library(FFMPEG_LIBAVUTIL +    NAMES avutil +    PATHS ${_FFMPEG_AVUTIL_LIBRARY_DIRS} +      /usr/lib +      /usr/local/lib +      /opt/local/lib +      /sw/lib) + +  find_library(FFMPEG_LIBSWSCALE +    NAMES swscale +    PATHS ${_FFMPEG_SWSCALE_LIBRARY_DIRS} +      /usr/lib +      /usr/local/lib +      /opt/local/lib +      /sw/lib) + +  if(FFMPEG_LIBAVCODEC AND FFMPEG_LIBAVUTIL AND FFMPEG_LIBSWSCALE) +    set(FFMPEG_FOUND TRUE) +  endif() + +  if(FFMPEG_FOUND) +    set(FFMPEG_INCLUDE_DIR ${FFMPEG_AVCODEC_INCLUDE_DIR}) +    set(FFMPEG_LIBRARIES +      ${FFMPEG_LIBAVCODEC} +      ${FFMPEG_LIBAVUTIL} +      ${FFMPEG_LIBSWSCALE}) +  endif() + +  if(FFMPEG_FOUND) +    if(NOT FFMPEG_FIND_QUIETLY) +      message(STATUS +      "Found FFMPEG or Libav: ${FFMPEG_LIBRARIES}, ${FFMPEG_INCLUDE_DIR}") +    endif() +  else() +    if(FFMPEG_FIND_REQUIRED) +      message(FATAL_ERROR +      "Could not find libavcodec or libavutil or libswscale") +    endif() +  endif() +endif() diff --git a/src/common/CMakeLists.txt b/src/common/CMakeLists.txt index 0fb5d9708..e50ab2922 100644 --- a/src/common/CMakeLists.txt +++ b/src/common/CMakeLists.txt @@ -150,6 +150,8 @@ add_library(common STATIC      scope_exit.h      spin_lock.cpp      spin_lock.h +    stream.cpp +    stream.h      string_util.cpp      string_util.h      swap.h diff --git a/src/common/stream.cpp b/src/common/stream.cpp new file mode 100644 index 000000000..bf0496c26 --- /dev/null +++ b/src/common/stream.cpp @@ -0,0 +1,47 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include <stdexcept> +#include "common/common_types.h" +#include "common/stream.h" + +namespace Common { + +Stream::Stream() = default; +Stream::~Stream() = default; + +void Stream::Seek(s32 offset, SeekOrigin origin) { +    if (origin == SeekOrigin::SetOrigin) { +        if (offset < 0) { +            position = 0; +        } else if (position >= buffer.size()) { +            position = buffer.size(); +        } else { +            position = offset; +        } +    } else if (origin == SeekOrigin::FromCurrentPos) { +        Seek(static_cast<s32>(position) + offset, SeekOrigin::SetOrigin); +    } else if (origin == SeekOrigin::FromEnd) { +        Seek(static_cast<s32>(buffer.size()) - offset, SeekOrigin::SetOrigin); +    } +} + +u8 Stream::ReadByte() { +    if (position < buffer.size()) { +        return buffer[position++]; +    } else { +        throw std::out_of_range("Attempting to read a byte not within the buffer range"); +    } +} + +void Stream::WriteByte(u8 byte) { +    if (position == buffer.size()) { +        buffer.push_back(byte); +        position++; +    } else { +        buffer.insert(buffer.begin() + position, byte); +    } +} + +} // namespace Common diff --git a/src/common/stream.h b/src/common/stream.h new file mode 100644 index 000000000..2585c16af --- /dev/null +++ b/src/common/stream.h @@ -0,0 +1,50 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include <vector> +#include "common/common_types.h" + +namespace Common { + +enum class SeekOrigin { +    SetOrigin, +    FromCurrentPos, +    FromEnd, +}; + +class Stream { +public: +    /// Stream creates a bitstream and provides common functionality on the stream. +    explicit Stream(); +    ~Stream(); + +    /// Reposition bitstream "cursor" to the specified offset from origin +    void Seek(s32 offset, SeekOrigin origin); + +    /// Reads next byte in the stream buffer and increments position +    u8 ReadByte(); + +    /// Writes byte at current position +    void WriteByte(u8 byte); + +    std::size_t GetPosition() const { +        return position; +    } + +    std::vector<u8>& GetBuffer() { +        return buffer; +    } + +    const std::vector<u8>& GetBuffer() const { +        return buffer; +    } + +private: +    std::vector<u8> buffer; +    std::size_t position{0}; +}; + +} // namespace Common diff --git a/src/core/CMakeLists.txt b/src/core/CMakeLists.txt index db1c9fdef..e0f207f3e 100644 --- a/src/core/CMakeLists.txt +++ b/src/core/CMakeLists.txt @@ -439,6 +439,8 @@ add_library(core STATIC      hle/service/nvdrv/devices/nvhost_gpu.h      hle/service/nvdrv/devices/nvhost_nvdec.cpp      hle/service/nvdrv/devices/nvhost_nvdec.h +    hle/service/nvdrv/devices/nvhost_nvdec_common.cpp +    hle/service/nvdrv/devices/nvhost_nvdec_common.h      hle/service/nvdrv/devices/nvhost_nvjpg.cpp      hle/service/nvdrv/devices/nvhost_nvjpg.h      hle/service/nvdrv/devices/nvhost_vic.cpp diff --git a/src/core/hle/service/nvdrv/devices/nvhost_nvdec.cpp b/src/core/hle/service/nvdrv/devices/nvhost_nvdec.cpp index fcb612864..b6df48360 100644 --- a/src/core/hle/service/nvdrv/devices/nvhost_nvdec.cpp +++ b/src/core/hle/service/nvdrv/devices/nvhost_nvdec.cpp @@ -2,15 +2,17 @@  // Licensed under GPLv2 or any later version  // Refer to the license.txt file included. -#include <cstring> -  #include "common/assert.h"  #include "common/logging/log.h" +#include "core/core.h"  #include "core/hle/service/nvdrv/devices/nvhost_nvdec.h" +#include "video_core/memory_manager.h" +#include "video_core/renderer_base.h"  namespace Service::Nvidia::Devices { -nvhost_nvdec::nvhost_nvdec(Core::System& system) : nvdevice(system) {} +nvhost_nvdec::nvhost_nvdec(Core::System& system, std::shared_ptr<nvmap> nvmap_dev) +    : nvhost_nvdec_common(system, std::move(nvmap_dev)) {}  nvhost_nvdec::~nvhost_nvdec() = default;  u32 nvhost_nvdec::ioctl(Ioctl command, const std::vector<u8>& input, const std::vector<u8>& input2, @@ -21,7 +23,7 @@ u32 nvhost_nvdec::ioctl(Ioctl command, const std::vector<u8>& input, const std::      switch (static_cast<IoctlCommand>(command.raw)) {      case IoctlCommand::IocSetNVMAPfdCommand: -        return SetNVMAPfd(input, output); +        return SetNVMAPfd(input);      case IoctlCommand::IocSubmit:          return Submit(input, output);      case IoctlCommand::IocGetSyncpoint: @@ -29,79 +31,29 @@ u32 nvhost_nvdec::ioctl(Ioctl command, const std::vector<u8>& input, const std::      case IoctlCommand::IocGetWaitbase:          return GetWaitbase(input, output);      case IoctlCommand::IocMapBuffer: -        return MapBuffer(input, output); +    case IoctlCommand::IocMapBuffer2: +    case IoctlCommand::IocMapBuffer3:      case IoctlCommand::IocMapBufferEx: -        return MapBufferEx(input, output); -    case IoctlCommand::IocUnmapBufferEx: -        return UnmapBufferEx(input, output); +        return MapBuffer(input, output); +    case IoctlCommand::IocUnmapBufferEx: { +        // This command is sent when the video stream has ended, flush all video contexts +        // This is usually sent in the folowing order: vic, nvdec, vic. +        // Inform the GPU to clear any remaining nvdec buffers when this is detected. +        LOG_INFO(Service_NVDRV, "NVDEC video stream ended"); +        Tegra::ChCommandHeaderList cmdlist(1); +        cmdlist[0] = Tegra::ChCommandHeader{0xDEADB33F}; +        system.GPU().PushCommandBuffer(cmdlist); +        [[fallthrough]]; // fallthrough to unmap buffers +    }; +    case IoctlCommand::IocUnmapBuffer: +    case IoctlCommand::IocUnmapBuffer2: +    case IoctlCommand::IocUnmapBuffer3: +        return UnmapBuffer(input, output); +    case IoctlCommand::IocSetSubmitTimeout: +        return SetSubmitTimeout(input, output);      } -    UNIMPLEMENTED_MSG("Unimplemented ioctl"); -    return 0; -} - -u32 nvhost_nvdec::SetNVMAPfd(const std::vector<u8>& input, std::vector<u8>& output) { -    IoctlSetNvmapFD params{}; -    std::memcpy(¶ms, input.data(), sizeof(IoctlSetNvmapFD)); -    LOG_DEBUG(Service_NVDRV, "called, fd={}", params.nvmap_fd); - -    nvmap_fd = params.nvmap_fd; -    return 0; -} - -u32 nvhost_nvdec::Submit(const std::vector<u8>& input, std::vector<u8>& output) { -    IoctlSubmit params{}; -    std::memcpy(¶ms, input.data(), sizeof(IoctlSubmit)); -    LOG_WARNING(Service_NVDRV, "(STUBBED) called"); -    std::memcpy(output.data(), ¶ms, sizeof(IoctlSubmit)); -    return 0; -} - -u32 nvhost_nvdec::GetSyncpoint(const std::vector<u8>& input, std::vector<u8>& output) { -    IoctlGetSyncpoint params{}; -    std::memcpy(¶ms, input.data(), sizeof(IoctlGetSyncpoint)); -    LOG_INFO(Service_NVDRV, "called, unknown=0x{:X}", params.unknown); -    params.value = 0; // Seems to be hard coded at 0 -    std::memcpy(output.data(), ¶ms, sizeof(IoctlGetSyncpoint)); -    return 0; -} - -u32 nvhost_nvdec::GetWaitbase(const std::vector<u8>& input, std::vector<u8>& output) { -    IoctlGetWaitbase params{}; -    std::memcpy(¶ms, input.data(), sizeof(IoctlGetWaitbase)); -    LOG_INFO(Service_NVDRV, "called, unknown=0x{:X}", params.unknown); -    params.value = 0; // Seems to be hard coded at 0 -    std::memcpy(output.data(), ¶ms, sizeof(IoctlGetWaitbase)); -    return 0; -} - -u32 nvhost_nvdec::MapBuffer(const std::vector<u8>& input, std::vector<u8>& output) { -    IoctlMapBuffer params{}; -    std::memcpy(¶ms, input.data(), sizeof(IoctlMapBuffer)); -    LOG_WARNING(Service_NVDRV, "(STUBBED) called with address={:08X}{:08X}", params.address_2, -                params.address_1); -    params.address_1 = 0; -    params.address_2 = 0; -    std::memcpy(output.data(), ¶ms, sizeof(IoctlMapBuffer)); -    return 0; -} - -u32 nvhost_nvdec::MapBufferEx(const std::vector<u8>& input, std::vector<u8>& output) { -    IoctlMapBufferEx params{}; -    std::memcpy(¶ms, input.data(), sizeof(IoctlMapBufferEx)); -    LOG_WARNING(Service_NVDRV, "(STUBBED) called with address={:08X}{:08X}", params.address_2, -                params.address_1); -    params.address_1 = 0; -    params.address_2 = 0; -    std::memcpy(output.data(), ¶ms, sizeof(IoctlMapBufferEx)); -    return 0; -} - -u32 nvhost_nvdec::UnmapBufferEx(const std::vector<u8>& input, std::vector<u8>& output) { -    IoctlUnmapBufferEx params{}; -    std::memcpy(¶ms, input.data(), sizeof(IoctlUnmapBufferEx)); -    LOG_WARNING(Service_NVDRV, "(STUBBED) called"); -    std::memcpy(output.data(), ¶ms, sizeof(IoctlUnmapBufferEx)); +    UNIMPLEMENTED_MSG("Unimplemented ioctl 0x{:X}", command.raw);      return 0;  } diff --git a/src/core/hle/service/nvdrv/devices/nvhost_nvdec.h b/src/core/hle/service/nvdrv/devices/nvhost_nvdec.h index 4332db118..102777ddd 100644 --- a/src/core/hle/service/nvdrv/devices/nvhost_nvdec.h +++ b/src/core/hle/service/nvdrv/devices/nvhost_nvdec.h @@ -4,16 +4,14 @@  #pragma once -#include <vector> -#include "common/common_types.h" -#include "common/swap.h" -#include "core/hle/service/nvdrv/devices/nvdevice.h" +#include <memory> +#include "core/hle/service/nvdrv/devices/nvhost_nvdec_common.h"  namespace Service::Nvidia::Devices { -class nvhost_nvdec final : public nvdevice { +class nvhost_nvdec final : public nvhost_nvdec_common {  public: -    explicit nvhost_nvdec(Core::System& system); +    explicit nvhost_nvdec(Core::System& system, std::shared_ptr<nvmap> nvmap_dev);      ~nvhost_nvdec() override;      u32 ioctl(Ioctl command, const std::vector<u8>& input, const std::vector<u8>& input2, @@ -27,62 +25,15 @@ private:          IocGetSyncpoint = 0xC0080002,          IocGetWaitbase = 0xC0080003,          IocMapBuffer = 0xC01C0009, +        IocMapBuffer2 = 0xC16C0009, +        IocMapBuffer3 = 0xC15C0009,          IocMapBufferEx = 0xC0A40009, -        IocUnmapBufferEx = 0xC0A4000A, +        IocUnmapBuffer = 0xC0A4000A, +        IocUnmapBuffer2 = 0xC16C000A, +        IocUnmapBufferEx = 0xC01C000A, +        IocUnmapBuffer3 = 0xC15C000A, +        IocSetSubmitTimeout = 0x40040007,      }; - -    struct IoctlSetNvmapFD { -        u32_le nvmap_fd; -    }; -    static_assert(sizeof(IoctlSetNvmapFD) == 0x4, "IoctlSetNvmapFD is incorrect size"); - -    struct IoctlSubmit { -        INSERT_PADDING_BYTES(0x40); // TODO(DarkLordZach): RE this structure -    }; -    static_assert(sizeof(IoctlSubmit) == 0x40, "IoctlSubmit has incorrect size"); - -    struct IoctlGetSyncpoint { -        u32 unknown; // seems to be ignored? Nintendo added this -        u32 value; -    }; -    static_assert(sizeof(IoctlGetSyncpoint) == 0x08, "IoctlGetSyncpoint has incorrect size"); - -    struct IoctlGetWaitbase { -        u32 unknown; // seems to be ignored? Nintendo added this -        u32 value; -    }; -    static_assert(sizeof(IoctlGetWaitbase) == 0x08, "IoctlGetWaitbase has incorrect size"); - -    struct IoctlMapBuffer { -        u32 unknown; -        u32 address_1; -        u32 address_2; -        INSERT_PADDING_BYTES(0x10); // TODO(DarkLordZach): RE this structure -    }; -    static_assert(sizeof(IoctlMapBuffer) == 0x1C, "IoctlMapBuffer is incorrect size"); - -    struct IoctlMapBufferEx { -        u32 unknown; -        u32 address_1; -        u32 address_2; -        INSERT_PADDING_BYTES(0x98); // TODO(DarkLordZach): RE this structure -    }; -    static_assert(sizeof(IoctlMapBufferEx) == 0xA4, "IoctlMapBufferEx has incorrect size"); - -    struct IoctlUnmapBufferEx { -        INSERT_PADDING_BYTES(0xA4); // TODO(DarkLordZach): RE this structure -    }; -    static_assert(sizeof(IoctlUnmapBufferEx) == 0xA4, "IoctlUnmapBufferEx has incorrect size"); - -    u32_le nvmap_fd{}; - -    u32 SetNVMAPfd(const std::vector<u8>& input, std::vector<u8>& output); -    u32 Submit(const std::vector<u8>& input, std::vector<u8>& output); -    u32 GetSyncpoint(const std::vector<u8>& input, std::vector<u8>& output); -    u32 GetWaitbase(const std::vector<u8>& input, std::vector<u8>& output); -    u32 MapBuffer(const std::vector<u8>& input, std::vector<u8>& output); -    u32 MapBufferEx(const std::vector<u8>& input, std::vector<u8>& output); -    u32 UnmapBufferEx(const std::vector<u8>& input, std::vector<u8>& output);  };  } // namespace Service::Nvidia::Devices diff --git a/src/core/hle/service/nvdrv/devices/nvhost_nvdec_common.cpp b/src/core/hle/service/nvdrv/devices/nvhost_nvdec_common.cpp new file mode 100644 index 000000000..85792495f --- /dev/null +++ b/src/core/hle/service/nvdrv/devices/nvhost_nvdec_common.cpp @@ -0,0 +1,234 @@ +// Copyright 2020 yuzu emulator team +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include <algorithm> +#include <cstring> + +#include "common/assert.h" +#include "common/common_types.h" +#include "common/logging/log.h" +#include "core/core.h" +#include "core/hle/service/nvdrv/devices/nvhost_nvdec_common.h" +#include "core/hle/service/nvdrv/devices/nvmap.h" +#include "core/memory.h" +#include "video_core/memory_manager.h" +#include "video_core/renderer_base.h" + +namespace Service::Nvidia::Devices { + +namespace { +// Splice vectors will copy count amount of type T from the input vector into the dst vector. +template <typename T> +std::size_t SpliceVectors(const std::vector<u8>& input, std::vector<T>& dst, std::size_t count, +                          std::size_t offset) { +    std::memcpy(dst.data(), input.data() + offset, count * sizeof(T)); +    offset += count * sizeof(T); +    return offset; +} + +// Write vectors will write data to the output buffer +template <typename T> +std::size_t WriteVectors(std::vector<u8>& dst, const std::vector<T>& src, std::size_t offset) { +    std::memcpy(dst.data() + offset, src.data(), src.size() * sizeof(T)); +    offset += src.size() * sizeof(T); +    return offset; +} +} // Anonymous namespace + +namespace NvErrCodes { +constexpr u32 Success{}; +constexpr u32 OutOfMemory{static_cast<u32>(-12)}; +constexpr u32 InvalidInput{static_cast<u32>(-22)}; +} // namespace NvErrCodes + +nvhost_nvdec_common::nvhost_nvdec_common(Core::System& system, std::shared_ptr<nvmap> nvmap_dev) +    : nvdevice(system), nvmap_dev(std::move(nvmap_dev)) {} +nvhost_nvdec_common::~nvhost_nvdec_common() = default; + +u32 nvhost_nvdec_common::SetNVMAPfd(const std::vector<u8>& input) { +    IoctlSetNvmapFD params{}; +    std::memcpy(¶ms, input.data(), sizeof(IoctlSetNvmapFD)); +    LOG_DEBUG(Service_NVDRV, "called, fd={}", params.nvmap_fd); + +    nvmap_fd = params.nvmap_fd; +    return 0; +} + +u32 nvhost_nvdec_common::Submit(const std::vector<u8>& input, std::vector<u8>& output) { +    IoctlSubmit params{}; +    std::memcpy(¶ms, input.data(), sizeof(IoctlSubmit)); +    LOG_DEBUG(Service_NVDRV, "called NVDEC Submit, cmd_buffer_count={}", params.cmd_buffer_count); + +    // Instantiate param buffers +    std::size_t offset = sizeof(IoctlSubmit); +    std::vector<CommandBuffer> command_buffers(params.cmd_buffer_count); +    std::vector<Reloc> relocs(params.relocation_count); +    std::vector<u32> reloc_shifts(params.relocation_count); +    std::vector<SyncptIncr> syncpt_increments(params.syncpoint_count); +    std::vector<SyncptIncr> wait_checks(params.syncpoint_count); +    std::vector<Fence> fences(params.fence_count); + +    // Splice input into their respective buffers +    offset = SpliceVectors(input, command_buffers, params.cmd_buffer_count, offset); +    offset = SpliceVectors(input, relocs, params.relocation_count, offset); +    offset = SpliceVectors(input, reloc_shifts, params.relocation_count, offset); +    offset = SpliceVectors(input, syncpt_increments, params.syncpoint_count, offset); +    offset = SpliceVectors(input, wait_checks, params.syncpoint_count, offset); +    offset = SpliceVectors(input, fences, params.fence_count, offset); + +    // TODO(ameerj): For async gpu, utilize fences for syncpoint 'max' increment + +    auto& gpu = system.GPU(); + +    for (const auto& cmd_buffer : command_buffers) { +        auto object = nvmap_dev->GetObject(cmd_buffer.memory_id); +        ASSERT_OR_EXECUTE(object, return NvErrCodes::InvalidInput;); +        const auto map = FindBufferMap(object->dma_map_addr); +        if (!map) { +            LOG_ERROR(Service_NVDRV, "Tried to submit an invalid offset 0x{:X} dma 0x{:X}", +                      object->addr, object->dma_map_addr); +            return 0; +        } +        Tegra::ChCommandHeaderList cmdlist(cmd_buffer.word_count); +        gpu.MemoryManager().ReadBlock(map->StartAddr() + cmd_buffer.offset, cmdlist.data(), +                                      cmdlist.size() * sizeof(u32)); +        gpu.PushCommandBuffer(cmdlist); +    } + +    std::memcpy(output.data(), ¶ms, sizeof(IoctlSubmit)); +    // Some games expect command_buffers to be written back +    offset = sizeof(IoctlSubmit); +    offset = WriteVectors(output, command_buffers, offset); +    offset = WriteVectors(output, relocs, offset); +    offset = WriteVectors(output, reloc_shifts, offset); +    offset = WriteVectors(output, syncpt_increments, offset); +    offset = WriteVectors(output, wait_checks, offset); + +    return NvErrCodes::Success; +} + +u32 nvhost_nvdec_common::GetSyncpoint(const std::vector<u8>& input, std::vector<u8>& output) { +    IoctlGetSyncpoint params{}; +    std::memcpy(¶ms, input.data(), sizeof(IoctlGetSyncpoint)); +    LOG_DEBUG(Service_NVDRV, "called GetSyncpoint, id={}", params.param); + +    // We found that implementing this causes deadlocks with async gpu, along with degraded +    // performance. TODO: RE the nvdec async implementation +    params.value = 0; +    std::memcpy(output.data(), ¶ms, sizeof(IoctlGetSyncpoint)); + +    return NvErrCodes::Success; +} + +u32 nvhost_nvdec_common::GetWaitbase(const std::vector<u8>& input, std::vector<u8>& output) { +    IoctlGetWaitbase params{}; +    std::memcpy(¶ms, input.data(), sizeof(IoctlGetWaitbase)); +    params.value = 0; // Seems to be hard coded at 0 +    std::memcpy(output.data(), ¶ms, sizeof(IoctlGetWaitbase)); +    return 0; +} + +u32 nvhost_nvdec_common::MapBuffer(const std::vector<u8>& input, std::vector<u8>& output) { +    IoctlMapBuffer params{}; +    std::memcpy(¶ms, input.data(), sizeof(IoctlMapBuffer)); +    std::vector<MapBufferEntry> cmd_buffer_handles(params.num_entries); + +    SpliceVectors(input, cmd_buffer_handles, params.num_entries, sizeof(IoctlMapBuffer)); + +    auto& gpu = system.GPU(); + +    for (auto& cmf_buff : cmd_buffer_handles) { +        auto object{nvmap_dev->GetObject(cmf_buff.map_handle)}; +        if (!object) { +            LOG_ERROR(Service_NVDRV, "invalid cmd_buffer nvmap_handle={:X}", cmf_buff.map_handle); +            std::memcpy(output.data(), ¶ms, output.size()); +            return NvErrCodes::InvalidInput; +        } +        if (object->dma_map_addr == 0) { +            // NVDEC and VIC memory is in the 32-bit address space +            // MapAllocate32 will attempt to map a lower 32-bit value in the shared gpu memory space +            const GPUVAddr low_addr = gpu.MemoryManager().MapAllocate32(object->addr, object->size); +            object->dma_map_addr = static_cast<u32>(low_addr); +            // Ensure that the dma_map_addr is indeed in the lower 32-bit address space. +            ASSERT(object->dma_map_addr == low_addr); +        } +        if (!object->dma_map_addr) { +            LOG_ERROR(Service_NVDRV, "failed to map size={}", object->size); +        } else { +            cmf_buff.map_address = object->dma_map_addr; +            AddBufferMap(object->dma_map_addr, object->size, object->addr, +                         object->status == nvmap::Object::Status::Allocated); +        } +    } +    std::memcpy(output.data(), ¶ms, sizeof(IoctlMapBuffer)); +    std::memcpy(output.data() + sizeof(IoctlMapBuffer), cmd_buffer_handles.data(), +                cmd_buffer_handles.size() * sizeof(MapBufferEntry)); + +    return NvErrCodes::Success; +} + +u32 nvhost_nvdec_common::UnmapBuffer(const std::vector<u8>& input, std::vector<u8>& output) { +    IoctlMapBuffer params{}; +    std::memcpy(¶ms, input.data(), sizeof(IoctlMapBuffer)); +    std::vector<MapBufferEntry> cmd_buffer_handles(params.num_entries); +    SpliceVectors(input, cmd_buffer_handles, params.num_entries, sizeof(IoctlMapBuffer)); + +    auto& gpu = system.GPU(); + +    for (auto& cmf_buff : cmd_buffer_handles) { +        const auto object{nvmap_dev->GetObject(cmf_buff.map_handle)}; +        if (!object) { +            LOG_ERROR(Service_NVDRV, "invalid cmd_buffer nvmap_handle={:X}", cmf_buff.map_handle); +            std::memcpy(output.data(), ¶ms, output.size()); +            return NvErrCodes::InvalidInput; +        } +        if (const auto size{RemoveBufferMap(object->dma_map_addr)}; size) { +            gpu.MemoryManager().Unmap(object->dma_map_addr, *size); +        } else { +            // This occurs quite frequently, however does not seem to impact functionality +            LOG_DEBUG(Service_NVDRV, "invalid offset=0x{:X} dma=0x{:X}", object->addr, +                      object->dma_map_addr); +        } +        object->dma_map_addr = 0; +    } +    std::memset(output.data(), 0, output.size()); +    return NvErrCodes::Success; +} + +u32 nvhost_nvdec_common::SetSubmitTimeout(const std::vector<u8>& input, std::vector<u8>& output) { +    std::memcpy(&submit_timeout, input.data(), input.size()); +    LOG_WARNING(Service_NVDRV, "(STUBBED) called"); +    return NvErrCodes::Success; +} + +std::optional<nvhost_nvdec_common::BufferMap> nvhost_nvdec_common::FindBufferMap( +    GPUVAddr gpu_addr) const { +    const auto it = std::find_if( +        buffer_mappings.begin(), buffer_mappings.upper_bound(gpu_addr), [&](const auto& entry) { +            return (gpu_addr >= entry.second.StartAddr() && gpu_addr < entry.second.EndAddr()); +        }); + +    ASSERT(it != buffer_mappings.end()); +    return it->second; +} + +void nvhost_nvdec_common::AddBufferMap(GPUVAddr gpu_addr, std::size_t size, VAddr cpu_addr, +                                       bool is_allocated) { +    buffer_mappings.insert_or_assign(gpu_addr, BufferMap{gpu_addr, size, cpu_addr, is_allocated}); +} + +std::optional<std::size_t> nvhost_nvdec_common::RemoveBufferMap(GPUVAddr gpu_addr) { +    const auto iter{buffer_mappings.find(gpu_addr)}; +    if (iter == buffer_mappings.end()) { +        return std::nullopt; +    } +    std::size_t size = 0; +    if (iter->second.IsAllocated()) { +        size = iter->second.Size(); +    } +    buffer_mappings.erase(iter); +    return size; +} + +} // namespace Service::Nvidia::Devices diff --git a/src/core/hle/service/nvdrv/devices/nvhost_nvdec_common.h b/src/core/hle/service/nvdrv/devices/nvhost_nvdec_common.h new file mode 100644 index 000000000..c249c5349 --- /dev/null +++ b/src/core/hle/service/nvdrv/devices/nvhost_nvdec_common.h @@ -0,0 +1,168 @@ +// Copyright 2020 yuzu emulator team +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include <map> +#include <vector> +#include "common/common_types.h" +#include "common/swap.h" +#include "core/hle/service/nvdrv/devices/nvdevice.h" + +namespace Service::Nvidia::Devices { +class nvmap; + +class nvhost_nvdec_common : public nvdevice { +public: +    explicit nvhost_nvdec_common(Core::System& system, std::shared_ptr<nvmap> nvmap_dev); +    ~nvhost_nvdec_common() override; + +    virtual u32 ioctl(Ioctl command, const std::vector<u8>& input, const std::vector<u8>& input2, +                      std::vector<u8>& output, std::vector<u8>& output2, IoctlCtrl& ctrl, +                      IoctlVersion version) = 0; + +protected: +    class BufferMap final { +    public: +        constexpr BufferMap() = default; + +        constexpr BufferMap(GPUVAddr start_addr, std::size_t size) +            : start_addr{start_addr}, end_addr{start_addr + size} {} + +        constexpr BufferMap(GPUVAddr start_addr, std::size_t size, VAddr cpu_addr, +                            bool is_allocated) +            : start_addr{start_addr}, end_addr{start_addr + size}, cpu_addr{cpu_addr}, +              is_allocated{is_allocated} {} + +        constexpr VAddr StartAddr() const { +            return start_addr; +        } + +        constexpr VAddr EndAddr() const { +            return end_addr; +        } + +        constexpr std::size_t Size() const { +            return end_addr - start_addr; +        } + +        constexpr VAddr CpuAddr() const { +            return cpu_addr; +        } + +        constexpr bool IsAllocated() const { +            return is_allocated; +        } + +    private: +        GPUVAddr start_addr{}; +        GPUVAddr end_addr{}; +        VAddr cpu_addr{}; +        bool is_allocated{}; +    }; + +    struct IoctlSetNvmapFD { +        u32_le nvmap_fd; +    }; +    static_assert(sizeof(IoctlSetNvmapFD) == 4, "IoctlSetNvmapFD is incorrect size"); + +    struct IoctlSubmitCommandBuffer { +        u32_le id; +        u32_le offset; +        u32_le count; +    }; +    static_assert(sizeof(IoctlSubmitCommandBuffer) == 0xC, +                  "IoctlSubmitCommandBuffer is incorrect size"); +    struct IoctlSubmit { +        u32_le cmd_buffer_count; +        u32_le relocation_count; +        u32_le syncpoint_count; +        u32_le fence_count; +    }; +    static_assert(sizeof(IoctlSubmit) == 0x10, "IoctlSubmit has incorrect size"); + +    struct CommandBuffer { +        s32 memory_id; +        u32 offset; +        s32 word_count; +    }; +    static_assert(sizeof(CommandBuffer) == 0xC, "CommandBuffer has incorrect size"); + +    struct Reloc { +        s32 cmdbuffer_memory; +        s32 cmdbuffer_offset; +        s32 target; +        s32 target_offset; +    }; +    static_assert(sizeof(Reloc) == 0x10, "CommandBuffer has incorrect size"); + +    struct SyncptIncr { +        u32 id; +        u32 increments; +    }; +    static_assert(sizeof(SyncptIncr) == 0x8, "CommandBuffer has incorrect size"); + +    struct Fence { +        u32 id; +        u32 value; +    }; +    static_assert(sizeof(Fence) == 0x8, "CommandBuffer has incorrect size"); + +    struct IoctlGetSyncpoint { +        // Input +        u32_le param; +        // Output +        u32_le value; +    }; +    static_assert(sizeof(IoctlGetSyncpoint) == 8, "IocGetIdParams has wrong size"); + +    struct IoctlGetWaitbase { +        u32_le unknown; // seems to be ignored? Nintendo added this +        u32_le value; +    }; +    static_assert(sizeof(IoctlGetWaitbase) == 0x8, "IoctlGetWaitbase is incorrect size"); + +    struct IoctlMapBuffer { +        u32_le num_entries; +        u32_le data_address; // Ignored by the driver. +        u32_le attach_host_ch_das; +    }; +    static_assert(sizeof(IoctlMapBuffer) == 0x0C, "IoctlMapBuffer is incorrect size"); + +    struct IocGetIdParams { +        // Input +        u32_le param; +        // Output +        u32_le value; +    }; +    static_assert(sizeof(IocGetIdParams) == 8, "IocGetIdParams has wrong size"); + +    // Used for mapping and unmapping command buffers +    struct MapBufferEntry { +        u32_le map_handle; +        u32_le map_address; +    }; +    static_assert(sizeof(IoctlMapBuffer) == 0x0C, "IoctlMapBuffer is incorrect size"); + +    /// Ioctl command implementations +    u32 SetNVMAPfd(const std::vector<u8>& input); +    u32 Submit(const std::vector<u8>& input, std::vector<u8>& output); +    u32 GetSyncpoint(const std::vector<u8>& input, std::vector<u8>& output); +    u32 GetWaitbase(const std::vector<u8>& input, std::vector<u8>& output); +    u32 MapBuffer(const std::vector<u8>& input, std::vector<u8>& output); +    u32 UnmapBuffer(const std::vector<u8>& input, std::vector<u8>& output); +    u32 SetSubmitTimeout(const std::vector<u8>& input, std::vector<u8>& output); + +    std::optional<BufferMap> FindBufferMap(GPUVAddr gpu_addr) const; +    void AddBufferMap(GPUVAddr gpu_addr, std::size_t size, VAddr cpu_addr, bool is_allocated); +    std::optional<std::size_t> RemoveBufferMap(GPUVAddr gpu_addr); + +    u32_le nvmap_fd{}; +    u32_le submit_timeout{}; +    std::shared_ptr<nvmap> nvmap_dev; + +    // This is expected to be ordered, therefore we must use a map, not unordered_map +    std::map<GPUVAddr, BufferMap> buffer_mappings; +}; +}; // namespace Service::Nvidia::Devices diff --git a/src/core/hle/service/nvdrv/devices/nvhost_vic.cpp b/src/core/hle/service/nvdrv/devices/nvhost_vic.cpp index 9da19ad56..60db54d00 100644 --- a/src/core/hle/service/nvdrv/devices/nvhost_vic.cpp +++ b/src/core/hle/service/nvdrv/devices/nvhost_vic.cpp @@ -2,15 +2,17 @@  // Licensed under GPLv2 or any later version  // Refer to the license.txt file included. -#include <cstring> -  #include "common/assert.h"  #include "common/logging/log.h" +#include "core/core.h"  #include "core/hle/service/nvdrv/devices/nvhost_vic.h" +#include "video_core/memory_manager.h" +#include "video_core/renderer_base.h"  namespace Service::Nvidia::Devices { +nvhost_vic::nvhost_vic(Core::System& system, std::shared_ptr<nvmap> nvmap_dev) +    : nvhost_nvdec_common(system, std::move(nvmap_dev)) {} -nvhost_vic::nvhost_vic(Core::System& system) : nvdevice(system) {}  nvhost_vic::~nvhost_vic() = default;  u32 nvhost_vic::ioctl(Ioctl command, const std::vector<u8>& input, const std::vector<u8>& input2, @@ -21,7 +23,7 @@ u32 nvhost_vic::ioctl(Ioctl command, const std::vector<u8>& input, const std::ve      switch (static_cast<IoctlCommand>(command.raw)) {      case IoctlCommand::IocSetNVMAPfdCommand: -        return SetNVMAPfd(input, output); +        return SetNVMAPfd(input);      case IoctlCommand::IocSubmit:          return Submit(input, output);      case IoctlCommand::IocGetSyncpoint: @@ -29,83 +31,19 @@ u32 nvhost_vic::ioctl(Ioctl command, const std::vector<u8>& input, const std::ve      case IoctlCommand::IocGetWaitbase:          return GetWaitbase(input, output);      case IoctlCommand::IocMapBuffer: -        return MapBuffer(input, output); +    case IoctlCommand::IocMapBuffer2: +    case IoctlCommand::IocMapBuffer3: +    case IoctlCommand::IocMapBuffer4:      case IoctlCommand::IocMapBufferEx:          return MapBuffer(input, output); +    case IoctlCommand::IocUnmapBuffer: +    case IoctlCommand::IocUnmapBuffer2: +    case IoctlCommand::IocUnmapBuffer3:      case IoctlCommand::IocUnmapBufferEx: -        return UnmapBufferEx(input, output); +        return UnmapBuffer(input, output);      } -    UNIMPLEMENTED_MSG("Unimplemented ioctl"); -    return 0; -} - -u32 nvhost_vic::SetNVMAPfd(const std::vector<u8>& input, std::vector<u8>& output) { -    IoctlSetNvmapFD params{}; -    std::memcpy(¶ms, input.data(), sizeof(IoctlSetNvmapFD)); -    LOG_DEBUG(Service_NVDRV, "called, fd={}", params.nvmap_fd); - -    nvmap_fd = params.nvmap_fd; -    return 0; -} - -u32 nvhost_vic::Submit(const std::vector<u8>& input, std::vector<u8>& output) { -    IoctlSubmit params{}; -    std::memcpy(¶ms, input.data(), sizeof(IoctlSubmit)); -    LOG_WARNING(Service_NVDRV, "(STUBBED) called"); - -    // Workaround for Luigi's Mansion 3, as nvhost_vic is not implemented for asynch GPU -    params.command_buffer = {}; - -    std::memcpy(output.data(), ¶ms, sizeof(IoctlSubmit)); -    return 0; -} - -u32 nvhost_vic::GetSyncpoint(const std::vector<u8>& input, std::vector<u8>& output) { -    IoctlGetSyncpoint params{}; -    std::memcpy(¶ms, input.data(), sizeof(IoctlGetSyncpoint)); -    LOG_INFO(Service_NVDRV, "called, unknown=0x{:X}", params.unknown); -    params.value = 0; // Seems to be hard coded at 0 -    std::memcpy(output.data(), ¶ms, sizeof(IoctlGetSyncpoint)); -    return 0; -} - -u32 nvhost_vic::GetWaitbase(const std::vector<u8>& input, std::vector<u8>& output) { -    IoctlGetWaitbase params{}; -    std::memcpy(¶ms, input.data(), sizeof(IoctlGetWaitbase)); -    LOG_INFO(Service_NVDRV, "called, unknown=0x{:X}", params.unknown); -    params.value = 0; // Seems to be hard coded at 0 -    std::memcpy(output.data(), ¶ms, sizeof(IoctlGetWaitbase)); -    return 0; -} - -u32 nvhost_vic::MapBuffer(const std::vector<u8>& input, std::vector<u8>& output) { -    IoctlMapBuffer params{}; -    std::memcpy(¶ms, input.data(), sizeof(IoctlMapBuffer)); -    LOG_WARNING(Service_NVDRV, "(STUBBED) called with address={:08X}{:08X}", params.address_2, -                params.address_1); -    params.address_1 = 0; -    params.address_2 = 0; -    std::memcpy(output.data(), ¶ms, sizeof(IoctlMapBuffer)); -    return 0; -} - -u32 nvhost_vic::MapBufferEx(const std::vector<u8>& input, std::vector<u8>& output) { -    IoctlMapBufferEx params{}; -    std::memcpy(¶ms, input.data(), sizeof(IoctlMapBufferEx)); -    LOG_WARNING(Service_NVDRV, "(STUBBED) called with address={:08X}{:08X}", params.address_2, -                params.address_1); -    params.address_1 = 0; -    params.address_2 = 0; -    std::memcpy(output.data(), ¶ms, sizeof(IoctlMapBufferEx)); -    return 0; -} - -u32 nvhost_vic::UnmapBufferEx(const std::vector<u8>& input, std::vector<u8>& output) { -    IoctlUnmapBufferEx params{}; -    std::memcpy(¶ms, input.data(), sizeof(IoctlUnmapBufferEx)); -    LOG_WARNING(Service_NVDRV, "(STUBBED) called"); -    std::memcpy(output.data(), ¶ms, sizeof(IoctlUnmapBufferEx)); +    UNIMPLEMENTED_MSG("Unimplemented ioctl 0x{:X}", command.raw);      return 0;  } diff --git a/src/core/hle/service/nvdrv/devices/nvhost_vic.h b/src/core/hle/service/nvdrv/devices/nvhost_vic.h index a7bb7bbd5..f975b190c 100644 --- a/src/core/hle/service/nvdrv/devices/nvhost_vic.h +++ b/src/core/hle/service/nvdrv/devices/nvhost_vic.h @@ -4,19 +4,15 @@  #pragma once -#include <array> -#include <vector> -#include "common/common_types.h" -#include "common/swap.h" -#include "core/hle/service/nvdrv/devices/nvdevice.h" +#include "core/hle/service/nvdrv/devices/nvhost_nvdec_common.h"  namespace Service::Nvidia::Devices { +class nvmap; -class nvhost_vic final : public nvdevice { +class nvhost_vic final : public nvhost_nvdec_common {  public: -    explicit nvhost_vic(Core::System& system); -    ~nvhost_vic() override; - +    explicit nvhost_vic(Core::System& system, std::shared_ptr<nvmap> nvmap_dev); +    ~nvhost_vic();      u32 ioctl(Ioctl command, const std::vector<u8>& input, const std::vector<u8>& input2,                std::vector<u8>& output, std::vector<u8>& output2, IoctlCtrl& ctrl,                IoctlVersion version) override; @@ -28,74 +24,14 @@ private:          IocGetSyncpoint = 0xC0080002,          IocGetWaitbase = 0xC0080003,          IocMapBuffer = 0xC01C0009, +        IocMapBuffer2 = 0xC0340009, +        IocMapBuffer3 = 0xC0140009, +        IocMapBuffer4 = 0xC00C0009,          IocMapBufferEx = 0xC03C0009, -        IocUnmapBufferEx = 0xC03C000A, -    }; - -    struct IoctlSetNvmapFD { -        u32_le nvmap_fd; -    }; -    static_assert(sizeof(IoctlSetNvmapFD) == 4, "IoctlSetNvmapFD is incorrect size"); - -    struct IoctlSubmitCommandBuffer { -        u32 id; -        u32 offset; -        u32 count; -    }; -    static_assert(sizeof(IoctlSubmitCommandBuffer) == 0xC, -                  "IoctlSubmitCommandBuffer is incorrect size"); - -    struct IoctlSubmit { -        u32 command_buffer_count; -        u32 relocations_count; -        u32 syncpt_count; -        u32 wait_count; -        std::array<IoctlSubmitCommandBuffer, 4> command_buffer; -    }; -    static_assert(sizeof(IoctlSubmit) == 0x40, "IoctlSubmit is incorrect size"); - -    struct IoctlGetSyncpoint { -        u32 unknown; // seems to be ignored? Nintendo added this -        u32 value; -    }; -    static_assert(sizeof(IoctlGetSyncpoint) == 0x8, "IoctlGetSyncpoint is incorrect size"); - -    struct IoctlGetWaitbase { -        u32 unknown; // seems to be ignored? Nintendo added this -        u32 value; -    }; -    static_assert(sizeof(IoctlGetWaitbase) == 0x8, "IoctlGetWaitbase is incorrect size"); - -    struct IoctlMapBuffer { -        u32 unknown; -        u32 address_1; -        u32 address_2; -        INSERT_PADDING_BYTES(0x10); // TODO(DarkLordZach): RE this structure -    }; -    static_assert(sizeof(IoctlMapBuffer) == 0x1C, "IoctlMapBuffer is incorrect size"); - -    struct IoctlMapBufferEx { -        u32 unknown; -        u32 address_1; -        u32 address_2; -        INSERT_PADDING_BYTES(0x30); // TODO(DarkLordZach): RE this structure +        IocUnmapBuffer = 0xC03C000A, +        IocUnmapBuffer2 = 0xC034000A, +        IocUnmapBuffer3 = 0xC00C000A, +        IocUnmapBufferEx = 0xC01C000A,      }; -    static_assert(sizeof(IoctlMapBufferEx) == 0x3C, "IoctlMapBufferEx is incorrect size"); - -    struct IoctlUnmapBufferEx { -        INSERT_PADDING_BYTES(0x3C); // TODO(DarkLordZach): RE this structure -    }; -    static_assert(sizeof(IoctlUnmapBufferEx) == 0x3C, "IoctlUnmapBufferEx is incorrect size"); - -    u32_le nvmap_fd{}; - -    u32 SetNVMAPfd(const std::vector<u8>& input, std::vector<u8>& output); -    u32 Submit(const std::vector<u8>& input, std::vector<u8>& output); -    u32 GetSyncpoint(const std::vector<u8>& input, std::vector<u8>& output); -    u32 GetWaitbase(const std::vector<u8>& input, std::vector<u8>& output); -    u32 MapBuffer(const std::vector<u8>& input, std::vector<u8>& output); -    u32 MapBufferEx(const std::vector<u8>& input, std::vector<u8>& output); -    u32 UnmapBufferEx(const std::vector<u8>& input, std::vector<u8>& output);  }; -  } // namespace Service::Nvidia::Devices diff --git a/src/core/hle/service/nvdrv/devices/nvmap.h b/src/core/hle/service/nvdrv/devices/nvmap.h index 84624be00..04b9ef540 100644 --- a/src/core/hle/service/nvdrv/devices/nvmap.h +++ b/src/core/hle/service/nvdrv/devices/nvmap.h @@ -37,6 +37,7 @@ public:          VAddr addr;          Status status;          u32 refcount; +        u32 dma_map_addr;      };      std::shared_ptr<Object> GetObject(u32 handle) const { diff --git a/src/core/hle/service/nvdrv/nvdrv.cpp b/src/core/hle/service/nvdrv/nvdrv.cpp index 197c77db0..803c1a984 100644 --- a/src/core/hle/service/nvdrv/nvdrv.cpp +++ b/src/core/hle/service/nvdrv/nvdrv.cpp @@ -51,9 +51,9 @@ Module::Module(Core::System& system) {      devices["/dev/nvmap"] = nvmap_dev;      devices["/dev/nvdisp_disp0"] = std::make_shared<Devices::nvdisp_disp0>(system, nvmap_dev);      devices["/dev/nvhost-ctrl"] = std::make_shared<Devices::nvhost_ctrl>(system, events_interface); -    devices["/dev/nvhost-nvdec"] = std::make_shared<Devices::nvhost_nvdec>(system); +    devices["/dev/nvhost-nvdec"] = std::make_shared<Devices::nvhost_nvdec>(system, nvmap_dev);      devices["/dev/nvhost-nvjpg"] = std::make_shared<Devices::nvhost_nvjpg>(system); -    devices["/dev/nvhost-vic"] = std::make_shared<Devices::nvhost_vic>(system); +    devices["/dev/nvhost-vic"] = std::make_shared<Devices::nvhost_vic>(system, nvmap_dev);  }  Module::~Module() = default; diff --git a/src/core/settings.cpp b/src/core/settings.cpp index 28d3f9099..e14c02045 100644 --- a/src/core/settings.cpp +++ b/src/core/settings.cpp @@ -63,6 +63,7 @@ void LogSettings() {      log_setting("Renderer_GPUAccuracyLevel", values.gpu_accuracy.GetValue());      log_setting("Renderer_UseAsynchronousGpuEmulation",                  values.use_asynchronous_gpu_emulation.GetValue()); +    log_setting("Renderer_UseNvdecEmulation", values.use_nvdec_emulation.GetValue());      log_setting("Renderer_UseVsync", values.use_vsync.GetValue());      log_setting("Renderer_UseAssemblyShaders", values.use_assembly_shaders.GetValue());      log_setting("Renderer_UseAsynchronousShaders", values.use_asynchronous_shaders.GetValue()); @@ -119,6 +120,7 @@ void RestoreGlobalState() {      values.use_disk_shader_cache.SetGlobal(true);      values.gpu_accuracy.SetGlobal(true);      values.use_asynchronous_gpu_emulation.SetGlobal(true); +    values.use_nvdec_emulation.SetGlobal(true);      values.use_vsync.SetGlobal(true);      values.use_assembly_shaders.SetGlobal(true);      values.use_asynchronous_shaders.SetGlobal(true); diff --git a/src/core/settings.h b/src/core/settings.h index 9834f44bb..604805615 100644 --- a/src/core/settings.h +++ b/src/core/settings.h @@ -111,6 +111,7 @@ struct Values {      Setting<bool> use_disk_shader_cache;      Setting<GPUAccuracy> gpu_accuracy;      Setting<bool> use_asynchronous_gpu_emulation; +    Setting<bool> use_nvdec_emulation;      Setting<bool> use_vsync;      Setting<bool> use_assembly_shaders;      Setting<bool> use_asynchronous_shaders; diff --git a/src/core/telemetry_session.cpp b/src/core/telemetry_session.cpp index da09c0dbc..ebc19e18a 100644 --- a/src/core/telemetry_session.cpp +++ b/src/core/telemetry_session.cpp @@ -206,6 +206,8 @@ void TelemetrySession::AddInitialInfo(Loader::AppLoader& app_loader) {               TranslateGPUAccuracyLevel(Settings::values.gpu_accuracy.GetValue()));      AddField(field_type, "Renderer_UseAsynchronousGpuEmulation",               Settings::values.use_asynchronous_gpu_emulation.GetValue()); +    AddField(field_type, "Renderer_UseNvdecEmulation", +             Settings::values.use_nvdec_emulation.GetValue());      AddField(field_type, "Renderer_UseVsync", Settings::values.use_vsync.GetValue());      AddField(field_type, "Renderer_UseAssemblyShaders",               Settings::values.use_assembly_shaders.GetValue()); diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt index 77ebac19f..fdfc885fc 100644 --- a/src/video_core/CMakeLists.txt +++ b/src/video_core/CMakeLists.txt @@ -5,6 +5,24 @@ add_library(video_core STATIC      buffer_cache/buffer_cache.h      buffer_cache/map_interval.cpp      buffer_cache/map_interval.h +    cdma_pusher.cpp +    cdma_pusher.h +    command_classes/codecs/codec.cpp +    command_classes/codecs/codec.h +    command_classes/codecs/h264.cpp +    command_classes/codecs/h264.h +    command_classes/codecs/vp9.cpp +    command_classes/codecs/vp9.h +    command_classes/codecs/vp9_types.h +    command_classes/host1x.cpp +    command_classes/host1x.h +    command_classes/nvdec.cpp +    command_classes/nvdec.h +    command_classes/nvdec_common.h +    command_classes/sync_manager.cpp +    command_classes/sync_manager.h +    command_classes/vic.cpp +    command_classes/vic.h      compatible_formats.cpp      compatible_formats.h      dirty_flags.cpp @@ -250,6 +268,14 @@ create_target_directory_groups(video_core)  target_link_libraries(video_core PUBLIC common core)  target_link_libraries(video_core PRIVATE glad xbyak) +if (MSVC) +    target_include_directories(video_core PRIVATE ${FFMPEG_INCLUDE_DIR}) +    target_link_libraries(video_core PUBLIC ${FFMPEG_LIBRARY_DIR}/swscale.lib ${FFMPEG_LIBRARY_DIR}/avcodec.lib ${FFMPEG_LIBRARY_DIR}/avutil.lib) +else() +    target_include_directories(video_core PRIVATE ${FFMPEG_INCLUDE_DIR}) +    target_link_libraries(video_core PRIVATE ${FFMPEG_LIBRARIES}) +endif() +  add_dependencies(video_core host_shaders)  target_include_directories(video_core PRIVATE ${HOST_SHADERS_INCLUDE}) diff --git a/src/video_core/cdma_pusher.cpp b/src/video_core/cdma_pusher.cpp new file mode 100644 index 000000000..d774db107 --- /dev/null +++ b/src/video_core/cdma_pusher.cpp @@ -0,0 +1,171 @@ +// MIT License +// +// Copyright (c) Ryujinx Team and Contributors +// +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and +// associated documentation files (the "Software"), to deal in the Software without restriction, +// including without limitation the rights to use, copy, modify, merge, publish, distribute, +// sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all copies or +// substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT +// NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, +// DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +// + +#include "command_classes/host1x.h" +#include "command_classes/nvdec.h" +#include "command_classes/vic.h" +#include "common/bit_util.h" +#include "video_core/cdma_pusher.h" +#include "video_core/command_classes/nvdec_common.h" +#include "video_core/engines/maxwell_3d.h" +#include "video_core/gpu.h" +#include "video_core/memory_manager.h" + +namespace Tegra { +CDmaPusher::CDmaPusher(GPU& gpu) +    : gpu(gpu), nvdec_processor(std::make_shared<Nvdec>(gpu)), +      vic_processor(std::make_unique<Vic>(gpu, nvdec_processor)), +      host1x_processor(std::make_unique<Host1x>(gpu)), +      nvdec_sync(std::make_unique<SyncptIncrManager>(gpu)), +      vic_sync(std::make_unique<SyncptIncrManager>(gpu)) {} + +CDmaPusher::~CDmaPusher() = default; + +void CDmaPusher::Push(ChCommandHeaderList&& entries) { +    cdma_queue.push(std::move(entries)); +} + +void CDmaPusher::DispatchCalls() { +    while (!cdma_queue.empty()) { +        Step(); +    } +} + +void CDmaPusher::Step() { +    const auto entries{cdma_queue.front()}; +    cdma_queue.pop(); + +    std::vector<u32> values(entries.size()); +    std::memcpy(values.data(), entries.data(), entries.size() * sizeof(u32)); + +    for (const u32 value : values) { +        if (mask != 0) { +            const u32 lbs = Common::CountTrailingZeroes32(mask); +            mask &= ~(1U << lbs); +            ExecuteCommand(static_cast<u32>(offset + lbs), value); +            continue; +        } else if (count != 0) { +            --count; +            ExecuteCommand(static_cast<u32>(offset), value); +            if (incrementing) { +                ++offset; +            } +            continue; +        } +        const auto mode = static_cast<ChSubmissionMode>((value >> 28) & 0xf); +        switch (mode) { +        case ChSubmissionMode::SetClass: { +            mask = value & 0x3f; +            offset = (value >> 16) & 0xfff; +            current_class = static_cast<ChClassId>((value >> 6) & 0x3ff); +            break; +        } +        case ChSubmissionMode::Incrementing: +        case ChSubmissionMode::NonIncrementing: +            count = value & 0xffff; +            offset = (value >> 16) & 0xfff; +            incrementing = mode == ChSubmissionMode::Incrementing; +            break; +        case ChSubmissionMode::Mask: +            mask = value & 0xffff; +            offset = (value >> 16) & 0xfff; +            break; +        case ChSubmissionMode::Immediate: { +            const u32 data = value & 0xfff; +            offset = (value >> 16) & 0xfff; +            ExecuteCommand(static_cast<u32>(offset), data); +            break; +        } +        default: +            UNIMPLEMENTED_MSG("ChSubmission mode {} is not implemented!", static_cast<u32>(mode)); +            break; +        } +    } +} + +void CDmaPusher::ExecuteCommand(u32 offset, u32 data) { +    switch (current_class) { +    case ChClassId::NvDec: +        ThiStateWrite(nvdec_thi_state, offset, {data}); +        switch (static_cast<ThiMethod>(offset)) { +        case ThiMethod::IncSyncpt: { +            LOG_DEBUG(Service_NVDRV, "NVDEC Class IncSyncpt Method"); +            const auto syncpoint_id = static_cast<u32>(data & 0xFF); +            const auto cond = static_cast<u32>((data >> 8) & 0xFF); +            if (cond == 0) { +                nvdec_sync->Increment(syncpoint_id); +            } else { +                nvdec_sync->IncrementWhenDone(static_cast<u32>(current_class), syncpoint_id); +                nvdec_sync->SignalDone(syncpoint_id); +            } +            break; +        } +        case ThiMethod::SetMethod1: +            LOG_DEBUG(Service_NVDRV, "NVDEC method 0x{:X}", +                      static_cast<u32>(nvdec_thi_state.method_0)); +            nvdec_processor->ProcessMethod( +                static_cast<Tegra::Nvdec::Method>(nvdec_thi_state.method_0), {data}); +            break; +        default: +            break; +        } +        break; +    case ChClassId::GraphicsVic: +        ThiStateWrite(vic_thi_state, static_cast<u32>(offset), {data}); +        switch (static_cast<ThiMethod>(offset)) { +        case ThiMethod::IncSyncpt: { +            LOG_DEBUG(Service_NVDRV, "VIC Class IncSyncpt Method"); +            const auto syncpoint_id = static_cast<u32>(data & 0xFF); +            const auto cond = static_cast<u32>((data >> 8) & 0xFF); +            if (cond == 0) { +                vic_sync->Increment(syncpoint_id); +            } else { +                vic_sync->IncrementWhenDone(static_cast<u32>(current_class), syncpoint_id); +                vic_sync->SignalDone(syncpoint_id); +            } +            break; +        } +        case ThiMethod::SetMethod1: +            LOG_DEBUG(Service_NVDRV, "VIC method 0x{:X}, Args=({})", +                      static_cast<u32>(vic_thi_state.method_0)); +            vic_processor->ProcessMethod(static_cast<Tegra::Vic::Method>(vic_thi_state.method_0), +                                         {data}); +            break; +        default: +            break; +        } +        break; +    case ChClassId::Host1x: +        // This device is mainly for syncpoint synchronization +        LOG_DEBUG(Service_NVDRV, "Host1X Class Method"); +        host1x_processor->ProcessMethod(static_cast<Tegra::Host1x::Method>(offset), {data}); +        break; +    default: +        UNIMPLEMENTED_MSG("Current class not implemented {:X}", static_cast<u32>(current_class)); +        break; +    } +} + +void CDmaPusher::ThiStateWrite(ThiRegisters& state, u32 offset, const std::vector<u32>& arguments) { +    u8* const state_offset = reinterpret_cast<u8*>(&state) + sizeof(u32) * offset; +    std::memcpy(state_offset, arguments.data(), sizeof(u32) * arguments.size()); +} + +} // namespace Tegra diff --git a/src/video_core/cdma_pusher.h b/src/video_core/cdma_pusher.h new file mode 100644 index 000000000..982f309c5 --- /dev/null +++ b/src/video_core/cdma_pusher.h @@ -0,0 +1,138 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include <memory> +#include <unordered_map> +#include <vector> +#include <queue> + +#include "common/bit_field.h" +#include "common/common_types.h" +#include "video_core/command_classes/sync_manager.h" + +namespace Tegra { + +class GPU; +class Nvdec; +class Vic; +class Host1x; + +enum class ChSubmissionMode : u32 { +    SetClass = 0, +    Incrementing = 1, +    NonIncrementing = 2, +    Mask = 3, +    Immediate = 4, +    Restart = 5, +    Gather = 6, +}; + +enum class ChClassId : u32 { +    NoClass = 0x0, +    Host1x = 0x1, +    VideoEncodeMpeg = 0x20, +    VideoEncodeNvEnc = 0x21, +    VideoStreamingVi = 0x30, +    VideoStreamingIsp = 0x32, +    VideoStreamingIspB = 0x34, +    VideoStreamingViI2c = 0x36, +    GraphicsVic = 0x5d, +    Graphics3D = 0x60, +    GraphicsGpu = 0x61, +    Tsec = 0xe0, +    TsecB = 0xe1, +    NvJpg = 0xc0, +    NvDec = 0xf0 +}; + +enum class ChMethod : u32 { +    Empty = 0, +    SetMethod = 0x10, +    SetData = 0x11, +}; + +union ChCommandHeader { +    u32 raw; +    BitField<0, 16, u32> value; +    BitField<16, 12, ChMethod> method_offset; +    BitField<28, 4, ChSubmissionMode> submission_mode; +}; +static_assert(sizeof(ChCommandHeader) == sizeof(u32), "ChCommand header is an invalid size"); + +struct ChCommand { +    ChClassId class_id{}; +    int method_offset{}; +    std::vector<u32> arguments; +}; + +using ChCommandHeaderList = std::vector<Tegra::ChCommandHeader>; +using ChCommandList = std::vector<Tegra::ChCommand>; + +struct ThiRegisters { +    u32_le increment_syncpt{}; +    INSERT_PADDING_WORDS(1); +    u32_le increment_syncpt_error{}; +    u32_le ctx_switch_incremement_syncpt{}; +    INSERT_PADDING_WORDS(4); +    u32_le ctx_switch{}; +    INSERT_PADDING_WORDS(1); +    u32_le ctx_syncpt_eof{}; +    INSERT_PADDING_WORDS(5); +    u32_le method_0{}; +    u32_le method_1{}; +    INSERT_PADDING_WORDS(12); +    u32_le int_status{}; +    u32_le int_mask{}; +}; + +enum class ThiMethod : u32 { +    IncSyncpt = offsetof(ThiRegisters, increment_syncpt) / sizeof(u32), +    SetMethod0 = offsetof(ThiRegisters, method_0) / sizeof(u32), +    SetMethod1 = offsetof(ThiRegisters, method_1) / sizeof(u32), +}; + +class CDmaPusher { +public: +    explicit CDmaPusher(GPU& gpu); +    ~CDmaPusher(); + +    /// Push NVDEC command buffer entries into queue +    void Push(ChCommandHeaderList&& entries); + +    /// Process queued command buffer entries +    void DispatchCalls(); + +    /// Process one queue element +    void Step(); + +    /// Invoke command class devices to execute the command based on the current state +    void ExecuteCommand(u32 offset, u32 data); + +private: +    /// Write arguments value to the ThiRegisters member at the specified offset +    void ThiStateWrite(ThiRegisters& state, u32 offset, const std::vector<u32>& arguments); + +    GPU& gpu; + +    std::shared_ptr<Tegra::Nvdec> nvdec_processor; +    std::unique_ptr<Tegra::Vic> vic_processor; +    std::unique_ptr<Tegra::Host1x> host1x_processor; +    std::unique_ptr<SyncptIncrManager> nvdec_sync; +    std::unique_ptr<SyncptIncrManager> vic_sync; +    ChClassId current_class{}; +    ThiRegisters vic_thi_state{}; +    ThiRegisters nvdec_thi_state{}; + +    s32 count{}; +    s32 offset{}; +    s32 mask{}; +    bool incrementing{}; + +    // Queue of command lists to be processed +    std::queue<ChCommandHeaderList> cdma_queue; +}; + +} // namespace Tegra diff --git a/src/video_core/command_classes/codecs/codec.cpp b/src/video_core/command_classes/codecs/codec.cpp new file mode 100644 index 000000000..2df410be8 --- /dev/null +++ b/src/video_core/command_classes/codecs/codec.cpp @@ -0,0 +1,114 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include <cstring> +#include <fstream> +#include "common/assert.h" +#include "video_core/command_classes/codecs/codec.h" +#include "video_core/command_classes/codecs/h264.h" +#include "video_core/command_classes/codecs/vp9.h" +#include "video_core/gpu.h" +#include "video_core/memory_manager.h" + +extern "C" { +#include <libavutil/opt.h> +} + +namespace Tegra { + +Codec::Codec(GPU& gpu_) +    : gpu(gpu_), h264_decoder(std::make_unique<Decoder::H264>(gpu)), +      vp9_decoder(std::make_unique<Decoder::VP9>(gpu)) {} + +Codec::~Codec() { +    if (!initialized) { +        return; +    } +    // Free libav memory +    avcodec_send_packet(av_codec_ctx, nullptr); +    avcodec_receive_frame(av_codec_ctx, av_frame); +    avcodec_flush_buffers(av_codec_ctx); + +    av_frame_unref(av_frame); +    av_free(av_frame); +    avcodec_close(av_codec_ctx); +} + +void Codec::SetTargetCodec(NvdecCommon::VideoCodec codec) { +    LOG_INFO(Service_NVDRV, "NVDEC video codec initialized to {}", static_cast<u32>(codec)); +    current_codec = codec; +} + +void Codec::StateWrite(u32 offset, u64 arguments) { +    u8* const state_offset = reinterpret_cast<u8*>(&state) + offset * sizeof(u64); +    std::memcpy(state_offset, &arguments, sizeof(u64)); +} + +void Codec::Decode() { +    bool is_first_frame = false; + +    if (!initialized) { +        if (current_codec == NvdecCommon::VideoCodec::H264) { +            av_codec = avcodec_find_decoder(AV_CODEC_ID_H264); +        } else if (current_codec == NvdecCommon::VideoCodec::Vp9) { +            av_codec = avcodec_find_decoder(AV_CODEC_ID_VP9); +        } else { +            LOG_ERROR(Service_NVDRV, "Unknown video codec {}", static_cast<u32>(current_codec)); +            return; +        } + +        av_codec_ctx = avcodec_alloc_context3(av_codec); +        av_frame = av_frame_alloc(); +        av_opt_set(av_codec_ctx->priv_data, "tune", "zerolatency", 0); + +        // TODO(ameerj): libavcodec gpu hw acceleration + +        const auto av_error = avcodec_open2(av_codec_ctx, av_codec, nullptr); +        if (av_error < 0) { +            LOG_ERROR(Service_NVDRV, "avcodec_open2() Failed."); +            av_frame_unref(av_frame); +            av_free(av_frame); +            avcodec_close(av_codec_ctx); +            return; +        } +        initialized = true; +        is_first_frame = true; +    } +    bool vp9_hidden_frame = false; + +    AVPacket packet{}; +    av_init_packet(&packet); +    std::vector<u8> frame_data; + +    if (current_codec == NvdecCommon::VideoCodec::H264) { +        frame_data = h264_decoder->ComposeFrameHeader(state, is_first_frame); +    } else if (current_codec == NvdecCommon::VideoCodec::Vp9) { +        frame_data = vp9_decoder->ComposeFrameHeader(state); +        vp9_hidden_frame = vp9_decoder->WasFrameHidden(); +    } + +    packet.data = frame_data.data(); +    packet.size = static_cast<int>(frame_data.size()); + +    avcodec_send_packet(av_codec_ctx, &packet); + +    if (!vp9_hidden_frame) { +        // Only receive/store visible frames +        avcodec_receive_frame(av_codec_ctx, av_frame); +    } +} + +AVFrame* Codec::GetCurrentFrame() { +    return av_frame; +} + +const AVFrame* Codec::GetCurrentFrame() const { +    return av_frame; +} + +NvdecCommon::VideoCodec Codec::GetCurrentCodec() const { +    return current_codec; +} + +} // namespace Tegra diff --git a/src/video_core/command_classes/codecs/codec.h b/src/video_core/command_classes/codecs/codec.h new file mode 100644 index 000000000..2e56daf29 --- /dev/null +++ b/src/video_core/command_classes/codecs/codec.h @@ -0,0 +1,68 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include <memory> +#include <vector> +#include "common/common_funcs.h" +#include "common/common_types.h" +#include "video_core/command_classes/nvdec_common.h" + +extern "C" { +#if defined(__GNUC__) || defined(__clang__) +#pragma GCC diagnostic ignored "-Wconversion" +#endif +#include <libavcodec/avcodec.h> +#if defined(__GNUC__) || defined(__clang__) +#pragma GCC diagnostic pop +#endif +} + +namespace Tegra { +class GPU; +struct VicRegisters; + +namespace Decoder { +class H264; +class VP9; +} // namespace Decoder + +class Codec { +public: +    explicit Codec(GPU& gpu); +    ~Codec(); + +    /// Sets NVDEC video stream codec +    void SetTargetCodec(NvdecCommon::VideoCodec codec); + +    /// Populate NvdecRegisters state with argument value at the provided offset +    void StateWrite(u32 offset, u64 arguments); + +    /// Call decoders to construct headers, decode AVFrame with ffmpeg +    void Decode(); + +    /// Returns most recently decoded frame +    AVFrame* GetCurrentFrame(); +    const AVFrame* GetCurrentFrame() const; + +    /// Returns the value of current_codec +    NvdecCommon::VideoCodec GetCurrentCodec() const; + +private: +    bool initialized{}; +    NvdecCommon::VideoCodec current_codec{NvdecCommon::VideoCodec::None}; + +    AVCodec* av_codec{nullptr}; +    AVCodecContext* av_codec_ctx{nullptr}; +    AVFrame* av_frame{nullptr}; + +    GPU& gpu; +    std::unique_ptr<Decoder::H264> h264_decoder; +    std::unique_ptr<Decoder::VP9> vp9_decoder; + +    NvdecCommon::NvdecRegisters state{}; +}; + +} // namespace Tegra diff --git a/src/video_core/command_classes/codecs/h264.cpp b/src/video_core/command_classes/codecs/h264.cpp new file mode 100644 index 000000000..1a39f7b23 --- /dev/null +++ b/src/video_core/command_classes/codecs/h264.cpp @@ -0,0 +1,276 @@ +// MIT License +// +// Copyright (c) Ryujinx Team and Contributors +// +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and +// associated documentation files (the "Software"), to deal in the Software without restriction, +// including without limitation the rights to use, copy, modify, merge, publish, distribute, +// sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all copies or +// substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT +// NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, +// DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +// + +#include "common/bit_util.h" +#include "video_core/command_classes/codecs/h264.h" +#include "video_core/gpu.h" +#include "video_core/memory_manager.h" + +namespace Tegra::Decoder { +H264::H264(GPU& gpu_) : gpu(gpu_) {} + +H264::~H264() = default; + +std::vector<u8>& H264::ComposeFrameHeader(NvdecCommon::NvdecRegisters& state, bool is_first_frame) { +    H264DecoderContext context{}; +    gpu.MemoryManager().ReadBlock(state.picture_info_offset, &context, sizeof(H264DecoderContext)); + +    const s32 frame_number = static_cast<s32>((context.h264_parameter_set.flags >> 46) & 0x1ffff); +    if (!is_first_frame && frame_number != 0) { +        frame.resize(context.frame_data_size); + +        gpu.MemoryManager().ReadBlock(state.frame_bitstream_offset, frame.data(), frame.size()); +    } else { +        /// Encode header +        H264BitWriter writer{}; +        writer.WriteU(1, 24); +        writer.WriteU(0, 1); +        writer.WriteU(3, 2); +        writer.WriteU(7, 5); +        writer.WriteU(100, 8); +        writer.WriteU(0, 8); +        writer.WriteU(31, 8); +        writer.WriteUe(0); +        const s32 chroma_format_idc = (context.h264_parameter_set.flags >> 12) & 0x3; +        writer.WriteUe(chroma_format_idc); +        if (chroma_format_idc == 3) { +            writer.WriteBit(false); +        } + +        writer.WriteUe(0); +        writer.WriteUe(0); +        writer.WriteBit(false); // QpprimeYZeroTransformBypassFlag +        writer.WriteBit(false); // Scaling matrix present flag + +        const s32 order_cnt_type = static_cast<s32>((context.h264_parameter_set.flags >> 14) & 3); +        writer.WriteUe(static_cast<s32>((context.h264_parameter_set.flags >> 8) & 0xf)); +        writer.WriteUe(order_cnt_type); +        if (order_cnt_type == 0) { +            writer.WriteUe(context.h264_parameter_set.log2_max_pic_order_cnt); +        } else if (order_cnt_type == 1) { +            writer.WriteBit(context.h264_parameter_set.delta_pic_order_always_zero_flag != 0); + +            writer.WriteSe(0); +            writer.WriteSe(0); +            writer.WriteUe(0); +        } + +        const s32 pic_height = context.h264_parameter_set.pic_height_in_map_units / +                               (context.h264_parameter_set.frame_mbs_only_flag ? 1 : 2); + +        writer.WriteUe(16); +        writer.WriteBit(false); +        writer.WriteUe(context.h264_parameter_set.pic_width_in_mbs - 1); +        writer.WriteUe(pic_height - 1); +        writer.WriteBit(context.h264_parameter_set.frame_mbs_only_flag != 0); + +        if (!context.h264_parameter_set.frame_mbs_only_flag) { +            writer.WriteBit(((context.h264_parameter_set.flags >> 0) & 1) != 0); +        } + +        writer.WriteBit(((context.h264_parameter_set.flags >> 1) & 1) != 0); +        writer.WriteBit(false); // Frame cropping flag +        writer.WriteBit(false); // VUI parameter present flag + +        writer.End(); + +        // H264 PPS +        writer.WriteU(1, 24); +        writer.WriteU(0, 1); +        writer.WriteU(3, 2); +        writer.WriteU(8, 5); + +        writer.WriteUe(0); +        writer.WriteUe(0); + +        writer.WriteBit(context.h264_parameter_set.entropy_coding_mode_flag); +        writer.WriteBit(false); +        writer.WriteUe(0); +        writer.WriteUe(context.h264_parameter_set.num_refidx_l0_default_active); +        writer.WriteUe(context.h264_parameter_set.num_refidx_l1_default_active); +        writer.WriteBit(((context.h264_parameter_set.flags >> 2) & 1) != 0); +        writer.WriteU(static_cast<s32>((context.h264_parameter_set.flags >> 32) & 0x3), 2); +        s32 pic_init_qp = static_cast<s32>((context.h264_parameter_set.flags >> 16) & 0x3f); +        pic_init_qp = (pic_init_qp << 26) >> 26; +        writer.WriteSe(pic_init_qp); +        writer.WriteSe(0); +        s32 chroma_qp_index_offset = +            static_cast<s32>((context.h264_parameter_set.flags >> 22) & 0x1f); +        chroma_qp_index_offset = (chroma_qp_index_offset << 27) >> 27; + +        writer.WriteSe(chroma_qp_index_offset); +        writer.WriteBit(context.h264_parameter_set.deblocking_filter_control_flag != 0); +        writer.WriteBit(((context.h264_parameter_set.flags >> 3) & 1) != 0); +        writer.WriteBit(context.h264_parameter_set.redundant_pic_count_flag != 0); +        writer.WriteBit(context.h264_parameter_set.transform_8x8_mode_flag != 0); + +        writer.WriteBit(true); + +        for (s32 index = 0; index < 6; index++) { +            writer.WriteBit(true); +            const auto matrix_x4 = +                std::vector<u8>(context.scaling_matrix_4.begin(), context.scaling_matrix_4.end()); +            writer.WriteScalingList(matrix_x4, index * 16, 16); +        } + +        if (context.h264_parameter_set.transform_8x8_mode_flag) { +            for (s32 index = 0; index < 2; index++) { +                writer.WriteBit(true); +                const auto matrix_x8 = std::vector<u8>(context.scaling_matrix_8.begin(), +                                                       context.scaling_matrix_8.end()); + +                writer.WriteScalingList(matrix_x8, index * 64, 64); +            } +        } + +        s32 chroma_qp_index_offset2 = +            static_cast<s32>((context.h264_parameter_set.flags >> 27) & 0x1f); +        chroma_qp_index_offset2 = (chroma_qp_index_offset2 << 27) >> 27; + +        writer.WriteSe(chroma_qp_index_offset2); + +        writer.End(); + +        const auto& encoded_header = writer.GetByteArray(); +        frame.resize(encoded_header.size() + context.frame_data_size); +        std::memcpy(frame.data(), encoded_header.data(), encoded_header.size()); + +        gpu.MemoryManager().ReadBlock(state.frame_bitstream_offset, +                                      frame.data() + encoded_header.size(), +                                      context.frame_data_size); +    } + +    return frame; +} + +H264BitWriter::H264BitWriter() = default; + +H264BitWriter::~H264BitWriter() = default; + +void H264BitWriter::WriteU(s32 value, s32 value_sz) { +    WriteBits(value, value_sz); +} + +void H264BitWriter::WriteSe(s32 value) { +    WriteExpGolombCodedInt(value); +} + +void H264BitWriter::WriteUe(s32 value) { +    WriteExpGolombCodedUInt((u32)value); +} + +void H264BitWriter::End() { +    WriteBit(true); +    Flush(); +} + +void H264BitWriter::WriteBit(bool state) { +    WriteBits(state ? 1 : 0, 1); +} + +void H264BitWriter::WriteScalingList(const std::vector<u8>& list, s32 start, s32 count) { +    std::vector<u8> scan(count); +    if (count == 16) { +        std::memcpy(scan.data(), zig_zag_scan.data(), scan.size()); +    } else { +        std::memcpy(scan.data(), zig_zag_direct.data(), scan.size()); +    } +    u8 last_scale = 8; + +    for (s32 index = 0; index < count; index++) { +        const u8 value = list[start + scan[index]]; +        const s32 delta_scale = static_cast<s32>(value - last_scale); + +        WriteSe(delta_scale); + +        last_scale = value; +    } +} + +std::vector<u8>& H264BitWriter::GetByteArray() { +    return byte_array; +} + +const std::vector<u8>& H264BitWriter::GetByteArray() const { +    return byte_array; +} + +void H264BitWriter::WriteBits(s32 value, s32 bit_count) { +    s32 value_pos = 0; + +    s32 remaining = bit_count; + +    while (remaining > 0) { +        s32 copy_size = remaining; + +        const s32 free_bits = GetFreeBufferBits(); + +        if (copy_size > free_bits) { +            copy_size = free_bits; +        } + +        const s32 mask = (1 << copy_size) - 1; + +        const s32 src_shift = (bit_count - value_pos) - copy_size; +        const s32 dst_shift = (buffer_size - buffer_pos) - copy_size; + +        buffer |= ((value >> src_shift) & mask) << dst_shift; + +        value_pos += copy_size; +        buffer_pos += copy_size; +        remaining -= copy_size; +    } +} + +void H264BitWriter::WriteExpGolombCodedInt(s32 value) { +    const s32 sign = value <= 0 ? 0 : 1; +    if (value < 0) { +        value = -value; +    } +    value = (value << 1) - sign; +    WriteExpGolombCodedUInt(value); +} + +void H264BitWriter::WriteExpGolombCodedUInt(u32 value) { +    const s32 size = 32 - Common::CountLeadingZeroes32(static_cast<s32>(value + 1)); +    WriteBits(1, size); + +    value -= (1U << (size - 1)) - 1; +    WriteBits(static_cast<s32>(value), size - 1); +} + +s32 H264BitWriter::GetFreeBufferBits() { +    if (buffer_pos == buffer_size) { +        Flush(); +    } + +    return buffer_size - buffer_pos; +} + +void H264BitWriter::Flush() { +    if (buffer_pos == 0) { +        return; +    } +    byte_array.push_back(static_cast<u8>(buffer)); + +    buffer = 0; +    buffer_pos = 0; +} +} // namespace Tegra::Decoder diff --git a/src/video_core/command_classes/codecs/h264.h b/src/video_core/command_classes/codecs/h264.h new file mode 100644 index 000000000..21752dd90 --- /dev/null +++ b/src/video_core/command_classes/codecs/h264.h @@ -0,0 +1,130 @@ +// MIT License +// +// Copyright (c) Ryujinx Team and Contributors +// +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and +// associated documentation files (the "Software"), to deal in the Software without restriction, +// including without limitation the rights to use, copy, modify, merge, publish, distribute, +// sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all copies or +// substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT +// NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, +// DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +// + +#pragma once + +#include <vector> +#include "common/common_funcs.h" +#include "common/common_types.h" +#include "video_core/command_classes/nvdec_common.h" + +namespace Tegra { +class GPU; +namespace Decoder { + +class H264BitWriter { +public: +    H264BitWriter(); +    ~H264BitWriter(); + +    /// The following Write methods are based on clause 9.1 in the H.264 specification. +    /// WriteSe and WriteUe write in the Exp-Golomb-coded syntax +    void WriteU(s32 value, s32 value_sz); +    void WriteSe(s32 value); +    void WriteUe(s32 value); + +    /// Finalize the bitstream +    void End(); + +    /// append a bit to the stream, equivalent value to the state parameter +    void WriteBit(bool state); + +    /// Based on section 7.3.2.1.1.1 and Table 7-4 in the H.264 specification +    /// Writes the scaling matrices of the sream +    void WriteScalingList(const std::vector<u8>& list, s32 start, s32 count); + +    /// Return the bitstream as a vector. +    std::vector<u8>& GetByteArray(); +    const std::vector<u8>& GetByteArray() const; + +private: +    // ZigZag LUTs from libavcodec. +    static constexpr std::array<u8, 64> zig_zag_direct{ +        0,  1,  8,  16, 9,  2,  3,  10, 17, 24, 32, 25, 18, 11, 4,  5,  12, 19, 26, 33, 40, 48, +        41, 34, 27, 20, 13, 6,  7,  14, 21, 28, 35, 42, 49, 56, 57, 50, 43, 36, 29, 22, 15, 23, +        30, 37, 44, 51, 58, 59, 52, 45, 38, 31, 39, 46, 53, 60, 61, 54, 47, 55, 62, 63, +    }; + +    static constexpr std::array<u8, 16> zig_zag_scan{ +        0 + 0 * 4, 1 + 0 * 4, 0 + 1 * 4, 0 + 2 * 4, 1 + 1 * 4, 2 + 0 * 4, 3 + 0 * 4, 2 + 1 * 4, +        1 + 2 * 4, 0 + 3 * 4, 1 + 3 * 4, 2 + 2 * 4, 3 + 1 * 4, 3 + 2 * 4, 2 + 3 * 4, 3 + 3 * 4, +    }; + +    void WriteBits(s32 value, s32 bit_count); +    void WriteExpGolombCodedInt(s32 value); +    void WriteExpGolombCodedUInt(u32 value); +    s32 GetFreeBufferBits(); +    void Flush(); + +    s32 buffer_size{8}; + +    s32 buffer{}; +    s32 buffer_pos{}; +    std::vector<u8> byte_array; +}; + +class H264 { +public: +    explicit H264(GPU& gpu); +    ~H264(); + +    /// Compose the H264 header of the frame for FFmpeg decoding +    std::vector<u8>& ComposeFrameHeader(NvdecCommon::NvdecRegisters& state, +                                        bool is_first_frame = false); + +private: +    struct H264ParameterSet { +        u32 log2_max_pic_order_cnt{}; +        u32 delta_pic_order_always_zero_flag{}; +        u32 frame_mbs_only_flag{}; +        u32 pic_width_in_mbs{}; +        u32 pic_height_in_map_units{}; +        INSERT_PADDING_WORDS(1); +        u32 entropy_coding_mode_flag{}; +        u32 bottom_field_pic_order_flag{}; +        u32 num_refidx_l0_default_active{}; +        u32 num_refidx_l1_default_active{}; +        u32 deblocking_filter_control_flag{}; +        u32 redundant_pic_count_flag{}; +        u32 transform_8x8_mode_flag{}; +        INSERT_PADDING_WORDS(9); +        u64 flags{}; +        u32 frame_number{}; +        u32 frame_number2{}; +    }; +    static_assert(sizeof(H264ParameterSet) == 0x68, "H264ParameterSet is an invalid size"); + +    struct H264DecoderContext { +        INSERT_PADDING_BYTES(0x48); +        u32 frame_data_size{}; +        INSERT_PADDING_BYTES(0xc); +        H264ParameterSet h264_parameter_set{}; +        INSERT_PADDING_BYTES(0x100); +        std::array<u8, 0x60> scaling_matrix_4; +        std::array<u8, 0x80> scaling_matrix_8; +    }; +    static_assert(sizeof(H264DecoderContext) == 0x2a0, "H264DecoderContext is an invalid size"); + +    std::vector<u8> frame; +    GPU& gpu; +}; + +} // namespace Decoder +} // namespace Tegra diff --git a/src/video_core/command_classes/codecs/vp9.cpp b/src/video_core/command_classes/codecs/vp9.cpp new file mode 100644 index 000000000..3bae0bb5d --- /dev/null +++ b/src/video_core/command_classes/codecs/vp9.cpp @@ -0,0 +1,1010 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include <cstring> // for std::memcpy +#include <numeric> +#include "video_core/command_classes/codecs/vp9.h" +#include "video_core/gpu.h" +#include "video_core/memory_manager.h" + +namespace Tegra::Decoder { + +// Default compressed header probabilities once frame context resets +constexpr Vp9EntropyProbs default_probs{ +    .y_mode_prob{ +        65,  32, 18, 144, 162, 194, 41, 51, 98, 132, 68,  18, 165, 217, 196, 45, 40, 78, +        173, 80, 19, 176, 240, 193, 64, 35, 46, 221, 135, 38, 194, 248, 121, 96, 85, 29, +    }, +    .partition_prob{ +        199, 122, 141, 0, 147, 63, 159, 0, 148, 133, 118, 0, 121, 104, 114, 0, +        174, 73,  87,  0, 92,  41, 83,  0, 82,  99,  50,  0, 53,  39,  39,  0, +        177, 58,  59,  0, 68,  26, 63,  0, 52,  79,  25,  0, 17,  14,  12,  0, +        222, 34,  30,  0, 72,  16, 44,  0, 58,  32,  12,  0, 10,  7,   6,   0, +    }, +    .coef_probs{ +        195, 29,  183, 0, 84,  49,  136, 0, 8,   42,  71,  0, 0,   0,   0,   0, 0,   0,   0,   0, +        0,   0,   0,   0, 31,  107, 169, 0, 35,  99,  159, 0, 17,  82,  140, 0, 8,   66,  114, 0, +        2,   44,  76,  0, 1,   19,  32,  0, 40,  132, 201, 0, 29,  114, 187, 0, 13,  91,  157, 0, +        7,   75,  127, 0, 3,   58,  95,  0, 1,   28,  47,  0, 69,  142, 221, 0, 42,  122, 201, 0, +        15,  91,  159, 0, 6,   67,  121, 0, 1,   42,  77,  0, 1,   17,  31,  0, 102, 148, 228, 0, +        67,  117, 204, 0, 17,  82,  154, 0, 6,   59,  114, 0, 2,   39,  75,  0, 1,   15,  29,  0, +        156, 57,  233, 0, 119, 57,  212, 0, 58,  48,  163, 0, 29,  40,  124, 0, 12,  30,  81,  0, +        3,   12,  31,  0, 191, 107, 226, 0, 124, 117, 204, 0, 25,  99,  155, 0, 0,   0,   0,   0, +        0,   0,   0,   0, 0,   0,   0,   0, 29,  148, 210, 0, 37,  126, 194, 0, 8,   93,  157, 0, +        2,   68,  118, 0, 1,   39,  69,  0, 1,   17,  33,  0, 41,  151, 213, 0, 27,  123, 193, 0, +        3,   82,  144, 0, 1,   58,  105, 0, 1,   32,  60,  0, 1,   13,  26,  0, 59,  159, 220, 0, +        23,  126, 198, 0, 4,   88,  151, 0, 1,   66,  114, 0, 1,   38,  71,  0, 1,   18,  34,  0, +        114, 136, 232, 0, 51,  114, 207, 0, 11,  83,  155, 0, 3,   56,  105, 0, 1,   33,  65,  0, +        1,   17,  34,  0, 149, 65,  234, 0, 121, 57,  215, 0, 61,  49,  166, 0, 28,  36,  114, 0, +        12,  25,  76,  0, 3,   16,  42,  0, 214, 49,  220, 0, 132, 63,  188, 0, 42,  65,  137, 0, +        0,   0,   0,   0, 0,   0,   0,   0, 0,   0,   0,   0, 85,  137, 221, 0, 104, 131, 216, 0, +        49,  111, 192, 0, 21,  87,  155, 0, 2,   49,  87,  0, 1,   16,  28,  0, 89,  163, 230, 0, +        90,  137, 220, 0, 29,  100, 183, 0, 10,  70,  135, 0, 2,   42,  81,  0, 1,   17,  33,  0, +        108, 167, 237, 0, 55,  133, 222, 0, 15,  97,  179, 0, 4,   72,  135, 0, 1,   45,  85,  0, +        1,   19,  38,  0, 124, 146, 240, 0, 66,  124, 224, 0, 17,  88,  175, 0, 4,   58,  122, 0, +        1,   36,  75,  0, 1,   18,  37,  0, 141, 79,  241, 0, 126, 70,  227, 0, 66,  58,  182, 0, +        30,  44,  136, 0, 12,  34,  96,  0, 2,   20,  47,  0, 229, 99,  249, 0, 143, 111, 235, 0, +        46,  109, 192, 0, 0,   0,   0,   0, 0,   0,   0,   0, 0,   0,   0,   0, 82,  158, 236, 0, +        94,  146, 224, 0, 25,  117, 191, 0, 9,   87,  149, 0, 3,   56,  99,  0, 1,   33,  57,  0, +        83,  167, 237, 0, 68,  145, 222, 0, 10,  103, 177, 0, 2,   72,  131, 0, 1,   41,  79,  0, +        1,   20,  39,  0, 99,  167, 239, 0, 47,  141, 224, 0, 10,  104, 178, 0, 2,   73,  133, 0, +        1,   44,  85,  0, 1,   22,  47,  0, 127, 145, 243, 0, 71,  129, 228, 0, 17,  93,  177, 0, +        3,   61,  124, 0, 1,   41,  84,  0, 1,   21,  52,  0, 157, 78,  244, 0, 140, 72,  231, 0, +        69,  58,  184, 0, 31,  44,  137, 0, 14,  38,  105, 0, 8,   23,  61,  0, 125, 34,  187, 0, +        52,  41,  133, 0, 6,   31,  56,  0, 0,   0,   0,   0, 0,   0,   0,   0, 0,   0,   0,   0, +        37,  109, 153, 0, 51,  102, 147, 0, 23,  87,  128, 0, 8,   67,  101, 0, 1,   41,  63,  0, +        1,   19,  29,  0, 31,  154, 185, 0, 17,  127, 175, 0, 6,   96,  145, 0, 2,   73,  114, 0, +        1,   51,  82,  0, 1,   28,  45,  0, 23,  163, 200, 0, 10,  131, 185, 0, 2,   93,  148, 0, +        1,   67,  111, 0, 1,   41,  69,  0, 1,   14,  24,  0, 29,  176, 217, 0, 12,  145, 201, 0, +        3,   101, 156, 0, 1,   69,  111, 0, 1,   39,  63,  0, 1,   14,  23,  0, 57,  192, 233, 0, +        25,  154, 215, 0, 6,   109, 167, 0, 3,   78,  118, 0, 1,   48,  69,  0, 1,   21,  29,  0, +        202, 105, 245, 0, 108, 106, 216, 0, 18,  90,  144, 0, 0,   0,   0,   0, 0,   0,   0,   0, +        0,   0,   0,   0, 33,  172, 219, 0, 64,  149, 206, 0, 14,  117, 177, 0, 5,   90,  141, 0, +        2,   61,  95,  0, 1,   37,  57,  0, 33,  179, 220, 0, 11,  140, 198, 0, 1,   89,  148, 0, +        1,   60,  104, 0, 1,   33,  57,  0, 1,   12,  21,  0, 30,  181, 221, 0, 8,   141, 198, 0, +        1,   87,  145, 0, 1,   58,  100, 0, 1,   31,  55,  0, 1,   12,  20,  0, 32,  186, 224, 0, +        7,   142, 198, 0, 1,   86,  143, 0, 1,   58,  100, 0, 1,   31,  55,  0, 1,   12,  22,  0, +        57,  192, 227, 0, 20,  143, 204, 0, 3,   96,  154, 0, 1,   68,  112, 0, 1,   42,  69,  0, +        1,   19,  32,  0, 212, 35,  215, 0, 113, 47,  169, 0, 29,  48,  105, 0, 0,   0,   0,   0, +        0,   0,   0,   0, 0,   0,   0,   0, 74,  129, 203, 0, 106, 120, 203, 0, 49,  107, 178, 0, +        19,  84,  144, 0, 4,   50,  84,  0, 1,   15,  25,  0, 71,  172, 217, 0, 44,  141, 209, 0, +        15,  102, 173, 0, 6,   76,  133, 0, 2,   51,  89,  0, 1,   24,  42,  0, 64,  185, 231, 0, +        31,  148, 216, 0, 8,   103, 175, 0, 3,   74,  131, 0, 1,   46,  81,  0, 1,   18,  30,  0, +        65,  196, 235, 0, 25,  157, 221, 0, 5,   105, 174, 0, 1,   67,  120, 0, 1,   38,  69,  0, +        1,   15,  30,  0, 65,  204, 238, 0, 30,  156, 224, 0, 7,   107, 177, 0, 2,   70,  124, 0, +        1,   42,  73,  0, 1,   18,  34,  0, 225, 86,  251, 0, 144, 104, 235, 0, 42,  99,  181, 0, +        0,   0,   0,   0, 0,   0,   0,   0, 0,   0,   0,   0, 85,  175, 239, 0, 112, 165, 229, 0, +        29,  136, 200, 0, 12,  103, 162, 0, 6,   77,  123, 0, 2,   53,  84,  0, 75,  183, 239, 0, +        30,  155, 221, 0, 3,   106, 171, 0, 1,   74,  128, 0, 1,   44,  76,  0, 1,   17,  28,  0, +        73,  185, 240, 0, 27,  159, 222, 0, 2,   107, 172, 0, 1,   75,  127, 0, 1,   42,  73,  0, +        1,   17,  29,  0, 62,  190, 238, 0, 21,  159, 222, 0, 2,   107, 172, 0, 1,   72,  122, 0, +        1,   40,  71,  0, 1,   18,  32,  0, 61,  199, 240, 0, 27,  161, 226, 0, 4,   113, 180, 0, +        1,   76,  129, 0, 1,   46,  80,  0, 1,   23,  41,  0, 7,   27,  153, 0, 5,   30,  95,  0, +        1,   16,  30,  0, 0,   0,   0,   0, 0,   0,   0,   0, 0,   0,   0,   0, 50,  75,  127, 0, +        57,  75,  124, 0, 27,  67,  108, 0, 10,  54,  86,  0, 1,   33,  52,  0, 1,   12,  18,  0, +        43,  125, 151, 0, 26,  108, 148, 0, 7,   83,  122, 0, 2,   59,  89,  0, 1,   38,  60,  0, +        1,   17,  27,  0, 23,  144, 163, 0, 13,  112, 154, 0, 2,   75,  117, 0, 1,   50,  81,  0, +        1,   31,  51,  0, 1,   14,  23,  0, 18,  162, 185, 0, 6,   123, 171, 0, 1,   78,  125, 0, +        1,   51,  86,  0, 1,   31,  54,  0, 1,   14,  23,  0, 15,  199, 227, 0, 3,   150, 204, 0, +        1,   91,  146, 0, 1,   55,  95,  0, 1,   30,  53,  0, 1,   11,  20,  0, 19,  55,  240, 0, +        19,  59,  196, 0, 3,   52,  105, 0, 0,   0,   0,   0, 0,   0,   0,   0, 0,   0,   0,   0, +        41,  166, 207, 0, 104, 153, 199, 0, 31,  123, 181, 0, 14,  101, 152, 0, 5,   72,  106, 0, +        1,   36,  52,  0, 35,  176, 211, 0, 12,  131, 190, 0, 2,   88,  144, 0, 1,   60,  101, 0, +        1,   36,  60,  0, 1,   16,  28,  0, 28,  183, 213, 0, 8,   134, 191, 0, 1,   86,  142, 0, +        1,   56,  96,  0, 1,   30,  53,  0, 1,   12,  20,  0, 20,  190, 215, 0, 4,   135, 192, 0, +        1,   84,  139, 0, 1,   53,  91,  0, 1,   28,  49,  0, 1,   11,  20,  0, 13,  196, 216, 0, +        2,   137, 192, 0, 1,   86,  143, 0, 1,   57,  99,  0, 1,   32,  56,  0, 1,   13,  24,  0, +        211, 29,  217, 0, 96,  47,  156, 0, 22,  43,  87,  0, 0,   0,   0,   0, 0,   0,   0,   0, +        0,   0,   0,   0, 78,  120, 193, 0, 111, 116, 186, 0, 46,  102, 164, 0, 15,  80,  128, 0, +        2,   49,  76,  0, 1,   18,  28,  0, 71,  161, 203, 0, 42,  132, 192, 0, 10,  98,  150, 0, +        3,   69,  109, 0, 1,   44,  70,  0, 1,   18,  29,  0, 57,  186, 211, 0, 30,  140, 196, 0, +        4,   93,  146, 0, 1,   62,  102, 0, 1,   38,  65,  0, 1,   16,  27,  0, 47,  199, 217, 0, +        14,  145, 196, 0, 1,   88,  142, 0, 1,   57,  98,  0, 1,   36,  62,  0, 1,   15,  26,  0, +        26,  219, 229, 0, 5,   155, 207, 0, 1,   94,  151, 0, 1,   60,  104, 0, 1,   36,  62,  0, +        1,   16,  28,  0, 233, 29,  248, 0, 146, 47,  220, 0, 43,  52,  140, 0, 0,   0,   0,   0, +        0,   0,   0,   0, 0,   0,   0,   0, 100, 163, 232, 0, 179, 161, 222, 0, 63,  142, 204, 0, +        37,  113, 174, 0, 26,  89,  137, 0, 18,  68,  97,  0, 85,  181, 230, 0, 32,  146, 209, 0, +        7,   100, 164, 0, 3,   71,  121, 0, 1,   45,  77,  0, 1,   18,  30,  0, 65,  187, 230, 0, +        20,  148, 207, 0, 2,   97,  159, 0, 1,   68,  116, 0, 1,   40,  70,  0, 1,   14,  29,  0, +        40,  194, 227, 0, 8,   147, 204, 0, 1,   94,  155, 0, 1,   65,  112, 0, 1,   39,  66,  0, +        1,   14,  26,  0, 16,  208, 228, 0, 3,   151, 207, 0, 1,   98,  160, 0, 1,   67,  117, 0, +        1,   41,  74,  0, 1,   17,  31,  0, 17,  38,  140, 0, 7,   34,  80,  0, 1,   17,  29,  0, +        0,   0,   0,   0, 0,   0,   0,   0, 0,   0,   0,   0, 37,  75,  128, 0, 41,  76,  128, 0, +        26,  66,  116, 0, 12,  52,  94,  0, 2,   32,  55,  0, 1,   10,  16,  0, 50,  127, 154, 0, +        37,  109, 152, 0, 16,  82,  121, 0, 5,   59,  85,  0, 1,   35,  54,  0, 1,   13,  20,  0, +        40,  142, 167, 0, 17,  110, 157, 0, 2,   71,  112, 0, 1,   44,  72,  0, 1,   27,  45,  0, +        1,   11,  17,  0, 30,  175, 188, 0, 9,   124, 169, 0, 1,   74,  116, 0, 1,   48,  78,  0, +        1,   30,  49,  0, 1,   11,  18,  0, 10,  222, 223, 0, 2,   150, 194, 0, 1,   83,  128, 0, +        1,   48,  79,  0, 1,   27,  45,  0, 1,   11,  17,  0, 36,  41,  235, 0, 29,  36,  193, 0, +        10,  27,  111, 0, 0,   0,   0,   0, 0,   0,   0,   0, 0,   0,   0,   0, 85,  165, 222, 0, +        177, 162, 215, 0, 110, 135, 195, 0, 57,  113, 168, 0, 23,  83,  120, 0, 10,  49,  61,  0, +        85,  190, 223, 0, 36,  139, 200, 0, 5,   90,  146, 0, 1,   60,  103, 0, 1,   38,  65,  0, +        1,   18,  30,  0, 72,  202, 223, 0, 23,  141, 199, 0, 2,   86,  140, 0, 1,   56,  97,  0, +        1,   36,  61,  0, 1,   16,  27,  0, 55,  218, 225, 0, 13,  145, 200, 0, 1,   86,  141, 0, +        1,   57,  99,  0, 1,   35,  61,  0, 1,   13,  22,  0, 15,  235, 212, 0, 1,   132, 184, 0, +        1,   84,  139, 0, 1,   57,  97,  0, 1,   34,  56,  0, 1,   14,  23,  0, 181, 21,  201, 0, +        61,  37,  123, 0, 10,  38,  71,  0, 0,   0,   0,   0, 0,   0,   0,   0, 0,   0,   0,   0, +        47,  106, 172, 0, 95,  104, 173, 0, 42,  93,  159, 0, 18,  77,  131, 0, 4,   50,  81,  0, +        1,   17,  23,  0, 62,  147, 199, 0, 44,  130, 189, 0, 28,  102, 154, 0, 18,  75,  115, 0, +        2,   44,  65,  0, 1,   12,  19,  0, 55,  153, 210, 0, 24,  130, 194, 0, 3,   93,  146, 0, +        1,   61,  97,  0, 1,   31,  50,  0, 1,   10,  16,  0, 49,  186, 223, 0, 17,  148, 204, 0, +        1,   96,  142, 0, 1,   53,  83,  0, 1,   26,  44,  0, 1,   11,  17,  0, 13,  217, 212, 0, +        2,   136, 180, 0, 1,   78,  124, 0, 1,   50,  83,  0, 1,   29,  49,  0, 1,   14,  23,  0, +        197, 13,  247, 0, 82,  17,  222, 0, 25,  17,  162, 0, 0,   0,   0,   0, 0,   0,   0,   0, +        0,   0,   0,   0, 126, 186, 247, 0, 234, 191, 243, 0, 176, 177, 234, 0, 104, 158, 220, 0, +        66,  128, 186, 0, 55,  90,  137, 0, 111, 197, 242, 0, 46,  158, 219, 0, 9,   104, 171, 0, +        2,   65,  125, 0, 1,   44,  80,  0, 1,   17,  91,  0, 104, 208, 245, 0, 39,  168, 224, 0, +        3,   109, 162, 0, 1,   79,  124, 0, 1,   50,  102, 0, 1,   43,  102, 0, 84,  220, 246, 0, +        31,  177, 231, 0, 2,   115, 180, 0, 1,   79,  134, 0, 1,   55,  77,  0, 1,   60,  79,  0, +        43,  243, 240, 0, 8,   180, 217, 0, 1,   115, 166, 0, 1,   84,  121, 0, 1,   51,  67,  0, +        1,   16,  6,   0, +    }, +    .switchable_interp_prob{235, 162, 36, 255, 34, 3, 149, 144}, +    .inter_mode_prob{ +        2,  173, 34, 0,  7,  145, 85, 0,  7,  166, 63, 0,  7,  94, +        66, 0,   8,  64, 46, 0,   17, 81, 31, 0,   25, 29, 30, 0, +    }, +    .intra_inter_prob{9, 102, 187, 225}, +    .comp_inter_prob{9, 102, 187, 225, 0}, +    .single_ref_prob{33, 16, 77, 74, 142, 142, 172, 170, 238, 247}, +    .comp_ref_prob{50, 126, 123, 221, 226}, +    .tx_32x32_prob{3, 136, 37, 5, 52, 13}, +    .tx_16x16_prob{20, 152, 15, 101}, +    .tx_8x8_prob{100, 66}, +    .skip_probs{192, 128, 64}, +    .joints{32, 64, 96}, +    .sign{128, 128}, +    .classes{ +        224, 144, 192, 168, 192, 176, 192, 198, 198, 245, +        216, 128, 176, 160, 176, 176, 192, 198, 198, 208, +    }, +    .class_0{216, 208}, +    .prob_bits{ +        136, 140, 148, 160, 176, 192, 224, 234, 234, 240, +        136, 140, 148, 160, 176, 192, 224, 234, 234, 240, +    }, +    .class_0_fr{128, 128, 64, 96, 112, 64, 128, 128, 64, 96, 112, 64}, +    .fr{64, 96, 64, 64, 96, 64}, +    .class_0_hp{160, 160}, +    .high_precision{128, 128}, +}; + +VP9::VP9(GPU& gpu) : gpu(gpu) {} + +VP9::~VP9() = default; + +void VP9::WriteProbabilityUpdate(VpxRangeEncoder& writer, u8 new_prob, u8 old_prob) { +    const bool update = new_prob != old_prob; + +    writer.Write(update, diff_update_probability); + +    if (update) { +        WriteProbabilityDelta(writer, new_prob, old_prob); +    } +} +template <typename T, std::size_t N> +void VP9::WriteProbabilityUpdate(VpxRangeEncoder& writer, const std::array<T, N>& new_prob, +                                 const std::array<T, N>& old_prob) { +    for (std::size_t offset = 0; offset < new_prob.size(); ++offset) { +        WriteProbabilityUpdate(writer, new_prob[offset], old_prob[offset]); +    } +} + +template <typename T, std::size_t N> +void VP9::WriteProbabilityUpdateAligned4(VpxRangeEncoder& writer, const std::array<T, N>& new_prob, +                                         const std::array<T, N>& old_prob) { +    for (std::size_t offset = 0; offset < new_prob.size(); offset += 4) { +        WriteProbabilityUpdate(writer, new_prob[offset + 0], old_prob[offset + 0]); +        WriteProbabilityUpdate(writer, new_prob[offset + 1], old_prob[offset + 1]); +        WriteProbabilityUpdate(writer, new_prob[offset + 2], old_prob[offset + 2]); +    } +} + +void VP9::WriteProbabilityDelta(VpxRangeEncoder& writer, u8 new_prob, u8 old_prob) { +    const int delta = RemapProbability(new_prob, old_prob); + +    EncodeTermSubExp(writer, delta); +} + +s32 VP9::RemapProbability(s32 new_prob, s32 old_prob) { +    new_prob--; +    old_prob--; + +    std::size_t index{}; + +    if (old_prob * 2 <= 0xff) { +        index = static_cast<std::size_t>(std::max(0, RecenterNonNeg(new_prob, old_prob) - 1)); +    } else { +        index = static_cast<std::size_t>( +            std::max(0, RecenterNonNeg(0xff - 1 - new_prob, 0xff - 1 - old_prob) - 1)); +    } + +    return map_lut[index]; +} + +s32 VP9::RecenterNonNeg(s32 new_prob, s32 old_prob) { +    if (new_prob > old_prob * 2) { +        return new_prob; +    } else if (new_prob >= old_prob) { +        return (new_prob - old_prob) * 2; +    } else { +        return (old_prob - new_prob) * 2 - 1; +    } +} + +void VP9::EncodeTermSubExp(VpxRangeEncoder& writer, s32 value) { +    if (WriteLessThan(writer, value, 16)) { +        writer.Write(value, 4); +    } else if (WriteLessThan(writer, value, 32)) { +        writer.Write(value - 16, 4); +    } else if (WriteLessThan(writer, value, 64)) { +        writer.Write(value - 32, 5); +    } else { +        value -= 64; + +        constexpr s32 size = 8; + +        const s32 mask = (1 << size) - 191; + +        const s32 delta = value - mask; + +        if (delta < 0) { +            writer.Write(value, size - 1); +        } else { +            writer.Write(delta / 2 + mask, size - 1); +            writer.Write(delta & 1, 1); +        } +    } +} + +bool VP9::WriteLessThan(VpxRangeEncoder& writer, s32 value, s32 test) { +    const bool is_lt = value < test; +    writer.Write(!is_lt); +    return is_lt; +} + +void VP9::WriteCoefProbabilityUpdate(VpxRangeEncoder& writer, s32 tx_mode, +                                     const std::array<u8, 2304>& new_prob, +                                     const std::array<u8, 2304>& old_prob) { +    // Note: There's 1 byte added on each packet for alignment, +    // this byte is ignored when doing updates. +    constexpr s32 block_bytes = 2 * 2 * 6 * 6 * 4; + +    const auto needs_update = [&](s32 base_index) -> bool { +        s32 index = base_index; +        for (s32 i = 0; i < 2; i++) { +            for (s32 j = 0; j < 2; j++) { +                for (s32 k = 0; k < 6; k++) { +                    for (s32 l = 0; l < 6; l++) { +                        if (new_prob[index + 0] != old_prob[index + 0] || +                            new_prob[index + 1] != old_prob[index + 1] || +                            new_prob[index + 2] != old_prob[index + 2]) { +                            return true; +                        } + +                        index += 4; +                    } +                } +            } +        } +        return false; +    }; + +    for (s32 block_index = 0; block_index < 4; block_index++) { +        const s32 base_index = block_index * block_bytes; +        const bool update = needs_update(base_index); +        writer.Write(update); + +        if (update) { +            s32 index = base_index; +            for (s32 i = 0; i < 2; i++) { +                for (s32 j = 0; j < 2; j++) { +                    for (s32 k = 0; k < 6; k++) { +                        for (s32 l = 0; l < 6; l++) { +                            if (k != 0 || l < 3) { +                                WriteProbabilityUpdate(writer, new_prob[index + 0], +                                                       old_prob[index + 0]); +                                WriteProbabilityUpdate(writer, new_prob[index + 1], +                                                       old_prob[index + 1]); +                                WriteProbabilityUpdate(writer, new_prob[index + 2], +                                                       old_prob[index + 2]); +                            } +                            index += 4; +                        } +                    } +                } +            } +        } + +        if (block_index == tx_mode) { +            break; +        } +    } +} + +void VP9::WriteMvProbabilityUpdate(VpxRangeEncoder& writer, u8 new_prob, u8 old_prob) { +    const bool update = new_prob != old_prob; +    writer.Write(update, diff_update_probability); + +    if (update) { +        writer.Write(new_prob >> 1, 7); +    } +} + +s32 VP9::CalcMinLog2TileCols(s32 frame_width) { +    const s32 sb64_cols = (frame_width + 63) / 64; +    s32 min_log2 = 0; + +    while ((64 << min_log2) < sb64_cols) { +        min_log2++; +    } + +    return min_log2; +} + +s32 VP9::CalcMaxLog2TileCols(s32 frameWidth) { +    const s32 sb64_cols = (frameWidth + 63) / 64; +    s32 max_log2 = 1; + +    while ((sb64_cols >> max_log2) >= 4) { +        max_log2++; +    } + +    return max_log2 - 1; +} + +Vp9PictureInfo VP9::GetVp9PictureInfo(const NvdecCommon::NvdecRegisters& state) { +    PictureInfo picture_info{}; +    gpu.MemoryManager().ReadBlock(state.picture_info_offset, &picture_info, sizeof(PictureInfo)); +    Vp9PictureInfo vp9_info = picture_info.Convert(); + +    InsertEntropy(state.vp9_entropy_probs_offset, vp9_info.entropy); + +    // surface_luma_offset[0:3] contains the address of the reference frame offsets in the following +    // order: last, golden, altref, current. It may be worthwhile to track the updates done here +    // to avoid buffering frame data needed for reference frame updating in the header composition. +    std::memcpy(vp9_info.frame_offsets.data(), state.surface_luma_offset.data(), 4 * sizeof(u64)); + +    return std::move(vp9_info); +} + +void VP9::InsertEntropy(u64 offset, Vp9EntropyProbs& dst) { +    EntropyProbs entropy{}; +    gpu.MemoryManager().ReadBlock(offset, &entropy, sizeof(EntropyProbs)); +    entropy.Convert(dst); +} + +Vp9FrameContainer VP9::GetCurrentFrame(const NvdecCommon::NvdecRegisters& state) { +    Vp9FrameContainer frame{}; +    { +        gpu.SyncGuestHost(); +        frame.info = std::move(GetVp9PictureInfo(state)); + +        frame.bit_stream.resize(frame.info.bitstream_size); +        gpu.MemoryManager().ReadBlock(state.frame_bitstream_offset, frame.bit_stream.data(), +                                      frame.info.bitstream_size); +    } +    // Buffer two frames, saving the last show frame info +    if (next_next_frame.bit_stream.size() != 0) { +        Vp9FrameContainer temp{ +            .info = frame.info, +            .bit_stream = frame.bit_stream, +        }; +        next_next_frame.info.show_frame = frame.info.last_frame_shown; +        frame.info = next_next_frame.info; +        frame.bit_stream = next_next_frame.bit_stream; +        next_next_frame = std::move(temp); + +        if (next_frame.bit_stream.size() != 0) { +            Vp9FrameContainer temp{ +                .info = frame.info, +                .bit_stream = frame.bit_stream, +            }; +            next_frame.info.show_frame = frame.info.last_frame_shown; +            frame.info = next_frame.info; +            frame.bit_stream = next_frame.bit_stream; +            next_frame = std::move(temp); +        } else { +            next_frame.info = frame.info; +            next_frame.bit_stream = frame.bit_stream; +        } +    } else { +        next_next_frame.info = frame.info; +        next_next_frame.bit_stream = frame.bit_stream; +    } +    return frame; +} + +std::vector<u8> VP9::ComposeCompressedHeader() { +    VpxRangeEncoder writer{}; + +    if (!current_frame_info.lossless) { +        if (static_cast<u32>(current_frame_info.transform_mode) >= 3) { +            writer.Write(3, 2); +            writer.Write(current_frame_info.transform_mode == 4); +        } else { +            writer.Write(current_frame_info.transform_mode, 2); +        } +    } + +    if (current_frame_info.transform_mode == 4) { +        // tx_mode_probs() in the spec +        WriteProbabilityUpdate(writer, current_frame_info.entropy.tx_8x8_prob, +                               prev_frame_probs.tx_8x8_prob); +        WriteProbabilityUpdate(writer, current_frame_info.entropy.tx_16x16_prob, +                               prev_frame_probs.tx_16x16_prob); +        WriteProbabilityUpdate(writer, current_frame_info.entropy.tx_32x32_prob, +                               prev_frame_probs.tx_32x32_prob); +        if (current_frame_info.show_frame && !current_frame_info.is_key_frame) { +            prev_frame_probs.tx_8x8_prob = current_frame_info.entropy.tx_8x8_prob; +            prev_frame_probs.tx_16x16_prob = current_frame_info.entropy.tx_16x16_prob; +            prev_frame_probs.tx_32x32_prob = current_frame_info.entropy.tx_32x32_prob; +        } +    } +    // read_coef_probs()  in the spec +    WriteCoefProbabilityUpdate(writer, current_frame_info.transform_mode, +                               current_frame_info.entropy.coef_probs, prev_frame_probs.coef_probs); +    // read_skip_probs()  in the spec +    WriteProbabilityUpdate(writer, current_frame_info.entropy.skip_probs, +                           prev_frame_probs.skip_probs); + +    if (current_frame_info.show_frame && !current_frame_info.is_key_frame) { +        prev_frame_probs.coef_probs = current_frame_info.entropy.coef_probs; +        prev_frame_probs.skip_probs = current_frame_info.entropy.skip_probs; +    } + +    if (!current_frame_info.intra_only) { +        // read_inter_probs() in the spec +        WriteProbabilityUpdateAligned4(writer, current_frame_info.entropy.inter_mode_prob, +                                       prev_frame_probs.inter_mode_prob); +        if (current_frame_info.show_frame && !current_frame_info.is_key_frame) { +            prev_frame_probs.inter_mode_prob = current_frame_info.entropy.inter_mode_prob; +        } + +        if (current_frame_info.interp_filter == 4) { +            // read_interp_filter_probs() in the spec +            WriteProbabilityUpdate(writer, current_frame_info.entropy.switchable_interp_prob, +                                   prev_frame_probs.switchable_interp_prob); +            if (current_frame_info.show_frame && !current_frame_info.is_key_frame) { +                prev_frame_probs.switchable_interp_prob = +                    current_frame_info.entropy.switchable_interp_prob; +            } +        } + +        // read_is_inter_probs() in the spec +        WriteProbabilityUpdate(writer, current_frame_info.entropy.intra_inter_prob, +                               prev_frame_probs.intra_inter_prob); +        if (current_frame_info.show_frame && !current_frame_info.is_key_frame) { +            prev_frame_probs.intra_inter_prob = current_frame_info.entropy.intra_inter_prob; +        } +        // frame_reference_mode() in the spec +        if ((current_frame_info.ref_frame_sign_bias[1] & 1) != +                (current_frame_info.ref_frame_sign_bias[2] & 1) || +            (current_frame_info.ref_frame_sign_bias[1] & 1) != +                (current_frame_info.ref_frame_sign_bias[3] & 1)) { +            if (current_frame_info.reference_mode >= 1) { +                writer.Write(1, 1); +                writer.Write(current_frame_info.reference_mode == 2); +            } else { +                writer.Write(0, 1); +            } +        } + +        // frame_reference_mode_probs() in the spec +        if (current_frame_info.reference_mode == 2) { +            WriteProbabilityUpdate(writer, current_frame_info.entropy.comp_inter_prob, +                                   prev_frame_probs.comp_inter_prob); +            if (current_frame_info.show_frame && !current_frame_info.is_key_frame) { +                prev_frame_probs.comp_inter_prob = current_frame_info.entropy.comp_inter_prob; +            } +        } + +        if (current_frame_info.reference_mode != 1) { +            WriteProbabilityUpdate(writer, current_frame_info.entropy.single_ref_prob, +                                   prev_frame_probs.single_ref_prob); +            if (current_frame_info.show_frame && !current_frame_info.is_key_frame) { +                prev_frame_probs.single_ref_prob = current_frame_info.entropy.single_ref_prob; +            } +        } + +        if (current_frame_info.reference_mode != 0) { +            WriteProbabilityUpdate(writer, current_frame_info.entropy.comp_ref_prob, +                                   prev_frame_probs.comp_ref_prob); +            if (current_frame_info.show_frame && !current_frame_info.is_key_frame) { +                prev_frame_probs.comp_ref_prob = current_frame_info.entropy.comp_ref_prob; +            } +        } + +        // read_y_mode_probs +        for (std::size_t index = 0; index < current_frame_info.entropy.y_mode_prob.size(); +             ++index) { +            WriteProbabilityUpdate(writer, current_frame_info.entropy.y_mode_prob[index], +                                   prev_frame_probs.y_mode_prob[index]); +        } +        if (current_frame_info.show_frame && !current_frame_info.is_key_frame) { +            prev_frame_probs.y_mode_prob = current_frame_info.entropy.y_mode_prob; +        } +        // read_partition_probs +        WriteProbabilityUpdateAligned4(writer, current_frame_info.entropy.partition_prob, +                                       prev_frame_probs.partition_prob); +        if (current_frame_info.show_frame && !current_frame_info.is_key_frame) { +            prev_frame_probs.partition_prob = current_frame_info.entropy.partition_prob; +        } + +        // mv_probs +        for (s32 i = 0; i < 3; i++) { +            WriteMvProbabilityUpdate(writer, current_frame_info.entropy.joints[i], +                                     prev_frame_probs.joints[i]); +        } +        if (current_frame_info.show_frame && !current_frame_info.is_key_frame) { +            prev_frame_probs.joints = current_frame_info.entropy.joints; +        } + +        for (s32 i = 0; i < 2; i++) { +            WriteMvProbabilityUpdate(writer, current_frame_info.entropy.sign[i], +                                     prev_frame_probs.sign[i]); + +            for (s32 j = 0; j < 10; j++) { +                const int index = i * 10 + j; + +                WriteMvProbabilityUpdate(writer, current_frame_info.entropy.classes[index], +                                         prev_frame_probs.classes[index]); +            } + +            WriteMvProbabilityUpdate(writer, current_frame_info.entropy.class_0[i], +                                     prev_frame_probs.class_0[i]); + +            for (s32 j = 0; j < 10; j++) { +                const int index = i * 10 + j; + +                WriteMvProbabilityUpdate(writer, current_frame_info.entropy.prob_bits[index], +                                         prev_frame_probs.prob_bits[index]); +            } +        } + +        for (s32 i = 0; i < 2; i++) { +            for (s32 j = 0; j < 2; j++) { +                for (s32 k = 0; k < 3; k++) { +                    const int index = i * 2 * 3 + j * 3 + k; + +                    WriteMvProbabilityUpdate(writer, current_frame_info.entropy.class_0_fr[index], +                                             prev_frame_probs.class_0_fr[index]); +                } +            } + +            for (s32 j = 0; j < 3; j++) { +                const int index = i * 3 + j; + +                WriteMvProbabilityUpdate(writer, current_frame_info.entropy.fr[index], +                                         prev_frame_probs.fr[index]); +            } +        } + +        if (current_frame_info.allow_high_precision_mv) { +            for (s32 index = 0; index < 2; index++) { +                WriteMvProbabilityUpdate(writer, current_frame_info.entropy.class_0_hp[index], +                                         prev_frame_probs.class_0_hp[index]); +                WriteMvProbabilityUpdate(writer, current_frame_info.entropy.high_precision[index], +                                         prev_frame_probs.high_precision[index]); +            } +        } + +        // save previous probs +        if (current_frame_info.show_frame && !current_frame_info.is_key_frame) { +            prev_frame_probs.sign = current_frame_info.entropy.sign; +            prev_frame_probs.classes = current_frame_info.entropy.classes; +            prev_frame_probs.class_0 = current_frame_info.entropy.class_0; +            prev_frame_probs.prob_bits = current_frame_info.entropy.prob_bits; +            prev_frame_probs.class_0_fr = current_frame_info.entropy.class_0_fr; +            prev_frame_probs.fr = current_frame_info.entropy.fr; +            prev_frame_probs.class_0_hp = current_frame_info.entropy.class_0_hp; +            prev_frame_probs.high_precision = current_frame_info.entropy.high_precision; +        } +    } + +    writer.End(); +    return writer.GetBuffer(); + +    const auto writer_bytearray = writer.GetBuffer(); + +    std::vector<u8> compressed_header(writer_bytearray.size()); +    std::memcpy(compressed_header.data(), writer_bytearray.data(), writer_bytearray.size()); +    return compressed_header; +} + +VpxBitStreamWriter VP9::ComposeUncompressedHeader() { +    VpxBitStreamWriter uncomp_writer{}; + +    uncomp_writer.WriteU(2, 2);                                      // Frame marker. +    uncomp_writer.WriteU(0, 2);                                      // Profile. +    uncomp_writer.WriteBit(false);                                   // Show existing frame. +    uncomp_writer.WriteBit(!current_frame_info.is_key_frame);        // is key frame? +    uncomp_writer.WriteBit(current_frame_info.show_frame);           // show frame? +    uncomp_writer.WriteBit(current_frame_info.error_resilient_mode); // error reslience + +    if (current_frame_info.is_key_frame) { +        uncomp_writer.WriteU(frame_sync_code, 24); +        uncomp_writer.WriteU(0, 3); // Color space. +        uncomp_writer.WriteU(0, 1); // Color range. +        uncomp_writer.WriteU(current_frame_info.frame_size.width - 1, 16); +        uncomp_writer.WriteU(current_frame_info.frame_size.height - 1, 16); +        uncomp_writer.WriteBit(false); // Render and frame size different. + +        // Reset context +        prev_frame_probs = default_probs; +        swap_next_golden = false; +        loop_filter_ref_deltas.fill(0); +        loop_filter_mode_deltas.fill(0); + +        // allow frames offsets to stabilize before checking for golden frames +        grace_period = 4; + +        // On key frames, all frame slots are set to the current frame, +        // so the value of the selected slot doesn't really matter. +        frame_ctxs.fill({current_frame_number, false, default_probs}); + +        // intra only, meaning the frame can be recreated with no other references +        current_frame_info.intra_only = true; + +    } else { +        std::array<s32, 3> ref_frame_index; + +        if (!current_frame_info.show_frame) { +            uncomp_writer.WriteBit(current_frame_info.intra_only); +            if (!current_frame_info.last_frame_was_key) { +                swap_next_golden = !swap_next_golden; +            } +        } else { +            current_frame_info.intra_only = false; +        } +        if (!current_frame_info.error_resilient_mode) { +            uncomp_writer.WriteU(0, 2); // Reset frame context. +        } + +        // Last, Golden, Altref frames +        ref_frame_index = std::array<s32, 3>{0, 1, 2}; + +        // set when next frame is hidden +        // altref and golden references are swapped +        if (swap_next_golden) { +            ref_frame_index = std::array<s32, 3>{0, 2, 1}; +        } + +        // update Last Frame +        u64 refresh_frame_flags = 1; + +        // golden frame may refresh, determined if the next golden frame offset is changed +        bool golden_refresh = false; +        if (grace_period <= 0) { +            for (s32 index = 1; index < 3; ++index) { +                if (current_frame_info.frame_offsets[index] != +                    next_frame.info.frame_offsets[index]) { +                    current_frame_info.refresh_frame[index] = true; +                    golden_refresh = true; +                    grace_period = 3; +                } +            } +        } + +        if (current_frame_info.show_frame && +            (!next_frame.info.show_frame || next_frame.info.is_key_frame)) { +            // Update golden frame +            refresh_frame_flags = swap_next_golden ? 2 : 4; +        } + +        if (!current_frame_info.show_frame) { +            // Update altref +            refresh_frame_flags = swap_next_golden ? 2 : 4; +        } else if (golden_refresh) { +            refresh_frame_flags = 3; +        } + +        if (current_frame_info.intra_only) { +            uncomp_writer.WriteU(frame_sync_code, 24); +            uncomp_writer.WriteU(static_cast<s32>(refresh_frame_flags), 8); +            uncomp_writer.WriteU(current_frame_info.frame_size.width - 1, 16); +            uncomp_writer.WriteU(current_frame_info.frame_size.height - 1, 16); +            uncomp_writer.WriteBit(false); // Render and frame size different. +        } else { +            uncomp_writer.WriteU(static_cast<s32>(refresh_frame_flags), 8); + +            for (s32 index = 1; index < 4; index++) { +                uncomp_writer.WriteU(ref_frame_index[index - 1], 3); +                uncomp_writer.WriteU(current_frame_info.ref_frame_sign_bias[index], 1); +            } + +            uncomp_writer.WriteBit(true);  // Frame size with refs. +            uncomp_writer.WriteBit(false); // Render and frame size different. +            uncomp_writer.WriteBit(current_frame_info.allow_high_precision_mv); +            uncomp_writer.WriteBit(current_frame_info.interp_filter == 4); + +            if (current_frame_info.interp_filter != 4) { +                uncomp_writer.WriteU(current_frame_info.interp_filter, 2); +            } +        } +    } + +    if (!current_frame_info.error_resilient_mode) { +        uncomp_writer.WriteBit(true); // Refresh frame context. where do i get this info from? +        uncomp_writer.WriteBit(true); // Frame parallel decoding mode. +    } + +    int frame_ctx_idx = 0; +    if (!current_frame_info.show_frame) { +        frame_ctx_idx = 1; +    } + +    uncomp_writer.WriteU(frame_ctx_idx, 2); // Frame context index. +    prev_frame_probs = +        frame_ctxs[frame_ctx_idx].probs; // reference probabilities for compressed header +    frame_ctxs[frame_ctx_idx] = {current_frame_number, false, current_frame_info.entropy}; + +    uncomp_writer.WriteU(current_frame_info.first_level, 6); +    uncomp_writer.WriteU(current_frame_info.sharpness_level, 3); +    uncomp_writer.WriteBit(current_frame_info.mode_ref_delta_enabled); + +    if (current_frame_info.mode_ref_delta_enabled) { +        // check if ref deltas are different, update accordingly +        std::array<bool, 4> update_loop_filter_ref_deltas; +        std::array<bool, 2> update_loop_filter_mode_deltas; + +        bool loop_filter_delta_update = false; + +        for (std::size_t index = 0; index < current_frame_info.ref_deltas.size(); index++) { +            const s8 old_deltas = loop_filter_ref_deltas[index]; +            const s8 new_deltas = current_frame_info.ref_deltas[index]; + +            loop_filter_delta_update |= +                (update_loop_filter_ref_deltas[index] = old_deltas != new_deltas); +        } + +        for (std::size_t index = 0; index < current_frame_info.mode_deltas.size(); index++) { +            const s8 old_deltas = loop_filter_mode_deltas[index]; +            const s8 new_deltas = current_frame_info.mode_deltas[index]; + +            loop_filter_delta_update |= +                (update_loop_filter_mode_deltas[index] = old_deltas != new_deltas); +        } + +        uncomp_writer.WriteBit(loop_filter_delta_update); + +        if (loop_filter_delta_update) { +            for (std::size_t index = 0; index < current_frame_info.ref_deltas.size(); index++) { +                uncomp_writer.WriteBit(update_loop_filter_ref_deltas[index]); + +                if (update_loop_filter_ref_deltas[index]) { +                    uncomp_writer.WriteS(current_frame_info.ref_deltas[index], 6); +                } +            } + +            for (std::size_t index = 0; index < current_frame_info.mode_deltas.size(); index++) { +                uncomp_writer.WriteBit(update_loop_filter_mode_deltas[index]); + +                if (update_loop_filter_mode_deltas[index]) { +                    uncomp_writer.WriteS(current_frame_info.mode_deltas[index], 6); +                } +            } +            // save new deltas +            loop_filter_ref_deltas = current_frame_info.ref_deltas; +            loop_filter_mode_deltas = current_frame_info.mode_deltas; +        } +    } + +    uncomp_writer.WriteU(current_frame_info.base_q_index, 8); + +    uncomp_writer.WriteDeltaQ(current_frame_info.y_dc_delta_q); +    uncomp_writer.WriteDeltaQ(current_frame_info.uv_dc_delta_q); +    uncomp_writer.WriteDeltaQ(current_frame_info.uv_ac_delta_q); + +    uncomp_writer.WriteBit(false); // Segmentation enabled (TODO). + +    const s32 min_tile_cols_log2 = CalcMinLog2TileCols(current_frame_info.frame_size.width); +    const s32 max_tile_cols_log2 = CalcMaxLog2TileCols(current_frame_info.frame_size.width); + +    const s32 tile_cols_log2_diff = current_frame_info.log2_tile_cols - min_tile_cols_log2; +    const s32 tile_cols_log2_inc_mask = (1 << tile_cols_log2_diff) - 1; + +    // If it's less than the maximum, we need to add an extra 0 on the bitstream +    // to indicate that it should stop reading. +    if (current_frame_info.log2_tile_cols < max_tile_cols_log2) { +        uncomp_writer.WriteU(tile_cols_log2_inc_mask << 1, tile_cols_log2_diff + 1); +    } else { +        uncomp_writer.WriteU(tile_cols_log2_inc_mask, tile_cols_log2_diff); +    } + +    const bool tile_rows_log2_is_nonzero = current_frame_info.log2_tile_rows != 0; + +    uncomp_writer.WriteBit(tile_rows_log2_is_nonzero); + +    if (tile_rows_log2_is_nonzero) { +        uncomp_writer.WriteBit(current_frame_info.log2_tile_rows > 1); +    } + +    return uncomp_writer; +} + +std::vector<u8>& VP9::ComposeFrameHeader(NvdecCommon::NvdecRegisters& state) { +    std::vector<u8> bitstream; +    { +        Vp9FrameContainer curr_frame = GetCurrentFrame(state); +        current_frame_info = curr_frame.info; +        bitstream = curr_frame.bit_stream; +    } + +    // The uncompressed header routine sets PrevProb parameters needed for the compressed header +    auto uncomp_writer = ComposeUncompressedHeader(); +    std::vector<u8> compressed_header = ComposeCompressedHeader(); + +    uncomp_writer.WriteU(static_cast<s32>(compressed_header.size()), 16); +    uncomp_writer.Flush(); +    std::vector<u8> uncompressed_header = uncomp_writer.GetByteArray(); + +    // Write headers and frame to buffer +    frame.resize(uncompressed_header.size() + compressed_header.size() + bitstream.size()); +    std::memcpy(frame.data(), uncompressed_header.data(), uncompressed_header.size()); +    std::memcpy(frame.data() + uncompressed_header.size(), compressed_header.data(), +                compressed_header.size()); +    std::memcpy(frame.data() + uncompressed_header.size() + compressed_header.size(), +                bitstream.data(), bitstream.size()); + +    // keep track of frame number +    current_frame_number++; +    grace_period--; + +    // don't display hidden frames +    hidden = !current_frame_info.show_frame; +    return frame; +} + +VpxRangeEncoder::VpxRangeEncoder() { +    Write(false); +} + +VpxRangeEncoder::~VpxRangeEncoder() = default; + +void VpxRangeEncoder::Write(s32 value, s32 value_size) { +    for (s32 bit = value_size - 1; bit >= 0; bit--) { +        Write(((value >> bit) & 1) != 0); +    } +} + +void VpxRangeEncoder::Write(bool bit) { +    Write(bit, half_probability); +} + +void VpxRangeEncoder::Write(bool bit, s32 probability) { +    u32 local_range = range; +    const u32 split = 1 + (((local_range - 1) * static_cast<u32>(probability)) >> 8); +    local_range = split; + +    if (bit) { +        low_value += split; +        local_range = range - split; +    } + +    s32 shift = norm_lut[local_range]; +    local_range <<= shift; +    count += shift; + +    if (count >= 0) { +        const s32 offset = shift - count; + +        if (((low_value << (offset - 1)) >> 31) != 0) { +            const s32 current_pos = static_cast<s32>(base_stream.GetPosition()); +            base_stream.Seek(-1, Common::SeekOrigin::FromCurrentPos); +            while (base_stream.GetPosition() >= 0 && PeekByte() == 0xff) { +                base_stream.WriteByte(0); + +                base_stream.Seek(-2, Common::SeekOrigin::FromCurrentPos); +            } +            base_stream.WriteByte(static_cast<u8>((PeekByte() + 1))); +            base_stream.Seek(current_pos, Common::SeekOrigin::SetOrigin); +        } +        base_stream.WriteByte(static_cast<u8>((low_value >> (24 - offset)))); + +        low_value <<= offset; +        shift = count; +        low_value &= 0xffffff; +        count -= 8; +    } + +    low_value <<= shift; +    range = local_range; +} + +void VpxRangeEncoder::End() { +    for (std::size_t index = 0; index < 32; ++index) { +        Write(false); +    } +} + +u8 VpxRangeEncoder::PeekByte() { +    const u8 value = base_stream.ReadByte(); +    base_stream.Seek(-1, Common::SeekOrigin::FromCurrentPos); + +    return value; +} + +VpxBitStreamWriter::VpxBitStreamWriter() = default; + +VpxBitStreamWriter::~VpxBitStreamWriter() = default; + +void VpxBitStreamWriter::WriteU(u32 value, u32 value_size) { +    WriteBits(value, value_size); +} + +void VpxBitStreamWriter::WriteS(s32 value, u32 value_size) { +    const bool sign = value < 0; +    if (sign) { +        value = -value; +    } + +    WriteBits(static_cast<u32>(value << 1) | (sign ? 1 : 0), value_size + 1); +} + +void VpxBitStreamWriter::WriteDeltaQ(u32 value) { +    const bool delta_coded = value != 0; +    WriteBit(delta_coded); + +    if (delta_coded) { +        WriteBits(value, 4); +    } +} + +void VpxBitStreamWriter::WriteBits(u32 value, u32 bit_count) { +    s32 value_pos = 0; +    s32 remaining = bit_count; + +    while (remaining > 0) { +        s32 copy_size = remaining; + +        const s32 free = GetFreeBufferBits(); + +        if (copy_size > free) { +            copy_size = free; +        } + +        const s32 mask = (1 << copy_size) - 1; + +        const s32 src_shift = (bit_count - value_pos) - copy_size; +        const s32 dst_shift = (buffer_size - buffer_pos) - copy_size; + +        buffer |= ((value >> src_shift) & mask) << dst_shift; + +        value_pos += copy_size; +        buffer_pos += copy_size; +        remaining -= copy_size; +    } +} + +void VpxBitStreamWriter::WriteBit(bool state) { +    WriteBits(state ? 1 : 0, 1); +} + +s32 VpxBitStreamWriter::GetFreeBufferBits() { +    if (buffer_pos == buffer_size) { +        Flush(); +    } + +    return buffer_size - buffer_pos; +} + +void VpxBitStreamWriter::Flush() { +    if (buffer_pos == 0) { +        return; +    } +    byte_array.push_back(static_cast<u8>(buffer)); +    buffer = 0; +    buffer_pos = 0; +} + +std::vector<u8>& VpxBitStreamWriter::GetByteArray() { +    return byte_array; +} + +const std::vector<u8>& VpxBitStreamWriter::GetByteArray() const { +    return byte_array; +} + +} // namespace Tegra::Decoder diff --git a/src/video_core/command_classes/codecs/vp9.h b/src/video_core/command_classes/codecs/vp9.h new file mode 100644 index 000000000..748e11bae --- /dev/null +++ b/src/video_core/command_classes/codecs/vp9.h @@ -0,0 +1,216 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include <unordered_map> +#include <vector> +#include "common/common_funcs.h" +#include "common/common_types.h" +#include "common/stream.h" +#include "video_core/command_classes/codecs/vp9_types.h" +#include "video_core/command_classes/nvdec_common.h" + +namespace Tegra { +class GPU; +enum class FrameType { KeyFrame = 0, InterFrame = 1 }; +namespace Decoder { + +/// The VpxRangeEncoder, and VpxBitStreamWriter classes are used to compose the +/// VP9 header bitstreams. + +class VpxRangeEncoder { +public: +    VpxRangeEncoder(); +    ~VpxRangeEncoder(); + +    /// Writes the rightmost value_size bits from value into the stream +    void Write(s32 value, s32 value_size); + +    /// Writes a single bit with half probability +    void Write(bool bit); + +    /// Writes a bit to the base_stream encoded with probability +    void Write(bool bit, s32 probability); + +    /// Signal the end of the bitstream +    void End(); + +    std::vector<u8>& GetBuffer() { +        return base_stream.GetBuffer(); +    } + +    const std::vector<u8>& GetBuffer() const { +        return base_stream.GetBuffer(); +    } + +private: +    u8 PeekByte(); +    Common::Stream base_stream{}; +    u32 low_value{}; +    u32 range{0xff}; +    s32 count{-24}; +    s32 half_probability{128}; +    static constexpr std::array<s32, 256> norm_lut{ +        0, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, +        3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, +        2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, +        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, +        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +    }; +}; + +class VpxBitStreamWriter { +public: +    VpxBitStreamWriter(); +    ~VpxBitStreamWriter(); + +    /// Write an unsigned integer value +    void WriteU(u32 value, u32 value_size); + +    /// Write a signed integer value +    void WriteS(s32 value, u32 value_size); + +    /// Based on 6.2.10 of VP9 Spec, writes a delta coded value +    void WriteDeltaQ(u32 value); + +    /// Write a single bit. +    void WriteBit(bool state); + +    /// Pushes current buffer into buffer_array, resets buffer +    void Flush(); + +    /// Returns byte_array +    std::vector<u8>& GetByteArray(); + +    /// Returns const byte_array +    const std::vector<u8>& GetByteArray() const; + +private: +    /// Write bit_count bits from value into buffer +    void WriteBits(u32 value, u32 bit_count); + +    /// Gets next available position in buffer, invokes Flush() if buffer is full +    s32 GetFreeBufferBits(); + +    s32 buffer_size{8}; + +    s32 buffer{}; +    s32 buffer_pos{}; +    std::vector<u8> byte_array; +}; + +class VP9 { +public: +    explicit VP9(GPU& gpu); +    ~VP9(); + +    /// Composes the VP9 frame from the GPU state information. Based on the official VP9 spec +    /// documentation +    std::vector<u8>& ComposeFrameHeader(NvdecCommon::NvdecRegisters& state); + +    /// Returns true if the most recent frame was a hidden frame. +    bool WasFrameHidden() const { +        return hidden; +    } + +private: +    /// Generates compressed header probability updates in the bitstream writer +    template <typename T, std::size_t N> +    void WriteProbabilityUpdate(VpxRangeEncoder& writer, const std::array<T, N>& new_prob, +                                const std::array<T, N>& old_prob); + +    /// Generates compressed header probability updates in the bitstream writer +    /// If probs are not equal, WriteProbabilityDelta is invoked +    void WriteProbabilityUpdate(VpxRangeEncoder& writer, u8 new_prob, u8 old_prob); + +    /// Generates compressed header probability deltas in the bitstream writer +    void WriteProbabilityDelta(VpxRangeEncoder& writer, u8 new_prob, u8 old_prob); + +    /// Adjusts old_prob depending on new_prob. Based on section 6.3.5 of VP9 Specification +    s32 RemapProbability(s32 new_prob, s32 old_prob); + +    /// Recenters probability. Based on section 6.3.6 of VP9 Specification +    s32 RecenterNonNeg(s32 new_prob, s32 old_prob); + +    /// Inverse of 6.3.4 Decode term subexp +    void EncodeTermSubExp(VpxRangeEncoder& writer, s32 value); + +    /// Writes if the value is less than the test value +    bool WriteLessThan(VpxRangeEncoder& writer, s32 value, s32 test); + +    /// Writes probability updates for the Coef probabilities +    void WriteCoefProbabilityUpdate(VpxRangeEncoder& writer, s32 tx_mode, +                                    const std::array<u8, 2304>& new_prob, +                                    const std::array<u8, 2304>& old_prob); + +    /// Write probabilities for 4-byte aligned structures +    template <typename T, std::size_t N> +    void WriteProbabilityUpdateAligned4(VpxRangeEncoder& writer, const std::array<T, N>& new_prob, +                                        const std::array<T, N>& old_prob); + +    /// Write motion vector probability updates. 6.3.17 in the spec +    void WriteMvProbabilityUpdate(VpxRangeEncoder& writer, u8 new_prob, u8 old_prob); + +    /// 6.2.14 Tile size calculation +    s32 CalcMinLog2TileCols(s32 frame_width); +    s32 CalcMaxLog2TileCols(s32 frame_width); + +    /// Returns VP9 information from NVDEC provided offset and size +    Vp9PictureInfo GetVp9PictureInfo(const NvdecCommon::NvdecRegisters& state); + +    /// Read and convert NVDEC provided entropy probs to Vp9EntropyProbs struct +    void InsertEntropy(u64 offset, Vp9EntropyProbs& dst); + +    /// Returns frame to be decoded after buffering +    Vp9FrameContainer GetCurrentFrame(const NvdecCommon::NvdecRegisters& state); + +    /// Use NVDEC providied information to compose the headers for the current frame +    std::vector<u8> ComposeCompressedHeader(); +    VpxBitStreamWriter ComposeUncompressedHeader(); + +    GPU& gpu; +    std::vector<u8> frame; + +    std::array<s8, 4> loop_filter_ref_deltas{}; +    std::array<s8, 2> loop_filter_mode_deltas{}; + +    bool hidden; +    s64 current_frame_number = -2; // since we buffer 2 frames +    s32 grace_period = 6;          // frame offsets need to stabilize +    std::array<FrameContexts, 4> frame_ctxs{}; +    Vp9FrameContainer next_frame{}; +    Vp9FrameContainer next_next_frame{}; +    bool swap_next_golden{}; + +    Vp9PictureInfo current_frame_info{}; +    Vp9EntropyProbs prev_frame_probs{}; + +    s32 diff_update_probability = 252; +    s32 frame_sync_code = 0x498342; +    static constexpr std::array<s32, 254> map_lut = { +        20,  21,  22,  23,  24,  25,  0,   26,  27,  28,  29,  30,  31,  32,  33,  34,  35, +        36,  37,  1,   38,  39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  2,   50, +        51,  52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  3,   62,  63,  64,  65,  66, +        67,  68,  69,  70,  71,  72,  73,  4,   74,  75,  76,  77,  78,  79,  80,  81,  82, +        83,  84,  85,  5,   86,  87,  88,  89,  90,  91,  92,  93,  94,  95,  96,  97,  6, +        98,  99,  100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 7,   110, 111, 112, 113, +        114, 115, 116, 117, 118, 119, 120, 121, 8,   122, 123, 124, 125, 126, 127, 128, 129, +        130, 131, 132, 133, 9,   134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, +        10,  146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 11,  158, 159, 160, +        161, 162, 163, 164, 165, 166, 167, 168, 169, 12,  170, 171, 172, 173, 174, 175, 176, +        177, 178, 179, 180, 181, 13,  182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, +        193, 14,  194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 15,  206, 207, +        208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 16,  218, 219, 220, 221, 222, 223, +        224, 225, 226, 227, 228, 229, 17,  230, 231, 232, 233, 234, 235, 236, 237, 238, 239, +        240, 241, 18,  242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 19, +    }; +}; + +} // namespace Decoder +} // namespace Tegra diff --git a/src/video_core/command_classes/codecs/vp9_types.h b/src/video_core/command_classes/codecs/vp9_types.h new file mode 100644 index 000000000..8688fdac0 --- /dev/null +++ b/src/video_core/command_classes/codecs/vp9_types.h @@ -0,0 +1,369 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include <algorithm> +#include <list> +#include <vector> +#include "common/cityhash.h" +#include "common/common_funcs.h" +#include "common/common_types.h" +#include "video_core/command_classes/nvdec_common.h" + +namespace Tegra { +class GPU; + +namespace Decoder { +struct Vp9FrameDimensions { +    s16 width{}; +    s16 height{}; +    s16 luma_pitch{}; +    s16 chroma_pitch{}; +}; +static_assert(sizeof(Vp9FrameDimensions) == 0x8, "Vp9 Vp9FrameDimensions is an invalid size"); + +enum FrameFlags : u32 { +    IsKeyFrame = 1 << 0, +    LastFrameIsKeyFrame = 1 << 1, +    FrameSizeChanged = 1 << 2, +    ErrorResilientMode = 1 << 3, +    LastShowFrame = 1 << 4, +    IntraOnly = 1 << 5, +}; + +enum class MvJointType { +    MvJointZero = 0,   /* Zero vector */ +    MvJointHnzvz = 1,  /* Vert zero, hor nonzero */ +    MvJointHzvnz = 2,  /* Hor zero, vert nonzero */ +    MvJointHnzvnz = 3, /* Both components nonzero */ +}; +enum class MvClassType { +    MvClass0 = 0,   /* (0, 2]     integer pel */ +    MvClass1 = 1,   /* (2, 4]     integer pel */ +    MvClass2 = 2,   /* (4, 8]     integer pel */ +    MvClass3 = 3,   /* (8, 16]    integer pel */ +    MvClass4 = 4,   /* (16, 32]   integer pel */ +    MvClass5 = 5,   /* (32, 64]   integer pel */ +    MvClass6 = 6,   /* (64, 128]  integer pel */ +    MvClass7 = 7,   /* (128, 256] integer pel */ +    MvClass8 = 8,   /* (256, 512] integer pel */ +    MvClass9 = 9,   /* (512, 1024] integer pel */ +    MvClass10 = 10, /* (1024,2048] integer pel */ +}; + +enum class BlockSize { +    Block4x4 = 0, +    Block4x8 = 1, +    Block8x4 = 2, +    Block8x8 = 3, +    Block8x16 = 4, +    Block16x8 = 5, +    Block16x16 = 6, +    Block16x32 = 7, +    Block32x16 = 8, +    Block32x32 = 9, +    Block32x64 = 10, +    Block64x32 = 11, +    Block64x64 = 12, +    BlockSizes = 13, +    BlockInvalid = BlockSizes +}; + +enum class PredictionMode { +    DcPred = 0,   // Average of above and left pixels +    VPred = 1,    // Vertical +    HPred = 2,    // Horizontal +    D45Pred = 3,  // Directional 45  deg = round(arctan(1 / 1) * 180 / pi) +    D135Pred = 4, // Directional 135 deg = 180 - 45 +    D117Pred = 5, // Directional 117 deg = 180 - 63 +    D153Pred = 6, // Directional 153 deg = 180 - 27 +    D207Pred = 7, // Directional 207 deg = 180 + 27 +    D63Pred = 8,  // Directional 63  deg = round(arctan(2 / 1) * 180 / pi) +    TmPred = 9,   // True-motion +    NearestMv = 10, +    NearMv = 11, +    ZeroMv = 12, +    NewMv = 13, +    MbModeCount = 14 +}; + +enum class TxSize { +    Tx4x4 = 0,   // 4x4 transform +    Tx8x8 = 1,   // 8x8 transform +    Tx16x16 = 2, // 16x16 transform +    Tx32x32 = 3, // 32x32 transform +    TxSizes = 4 +}; + +enum class TxMode { +    Only4X4 = 0,      // Only 4x4 transform used +    Allow8X8 = 1,     // Allow block transform size up to 8x8 +    Allow16X16 = 2,   // Allow block transform size up to 16x16 +    Allow32X32 = 3,   // Allow block transform size up to 32x32 +    TxModeSelect = 4, // Transform specified for each block +    TxModes = 5 +}; + +enum class reference_mode { +    SingleReference = 0, +    CompoundReference = 1, +    ReferenceModeSelect = 2, +    ReferenceModes = 3 +}; + +struct Segmentation { +    u8 enabled{}; +    u8 update_map{}; +    u8 temporal_update{}; +    u8 abs_delta{}; +    std::array<u32, 8> feature_mask{}; +    std::array<std::array<s16, 4>, 8> feature_data{}; +}; +static_assert(sizeof(Segmentation) == 0x64, "Segmentation is an invalid size"); + +struct LoopFilter { +    u8 mode_ref_delta_enabled{}; +    std::array<s8, 4> ref_deltas{}; +    std::array<s8, 2> mode_deltas{}; +}; +static_assert(sizeof(LoopFilter) == 0x7, "LoopFilter is an invalid size"); + +struct Vp9EntropyProbs { +    std::array<u8, 36> y_mode_prob{}; +    std::array<u8, 64> partition_prob{}; +    std::array<u8, 2304> coef_probs{}; +    std::array<u8, 8> switchable_interp_prob{}; +    std::array<u8, 28> inter_mode_prob{}; +    std::array<u8, 4> intra_inter_prob{}; +    std::array<u8, 5> comp_inter_prob{}; +    std::array<u8, 10> single_ref_prob{}; +    std::array<u8, 5> comp_ref_prob{}; +    std::array<u8, 6> tx_32x32_prob{}; +    std::array<u8, 4> tx_16x16_prob{}; +    std::array<u8, 2> tx_8x8_prob{}; +    std::array<u8, 3> skip_probs{}; +    std::array<u8, 3> joints{}; +    std::array<u8, 2> sign{}; +    std::array<u8, 20> classes{}; +    std::array<u8, 2> class_0{}; +    std::array<u8, 20> prob_bits{}; +    std::array<u8, 12> class_0_fr{}; +    std::array<u8, 6> fr{}; +    std::array<u8, 2> class_0_hp{}; +    std::array<u8, 2> high_precision{}; +}; +static_assert(sizeof(Vp9EntropyProbs) == 0x9F4, "Vp9EntropyProbs is an invalid size"); + +struct Vp9PictureInfo { +    bool is_key_frame{}; +    bool intra_only{}; +    bool last_frame_was_key{}; +    bool frame_size_changed{}; +    bool error_resilient_mode{}; +    bool last_frame_shown{}; +    bool show_frame{}; +    std::array<s8, 4> ref_frame_sign_bias{}; +    s32 base_q_index{}; +    s32 y_dc_delta_q{}; +    s32 uv_dc_delta_q{}; +    s32 uv_ac_delta_q{}; +    bool lossless{}; +    s32 transform_mode{}; +    bool allow_high_precision_mv{}; +    s32 interp_filter{}; +    s32 reference_mode{}; +    s8 comp_fixed_ref{}; +    std::array<s8, 2> comp_var_ref{}; +    s32 log2_tile_cols{}; +    s32 log2_tile_rows{}; +    bool segment_enabled{}; +    bool segment_map_update{}; +    bool segment_map_temporal_update{}; +    s32 segment_abs_delta{}; +    std::array<u32, 8> segment_feature_enable{}; +    std::array<std::array<s16, 4>, 8> segment_feature_data{}; +    bool mode_ref_delta_enabled{}; +    bool use_prev_in_find_mv_refs{}; +    std::array<s8, 4> ref_deltas{}; +    std::array<s8, 2> mode_deltas{}; +    Vp9EntropyProbs entropy{}; +    Vp9FrameDimensions frame_size{}; +    u8 first_level{}; +    u8 sharpness_level{}; +    u32 bitstream_size{}; +    std::array<u64, 4> frame_offsets{}; +    std::array<bool, 4> refresh_frame{}; +}; + +struct Vp9FrameContainer { +    Vp9PictureInfo info{}; +    std::vector<u8> bit_stream; +}; + +struct PictureInfo { +    INSERT_PADDING_WORDS(12); +    u32 bitstream_size{}; +    INSERT_PADDING_WORDS(5); +    Vp9FrameDimensions last_frame_size{}; +    Vp9FrameDimensions golden_frame_size{}; +    Vp9FrameDimensions alt_frame_size{}; +    Vp9FrameDimensions current_frame_size{}; +    u32 vp9_flags{}; +    std::array<s8, 4> ref_frame_sign_bias{}; +    u8 first_level{}; +    u8 sharpness_level{}; +    u8 base_q_index{}; +    u8 y_dc_delta_q{}; +    u8 uv_ac_delta_q{}; +    u8 uv_dc_delta_q{}; +    u8 lossless{}; +    u8 tx_mode{}; +    u8 allow_high_precision_mv{}; +    u8 interp_filter{}; +    u8 reference_mode{}; +    s8 comp_fixed_ref{}; +    std::array<s8, 2> comp_var_ref{}; +    u8 log2_tile_cols{}; +    u8 log2_tile_rows{}; +    Segmentation segmentation{}; +    LoopFilter loop_filter{}; +    INSERT_PADDING_BYTES(5); +    u32 surface_params{}; +    INSERT_PADDING_WORDS(3); + +    Vp9PictureInfo Convert() const { + +        return Vp9PictureInfo{ +            .is_key_frame = (vp9_flags & FrameFlags::IsKeyFrame) != 0, +            .intra_only = (vp9_flags & FrameFlags::IntraOnly) != 0, +            .last_frame_was_key = (vp9_flags & FrameFlags::LastFrameIsKeyFrame) != 0, +            .frame_size_changed = (vp9_flags & FrameFlags::FrameSizeChanged) != 0, +            .error_resilient_mode = (vp9_flags & FrameFlags::ErrorResilientMode) != 0, +            .last_frame_shown = (vp9_flags & FrameFlags::LastShowFrame) != 0, +            .ref_frame_sign_bias = ref_frame_sign_bias, +            .base_q_index = base_q_index, +            .y_dc_delta_q = y_dc_delta_q, +            .uv_dc_delta_q = uv_dc_delta_q, +            .uv_ac_delta_q = uv_ac_delta_q, +            .lossless = lossless != 0, +            .transform_mode = tx_mode, +            .allow_high_precision_mv = allow_high_precision_mv != 0, +            .interp_filter = interp_filter, +            .reference_mode = reference_mode, +            .comp_fixed_ref = comp_fixed_ref, +            .comp_var_ref = comp_var_ref, +            .log2_tile_cols = log2_tile_cols, +            .log2_tile_rows = log2_tile_rows, +            .segment_enabled = segmentation.enabled != 0, +            .segment_map_update = segmentation.update_map != 0, +            .segment_map_temporal_update = segmentation.temporal_update != 0, +            .segment_abs_delta = segmentation.abs_delta, +            .segment_feature_enable = segmentation.feature_mask, +            .segment_feature_data = segmentation.feature_data, +            .mode_ref_delta_enabled = loop_filter.mode_ref_delta_enabled != 0, +            .use_prev_in_find_mv_refs = !(vp9_flags == (FrameFlags::ErrorResilientMode)) && +                                        !(vp9_flags == (FrameFlags::FrameSizeChanged)) && +                                        !(vp9_flags == (FrameFlags::IntraOnly)) && +                                        (vp9_flags == (FrameFlags::LastShowFrame)) && +                                        !(vp9_flags == (FrameFlags::LastFrameIsKeyFrame)), +            .ref_deltas = loop_filter.ref_deltas, +            .mode_deltas = loop_filter.mode_deltas, +            .frame_size = current_frame_size, +            .first_level = first_level, +            .sharpness_level = sharpness_level, +            .bitstream_size = bitstream_size, +        }; +    } +}; +static_assert(sizeof(PictureInfo) == 0x100, "PictureInfo is an invalid size"); + +struct EntropyProbs { +    INSERT_PADDING_BYTES(1024); +    std::array<std::array<u8, 4>, 7> inter_mode_prob{}; +    std::array<u8, 4> intra_inter_prob{}; +    INSERT_PADDING_BYTES(80); +    std::array<std::array<u8, 1>, 2> tx_8x8_prob{}; +    std::array<std::array<u8, 2>, 2> tx_16x16_prob{}; +    std::array<std::array<u8, 3>, 2> tx_32x32_prob{}; +    std::array<u8, 4> y_mode_prob_e8{}; +    std::array<std::array<u8, 8>, 4> y_mode_prob_e0e7{}; +    INSERT_PADDING_BYTES(64); +    std::array<std::array<u8, 4>, 16> partition_prob{}; +    INSERT_PADDING_BYTES(10); +    std::array<std::array<u8, 2>, 4> switchable_interp_prob{}; +    std::array<u8, 5> comp_inter_prob{}; +    std::array<u8, 4> skip_probs{}; +    std::array<u8, 3> joints{}; +    std::array<u8, 2> sign{}; +    std::array<std::array<u8, 1>, 2> class_0{}; +    std::array<std::array<u8, 3>, 2> fr{}; +    std::array<u8, 2> class_0_hp{}; +    std::array<u8, 2> high_precision{}; +    std::array<std::array<u8, 10>, 2> classes{}; +    std::array<std::array<std::array<u8, 3>, 2>, 2> class_0_fr{}; +    std::array<std::array<u8, 10>, 2> pred_bits{}; +    std::array<std::array<u8, 2>, 5> single_ref_prob{}; +    std::array<u8, 5> comp_ref_prob{}; +    INSERT_PADDING_BYTES(17); +    std::array<std::array<std::array<std::array<std::array<std::array<u8, 4>, 6>, 6>, 2>, 2>, 4> +        coef_probs{}; + +    void Convert(Vp9EntropyProbs& fc) { +        std::memcpy(fc.inter_mode_prob.data(), inter_mode_prob.data(), fc.inter_mode_prob.size()); + +        std::memcpy(fc.intra_inter_prob.data(), intra_inter_prob.data(), +                    fc.intra_inter_prob.size()); + +        std::memcpy(fc.tx_8x8_prob.data(), tx_8x8_prob.data(), fc.tx_8x8_prob.size()); +        std::memcpy(fc.tx_16x16_prob.data(), tx_16x16_prob.data(), fc.tx_16x16_prob.size()); +        std::memcpy(fc.tx_32x32_prob.data(), tx_32x32_prob.data(), fc.tx_32x32_prob.size()); + +        for (s32 i = 0; i < 4; i++) { +            for (s32 j = 0; j < 9; j++) { +                fc.y_mode_prob[j + 9 * i] = j < 8 ? y_mode_prob_e0e7[i][j] : y_mode_prob_e8[i]; +            } +        } + +        std::memcpy(fc.partition_prob.data(), partition_prob.data(), fc.partition_prob.size()); + +        std::memcpy(fc.switchable_interp_prob.data(), switchable_interp_prob.data(), +                    fc.switchable_interp_prob.size()); +        std::memcpy(fc.comp_inter_prob.data(), comp_inter_prob.data(), fc.comp_inter_prob.size()); +        std::memcpy(fc.skip_probs.data(), skip_probs.data(), fc.skip_probs.size()); + +        std::memcpy(fc.joints.data(), joints.data(), fc.joints.size()); + +        std::memcpy(fc.sign.data(), sign.data(), fc.sign.size()); +        std::memcpy(fc.class_0.data(), class_0.data(), fc.class_0.size()); +        std::memcpy(fc.fr.data(), fr.data(), fc.fr.size()); +        std::memcpy(fc.class_0_hp.data(), class_0_hp.data(), fc.class_0_hp.size()); +        std::memcpy(fc.high_precision.data(), high_precision.data(), fc.high_precision.size()); +        std::memcpy(fc.classes.data(), classes.data(), fc.classes.size()); +        std::memcpy(fc.class_0_fr.data(), class_0_fr.data(), fc.class_0_fr.size()); +        std::memcpy(fc.prob_bits.data(), pred_bits.data(), fc.prob_bits.size()); +        std::memcpy(fc.single_ref_prob.data(), single_ref_prob.data(), fc.single_ref_prob.size()); +        std::memcpy(fc.comp_ref_prob.data(), comp_ref_prob.data(), fc.comp_ref_prob.size()); + +        std::memcpy(fc.coef_probs.data(), coef_probs.data(), fc.coef_probs.size()); +    } +}; +static_assert(sizeof(EntropyProbs) == 0xEA0, "EntropyProbs is an invalid size"); + +enum class Ref { Last, Golden, AltRef }; + +struct RefPoolElement { +    s64 frame{}; +    Ref ref{}; +    bool refresh{}; +}; + +struct FrameContexts { +    s64 from{}; +    bool adapted{}; +    Vp9EntropyProbs probs{}; +}; + +}; // namespace Decoder +}; // namespace Tegra diff --git a/src/video_core/command_classes/host1x.cpp b/src/video_core/command_classes/host1x.cpp new file mode 100644 index 000000000..a5234ee47 --- /dev/null +++ b/src/video_core/command_classes/host1x.cpp @@ -0,0 +1,39 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include "common/assert.h" +#include "video_core/command_classes/host1x.h" +#include "video_core/gpu.h" + +Tegra::Host1x::Host1x(GPU& gpu_) : gpu(gpu_) {} + +Tegra::Host1x::~Host1x() = default; + +void Tegra::Host1x::StateWrite(u32 offset, u32 arguments) { +    u8* const state_offset = reinterpret_cast<u8*>(&state) + offset * sizeof(u32); +    std::memcpy(state_offset, &arguments, sizeof(u32)); +} + +void Tegra::Host1x::ProcessMethod(Host1x::Method method, const std::vector<u32>& arguments) { +    StateWrite(static_cast<u32>(method), arguments[0]); +    switch (method) { +    case Method::WaitSyncpt: +        Execute(arguments[0]); +        break; +    case Method::LoadSyncptPayload32: +        syncpoint_value = arguments[0]; +        break; +    case Method::WaitSyncpt32: +        Execute(arguments[0]); +        break; +    default: +        UNIMPLEMENTED_MSG("Host1x method 0x{:X}", static_cast<u32>(method)); +        break; +    } +} + +void Tegra::Host1x::Execute(u32 data) { +    // This method waits on a valid syncpoint. +    // TODO: Implement when proper Async is in place +} diff --git a/src/video_core/command_classes/host1x.h b/src/video_core/command_classes/host1x.h new file mode 100644 index 000000000..501a5ed2e --- /dev/null +++ b/src/video_core/command_classes/host1x.h @@ -0,0 +1,78 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include <vector> +#include "common/common_funcs.h" +#include "common/common_types.h" + +namespace Tegra { +class GPU; +class Nvdec; + +class Host1x { +public: +    struct Host1xClassRegisters { +        u32 incr_syncpt{}; +        u32 incr_syncpt_ctrl{}; +        u32 incr_syncpt_error{}; +        INSERT_PADDING_WORDS(5); +        u32 wait_syncpt{}; +        u32 wait_syncpt_base{}; +        u32 wait_syncpt_incr{}; +        u32 load_syncpt_base{}; +        u32 incr_syncpt_base{}; +        u32 clear{}; +        u32 wait{}; +        u32 wait_with_interrupt{}; +        u32 delay_use{}; +        u32 tick_count_high{}; +        u32 tick_count_low{}; +        u32 tick_ctrl{}; +        INSERT_PADDING_WORDS(23); +        u32 ind_ctrl{}; +        u32 ind_off2{}; +        u32 ind_off{}; +        std::array<u32, 31> ind_data{}; +        INSERT_PADDING_WORDS(1); +        u32 load_syncpoint_payload32{}; +        u32 stall_ctrl{}; +        u32 wait_syncpt32{}; +        u32 wait_syncpt_base32{}; +        u32 load_syncpt_base32{}; +        u32 incr_syncpt_base32{}; +        u32 stall_count_high{}; +        u32 stall_count_low{}; +        u32 xref_ctrl{}; +        u32 channel_xref_high{}; +        u32 channel_xref_low{}; +    }; +    static_assert(sizeof(Host1xClassRegisters) == 0x164, "Host1xClassRegisters is an invalid size"); + +    enum class Method : u32 { +        WaitSyncpt = offsetof(Host1xClassRegisters, wait_syncpt) / 4, +        LoadSyncptPayload32 = offsetof(Host1xClassRegisters, load_syncpoint_payload32) / 4, +        WaitSyncpt32 = offsetof(Host1xClassRegisters, wait_syncpt32) / 4, +    }; + +    explicit Host1x(GPU& gpu); +    ~Host1x(); + +    /// Writes the method into the state, Invoke Execute() if encountered +    void ProcessMethod(Host1x::Method method, const std::vector<u32>& arguments); + +private: +    /// For Host1x, execute is waiting on a syncpoint previously written into the state +    void Execute(u32 data); + +    /// Write argument into the provided offset +    void StateWrite(u32 offset, u32 arguments); + +    u32 syncpoint_value{}; +    Host1xClassRegisters state{}; +    GPU& gpu; +}; + +} // namespace Tegra diff --git a/src/video_core/command_classes/nvdec.cpp b/src/video_core/command_classes/nvdec.cpp new file mode 100644 index 000000000..ede9466eb --- /dev/null +++ b/src/video_core/command_classes/nvdec.cpp @@ -0,0 +1,56 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include <bitset> +#include "common/assert.h" +#include "common/bit_util.h" +#include "core/memory.h" +#include "video_core/command_classes/nvdec.h" +#include "video_core/gpu.h" +#include "video_core/memory_manager.h" + +namespace Tegra { + +Nvdec::Nvdec(GPU& gpu_) : gpu(gpu_), codec(std::make_unique<Codec>(gpu)) {} + +Nvdec::~Nvdec() = default; + +void Nvdec::ProcessMethod(Nvdec::Method method, const std::vector<u32>& arguments) { +    if (method == Method::SetVideoCodec) { +        codec->StateWrite(static_cast<u32>(method), arguments[0]); +    } else { +        codec->StateWrite(static_cast<u32>(method), static_cast<u64>(arguments[0]) << 8); +    } + +    switch (method) { +    case Method::SetVideoCodec: +        codec->SetTargetCodec(static_cast<NvdecCommon::VideoCodec>(arguments[0])); +        break; +    case Method::Execute: +        Execute(); +        break; +    } +} + +AVFrame* Nvdec::GetFrame() { +    return codec->GetCurrentFrame(); +} + +const AVFrame* Nvdec::GetFrame() const { +    return codec->GetCurrentFrame(); +} + +void Nvdec::Execute() { +    switch (codec->GetCurrentCodec()) { +    case NvdecCommon::VideoCodec::H264: +    case NvdecCommon::VideoCodec::Vp9: +        codec->Decode(); +        break; +    default: +        UNIMPLEMENTED_MSG("Unknown codec {}", static_cast<u32>(codec->GetCurrentCodec())); +        break; +    } +} + +} // namespace Tegra diff --git a/src/video_core/command_classes/nvdec.h b/src/video_core/command_classes/nvdec.h new file mode 100644 index 000000000..c1a9d843e --- /dev/null +++ b/src/video_core/command_classes/nvdec.h @@ -0,0 +1,39 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include <vector> +#include "common/common_funcs.h" +#include "common/common_types.h" +#include "video_core/command_classes/codecs/codec.h" + +namespace Tegra { +class GPU; + +class Nvdec { +public: +    enum class Method : u32 { +        SetVideoCodec = 0x80, +        Execute = 0xc0, +    }; + +    explicit Nvdec(GPU& gpu); +    ~Nvdec(); + +    /// Writes the method into the state, Invoke Execute() if encountered +    void ProcessMethod(Nvdec::Method method, const std::vector<u32>& arguments); + +    /// Return most recently decoded frame +    AVFrame* GetFrame(); +    const AVFrame* GetFrame() const; + +private: +    /// Invoke codec to decode a frame +    void Execute(); + +    GPU& gpu; +    std::unique_ptr<Tegra::Codec> codec; +}; +} // namespace Tegra diff --git a/src/video_core/command_classes/nvdec_common.h b/src/video_core/command_classes/nvdec_common.h new file mode 100644 index 000000000..01b5e086d --- /dev/null +++ b/src/video_core/command_classes/nvdec_common.h @@ -0,0 +1,48 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include "common/common_funcs.h" +#include "common/common_types.h" + +namespace Tegra::NvdecCommon { + +struct NvdecRegisters { +    INSERT_PADDING_WORDS(256); +    u64 set_codec_id{}; +    INSERT_PADDING_WORDS(254); +    u64 set_platform_id{}; +    u64 picture_info_offset{}; +    u64 frame_bitstream_offset{}; +    u64 frame_number{}; +    u64 h264_slice_data_offsets{}; +    u64 h264_mv_dump_offset{}; +    INSERT_PADDING_WORDS(6); +    u64 frame_stats_offset{}; +    u64 h264_last_surface_luma_offset{}; +    u64 h264_last_surface_chroma_offset{}; +    std::array<u64, 17> surface_luma_offset{}; +    std::array<u64, 17> surface_chroma_offset{}; +    INSERT_PADDING_WORDS(132); +    u64 vp9_entropy_probs_offset{}; +    u64 vp9_backward_updates_offset{}; +    u64 vp9_last_frame_segmap_offset{}; +    u64 vp9_curr_frame_segmap_offset{}; +    INSERT_PADDING_WORDS(2); +    u64 vp9_last_frame_mvs_offset{}; +    u64 vp9_curr_frame_mvs_offset{}; +    INSERT_PADDING_WORDS(2); +}; +static_assert(sizeof(NvdecRegisters) == (0xBC0), "NvdecRegisters is incorrect size"); + +enum class VideoCodec : u32 { +    None = 0x0, +    H264 = 0x3, +    Vp8 = 0x5, +    H265 = 0x7, +    Vp9 = 0x9, +}; + +} // namespace Tegra::NvdecCommon diff --git a/src/video_core/command_classes/sync_manager.cpp b/src/video_core/command_classes/sync_manager.cpp new file mode 100644 index 000000000..a0ab44855 --- /dev/null +++ b/src/video_core/command_classes/sync_manager.cpp @@ -0,0 +1,60 @@ +// MIT License +// +// Copyright (c) Ryujinx Team and Contributors +// +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and +// associated documentation files (the "Software"), to deal in the Software without restriction, +// including without limitation the rights to use, copy, modify, merge, publish, distribute, +// sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all copies or +// substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT +// NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, +// DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +// + +#include <algorithm> +#include "sync_manager.h" +#include "video_core/gpu.h" + +namespace Tegra { +SyncptIncrManager::SyncptIncrManager(GPU& gpu_) : gpu(gpu_) {} +SyncptIncrManager::~SyncptIncrManager() = default; + +void SyncptIncrManager::Increment(u32 id) { +    increments.push_back(SyncptIncr{0, id, true}); +    IncrementAllDone(); +} + +u32 SyncptIncrManager::IncrementWhenDone(u32 class_id, u32 id) { +    const u32 handle = current_id++; +    increments.push_back(SyncptIncr{handle, class_id, id}); +    return handle; +} + +void SyncptIncrManager::SignalDone(u32 handle) { +    auto done_incr = std::find_if(increments.begin(), increments.end(), +                                  [handle](SyncptIncr incr) { return incr.id == handle; }); +    if (done_incr != increments.end()) { +        const SyncptIncr incr = *done_incr; +        *done_incr = SyncptIncr{incr.id, incr.class_id, incr.syncpt_id, true}; +    } +    IncrementAllDone(); +} + +void SyncptIncrManager::IncrementAllDone() { +    std::size_t done_count = 0; +    for (; done_count < increments.size(); ++done_count) { +        if (!increments[done_count].complete) { +            break; +        } +        gpu.IncrementSyncPoint(increments[done_count].syncpt_id); +    } +    increments.erase(increments.begin(), increments.begin() + done_count); +} +} // namespace Tegra diff --git a/src/video_core/command_classes/sync_manager.h b/src/video_core/command_classes/sync_manager.h new file mode 100644 index 000000000..353b67573 --- /dev/null +++ b/src/video_core/command_classes/sync_manager.h @@ -0,0 +1,64 @@ +// MIT License +// +// Copyright (c) Ryujinx Team and Contributors +// +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and +// associated documentation files (the "Software"), to deal in the Software without restriction, +// including without limitation the rights to use, copy, modify, merge, publish, distribute, +// sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all copies or +// substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT +// NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, +// DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +// + +#pragma once + +#include <mutex> +#include <vector> +#include "common/common_types.h" + +namespace Tegra { +class GPU; +struct SyncptIncr { +    u32 id; +    u32 class_id; +    u32 syncpt_id; +    bool complete; + +    SyncptIncr(u32 id, u32 syncpt_id_, u32 class_id_, bool done = false) +        : id(id), class_id(class_id_), syncpt_id(syncpt_id_), complete(done) {} +}; + +class SyncptIncrManager { +public: +    explicit SyncptIncrManager(GPU& gpu); +    ~SyncptIncrManager(); + +    /// Add syncpoint id and increment all +    void Increment(u32 id); + +    /// Returns a handle to increment later +    u32 IncrementWhenDone(u32 class_id, u32 id); + +    /// IncrememntAllDone, including handle +    void SignalDone(u32 handle); + +    /// Increment all sequential pending increments that are already done. +    void IncrementAllDone(); + +private: +    std::vector<SyncptIncr> increments; +    std::mutex increment_lock; +    u32 current_id{}; + +    GPU& gpu; +}; + +} // namespace Tegra diff --git a/src/video_core/command_classes/vic.cpp b/src/video_core/command_classes/vic.cpp new file mode 100644 index 000000000..66e15a1a8 --- /dev/null +++ b/src/video_core/command_classes/vic.cpp @@ -0,0 +1,180 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include <array> +#include "common/assert.h" +#include "video_core/command_classes/nvdec.h" +#include "video_core/command_classes/vic.h" +#include "video_core/engines/maxwell_3d.h" +#include "video_core/gpu.h" +#include "video_core/memory_manager.h" +#include "video_core/texture_cache/surface_params.h" + +extern "C" { +#include <libswscale/swscale.h> +} + +namespace Tegra { + +Vic::Vic(GPU& gpu_, std::shared_ptr<Nvdec> nvdec_processor_) +    : gpu(gpu_), nvdec_processor(std::move(nvdec_processor_)) {} +Vic::~Vic() = default; + +void Vic::VicStateWrite(u32 offset, u32 arguments) { +    u8* const state_offset = reinterpret_cast<u8*>(&vic_state) + offset * sizeof(u32); +    std::memcpy(state_offset, &arguments, sizeof(u32)); +} + +void Vic::ProcessMethod(Vic::Method method, const std::vector<u32>& arguments) { +    LOG_DEBUG(HW_GPU, "Vic method 0x{:X}", static_cast<u32>(method)); +    VicStateWrite(static_cast<u32>(method), arguments[0]); +    const u64 arg = static_cast<u64>(arguments[0]) << 8; +    switch (method) { +    case Method::Execute: +        Execute(); +        break; +    case Method::SetConfigStructOffset: +        config_struct_address = arg; +        break; +    case Method::SetOutputSurfaceLumaOffset: +        output_surface_luma_address = arg; +        break; +    case Method::SetOutputSurfaceChromaUOffset: +        output_surface_chroma_u_address = arg; +        break; +    case Method::SetOutputSurfaceChromaVOffset: +        output_surface_chroma_v_address = arg; +        break; +    default: +        break; +    } +} + +void Vic::Execute() { +    if (output_surface_luma_address == 0) { +        LOG_ERROR(Service_NVDRV, "VIC Luma address not set. Recieved 0x{:X}", +                  vic_state.output_surface.luma_offset); +        return; +    } +    const VicConfig config{gpu.MemoryManager().Read<u64>(config_struct_address + 0x20)}; +    const VideoPixelFormat pixel_format = +        static_cast<VideoPixelFormat>(config.pixel_format.Value()); +    switch (pixel_format) { +    case VideoPixelFormat::BGRA8: +    case VideoPixelFormat::RGBA8: { +        LOG_TRACE(Service_NVDRV, "Writing RGB Frame"); +        const auto* frame = nvdec_processor->GetFrame(); + +        if (!frame || frame->width == 0 || frame->height == 0) { +            return; +        } +        if (scaler_ctx == nullptr || frame->width != scaler_width || +            frame->height != scaler_height) { +            const AVPixelFormat target_format = +                (pixel_format == VideoPixelFormat::RGBA8) ? AV_PIX_FMT_RGBA : AV_PIX_FMT_BGRA; + +            sws_freeContext(scaler_ctx); +            scaler_ctx = nullptr; + +            // FFmpeg returns all frames in YUV420, convert it into expected format +            scaler_ctx = +                sws_getContext(frame->width, frame->height, AV_PIX_FMT_YUV420P, frame->width, +                               frame->height, target_format, 0, nullptr, nullptr, nullptr); + +            scaler_width = frame->width; +            scaler_height = frame->height; +        } +        // Get Converted frame +        const std::size_t linear_size = frame->width * frame->height * 4; + +        using AVMallocPtr = std::unique_ptr<u8, decltype(&av_free)>; +        AVMallocPtr converted_frame_buffer{static_cast<u8*>(av_malloc(linear_size)), av_free}; + +        const int converted_stride{frame->width * 4}; +        u8* const converted_frame_buf_addr{converted_frame_buffer.get()}; + +        sws_scale(scaler_ctx, frame->data, frame->linesize, 0, frame->height, +                  &converted_frame_buf_addr, &converted_stride); + +        const u32 blk_kind = static_cast<u32>(config.block_linear_kind); +        if (blk_kind != 0) { +            // swizzle pitch linear to block linear +            const u32 block_height = static_cast<u32>(config.block_linear_height_log2); +            const auto size = Tegra::Texture::CalculateSize(true, 4, frame->width, frame->height, 1, +                                                            block_height, 0); +            std::vector<u8> swizzled_data(size); +            Tegra::Texture::CopySwizzledData(frame->width, frame->height, 1, 4, 4, +                                             swizzled_data.data(), converted_frame_buffer.get(), +                                             false, block_height, 0, 1); + +            gpu.MemoryManager().WriteBlock(output_surface_luma_address, swizzled_data.data(), size); +            gpu.Maxwell3D().OnMemoryWrite(); +        } else { +            // send pitch linear frame +            gpu.MemoryManager().WriteBlock(output_surface_luma_address, converted_frame_buf_addr, +                                           linear_size); +            gpu.Maxwell3D().OnMemoryWrite(); +        } +        break; +    } +    case VideoPixelFormat::Yuv420: { +        LOG_TRACE(Service_NVDRV, "Writing YUV420 Frame"); + +        const auto* frame = nvdec_processor->GetFrame(); + +        if (!frame || frame->width == 0 || frame->height == 0) { +            return; +        } + +        const std::size_t surface_width = config.surface_width_minus1 + 1; +        const std::size_t surface_height = config.surface_height_minus1 + 1; +        const std::size_t half_width = surface_width / 2; +        const std::size_t half_height = config.surface_height_minus1 / 2; +        const std::size_t aligned_width = (surface_width + 0xff) & ~0xff; + +        const auto* luma_ptr = frame->data[0]; +        const auto* chroma_b_ptr = frame->data[1]; +        const auto* chroma_r_ptr = frame->data[2]; +        const auto stride = frame->linesize[0]; +        const auto half_stride = frame->linesize[1]; + +        std::vector<u8> luma_buffer(aligned_width * surface_height); +        std::vector<u8> chroma_buffer(aligned_width * half_height); + +        // Populate luma buffer +        for (std::size_t y = 0; y < surface_height - 1; ++y) { +            std::size_t src = y * stride; +            std::size_t dst = y * aligned_width; + +            std::size_t size = surface_width; + +            for (std::size_t offset = 0; offset < size; ++offset) { +                luma_buffer[dst + offset] = luma_ptr[src + offset]; +            } +        } +        gpu.MemoryManager().WriteBlock(output_surface_luma_address, luma_buffer.data(), +                                       luma_buffer.size()); + +        // Populate chroma buffer from both channels with interleaving. +        for (std::size_t y = 0; y < half_height; ++y) { +            std::size_t src = y * half_stride; +            std::size_t dst = y * aligned_width; + +            for (std::size_t x = 0; x < half_width; ++x) { +                chroma_buffer[dst + x * 2] = chroma_b_ptr[src + x]; +                chroma_buffer[dst + x * 2 + 1] = chroma_r_ptr[src + x]; +            } +        } +        gpu.MemoryManager().WriteBlock(output_surface_chroma_u_address, chroma_buffer.data(), +                                       chroma_buffer.size()); +        gpu.Maxwell3D().OnMemoryWrite(); +        break; +    } +    default: +        UNIMPLEMENTED_MSG("Unknown video pixel format {}", config.pixel_format.Value()); +        break; +    } +} + +} // namespace Tegra diff --git a/src/video_core/command_classes/vic.h b/src/video_core/command_classes/vic.h new file mode 100644 index 000000000..dd0a2aed8 --- /dev/null +++ b/src/video_core/command_classes/vic.h @@ -0,0 +1,110 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include <memory> +#include <vector> +#include "common/bit_field.h" +#include "common/common_types.h" + +struct SwsContext; + +namespace Tegra { +class GPU; +class Nvdec; + +struct PlaneOffsets { +    u32 luma_offset{}; +    u32 chroma_u_offset{}; +    u32 chroma_v_offset{}; +}; + +struct VicRegisters { +    INSERT_PADDING_WORDS(64); +    u32 nop{}; +    INSERT_PADDING_WORDS(15); +    u32 pm_trigger{}; +    INSERT_PADDING_WORDS(47); +    u32 set_application_id{}; +    u32 set_watchdog_timer{}; +    INSERT_PADDING_WORDS(17); +    u32 context_save_area{}; +    u32 context_switch{}; +    INSERT_PADDING_WORDS(43); +    u32 execute{}; +    INSERT_PADDING_WORDS(63); +    std::array<std::array<PlaneOffsets, 8>, 8> surfacex_slots{}; +    u32 picture_index{}; +    u32 control_params{}; +    u32 config_struct_offset{}; +    u32 filter_struct_offset{}; +    u32 palette_offset{}; +    u32 hist_offset{}; +    u32 context_id{}; +    u32 fce_ucode_size{}; +    PlaneOffsets output_surface{}; +    u32 fce_ucode_offset{}; +    INSERT_PADDING_WORDS(4); +    std::array<u32, 8> slot_context_id{}; +    INSERT_PADDING_WORDS(16); +}; +static_assert(sizeof(VicRegisters) == 0x7A0, "VicRegisters is an invalid size"); + +class Vic { +public: +    enum class Method : u32 { +        Execute = 0xc0, +        SetControlParams = 0x1c1, +        SetConfigStructOffset = 0x1c2, +        SetOutputSurfaceLumaOffset = 0x1c8, +        SetOutputSurfaceChromaUOffset = 0x1c9, +        SetOutputSurfaceChromaVOffset = 0x1ca +    }; + +    explicit Vic(GPU& gpu, std::shared_ptr<Tegra::Nvdec> nvdec_processor); +    ~Vic(); + +    /// Write to the device state. +    void ProcessMethod(Vic::Method method, const std::vector<u32>& arguments); + +private: +    void Execute(); + +    void VicStateWrite(u32 offset, u32 arguments); +    VicRegisters vic_state{}; + +    enum class VideoPixelFormat : u64_le { +        RGBA8 = 0x1f, +        BGRA8 = 0x20, +        Yuv420 = 0x44, +    }; + +    union VicConfig { +        u64_le raw{}; +        BitField<0, 7, u64_le> pixel_format; +        BitField<7, 2, u64_le> chroma_loc_horiz; +        BitField<9, 2, u64_le> chroma_loc_vert; +        BitField<11, 4, u64_le> block_linear_kind; +        BitField<15, 4, u64_le> block_linear_height_log2; +        BitField<19, 3, u64_le> reserved0; +        BitField<22, 10, u64_le> reserved1; +        BitField<32, 14, u64_le> surface_width_minus1; +        BitField<46, 14, u64_le> surface_height_minus1; +    }; + +    GPU& gpu; +    std::shared_ptr<Tegra::Nvdec> nvdec_processor; + +    GPUVAddr config_struct_address{}; +    GPUVAddr output_surface_luma_address{}; +    GPUVAddr output_surface_chroma_u_address{}; +    GPUVAddr output_surface_chroma_v_address{}; + +    SwsContext* scaler_ctx{}; +    s32 scaler_width{}; +    s32 scaler_height{}; +}; + +} // namespace Tegra diff --git a/src/video_core/gpu.cpp b/src/video_core/gpu.cpp index 4bb9256e9..171f78183 100644 --- a/src/video_core/gpu.cpp +++ b/src/video_core/gpu.cpp @@ -27,9 +27,10 @@ namespace Tegra {  MICROPROFILE_DEFINE(GPU_wait, "GPU", "Wait for the GPU", MP_RGB(128, 128, 192)); -GPU::GPU(Core::System& system_, bool is_async_) +GPU::GPU(Core::System& system_, bool is_async_, bool use_nvdec_)      : system{system_}, memory_manager{std::make_unique<Tegra::MemoryManager>(system)},        dma_pusher{std::make_unique<Tegra::DmaPusher>(system, *this)}, +      cdma_pusher{std::make_unique<Tegra::CDmaPusher>(*this)}, use_nvdec{use_nvdec_},        maxwell_3d{std::make_unique<Engines::Maxwell3D>(system, *memory_manager)},        fermi_2d{std::make_unique<Engines::Fermi2D>()},        kepler_compute{std::make_unique<Engines::KeplerCompute>(system, *memory_manager)}, @@ -77,10 +78,18 @@ DmaPusher& GPU::DmaPusher() {      return *dma_pusher;  } +Tegra::CDmaPusher& GPU::CDmaPusher() { +    return *cdma_pusher; +} +  const DmaPusher& GPU::DmaPusher() const {      return *dma_pusher;  } +const Tegra::CDmaPusher& GPU::CDmaPusher() const { +    return *cdma_pusher; +} +  void GPU::WaitFence(u32 syncpoint_id, u32 value) {      // Synced GPU, is always in sync      if (!is_async) { diff --git a/src/video_core/gpu.h b/src/video_core/gpu.h index 2d15d1c6f..b8c613b11 100644 --- a/src/video_core/gpu.h +++ b/src/video_core/gpu.h @@ -13,6 +13,7 @@  #include "common/common_types.h"  #include "core/hle/service/nvdrv/nvdata.h"  #include "core/hle/service/nvflinger/buffer_queue.h" +#include "video_core/cdma_pusher.h"  #include "video_core/dma_pusher.h"  using CacheAddr = std::uintptr_t; @@ -157,7 +158,7 @@ public:                method_count(method_count) {}      }; -    explicit GPU(Core::System& system, bool is_async); +    explicit GPU(Core::System& system, bool is_async, bool use_nvdec);      virtual ~GPU();      /// Binds a renderer to the GPU. @@ -209,6 +210,15 @@ public:      /// Returns a reference to the GPU DMA pusher.      Tegra::DmaPusher& DmaPusher(); +    /// Returns a const reference to the GPU DMA pusher. +    const Tegra::DmaPusher& DmaPusher() const; + +    /// Returns a reference to the GPU CDMA pusher. +    Tegra::CDmaPusher& CDmaPusher(); + +    /// Returns a const reference to the GPU CDMA pusher. +    const Tegra::CDmaPusher& CDmaPusher() const; +      VideoCore::RendererBase& Renderer() {          return *renderer;      } @@ -249,8 +259,9 @@ public:          return is_async;      } -    /// Returns a const reference to the GPU DMA pusher. -    const Tegra::DmaPusher& DmaPusher() const; +    bool UseNvdec() const { +        return use_nvdec; +    }      struct Regs {          static constexpr size_t NUM_REGS = 0x40; @@ -311,6 +322,9 @@ public:      /// Push GPU command entries to be processed      virtual void PushGPUEntries(Tegra::CommandList&& entries) = 0; +    /// Push GPU command buffer entries to be processed +    virtual void PushCommandBuffer(Tegra::ChCommandHeaderList& entries) = 0; +      /// Swap buffers (render frame)      virtual void SwapBuffers(const Tegra::FramebufferConfig* framebuffer) = 0; @@ -349,7 +363,9 @@ protected:      Core::System& system;      std::unique_ptr<Tegra::MemoryManager> memory_manager;      std::unique_ptr<Tegra::DmaPusher> dma_pusher; +    std::unique_ptr<Tegra::CDmaPusher> cdma_pusher;      std::unique_ptr<VideoCore::RendererBase> renderer; +    const bool use_nvdec;  private:      /// Mapping of command subchannels to their bound engine ids @@ -372,6 +388,7 @@ private:      std::array<std::list<u32>, Service::Nvidia::MaxSyncPoints> syncpt_interrupts;      std::mutex sync_mutex; +    std::mutex device_mutex;      std::condition_variable sync_cv; diff --git a/src/video_core/gpu_asynch.cpp b/src/video_core/gpu_asynch.cpp index 70a3d5738..a9baaf7ef 100644 --- a/src/video_core/gpu_asynch.cpp +++ b/src/video_core/gpu_asynch.cpp @@ -10,12 +10,13 @@  namespace VideoCommon { -GPUAsynch::GPUAsynch(Core::System& system) : GPU{system, true}, gpu_thread{system} {} +GPUAsynch::GPUAsynch(Core::System& system, bool use_nvdec) +    : GPU{system, true, use_nvdec}, gpu_thread{system} {}  GPUAsynch::~GPUAsynch() = default;  void GPUAsynch::Start() { -    gpu_thread.StartThread(*renderer, renderer->Context(), *dma_pusher); +    gpu_thread.StartThread(*renderer, renderer->Context(), *dma_pusher, *cdma_pusher);      cpu_context = renderer->GetRenderWindow().CreateSharedContext();      cpu_context->MakeCurrent();  } @@ -32,6 +33,27 @@ void GPUAsynch::PushGPUEntries(Tegra::CommandList&& entries) {      gpu_thread.SubmitList(std::move(entries));  } +void GPUAsynch::PushCommandBuffer(Tegra::ChCommandHeaderList& entries) { +    if (!use_nvdec) { +        return; +    } +    // This condition fires when a video stream ends, clear all intermediary data +    if (entries[0].raw == 0xDEADB33F) { +        cdma_pusher.reset(); +        return; +    } +    if (!cdma_pusher) { +        cdma_pusher = std::make_unique<Tegra::CDmaPusher>(*this); +    } + +    // SubmitCommandBuffer would make the nvdec operations async, this is not currently working +    // TODO(ameerj): RE proper async nvdec operation +    // gpu_thread.SubmitCommandBuffer(std::move(entries)); + +    cdma_pusher->Push(std::move(entries)); +    cdma_pusher->DispatchCalls(); +} +  void GPUAsynch::SwapBuffers(const Tegra::FramebufferConfig* framebuffer) {      gpu_thread.SwapBuffers(framebuffer);  } diff --git a/src/video_core/gpu_asynch.h b/src/video_core/gpu_asynch.h index f89c855a5..0c0872e73 100644 --- a/src/video_core/gpu_asynch.h +++ b/src/video_core/gpu_asynch.h @@ -20,13 +20,14 @@ namespace VideoCommon {  /// Implementation of GPU interface that runs the GPU asynchronously  class GPUAsynch final : public Tegra::GPU {  public: -    explicit GPUAsynch(Core::System& system); +    explicit GPUAsynch(Core::System& system, bool use_nvdec);      ~GPUAsynch() override;      void Start() override;      void ObtainContext() override;      void ReleaseContext() override;      void PushGPUEntries(Tegra::CommandList&& entries) override; +    void PushCommandBuffer(Tegra::ChCommandHeaderList& entries) override;      void SwapBuffers(const Tegra::FramebufferConfig* framebuffer) override;      void FlushRegion(VAddr addr, u64 size) override;      void InvalidateRegion(VAddr addr, u64 size) override; diff --git a/src/video_core/gpu_synch.cpp b/src/video_core/gpu_synch.cpp index 1ca47ddef..ecf7bbdf3 100644 --- a/src/video_core/gpu_synch.cpp +++ b/src/video_core/gpu_synch.cpp @@ -7,7 +7,7 @@  namespace VideoCommon { -GPUSynch::GPUSynch(Core::System& system) : GPU{system, false} {} +GPUSynch::GPUSynch(Core::System& system, bool use_nvdec) : GPU{system, false, use_nvdec} {}  GPUSynch::~GPUSynch() = default; @@ -26,6 +26,22 @@ void GPUSynch::PushGPUEntries(Tegra::CommandList&& entries) {      dma_pusher->DispatchCalls();  } +void GPUSynch::PushCommandBuffer(Tegra::ChCommandHeaderList& entries) { +    if (!use_nvdec) { +        return; +    } +    // This condition fires when a video stream ends, clears all intermediary data +    if (entries[0].raw == 0xDEADB33F) { +        cdma_pusher.reset(); +        return; +    } +    if (!cdma_pusher) { +        cdma_pusher = std::make_unique<Tegra::CDmaPusher>(*this); +    } +    cdma_pusher->Push(std::move(entries)); +    cdma_pusher->DispatchCalls(); +} +  void GPUSynch::SwapBuffers(const Tegra::FramebufferConfig* framebuffer) {      renderer->SwapBuffers(framebuffer);  } diff --git a/src/video_core/gpu_synch.h b/src/video_core/gpu_synch.h index 297258cb1..9d778c71a 100644 --- a/src/video_core/gpu_synch.h +++ b/src/video_core/gpu_synch.h @@ -19,13 +19,14 @@ namespace VideoCommon {  /// Implementation of GPU interface that runs the GPU synchronously  class GPUSynch final : public Tegra::GPU {  public: -    explicit GPUSynch(Core::System& system); +    explicit GPUSynch(Core::System& system, bool use_nvdec);      ~GPUSynch() override;      void Start() override;      void ObtainContext() override;      void ReleaseContext() override;      void PushGPUEntries(Tegra::CommandList&& entries) override; +    void PushCommandBuffer(Tegra::ChCommandHeaderList& entries) override;      void SwapBuffers(const Tegra::FramebufferConfig* framebuffer) override;      void FlushRegion(VAddr addr, u64 size) override;      void InvalidateRegion(VAddr addr, u64 size) override; diff --git a/src/video_core/gpu_thread.cpp b/src/video_core/gpu_thread.cpp index bf761abf2..4b8f58283 100644 --- a/src/video_core/gpu_thread.cpp +++ b/src/video_core/gpu_thread.cpp @@ -18,7 +18,7 @@ namespace VideoCommon::GPUThread {  /// Runs the GPU thread  static void RunThread(Core::System& system, VideoCore::RendererBase& renderer,                        Core::Frontend::GraphicsContext& context, Tegra::DmaPusher& dma_pusher, -                      SynchState& state) { +                      SynchState& state, Tegra::CDmaPusher& cdma_pusher) {      std::string name = "yuzu:GPU";      MicroProfileOnThreadCreate(name.c_str());      Common::SetCurrentThreadName(name.c_str()); @@ -42,6 +42,10 @@ static void RunThread(Core::System& system, VideoCore::RendererBase& renderer,          if (const auto submit_list = std::get_if<SubmitListCommand>(&next.data)) {              dma_pusher.Push(std::move(submit_list->entries));              dma_pusher.DispatchCalls(); +        } else if (const auto command_list = std::get_if<SubmitChCommandEntries>(&next.data)) { +            // NVDEC +            cdma_pusher.Push(std::move(command_list->entries)); +            cdma_pusher.DispatchCalls();          } else if (const auto data = std::get_if<SwapBuffersCommand>(&next.data)) {              renderer.SwapBuffers(data->framebuffer ? &*data->framebuffer : nullptr);          } else if (std::holds_alternative<OnCommandListEndCommand>(next.data)) { @@ -75,15 +79,19 @@ ThreadManager::~ThreadManager() {  void ThreadManager::StartThread(VideoCore::RendererBase& renderer,                                  Core::Frontend::GraphicsContext& context, -                                Tegra::DmaPusher& dma_pusher) { -    thread = std::thread{RunThread,         std::ref(system),     std::ref(renderer), -                         std::ref(context), std::ref(dma_pusher), std::ref(state)}; +                                Tegra::DmaPusher& dma_pusher, Tegra::CDmaPusher& cdma_pusher) { +    thread = std::thread(RunThread, std::ref(system), std::ref(renderer), std::ref(context), +                         std::ref(dma_pusher), std::ref(state), std::ref(cdma_pusher));  }  void ThreadManager::SubmitList(Tegra::CommandList&& entries) {      PushCommand(SubmitListCommand(std::move(entries)));  } +void ThreadManager::SubmitCommandBuffer(Tegra::ChCommandHeaderList&& entries) { +    PushCommand(SubmitChCommandEntries(std::move(entries))); +} +  void ThreadManager::SwapBuffers(const Tegra::FramebufferConfig* framebuffer) {      PushCommand(SwapBuffersCommand(framebuffer ? std::make_optional(*framebuffer) : std::nullopt));  } diff --git a/src/video_core/gpu_thread.h b/src/video_core/gpu_thread.h index 5a28335d6..32a34e3a7 100644 --- a/src/video_core/gpu_thread.h +++ b/src/video_core/gpu_thread.h @@ -37,6 +37,14 @@ struct SubmitListCommand final {      Tegra::CommandList entries;  }; +/// Command to signal to the GPU thread that a cdma command list is ready for processing +struct SubmitChCommandEntries final { +    explicit SubmitChCommandEntries(Tegra::ChCommandHeaderList&& entries) +        : entries{std::move(entries)} {} + +    Tegra::ChCommandHeaderList entries; +}; +  /// Command to signal to the GPU thread that a swap buffers is pending  struct SwapBuffersCommand final {      explicit SwapBuffersCommand(std::optional<const Tegra::FramebufferConfig> framebuffer) @@ -77,9 +85,9 @@ struct OnCommandListEndCommand final {};  struct GPUTickCommand final {};  using CommandData = -    std::variant<EndProcessingCommand, SubmitListCommand, SwapBuffersCommand, FlushRegionCommand, -                 InvalidateRegionCommand, FlushAndInvalidateRegionCommand, OnCommandListEndCommand, -                 GPUTickCommand>; +    std::variant<EndProcessingCommand, SubmitListCommand, SubmitChCommandEntries, +                 SwapBuffersCommand, FlushRegionCommand, InvalidateRegionCommand, +                 FlushAndInvalidateRegionCommand, OnCommandListEndCommand, GPUTickCommand>;  struct CommandDataContainer {      CommandDataContainer() = default; @@ -109,11 +117,14 @@ public:      /// Creates and starts the GPU thread.      void StartThread(VideoCore::RendererBase& renderer, Core::Frontend::GraphicsContext& context, -                     Tegra::DmaPusher& dma_pusher); +                     Tegra::DmaPusher& dma_pusher, Tegra::CDmaPusher& cdma_pusher);      /// Push GPU command entries to be processed      void SubmitList(Tegra::CommandList&& entries); +    /// Push GPU CDMA command buffer entries to be processed +    void SubmitCommandBuffer(Tegra::ChCommandHeaderList&& entries); +      /// Swap buffers (render frame)      void SwapBuffers(const Tegra::FramebufferConfig* framebuffer); diff --git a/src/video_core/memory_manager.cpp b/src/video_core/memory_manager.cpp index 02cf53d15..6e70bd362 100644 --- a/src/video_core/memory_manager.cpp +++ b/src/video_core/memory_manager.cpp @@ -11,6 +11,7 @@  #include "video_core/gpu.h"  #include "video_core/memory_manager.h"  #include "video_core/rasterizer_interface.h" +#include "video_core/renderer_base.h"  namespace Tegra { @@ -44,6 +45,12 @@ GPUVAddr MemoryManager::MapAllocate(VAddr cpu_addr, std::size_t size, std::size_      return Map(cpu_addr, *FindFreeRange(size, align), size);  } +GPUVAddr MemoryManager::MapAllocate32(VAddr cpu_addr, std::size_t size) { +    const std::optional<GPUVAddr> gpu_addr = FindFreeRange(size, 1, true); +    ASSERT(gpu_addr); +    return Map(cpu_addr, *gpu_addr, size); +} +  void MemoryManager::Unmap(GPUVAddr gpu_addr, std::size_t size) {      if (!size) {          return; @@ -108,7 +115,8 @@ void MemoryManager::SetPageEntry(GPUVAddr gpu_addr, PageEntry page_entry, std::s      page_table[PageEntryIndex(gpu_addr)] = page_entry;  } -std::optional<GPUVAddr> MemoryManager::FindFreeRange(std::size_t size, std::size_t align) const { +std::optional<GPUVAddr> MemoryManager::FindFreeRange(std::size_t size, std::size_t align, +                                                     bool start_32bit_address) const {      if (!align) {          align = page_size;      } else { @@ -116,7 +124,7 @@ std::optional<GPUVAddr> MemoryManager::FindFreeRange(std::size_t size, std::size      }      u64 available_size{}; -    GPUVAddr gpu_addr{address_space_start}; +    GPUVAddr gpu_addr{start_32bit_address ? address_space_start_low : address_space_start};      while (gpu_addr + available_size < address_space_size) {          if (GetPageEntry(gpu_addr + available_size).IsUnmapped()) {              available_size += page_size; diff --git a/src/video_core/memory_manager.h b/src/video_core/memory_manager.h index 53c8d122a..c078193d9 100644 --- a/src/video_core/memory_manager.h +++ b/src/video_core/memory_manager.h @@ -116,6 +116,7 @@ public:      [[nodiscard]] GPUVAddr Map(VAddr cpu_addr, GPUVAddr gpu_addr, std::size_t size);      [[nodiscard]] GPUVAddr MapAllocate(VAddr cpu_addr, std::size_t size, std::size_t align); +    [[nodiscard]] GPUVAddr MapAllocate32(VAddr cpu_addr, std::size_t size);      [[nodiscard]] std::optional<GPUVAddr> AllocateFixed(GPUVAddr gpu_addr, std::size_t size);      [[nodiscard]] GPUVAddr Allocate(std::size_t size, std::size_t align);      void Unmap(GPUVAddr gpu_addr, std::size_t size); @@ -124,7 +125,8 @@ private:      [[nodiscard]] PageEntry GetPageEntry(GPUVAddr gpu_addr) const;      void SetPageEntry(GPUVAddr gpu_addr, PageEntry page_entry, std::size_t size = page_size);      GPUVAddr UpdateRange(GPUVAddr gpu_addr, PageEntry page_entry, std::size_t size); -    [[nodiscard]] std::optional<GPUVAddr> FindFreeRange(std::size_t size, std::size_t align) const; +    [[nodiscard]] std::optional<GPUVAddr> FindFreeRange(std::size_t size, std::size_t align, +                                                        bool start_32bit_address = false) const;      void TryLockPage(PageEntry page_entry, std::size_t size);      void TryUnlockPage(PageEntry page_entry, std::size_t size); @@ -135,6 +137,7 @@ private:      static constexpr u64 address_space_size = 1ULL << 40;      static constexpr u64 address_space_start = 1ULL << 32; +    static constexpr u64 address_space_start_low = 1ULL << 16;      static constexpr u64 page_bits{16};      static constexpr u64 page_size{1 << page_bits};      static constexpr u64 page_mask{page_size - 1}; diff --git a/src/video_core/video_core.cpp b/src/video_core/video_core.cpp index a14df06a3..dd5cee4a1 100644 --- a/src/video_core/video_core.cpp +++ b/src/video_core/video_core.cpp @@ -44,10 +44,11 @@ namespace VideoCore {  std::unique_ptr<Tegra::GPU> CreateGPU(Core::Frontend::EmuWindow& emu_window, Core::System& system) {      std::unique_ptr<Tegra::GPU> gpu; +    const bool use_nvdec = Settings::values.use_nvdec_emulation.GetValue();      if (Settings::values.use_asynchronous_gpu_emulation.GetValue()) { -        gpu = std::make_unique<VideoCommon::GPUAsynch>(system); +        gpu = std::make_unique<VideoCommon::GPUAsynch>(system, use_nvdec);      } else { -        gpu = std::make_unique<VideoCommon::GPUSynch>(system); +        gpu = std::make_unique<VideoCommon::GPUSynch>(system, use_nvdec);      }      auto context = emu_window.CreateSharedContext(); diff --git a/src/yuzu/CMakeLists.txt b/src/yuzu/CMakeLists.txt index cc0291b15..4659e1f89 100644 --- a/src/yuzu/CMakeLists.txt +++ b/src/yuzu/CMakeLists.txt @@ -265,9 +265,11 @@ if (MSVC)      include(CopyYuzuQt5Deps)      include(CopyYuzuSDLDeps)      include(CopyYuzuUnicornDeps) +    include(CopyYuzuFFmpegDeps)      copy_yuzu_Qt5_deps(yuzu)      copy_yuzu_SDL_deps(yuzu)      copy_yuzu_unicorn_deps(yuzu) +    copy_yuzu_FFmpeg_deps(yuzu)  endif()  if (NOT APPLE) diff --git a/src/yuzu/configuration/config.cpp b/src/yuzu/configuration/config.cpp index d2913d613..abbc83929 100644 --- a/src/yuzu/configuration/config.cpp +++ b/src/yuzu/configuration/config.cpp @@ -717,6 +717,8 @@ void Config::ReadRendererValues() {      ReadSettingGlobal(Settings::values.gpu_accuracy, QStringLiteral("gpu_accuracy"), 0);      ReadSettingGlobal(Settings::values.use_asynchronous_gpu_emulation,                        QStringLiteral("use_asynchronous_gpu_emulation"), false); +    ReadSettingGlobal(Settings::values.use_nvdec_emulation, QStringLiteral("use_nvdec_emulation"), +                      true);      ReadSettingGlobal(Settings::values.use_vsync, QStringLiteral("use_vsync"), true);      ReadSettingGlobal(Settings::values.use_assembly_shaders, QStringLiteral("use_assembly_shaders"),                        false); @@ -1265,6 +1267,8 @@ void Config::SaveRendererValues() {                         Settings::values.gpu_accuracy.UsingGlobal(), 0);      WriteSettingGlobal(QStringLiteral("use_asynchronous_gpu_emulation"),                         Settings::values.use_asynchronous_gpu_emulation, false); +    WriteSettingGlobal(QStringLiteral("use_nvdec_emulation"), Settings::values.use_nvdec_emulation, +                       true);      WriteSettingGlobal(QStringLiteral("use_vsync"), Settings::values.use_vsync, true);      WriteSettingGlobal(QStringLiteral("use_assembly_shaders"),                         Settings::values.use_assembly_shaders, false); diff --git a/src/yuzu/configuration/configure_graphics.cpp b/src/yuzu/configuration/configure_graphics.cpp index 07d818548..4f083ecda 100644 --- a/src/yuzu/configuration/configure_graphics.cpp +++ b/src/yuzu/configuration/configure_graphics.cpp @@ -70,9 +70,11 @@ void ConfigureGraphics::SetConfiguration() {      ui->api->setEnabled(runtime_lock);      ui->use_asynchronous_gpu_emulation->setEnabled(runtime_lock);      ui->use_disk_shader_cache->setEnabled(runtime_lock); +    ui->use_nvdec_emulation->setEnabled(runtime_lock);      ui->use_disk_shader_cache->setChecked(Settings::values.use_disk_shader_cache.GetValue());      ui->use_asynchronous_gpu_emulation->setChecked(          Settings::values.use_asynchronous_gpu_emulation.GetValue()); +    ui->use_nvdec_emulation->setChecked(Settings::values.use_nvdec_emulation.GetValue());      if (Settings::configuring_global) {          ui->api->setCurrentIndex(static_cast<int>(Settings::values.renderer_backend.GetValue())); @@ -116,6 +118,9 @@ void ConfigureGraphics::ApplyConfiguration() {              Settings::values.use_asynchronous_gpu_emulation.SetValue(                  ui->use_asynchronous_gpu_emulation->isChecked());          } +        if (Settings::values.use_nvdec_emulation.UsingGlobal()) { +            Settings::values.use_nvdec_emulation.SetValue(ui->use_nvdec_emulation->isChecked()); +        }          if (Settings::values.bg_red.UsingGlobal()) {              Settings::values.bg_red.SetValue(static_cast<float>(bg_color.redF()));              Settings::values.bg_green.SetValue(static_cast<float>(bg_color.greenF())); @@ -144,6 +149,8 @@ void ConfigureGraphics::ApplyConfiguration() {          ConfigurationShared::ApplyPerGameSetting(&Settings::values.use_asynchronous_gpu_emulation,                                                   ui->use_asynchronous_gpu_emulation,                                                   use_asynchronous_gpu_emulation); +        ConfigurationShared::ApplyPerGameSetting(&Settings::values.use_nvdec_emulation, +                                                 ui->use_nvdec_emulation, use_nvdec_emulation);          if (ui->bg_combobox->currentIndex() == ConfigurationShared::USE_GLOBAL_INDEX) {              Settings::values.bg_red.SetGlobal(true); @@ -240,6 +247,7 @@ void ConfigureGraphics::SetupPerGameUI() {          ui->aspect_ratio_combobox->setEnabled(Settings::values.aspect_ratio.UsingGlobal());          ui->use_asynchronous_gpu_emulation->setEnabled(              Settings::values.use_asynchronous_gpu_emulation.UsingGlobal()); +        ui->use_nvdec_emulation->setEnabled(Settings::values.use_nvdec_emulation.UsingGlobal());          ui->use_disk_shader_cache->setEnabled(Settings::values.use_disk_shader_cache.UsingGlobal());          ui->bg_button->setEnabled(Settings::values.bg_red.UsingGlobal()); @@ -253,6 +261,8 @@ void ConfigureGraphics::SetupPerGameUI() {      ConfigurationShared::SetColoredTristate(          ui->use_disk_shader_cache, Settings::values.use_disk_shader_cache, use_disk_shader_cache); +    ConfigurationShared::SetColoredTristate( +        ui->use_nvdec_emulation, Settings::values.use_nvdec_emulation, use_nvdec_emulation);      ConfigurationShared::SetColoredTristate(ui->use_asynchronous_gpu_emulation,                                              Settings::values.use_asynchronous_gpu_emulation,                                              use_asynchronous_gpu_emulation); diff --git a/src/yuzu/configuration/configure_graphics.h b/src/yuzu/configuration/configure_graphics.h index b4961f719..1fefc88eb 100644 --- a/src/yuzu/configuration/configure_graphics.h +++ b/src/yuzu/configuration/configure_graphics.h @@ -46,6 +46,7 @@ private:      std::unique_ptr<Ui::ConfigureGraphics> ui;      QColor bg_color; +    ConfigurationShared::CheckState use_nvdec_emulation;      ConfigurationShared::CheckState use_disk_shader_cache;      ConfigurationShared::CheckState use_asynchronous_gpu_emulation; diff --git a/src/yuzu/configuration/configure_graphics.ui b/src/yuzu/configuration/configure_graphics.ui index 62aa337e7..58486eb1e 100644 --- a/src/yuzu/configuration/configure_graphics.ui +++ b/src/yuzu/configuration/configure_graphics.ui @@ -98,6 +98,13 @@           </widget>          </item>          <item> +         <widget class="QCheckBox" name="use_nvdec_emulation"> +          <property name="text"> +           <string>Use NVDEC emulation</string> +          </property> +         </widget> +        </item> +        <item>           <widget class="QWidget" name="aspect_ratio_layout" native="true">            <layout class="QHBoxLayout" name="horizontalLayout_6">             <property name="leftMargin"> | 
