208 files changed, 6146 insertions, 1795 deletions
diff --git a/src/common/logging/backend.cpp b/src/common/logging/backend.cpp
index 8f2591d53..04bc3128f 100644
--- a/src/common/logging/backend.cpp
+++ b/src/common/logging/backend.cpp
@@ -120,7 +120,7 @@ private:
             duration_cast<std::chrono::microseconds>(steady_clock::now() - time_origin);
         entry.log_class = log_class;
         entry.log_level = log_level;
-        entry.filename = Common::TrimSourcePath(filename);
+        entry.filename = filename;
         entry.line_num = line_nr;
         entry.function = function;
         entry.message = std::move(message);
diff --git a/src/common/logging/backend.h b/src/common/logging/backend.h
index fca0267a1..fc338c70d 100644
--- a/src/common/logging/backend.h
+++ b/src/common/logging/backend.h
@@ -23,7 +23,7 @@ struct Entry {
     std::chrono::microseconds timestamp;
     Class log_class;
     Level log_level;
-    std::string filename;
+    const char* filename;
     unsigned int line_num;
     std::string function;
     std::string message;
diff --git a/src/common/logging/log.h b/src/common/logging/log.h
index 259708116..13a4f1e30 100644
--- a/src/common/logging/log.h
+++ b/src/common/logging/log.h
@@ -9,6 +9,15 @@
 
 namespace Log {
 
+// trims up to and including the last of ../, ..\, src/, src\ in a string
+constexpr const char* TrimSourcePath(std::string_view source) {
+    const auto rfind = [source](const std::string_view match) {
+        return source.rfind(match) == source.npos ? 0 : (source.rfind(match) + match.size());
+    };
+    auto idx = std::max({rfind("src/"), rfind("src\\"), rfind("../"), rfind("..\\")});
+    return source.data() + idx;
+}
+
 /// Specifies the severity or level of detail of the log message.
 enum class Level : u8 {
     Trace,    ///< Extremely detailed and repetitive debugging information that is likely to
@@ -141,24 +150,24 @@ void FmtLogMessage(Class log_class, Level log_level, const char* filename, unsig
 
 #ifdef _DEBUG
 #define LOG_TRACE(log_class, ...)                                                                  \
-    ::Log::FmtLogMessage(::Log::Class::log_class, ::Log::Level::Trace, __FILE__, __LINE__,         \
-                         __func__, __VA_ARGS__)
+    ::Log::FmtLogMessage(::Log::Class::log_class, ::Log::Level::Trace,                             \
+                         ::Log::TrimSourcePath(__FILE__), __LINE__, __func__, __VA_ARGS__)
 #else
 #define LOG_TRACE(log_class, fmt, ...) (void(0))
 #endif
 
 #define LOG_DEBUG(log_class, ...)                                                                  \
-    ::Log::FmtLogMessage(::Log::Class::log_class, ::Log::Level::Debug, __FILE__, __LINE__,         \
-                         __func__, __VA_ARGS__)
+    ::Log::FmtLogMessage(::Log::Class::log_class, ::Log::Level::Debug,                             \
+                         ::Log::TrimSourcePath(__FILE__), __LINE__, __func__, __VA_ARGS__)
 #define LOG_INFO(log_class, ...)                                                                   \
-    ::Log::FmtLogMessage(::Log::Class::log_class, ::Log::Level::Info, __FILE__, __LINE__,          \
-                         __func__, __VA_ARGS__)
+    ::Log::FmtLogMessage(::Log::Class::log_class, ::Log::Level::Info,                              \
+                         ::Log::TrimSourcePath(__FILE__), __LINE__, __func__, __VA_ARGS__)
 #define LOG_WARNING(log_class, ...)                                                                \
-    ::Log::FmtLogMessage(::Log::Class::log_class, ::Log::Level::Warning, __FILE__, __LINE__,       \
-                         __func__, __VA_ARGS__)
+    ::Log::FmtLogMessage(::Log::Class::log_class, ::Log::Level::Warning,                           \
+                         ::Log::TrimSourcePath(__FILE__), __LINE__, __func__, __VA_ARGS__)
 #define LOG_ERROR(log_class, ...)                                                                  \
-    ::Log::FmtLogMessage(::Log::Class::log_class, ::Log::Level::Error, __FILE__, __LINE__,         \
-                         __func__, __VA_ARGS__)
+    ::Log::FmtLogMessage(::Log::Class::log_class, ::Log::Level::Error,                             \
+                         ::Log::TrimSourcePath(__FILE__), __LINE__, __func__, __VA_ARGS__)
 #define LOG_CRITICAL(log_class, ...)                                                               \
-    ::Log::FmtLogMessage(::Log::Class::log_class, ::Log::Level::Critical, __FILE__, __LINE__,      \
-                         __func__, __VA_ARGS__)
+    ::Log::FmtLogMessage(::Log::Class::log_class, ::Log::Level::Critical,                          \
+                         ::Log::TrimSourcePath(__FILE__), __LINE__, __func__, __VA_ARGS__)
diff --git a/src/common/string_util.cpp b/src/common/string_util.cpp
index 959f278aa..84883a1d3 100644
--- a/src/common/string_util.cpp
+++ b/src/common/string_util.cpp
@@ -223,26 +223,4 @@ std::u16string UTF16StringFromFixedZeroTerminatedBuffer(std::u16string_view buff
     return std::u16string(buffer.begin(), buffer.begin() + len);
 }
 
-const char* TrimSourcePath(const char* path, const char* root) {
-    const char* p = path;
-
-    while (*p != '\0') {
-        const char* next_slash = p;
-        while (*next_slash != '\0' && *next_slash != '/' && *next_slash != '\\') {
-            ++next_slash;
-        }
-
-        bool is_src = Common::ComparePartialString(p, next_slash, root);
-        p = next_slash;
-
-        if (*p != '\0') {
-            ++p;
-        }
-        if (is_src) {
-            path = p;
-        }
-    }
-    return path;
-}
-
 } // namespace Common
diff --git a/src/common/thread.h b/src/common/thread.h
index 0cfd98be6..2fc071685 100644
--- a/src/common/thread.h
+++ b/src/common/thread.h
@@ -28,6 +28,15 @@ public:
         is_set = false;
     }
 
+    template <class Duration>
+    bool WaitFor(const std::chrono::duration<Duration>& time) {
+        std::unique_lock lk{mutex};
+        if (!condvar.wait_for(lk, time, [this] { return is_set; }))
+            return false;
+        is_set = false;
+        return true;
+    }
+
     template <class Clock, class Duration>
     bool WaitUntil(const std::chrono::time_point<Clock, Duration>& time) {
         std::unique_lock lk{mutex};
diff --git a/src/core/CMakeLists.txt b/src/core/CMakeLists.txt
index 1a3647a67..26612e692 100644
--- a/src/core/CMakeLists.txt
+++ b/src/core/CMakeLists.txt
@@ -15,14 +15,14 @@ add_library(core STATIC
     constants.h
     core.cpp
     core.h
-    core_cpu.cpp
-    core_cpu.h
+    core_manager.cpp
+    core_manager.h
     core_timing.cpp
     core_timing.h
     core_timing_util.cpp
     core_timing_util.h
-    cpu_core_manager.cpp
-    cpu_core_manager.h
+    cpu_manager.cpp
+    cpu_manager.h
     crypto/aes_util.cpp
     crypto/aes_util.h
     crypto/encryption_layer.cpp
@@ -158,6 +158,8 @@ add_library(core STATIC
     hle/kernel/mutex.h
     hle/kernel/object.cpp
     hle/kernel/object.h
+    hle/kernel/physical_core.cpp
+    hle/kernel/physical_core.h
     hle/kernel/process.cpp
     hle/kernel/process.h
     hle/kernel/process_capability.cpp
@@ -179,14 +181,16 @@ add_library(core STATIC
     hle/kernel/svc.cpp
     hle/kernel/svc.h
     hle/kernel/svc_wrap.h
+    hle/kernel/synchronization_object.cpp
+    hle/kernel/synchronization_object.h
+    hle/kernel/synchronization.cpp
+    hle/kernel/synchronization.h
     hle/kernel/thread.cpp
     hle/kernel/thread.h
     hle/kernel/transfer_memory.cpp
     hle/kernel/transfer_memory.h
     hle/kernel/vm_manager.cpp
     hle/kernel/vm_manager.h
-    hle/kernel/wait_object.cpp
-    hle/kernel/wait_object.h
     hle/kernel/writable_event.cpp
     hle/kernel/writable_event.h
     hle/lock.cpp
diff --git a/src/core/arm/dynarmic/arm_dynarmic.cpp b/src/core/arm/dynarmic/arm_dynarmic.cpp
index e825c0526..29eaf74e5 100644
--- a/src/core/arm/dynarmic/arm_dynarmic.cpp
+++ b/src/core/arm/dynarmic/arm_dynarmic.cpp
@@ -10,11 +10,13 @@
 #include "common/microprofile.h"
 #include "core/arm/dynarmic/arm_dynarmic.h"
 #include "core/core.h"
-#include "core/core_cpu.h"
+#include "core/core_manager.h"
 #include "core/core_timing.h"
 #include "core/core_timing_util.h"
 #include "core/gdbstub/gdbstub.h"
+#include "core/hardware_properties.h"
 #include "core/hle/kernel/process.h"
+#include "core/hle/kernel/scheduler.h"
 #include "core/hle/kernel/svc.h"
 #include "core/hle/kernel/vm_manager.h"
 #include "core/memory.h"
@@ -87,7 +89,7 @@ public:
             if (GDBStub::IsServerEnabled()) {
                 parent.jit->HaltExecution();
                 parent.SetPC(pc);
-                Kernel::Thread* thread = Kernel::GetCurrentThread();
+                Kernel::Thread* const thread = parent.system.CurrentScheduler().GetCurrentThread();
                 parent.SaveContext(thread->GetContext());
                 GDBStub::Break();
                 GDBStub::SendTrap(thread, 5);
@@ -152,7 +154,7 @@ std::unique_ptr<Dynarmic::A64::Jit> ARM_Dynarmic::MakeJit(Common::PageTable& pag
     config.tpidr_el0 = &cb->tpidr_el0;
     config.dczid_el0 = 4;
     config.ctr_el0 = 0x8444c004;
-    config.cntfrq_el0 = Timing::CNTFREQ;
+    config.cntfrq_el0 = Hardware::CNTFREQ;
 
     // Unpredictable instructions
     config.define_unpredictable_behaviour = true;
diff --git a/src/core/arm/exclusive_monitor.cpp b/src/core/arm/exclusive_monitor.cpp
index abd59ff4b..94570e520 100644
--- a/src/core/arm/exclusive_monitor.cpp
+++ b/src/core/arm/exclusive_monitor.cpp
@@ -2,10 +2,24 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 
+#ifdef ARCHITECTURE_x86_64
+#include "core/arm/dynarmic/arm_dynarmic.h"
+#endif
 #include "core/arm/exclusive_monitor.h"
+#include "core/memory.h"
 
 namespace Core {
 
 ExclusiveMonitor::~ExclusiveMonitor() = default;
 
+std::unique_ptr<Core::ExclusiveMonitor> MakeExclusiveMonitor(Memory::Memory& memory,
+                                                             std::size_t num_cores) {
+#ifdef ARCHITECTURE_x86_64
+    return std::make_unique<Core::DynarmicExclusiveMonitor>(memory, num_cores);
+#else
+    // TODO(merry): Passthrough exclusive monitor
+    return nullptr;
+#endif
+}
+
 } // namespace Core
diff --git a/src/core/arm/exclusive_monitor.h b/src/core/arm/exclusive_monitor.h
index f59aca667..4ef418b90 100644
--- a/src/core/arm/exclusive_monitor.h
+++ b/src/core/arm/exclusive_monitor.h
@@ -4,8 +4,14 @@
 
 #pragma once
 
+#include <memory>
+
 #include "common/common_types.h"
 
+namespace Memory {
+class Memory;
+}
+
 namespace Core {
 
 class ExclusiveMonitor {
@@ -22,4 +28,7 @@ public:
     virtual bool ExclusiveWrite128(std::size_t core_index, VAddr vaddr, u128 value) = 0;
 };
 
+std::unique_ptr<Core::ExclusiveMonitor> MakeExclusiveMonitor(Memory::Memory& memory,
+                                                             std::size_t num_cores);
+
 } // namespace Core
diff --git a/src/core/arm/unicorn/arm_unicorn.cpp b/src/core/arm/unicorn/arm_unicorn.cpp
index 48182c99a..f99ad5802 100644
--- a/src/core/arm/unicorn/arm_unicorn.cpp
+++ b/src/core/arm/unicorn/arm_unicorn.cpp
@@ -9,6 +9,7 @@
 #include "core/arm/unicorn/arm_unicorn.h"
 #include "core/core.h"
 #include "core/core_timing.h"
+#include "core/hle/kernel/scheduler.h"
 #include "core/hle/kernel/svc.h"
 
 namespace Core {
@@ -177,7 +178,7 @@ void ARM_Unicorn::ExecuteInstructions(std::size_t num_instructions) {
             uc_reg_write(uc, UC_ARM64_REG_PC, &last_bkpt.address);
         }
 
-        Kernel::Thread* thread = Kernel::GetCurrentThread();
+        Kernel::Thread* const thread = system.CurrentScheduler().GetCurrentThread();
         SaveContext(thread->GetContext());
         if (last_bkpt_hit || GDBStub::IsMemoryBreak() || GDBStub::GetCpuStepFlag()) {
             last_bkpt_hit = false;
diff --git a/src/core/core.cpp b/src/core/core.cpp
index d697b80ef..0eb0c0dca 100644
--- a/src/core/core.cpp
+++ b/src/core/core.cpp
@@ -11,9 +11,9 @@
 #include "common/string_util.h"
 #include "core/arm/exclusive_monitor.h"
 #include "core/core.h"
-#include "core/core_cpu.h"
+#include "core/core_manager.h"
 #include "core/core_timing.h"
-#include "core/cpu_core_manager.h"
+#include "core/cpu_manager.h"
 #include "core/file_sys/bis_factory.h"
 #include "core/file_sys/card_image.h"
 #include "core/file_sys/mode.h"
@@ -28,6 +28,7 @@
 #include "core/hardware_interrupt_manager.h"
 #include "core/hle/kernel/client_port.h"
 #include "core/hle/kernel/kernel.h"
+#include "core/hle/kernel/physical_core.h"
 #include "core/hle/kernel/process.h"
 #include "core/hle/kernel/scheduler.h"
 #include "core/hle/kernel/thread.h"
@@ -113,16 +114,25 @@ FileSys::VirtualFile GetGameFileFromPath(const FileSys::VirtualFilesystem& vfs,
 struct System::Impl {
     explicit Impl(System& system)
         : kernel{system}, fs_controller{system}, memory{system},
-          cpu_core_manager{system}, reporter{system}, applet_manager{system} {}
+          cpu_manager{system}, reporter{system}, applet_manager{system} {}
 
-    Cpu& CurrentCpuCore() {
-        return cpu_core_manager.GetCurrentCore();
+    CoreManager& CurrentCoreManager() {
+        return cpu_manager.GetCurrentCoreManager();
+    }
+
+    Kernel::PhysicalCore& CurrentPhysicalCore() {
+        const auto index = cpu_manager.GetActiveCoreIndex();
+        return kernel.PhysicalCore(index);
+    }
+
+    Kernel::PhysicalCore& GetPhysicalCore(std::size_t index) {
+        return kernel.PhysicalCore(index);
     }
 
     ResultStatus RunLoop(bool tight_loop) {
         status = ResultStatus::Success;
 
-        cpu_core_manager.RunLoop(tight_loop);
+        cpu_manager.RunLoop(tight_loop);
 
         return status;
     }
@@ -131,8 +141,8 @@ struct System::Impl {
         LOG_DEBUG(HW_Memory, "initialized OK");
 
         core_timing.Initialize();
-        cpu_core_manager.Initialize();
         kernel.Initialize();
+        cpu_manager.Initialize();
 
         const auto current_time = std::chrono::duration_cast<std::chrono::seconds>(
             std::chrono::system_clock::now().time_since_epoch());
@@ -205,7 +215,6 @@ struct System::Impl {
         // Main process has been loaded and been made current.
         // Begin GPU and CPU execution.
         gpu_core->Start();
-        cpu_core_manager.StartThreads();
 
         // Initialize cheat engine
         if (cheat_engine) {
@@ -259,7 +268,9 @@ struct System::Impl {
         is_powered_on = false;
         exit_lock = false;
 
-        gpu_core->WaitIdle();
+        if (gpu_core) {
+            gpu_core->WaitIdle();
+        }
 
         // Shutdown emulation session
         renderer.reset();
@@ -272,7 +283,7 @@ struct System::Impl {
         gpu_core.reset();
 
         // Close all CPU/threading state
-        cpu_core_manager.Shutdown();
+        cpu_manager.Shutdown();
 
         // Shutdown kernel and core timing
         kernel.Shutdown();
@@ -342,7 +353,7 @@ struct System::Impl {
     std::unique_ptr<Tegra::GPU> gpu_core;
     std::unique_ptr<Hardware::InterruptManager> interrupt_manager;
     Memory::Memory memory;
-    CpuCoreManager cpu_core_manager;
+    CpuManager cpu_manager;
     bool is_powered_on = false;
     bool exit_lock = false;
 
@@ -377,12 +388,12 @@ struct System::Impl {
 System::System() : impl{std::make_unique<Impl>(*this)} {}
 System::~System() = default;
 
-Cpu& System::CurrentCpuCore() {
-    return impl->CurrentCpuCore();
+CoreManager& System::CurrentCoreManager() {
+    return impl->CurrentCoreManager();
 }
 
-const Cpu& System::CurrentCpuCore() const {
-    return impl->CurrentCpuCore();
+const CoreManager& System::CurrentCoreManager() const {
+    return impl->CurrentCoreManager();
 }
 
 System::ResultStatus System::RunLoop(bool tight_loop) {
@@ -394,7 +405,7 @@ System::ResultStatus System::SingleStep() {
 }
 
 void System::InvalidateCpuInstructionCaches() {
-    impl->cpu_core_manager.InvalidateAllInstructionCaches();
+    impl->kernel.InvalidateAllInstructionCaches();
 }
 
 System::ResultStatus System::Load(Frontend::EmuWindow& emu_window, const std::string& filepath) {
@@ -406,13 +417,11 @@ bool System::IsPoweredOn() const {
 }
 
 void System::PrepareReschedule() {
-    CurrentCpuCore().PrepareReschedule();
+    impl->CurrentPhysicalCore().Stop();
 }
 
 void System::PrepareReschedule(const u32 core_index) {
-    if (core_index < GlobalScheduler().CpuCoresCount()) {
-        CpuCore(core_index).PrepareReschedule();
-    }
+    impl->kernel.PrepareReschedule(core_index);
 }
 
 PerfStatsResults System::GetAndResetPerfStats() {
@@ -428,31 +437,31 @@ const TelemetrySession& System::TelemetrySession() const {
 }
 
 ARM_Interface& System::CurrentArmInterface() {
-    return CurrentCpuCore().ArmInterface();
+    return impl->CurrentPhysicalCore().ArmInterface();
 }
 
 const ARM_Interface& System::CurrentArmInterface() const {
-    return CurrentCpuCore().ArmInterface();
+    return impl->CurrentPhysicalCore().ArmInterface();
 }
 
 std::size_t System::CurrentCoreIndex() const {
-    return CurrentCpuCore().CoreIndex();
+    return impl->cpu_manager.GetActiveCoreIndex();
 }
 
 Kernel::Scheduler& System::CurrentScheduler() {
-    return CurrentCpuCore().Scheduler();
+    return impl->CurrentPhysicalCore().Scheduler();
 }
 
 const Kernel::Scheduler& System::CurrentScheduler() const {
-    return CurrentCpuCore().Scheduler();
+    return impl->CurrentPhysicalCore().Scheduler();
 }
 
 Kernel::Scheduler& System::Scheduler(std::size_t core_index) {
-    return CpuCore(core_index).Scheduler();
+    return impl->GetPhysicalCore(core_index).Scheduler();
 }
 
 const Kernel::Scheduler& System::Scheduler(std::size_t core_index) const {
-    return CpuCore(core_index).Scheduler();
+    return impl->GetPhysicalCore(core_index).Scheduler();
 }
 
 /// Gets the global scheduler
@@ -474,28 +483,28 @@ const Kernel::Process* System::CurrentProcess() const {
 }
 
 ARM_Interface& System::ArmInterface(std::size_t core_index) {
-    return CpuCore(core_index).ArmInterface();
+    return impl->GetPhysicalCore(core_index).ArmInterface();
 }
 
 const ARM_Interface& System::ArmInterface(std::size_t core_index) const {
-    return CpuCore(core_index).ArmInterface();
+    return impl->GetPhysicalCore(core_index).ArmInterface();
 }
 
-Cpu& System::CpuCore(std::size_t core_index) {
-    return impl->cpu_core_manager.GetCore(core_index);
+CoreManager& System::GetCoreManager(std::size_t core_index) {
+    return impl->cpu_manager.GetCoreManager(core_index);
 }
 
-const Cpu& System::CpuCore(std::size_t core_index) const {
+const CoreManager& System::GetCoreManager(std::size_t core_index) const {
     ASSERT(core_index < NUM_CPU_CORES);
-    return impl->cpu_core_manager.GetCore(core_index);
+    return impl->cpu_manager.GetCoreManager(core_index);
 }
 
 ExclusiveMonitor& System::Monitor() {
-    return impl->cpu_core_manager.GetExclusiveMonitor();
+    return impl->kernel.GetExclusiveMonitor();
 }
 
 const ExclusiveMonitor& System::Monitor() const {
-    return impl->cpu_core_manager.GetExclusiveMonitor();
+    return impl->kernel.GetExclusiveMonitor();
 }
 
 Memory::Memory& System::Memory() {
diff --git a/src/core/core.h b/src/core/core.h
index e240c5c58..e69d68fcf 100644
--- a/src/core/core.h
+++ b/src/core/core.h
@@ -93,7 +93,7 @@ class Memory;
 namespace Core {
 
 class ARM_Interface;
-class Cpu;
+class CoreManager;
 class ExclusiveMonitor;
 class FrameLimiter;
 class PerfStats;
@@ -218,10 +218,10 @@ public:
     const ARM_Interface& ArmInterface(std::size_t core_index) const;
 
     /// Gets a CPU interface to the CPU core with the specified index
-    Cpu& CpuCore(std::size_t core_index);
+    CoreManager& GetCoreManager(std::size_t core_index);
 
     /// Gets a CPU interface to the CPU core with the specified index
-    const Cpu& CpuCore(std::size_t core_index) const;
+    const CoreManager& GetCoreManager(std::size_t core_index) const;
 
     /// Gets a reference to the exclusive monitor
     ExclusiveMonitor& Monitor();
@@ -364,10 +364,10 @@ private:
     System();
 
     /// Returns the currently running CPU core
-    Cpu& CurrentCpuCore();
+    CoreManager& CurrentCoreManager();
 
     /// Returns the currently running CPU core
-    const Cpu& CurrentCpuCore() const;
+    const CoreManager& CurrentCoreManager() const;
 
     /**
      * Initialize the emulated system.
diff --git a/src/core/core_cpu.cpp b/src/core/core_cpu.cpp
deleted file mode 100644
index 630cd4feb..000000000
--- a/src/core/core_cpu.cpp
+++ /dev/null
@@ -1,127 +0,0 @@
-// Copyright 2018 yuzu emulator team
-// Licensed under GPLv2 or any later version
-// Refer to the license.txt file included.
-
-#include <condition_variable>
-#include <mutex>
-
-#include "common/logging/log.h"
-#ifdef ARCHITECTURE_x86_64
-#include "core/arm/dynarmic/arm_dynarmic.h"
-#endif
-#include "core/arm/exclusive_monitor.h"
-#include "core/arm/unicorn/arm_unicorn.h"
-#include "core/core.h"
-#include "core/core_cpu.h"
-#include "core/core_timing.h"
-#include "core/hle/kernel/scheduler.h"
-#include "core/hle/kernel/thread.h"
-#include "core/hle/lock.h"
-#include "core/settings.h"
-
-namespace Core {
-
-void CpuBarrier::NotifyEnd() {
-    std::unique_lock lock{mutex};
-    end = true;
-    condition.notify_all();
-}
-
-bool CpuBarrier::Rendezvous() {
-    if (!Settings::values.use_multi_core) {
-        // Meaningless when running in single-core mode
-        return true;
-    }
-
-    if (!end) {
-        std::unique_lock lock{mutex};
-
-        --cores_waiting;
-        if (!cores_waiting) {
-            cores_waiting = NUM_CPU_CORES;
-            condition.notify_all();
-            return true;
-        }
-
-        condition.wait(lock);
-        return true;
-    }
-
-    return false;
-}
-
-Cpu::Cpu(System& system, ExclusiveMonitor& exclusive_monitor, CpuBarrier& cpu_barrier,
-         std::size_t core_index)
-    : cpu_barrier{cpu_barrier}, global_scheduler{system.GlobalScheduler()},
-      core_timing{system.CoreTiming()}, core_index{core_index} {
-#ifdef ARCHITECTURE_x86_64
-    arm_interface = std::make_unique<ARM_Dynarmic>(system, exclusive_monitor, core_index);
-#else
-    arm_interface = std::make_unique<ARM_Unicorn>(system);
-    LOG_WARNING(Core, "CPU JIT requested, but Dynarmic not available");
-#endif
-
-    scheduler = std::make_unique<Kernel::Scheduler>(system, *arm_interface, core_index);
-}
-
-Cpu::~Cpu() = default;
-
-std::unique_ptr<ExclusiveMonitor> Cpu::MakeExclusiveMonitor(
-    [[maybe_unused]] Memory::Memory& memory, [[maybe_unused]] std::size_t num_cores) {
-#ifdef ARCHITECTURE_x86_64
-    return std::make_unique<DynarmicExclusiveMonitor>(memory, num_cores);
-#else
-    // TODO(merry): Passthrough exclusive monitor
-    return nullptr;
-#endif
-}
-
-void Cpu::RunLoop(bool tight_loop) {
-    // Wait for all other CPU cores to complete the previous slice, such that they run in lock-step
-    if (!cpu_barrier.Rendezvous()) {
-        // If rendezvous failed, session has been killed
-        return;
-    }
-
-    Reschedule();
-
-    // If we don't have a currently active thread then don't execute instructions,
-    // instead advance to the next event and try to yield to the next thread
-    if (Kernel::GetCurrentThread() == nullptr) {
-        LOG_TRACE(Core, "Core-{} idling", core_index);
-        core_timing.Idle();
-    } else {
-        if (tight_loop) {
-            arm_interface->Run();
-        } else {
-            arm_interface->Step();
-        }
-        // We are stopping a run, exclusive state must be cleared
-        arm_interface->ClearExclusiveState();
-    }
-    core_timing.Advance();
-
-    Reschedule();
-}
-
-void Cpu::SingleStep() {
-    return RunLoop(false);
-}
-
-void Cpu::PrepareReschedule() {
-    arm_interface->PrepareReschedule();
-}
-
-void Cpu::Reschedule() {
-    // Lock the global kernel mutex when we manipulate the HLE state
-    std::lock_guard lock(HLE::g_hle_lock);
-
-    global_scheduler.SelectThread(core_index);
-    scheduler->TryDoContextSwitch();
-}
-
-void Cpu::Shutdown() {
-    scheduler->Shutdown();
-}
-
-} // namespace Core
diff --git a/src/core/core_cpu.h b/src/core/core_cpu.h
deleted file mode 100644
index 78f5021a2..000000000
--- a/src/core/core_cpu.h
+++ /dev/null
@@ -1,120 +0,0 @@
-// Copyright 2018 yuzu emulator team
-// Licensed under GPLv2 or any later version
-// Refer to the license.txt file included.
-
-#pragma once
-
-#include <atomic>
-#include <condition_variable>
-#include <cstddef>
-#include <memory>
-#include <mutex>
-#include "common/common_types.h"
-
-namespace Kernel {
-class GlobalScheduler;
-class Scheduler;
-} // namespace Kernel
-
-namespace Core {
-class System;
-}
-
-namespace Core::Timing {
-class CoreTiming;
-}
-
-namespace Memory {
-class Memory;
-}
-
-namespace Core {
-
-class ARM_Interface;
-class ExclusiveMonitor;
-
-constexpr unsigned NUM_CPU_CORES{4};
-
-class CpuBarrier {
-public:
-    bool IsAlive() const {
-        return !end;
-    }
-
-    void NotifyEnd();
-
-    bool Rendezvous();
-
-private:
-    unsigned cores_waiting{NUM_CPU_CORES};
-    std::mutex mutex;
-    std::condition_variable condition;
-    std::atomic<bool> end{};
-};
-
-class Cpu {
-public:
-    Cpu(System& system, ExclusiveMonitor& exclusive_monitor, CpuBarrier& cpu_barrier,
-        std::size_t core_index);
-    ~Cpu();
-
-    void RunLoop(bool tight_loop = true);
-
-    void SingleStep();
-
-    void PrepareReschedule();
-
-    ARM_Interface& ArmInterface() {
-        return *arm_interface;
-    }
-
-    const ARM_Interface& ArmInterface() const {
-        return *arm_interface;
-    }
-
-    Kernel::Scheduler& Scheduler() {
-        return *scheduler;
-    }
-
-    const Kernel::Scheduler& Scheduler() const {
-        return *scheduler;
-    }
-
-    bool IsMainCore() const {
-        return core_index == 0;
-    }
-
-    std::size_t CoreIndex() const {
-        return core_index;
-    }
-
-    void Shutdown();
-
-    /**
-     * Creates an exclusive monitor to handle exclusive reads/writes.
-     *
-     * @param memory The current memory subsystem that the monitor may wish
-     *               to keep track of.
-     *
-     * @param num_cores The number of cores to assume about the CPU.
-     *
-     * @returns The constructed exclusive monitor instance, or nullptr if the current
-     *          CPU backend is unable to use an exclusive monitor.
-     */
-    static std::unique_ptr<ExclusiveMonitor> MakeExclusiveMonitor(Memory::Memory& memory,
-                                                                  std::size_t num_cores);
-
-private:
-    void Reschedule();
-
-    std::unique_ptr<ARM_Interface> arm_interface;
-    CpuBarrier& cpu_barrier;
-    Kernel::GlobalScheduler& global_scheduler;
-    std::unique_ptr<Kernel::Scheduler> scheduler;
-    Timing::CoreTiming& core_timing;
-
-    std::atomic<bool> reschedule_pending = false;
-    std::size_t core_index;
-};
-
-} // namespace Core
diff --git a/src/core/core_manager.cpp b/src/core/core_manager.cpp
new file mode 100644
index 000000000..8eacf92dd
--- /dev/null
+++ b/src/core/core_manager.cpp
@@ -0,0 +1,70 @@
+// Copyright 2018 yuzu emulator team
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <condition_variable>
+#include <mutex>
+
+#include "common/logging/log.h"
+#ifdef ARCHITECTURE_x86_64
+#include "core/arm/dynarmic/arm_dynarmic.h"
+#endif
+#include "core/arm/exclusive_monitor.h"
+#include "core/arm/unicorn/arm_unicorn.h"
+#include "core/core.h"
+#include "core/core_manager.h"
+#include "core/core_timing.h"
+#include "core/hle/kernel/kernel.h"
+#include "core/hle/kernel/physical_core.h"
+#include "core/hle/kernel/scheduler.h"
+#include "core/hle/kernel/thread.h"
+#include "core/hle/lock.h"
+#include "core/settings.h"
+
+namespace Core {
+
+CoreManager::CoreManager(System& system, std::size_t core_index)
+    : global_scheduler{system.GlobalScheduler()}, physical_core{system.Kernel().PhysicalCore(
+                                                      core_index)},
+      core_timing{system.CoreTiming()}, core_index{core_index} {}
+
+CoreManager::~CoreManager() = default;
+
+void CoreManager::RunLoop(bool tight_loop) {
+    Reschedule();
+
+    // If we don't have a currently active thread then don't execute instructions,
+    // instead advance to the next event and try to yield to the next thread
+    if (Kernel::GetCurrentThread() == nullptr) {
+        LOG_TRACE(Core, "Core-{} idling", core_index);
+        core_timing.Idle();
+    } else {
+        if (tight_loop) {
+            physical_core.Run();
+        } else {
+            physical_core.Step();
+        }
+    }
+    core_timing.Advance();
+
+    Reschedule();
+}
+
+void CoreManager::SingleStep() {
+    return RunLoop(false);
+}
+
+void CoreManager::PrepareReschedule() {
+    physical_core.Stop();
+}
+
+void CoreManager::Reschedule() {
+    // Lock the global kernel mutex when we manipulate the HLE state
+    std::lock_guard lock(HLE::g_hle_lock);
+
+    global_scheduler.SelectThread(core_index);
+
+    physical_core.Scheduler().TryDoContextSwitch();
+}
+
+} // namespace Core
diff --git a/src/core/core_manager.h b/src/core/core_manager.h
new file mode 100644
index 000000000..b14e723d7
--- /dev/null
+++ b/src/core/core_manager.h
@@ -0,0 +1,63 @@
+// Copyright 2018 yuzu emulator team
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <atomic>
+#include <cstddef>
+#include <memory>
+#include "common/common_types.h"
+
+namespace Kernel {
+class GlobalScheduler;
+class PhysicalCore;
+} // namespace Kernel
+
+namespace Core {
+class System;
+}
+
+namespace Core::Timing {
+class CoreTiming;
+}
+
+namespace Memory {
+class Memory;
+}
+
+namespace Core {
+
+constexpr unsigned NUM_CPU_CORES{4};
+
+class CoreManager {
+public:
+    CoreManager(System& system, std::size_t core_index);
+    ~CoreManager();
+
+    void RunLoop(bool tight_loop = true);
+
+    void SingleStep();
+
+    void PrepareReschedule();
+
+    bool IsMainCore() const {
+        return core_index == 0;
+    }
+
+    std::size_t CoreIndex() const {
+        return core_index;
+    }
+
+private:
+    void Reschedule();
+
+    Kernel::GlobalScheduler& global_scheduler;
+    Kernel::PhysicalCore& physical_core;
+    Timing::CoreTiming& core_timing;
+
+    std::atomic<bool> reschedule_pending = false;
+    std::size_t core_index;
+};
+
+} // namespace Core
diff --git a/src/core/core_timing.cpp b/src/core/core_timing.cpp
index aa09fa453..46d4178c4 100644
--- a/src/core/core_timing.cpp
+++ b/src/core/core_timing.cpp
@@ -12,6 +12,7 @@
 #include "common/assert.h"
 #include "common/thread.h"
 #include "core/core_timing_util.h"
+#include "core/hardware_properties.h"
 
 namespace Core::Timing {
 
@@ -215,7 +216,7 @@ void CoreTiming::Idle() {
 }
 
 std::chrono::microseconds CoreTiming::GetGlobalTimeUs() const {
-    return std::chrono::microseconds{GetTicks() * 1000000 / BASE_CLOCK_RATE};
+    return std::chrono::microseconds{GetTicks() * 1000000 / Hardware::BASE_CLOCK_RATE};
 }
 
 s64 CoreTiming::GetDowncount() const {
diff --git a/src/core/core_timing_util.cpp b/src/core/core_timing_util.cpp
index a10472a95..de50d3b14 100644
--- a/src/core/core_timing_util.cpp
+++ b/src/core/core_timing_util.cpp
@@ -11,7 +11,7 @@
 
 namespace Core::Timing {
 
-constexpr u64 MAX_VALUE_TO_MULTIPLY = std::numeric_limits<s64>::max() / BASE_CLOCK_RATE;
+constexpr u64 MAX_VALUE_TO_MULTIPLY = std::numeric_limits<s64>::max() / Hardware::BASE_CLOCK_RATE;
 
 s64 msToCycles(std::chrono::milliseconds ms) {
     if (static_cast<u64>(ms.count() / 1000) > MAX_VALUE_TO_MULTIPLY) {
@@ -20,9 +20,9 @@ s64 msToCycles(std::chrono::milliseconds ms) {
     }
     if (static_cast<u64>(ms.count()) > MAX_VALUE_TO_MULTIPLY) {
         LOG_DEBUG(Core_Timing, "Time very big, do rounding");
-        return BASE_CLOCK_RATE * (ms.count() / 1000);
+        return Hardware::BASE_CLOCK_RATE * (ms.count() / 1000);
     }
-    return (BASE_CLOCK_RATE * ms.count()) / 1000;
+    return (Hardware::BASE_CLOCK_RATE * ms.count()) / 1000;
 }
 
 s64 usToCycles(std::chrono::microseconds us) {
@@ -32,9 +32,9 @@ s64 usToCycles(std::chrono::microseconds us) {
     }
     if (static_cast<u64>(us.count()) > MAX_VALUE_TO_MULTIPLY) {
         LOG_DEBUG(Core_Timing, "Time very big, do rounding");
-        return BASE_CLOCK_RATE * (us.count() / 1000000);
+        return Hardware::BASE_CLOCK_RATE * (us.count() / 1000000);
     }
-    return (BASE_CLOCK_RATE * us.count()) / 1000000;
+    return (Hardware::BASE_CLOCK_RATE * us.count()) / 1000000;
 }
 
 s64 nsToCycles(std::chrono::nanoseconds ns) {
@@ -44,14 +44,14 @@ s64 nsToCycles(std::chrono::nanoseconds ns) {
     }
     if (static_cast<u64>(ns.count()) > MAX_VALUE_TO_MULTIPLY) {
         LOG_DEBUG(Core_Timing, "Time very big, do rounding");
-        return BASE_CLOCK_RATE * (ns.count() / 1000000000);
+        return Hardware::BASE_CLOCK_RATE * (ns.count() / 1000000000);
     }
-    return (BASE_CLOCK_RATE * ns.count()) / 1000000000;
+    return (Hardware::BASE_CLOCK_RATE * ns.count()) / 1000000000;
 }
 
 u64 CpuCyclesToClockCycles(u64 ticks) {
-    const u128 temporal = Common::Multiply64Into128(ticks, CNTFREQ);
-    return Common::Divide128On32(temporal, static_cast<u32>(BASE_CLOCK_RATE)).first;
+    const u128 temporal = Common::Multiply64Into128(ticks, Hardware::CNTFREQ);
+    return Common::Divide128On32(temporal, static_cast<u32>(Hardware::BASE_CLOCK_RATE)).first;
 }
 
 } // namespace Core::Timing
diff --git a/src/core/core_timing_util.h b/src/core/core_timing_util.h
index cdd84d70f..addc72b19 100644
--- a/src/core/core_timing_util.h
+++ b/src/core/core_timing_util.h
@@ -6,28 +6,24 @@
 
 #include <chrono>
 #include "common/common_types.h"
+#include "core/hardware_properties.h"
 
 namespace Core::Timing {
 
-// The below clock rate is based on Switch's clockspeed being widely known as 1.020GHz
-// The exact value used is of course unverified.
-constexpr u64 BASE_CLOCK_RATE = 1019215872; // Switch clock speed is 1020MHz un/docked
-constexpr u64 CNTFREQ = 19200000;           // Value from fusee.
-
 s64 msToCycles(std::chrono::milliseconds ms);
 s64 usToCycles(std::chrono::microseconds us);
 s64 nsToCycles(std::chrono::nanoseconds ns);
 
 inline std::chrono::milliseconds CyclesToMs(s64 cycles) {
-    return std::chrono::milliseconds(cycles * 1000 / BASE_CLOCK_RATE);
+    return std::chrono::milliseconds(cycles * 1000 / Hardware::BASE_CLOCK_RATE);
 }
 
 inline std::chrono::nanoseconds CyclesToNs(s64 cycles) {
-    return std::chrono::nanoseconds(cycles * 1000000000 / BASE_CLOCK_RATE);
+    return std::chrono::nanoseconds(cycles * 1000000000 / Hardware::BASE_CLOCK_RATE);
 }
 
 inline std::chrono::microseconds CyclesToUs(s64 cycles) {
-    return std::chrono::microseconds(cycles * 1000000 / BASE_CLOCK_RATE);
+    return std::chrono::microseconds(cycles * 1000000 / Hardware::BASE_CLOCK_RATE);
 }
 
 u64 CpuCyclesToClockCycles(u64 ticks);
diff --git a/src/core/cpu_core_manager.cpp b/src/core/cpu_core_manager.cpp
deleted file mode 100644
index f04a34133..000000000
--- a/src/core/cpu_core_manager.cpp
+++ /dev/null
@@ -1,152 +0,0 @@
-// Copyright 2018 yuzu emulator team
-// Licensed under GPLv2 or any later version
-// Refer to the license.txt file included.
-
-#include "common/assert.h"
-#include "core/arm/exclusive_monitor.h"
-#include "core/core.h"
-#include "core/core_cpu.h"
-#include "core/core_timing.h"
-#include "core/cpu_core_manager.h"
-#include "core/gdbstub/gdbstub.h"
-#include "core/settings.h"
-
-namespace Core {
-namespace {
-void RunCpuCore(const System& system, Cpu& cpu_state) {
-    while (system.IsPoweredOn()) {
-        cpu_state.RunLoop(true);
-    }
-}
-} // Anonymous namespace
-
-CpuCoreManager::CpuCoreManager(System& system) : system{system} {}
-CpuCoreManager::~CpuCoreManager() = default;
-
-void CpuCoreManager::Initialize() {
-    barrier = std::make_unique<CpuBarrier>();
-    exclusive_monitor = Cpu::MakeExclusiveMonitor(system.Memory(), cores.size());
-
-    for (std::size_t index = 0; index < cores.size(); ++index) {
-        cores[index] = std::make_unique<Cpu>(system, *exclusive_monitor, *barrier, index);
-    }
-}
-
-void CpuCoreManager::StartThreads() {
-    // Create threads for CPU cores 1-3, and build thread_to_cpu map
-    // CPU core 0 is run on the main thread
-    thread_to_cpu[std::this_thread::get_id()] = cores[0].get();
-    if (!Settings::values.use_multi_core) {
-        return;
-    }
-
-    for (std::size_t index = 0; index < core_threads.size(); ++index) {
-        core_threads[index] = std::make_unique<std::thread>(RunCpuCore, std::cref(system),
-                                                            std::ref(*cores[index + 1]));
-        thread_to_cpu[core_threads[index]->get_id()] = cores[index + 1].get();
-    }
-}
-
-void CpuCoreManager::Shutdown() {
-    barrier->NotifyEnd();
-    if (Settings::values.use_multi_core) {
-        for (auto& thread : core_threads) {
-            thread->join();
-            thread.reset();
-        }
-    }
-
-    thread_to_cpu.clear();
-    for (auto& cpu_core : cores) {
-        cpu_core->Shutdown();
-        cpu_core.reset();
-    }
-
-    exclusive_monitor.reset();
-    barrier.reset();
-}
-
-Cpu& CpuCoreManager::GetCore(std::size_t index) {
-    return *cores.at(index);
-}
-
-const Cpu& CpuCoreManager::GetCore(std::size_t index) const {
-    return *cores.at(index);
-}
-
-ExclusiveMonitor& CpuCoreManager::GetExclusiveMonitor() {
-    return *exclusive_monitor;
-}
-
-const ExclusiveMonitor& CpuCoreManager::GetExclusiveMonitor() const {
-    return *exclusive_monitor;
-}
-
-Cpu& CpuCoreManager::GetCurrentCore() {
-    if (Settings::values.use_multi_core) {
-        const auto& search = thread_to_cpu.find(std::this_thread::get_id());
-        ASSERT(search != thread_to_cpu.end());
-        ASSERT(search->second);
-        return *search->second;
-    }
-
-    // Otherwise, use single-threaded mode active_core variable
-    return *cores[active_core];
-}
-
-const Cpu& CpuCoreManager::GetCurrentCore() const {
-    if (Settings::values.use_multi_core) {
-        const auto& search = thread_to_cpu.find(std::this_thread::get_id());
-        ASSERT(search != thread_to_cpu.end());
-        ASSERT(search->second);
-        return *search->second;
-    }
-
-    // Otherwise, use single-threaded mode active_core variable
-    return *cores[active_core];
-}
-
-void CpuCoreManager::RunLoop(bool tight_loop) {
-    // Update thread_to_cpu in case Core 0 is run from a different host thread
-    thread_to_cpu[std::this_thread::get_id()] = cores[0].get();
-
-    if (GDBStub::IsServerEnabled()) {
-        GDBStub::HandlePacket();
-
-        // If the loop is halted and we want to step, use a tiny (1) number of instructions to
-        // execute. Otherwise, get out of the loop function.
-        if (GDBStub::GetCpuHaltFlag()) {
-            if (GDBStub::GetCpuStepFlag()) {
-                tight_loop = false;
-            } else {
-                return;
-            }
-        }
-    }
-
-    auto& core_timing = system.CoreTiming();
-    core_timing.ResetRun();
-    bool keep_running{};
-    do {
-        keep_running = false;
-        for (active_core = 0; active_core < NUM_CPU_CORES; ++active_core) {
-            core_timing.SwitchContext(active_core);
-            if (core_timing.CanCurrentContextRun()) {
-                cores[active_core]->RunLoop(tight_loop);
-            }
-            keep_running |= core_timing.CanCurrentContextRun();
-        }
-    } while (keep_running);
-
-    if (GDBStub::IsServerEnabled()) {
-        GDBStub::SetCpuStepFlag(false);
-    }
-}
-
-void CpuCoreManager::InvalidateAllInstructionCaches() {
-    for (auto& cpu : cores) {
-        cpu->ArmInterface().ClearInstructionCache();
-    }
-}
-
-} // namespace Core
diff --git a/src/core/cpu_core_manager.h b/src/core/cpu_core_manager.h
deleted file mode 100644
index 2cbbf8216..000000000
--- a/src/core/cpu_core_manager.h
+++ /dev/null
@@ -1,62 +0,0 @@
-// Copyright 2018 yuzu emulator team
-// Licensed under GPLv2 or any later version
-// Refer to the license.txt file included.
-
-#pragma once
-
-#include <array>
-#include <map>
-#include <memory>
-#include <thread>
-
-namespace Core {
-
-class Cpu;
-class CpuBarrier;
-class ExclusiveMonitor;
-class System;
-
-class CpuCoreManager {
-public:
-    explicit CpuCoreManager(System& system);
-    CpuCoreManager(const CpuCoreManager&) = delete;
-    CpuCoreManager(CpuCoreManager&&) = delete;
-
-    ~CpuCoreManager();
-
-    CpuCoreManager& operator=(const CpuCoreManager&) = delete;
-    CpuCoreManager& operator=(CpuCoreManager&&) = delete;
-
-    void Initialize();
-    void StartThreads();
-    void Shutdown();
-
-    Cpu& GetCore(std::size_t index);
-    const Cpu& GetCore(std::size_t index) const;
-
-    Cpu& GetCurrentCore();
-    const Cpu& GetCurrentCore() const;
-
-    ExclusiveMonitor& GetExclusiveMonitor();
-    const ExclusiveMonitor& GetExclusiveMonitor() const;
-
-    void RunLoop(bool tight_loop);
-
-    void InvalidateAllInstructionCaches();
-
-private:
-    static constexpr std::size_t NUM_CPU_CORES = 4;
-
-    std::unique_ptr<ExclusiveMonitor> exclusive_monitor;
-    std::unique_ptr<CpuBarrier> barrier;
-    std::array<std::unique_ptr<Cpu>, NUM_CPU_CORES> cores;
-    std::array<std::unique_ptr<std::thread>, NUM_CPU_CORES - 1> core_threads;
-    std::size_t active_core{}; ///< Active core, only used in single thread mode
-
-    /// Map of guest threads to CPU cores
-    std::map<std::thread::id, Cpu*> thread_to_cpu;
-
-    System& system;
-};
-
-} // namespace Core
diff --git a/src/core/cpu_manager.cpp b/src/core/cpu_manager.cpp
new file mode 100644
index 000000000..70ddbdcca
--- /dev/null
+++ b/src/core/cpu_manager.cpp
@@ -0,0 +1,81 @@
+// Copyright 2018 yuzu emulator team
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include "core/arm/exclusive_monitor.h"
+#include "core/core.h"
+#include "core/core_manager.h"
+#include "core/core_timing.h"
+#include "core/cpu_manager.h"
+#include "core/gdbstub/gdbstub.h"
+
+namespace Core {
+
+CpuManager::CpuManager(System& system) : system{system} {}
+CpuManager::~CpuManager() = default;
+
+void CpuManager::Initialize() {
+    for (std::size_t index = 0; index < core_managers.size(); ++index) {
+        core_managers[index] = std::make_unique<CoreManager>(system, index);
+    }
+}
+
+void CpuManager::Shutdown() {
+    for (auto& cpu_core : core_managers) {
+        cpu_core.reset();
+    }
+}
+
+CoreManager& CpuManager::GetCoreManager(std::size_t index) {
+    return *core_managers.at(index);
+}
+
+const CoreManager& CpuManager::GetCoreManager(std::size_t index) const {
+    return *core_managers.at(index);
+}
+
+CoreManager& CpuManager::GetCurrentCoreManager() {
+    // Otherwise, use single-threaded mode active_core variable
+    return *core_managers[active_core];
+}
+
+const CoreManager& CpuManager::GetCurrentCoreManager() const {
+    // Otherwise, use single-threaded mode active_core variable
+    return *core_managers[active_core];
+}
+
+void CpuManager::RunLoop(bool tight_loop) {
+    if (GDBStub::IsServerEnabled()) {
+        GDBStub::HandlePacket();
+
+        // If the loop is halted and we want to step, use a tiny (1) number of instructions to
+        // execute. Otherwise, get out of the loop function.
+        if (GDBStub::GetCpuHaltFlag()) {
+            if (GDBStub::GetCpuStepFlag()) {
+                tight_loop = false;
+            } else {
+                return;
+            }
+        }
+    }
+
+    auto& core_timing = system.CoreTiming();
+    core_timing.ResetRun();
+    bool keep_running{};
+    do {
+        keep_running = false;
+        for (active_core = 0; active_core < NUM_CPU_CORES; ++active_core) {
+            core_timing.SwitchContext(active_core);
+            if (core_timing.CanCurrentContextRun()) {
+                core_managers[active_core]->RunLoop(tight_loop);
+            }
+            keep_running |= core_timing.CanCurrentContextRun();
+        }
+    } while (keep_running);
+
+    if (GDBStub::IsServerEnabled()) {
+        GDBStub::SetCpuStepFlag(false);
+    }
+}
+
+} // namespace Core
diff --git a/src/core/cpu_manager.h b/src/core/cpu_manager.h
new file mode 100644
index 000000000..97554d1bb
--- /dev/null
+++ b/src/core/cpu_manager.h
@@ -0,0 +1,49 @@
+// Copyright 2018 yuzu emulator team
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <array>
+#include <memory>
+#include "core/hardware_properties.h"
+
+namespace Core {
+
+class CoreManager;
+class System;
+
+class CpuManager {
+public:
+    explicit CpuManager(System& system);
+    CpuManager(const CpuManager&) = delete;
+    CpuManager(CpuManager&&) = delete;
+
+    ~CpuManager();
+
+    CpuManager& operator=(const CpuManager&) = delete;
+    CpuManager& operator=(CpuManager&&) = delete;
+
+    void Initialize();
+    void Shutdown();
+
+    CoreManager& GetCoreManager(std::size_t index);
+    const CoreManager& GetCoreManager(std::size_t index) const;
+
+    CoreManager& GetCurrentCoreManager();
+    const CoreManager& GetCurrentCoreManager() const;
+
+    std::size_t GetActiveCoreIndex() const {
+        return active_core;
+    }
+
+    void RunLoop(bool tight_loop);
+
+private:
+    std::array<std::unique_ptr<CoreManager>, Hardware::NUM_CPU_CORES> core_managers;
+    std::size_t active_core{}; ///< Active core, only used in single thread mode
+
+    System& system;
+};
+
+} // namespace Core
diff --git a/src/core/frontend/emu_window.h b/src/core/frontend/emu_window.h
index 4a9912641..3376eedc5 100644
--- a/src/core/frontend/emu_window.h
+++ b/src/core/frontend/emu_window.h
@@ -75,6 +75,13 @@ public:
         return nullptr;
     }
 
+    /// Returns if window is shown (not minimized)
+    virtual bool IsShown() const = 0;
+
+    /// Retrieves Vulkan specific handlers from the window
+    virtual void RetrieveVulkanHandlers(void* get_instance_proc_addr, void* instance,
+                                        void* surface) const = 0;
+
     /**
      * Signal that a touch pressed event has occurred (e.g. mouse click pressed)
      * @param framebuffer_x Framebuffer x-coordinate that was pressed
diff --git a/src/core/frontend/framebuffer_layout.cpp b/src/core/frontend/framebuffer_layout.cpp
index d6d2cf3f0..2dc795d56 100644
--- a/src/core/frontend/framebuffer_layout.cpp
+++ b/src/core/frontend/framebuffer_layout.cpp
@@ -27,9 +27,9 @@ FramebufferLayout DefaultFrameLayout(u32 width, u32 height) {
     // so just calculate them both even if the other isn't showing.
     FramebufferLayout res{width, height};
 
-    const float emulation_aspect_ratio{static_cast<float>(ScreenUndocked::Height) /
-                                       ScreenUndocked::Width};
-    const auto window_aspect_ratio = static_cast<float>(height) / width;
+    const float window_aspect_ratio = static_cast<float>(height) / width;
+    const float emulation_aspect_ratio = EmulationAspectRatio(
+        static_cast<AspectRatio>(Settings::values.aspect_ratio), window_aspect_ratio);
 
     const Common::Rectangle<u32> screen_window_area{0, 0, width, height};
     Common::Rectangle<u32> screen = MaxRectangle(screen_window_area, emulation_aspect_ratio);
@@ -58,4 +58,19 @@ FramebufferLayout FrameLayoutFromResolutionScale(u32 res_scale) {
     return DefaultFrameLayout(width, height);
 }
 
+float EmulationAspectRatio(AspectRatio aspect, float window_aspect_ratio) {
+    switch (aspect) {
+    case AspectRatio::Default:
+        return static_cast<float>(ScreenUndocked::Height) / ScreenUndocked::Width;
+    case AspectRatio::R4_3:
+        return 3.0f / 4.0f;
+    case AspectRatio::R21_9:
+        return 9.0f / 21.0f;
+    case AspectRatio::StretchToWindow:
+        return window_aspect_ratio;
+    default:
+        return static_cast<float>(ScreenUndocked::Height) / ScreenUndocked::Width;
+    }
+}
+
 } // namespace Layout
diff --git a/src/core/frontend/framebuffer_layout.h b/src/core/frontend/framebuffer_layout.h
index d2370adde..1d39c1faf 100644
--- a/src/core/frontend/framebuffer_layout.h
+++ b/src/core/frontend/framebuffer_layout.h
@@ -18,6 +18,13 @@ enum ScreenDocked : u32 {
     HeightDocked = 1080,
 };
 
+enum class AspectRatio {
+    Default,
+    R4_3,
+    R21_9,
+    StretchToWindow,
+};
+
 /// Describes the layout of the window framebuffer
 struct FramebufferLayout {
     u32 width{ScreenUndocked::Width};
@@ -48,4 +55,12 @@ FramebufferLayout DefaultFrameLayout(u32 width, u32 height);
  */
 FramebufferLayout FrameLayoutFromResolutionScale(u32 res_scale);
 
+/**
+ * Convenience method to determine emulation aspect ratio
+ * @param aspect Represents the index of aspect ratio stored in Settings::values.aspect_ratio
+ * @param window_aspect_ratio Current window aspect ratio
+ * @return Emulation render window aspect ratio
+ */
+float EmulationAspectRatio(AspectRatio aspect, float window_aspect_ratio);
+
 } // namespace Layout
diff --git a/src/core/frontend/input.h b/src/core/frontend/input.h
index 7c11d7546..2b098b7c6 100644
--- a/src/core/frontend/input.h
+++ b/src/core/frontend/input.h
@@ -15,6 +15,13 @@
 
 namespace Input {
 
+enum class AnalogDirection : u8 {
+    RIGHT,
+    LEFT,
+    UP,
+    DOWN,
+};
+
 /// An abstract class template for an input device (a button, an analog input, etc.).
 template <typename StatusType>
 class InputDevice {
@@ -23,6 +30,9 @@ public:
     virtual StatusType GetStatus() const {
         return {};
     }
+    virtual bool GetAnalogDirectionStatus(AnalogDirection direction) const {
+        return {};
+    }
 };
 
 /// An abstract class template for a factory that can create input devices.
diff --git a/src/core/gdbstub/gdbstub.cpp b/src/core/gdbstub/gdbstub.cpp
index 37cb28848..67e95999d 100644
--- a/src/core/gdbstub/gdbstub.cpp
+++ b/src/core/gdbstub/gdbstub.cpp
@@ -35,7 +35,7 @@
 #include "common/swap.h"
 #include "core/arm/arm_interface.h"
 #include "core/core.h"
-#include "core/core_cpu.h"
+#include "core/core_manager.h"
 #include "core/gdbstub/gdbstub.h"
 #include "core/hle/kernel/process.h"
 #include "core/hle/kernel/scheduler.h"
diff --git a/src/core/hardware_properties.h b/src/core/hardware_properties.h
new file mode 100644
index 000000000..213461b6a
--- /dev/null
+++ b/src/core/hardware_properties.h
@@ -0,0 +1,45 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <tuple>
+
+#include "common/common_types.h"
+
+namespace Core {
+
+namespace Hardware {
+
+// The below clock rate is based on Switch's clockspeed being widely known as 1.020GHz
+// The exact value used is of course unverified.
+constexpr u64 BASE_CLOCK_RATE = 1019215872; // Switch cpu frequency is 1020MHz un/docked
+constexpr u64 CNTFREQ = 19200000;           // Switch's hardware clock speed
+constexpr u32 NUM_CPU_CORES = 4;            // Number of CPU Cores
+
+} // namespace Hardware
+
+struct EmuThreadHandle {
+    u32 host_handle;
+    u32 guest_handle;
+
+    u64 GetRaw() const {
+        return (static_cast<u64>(host_handle) << 32) | guest_handle;
+    }
+
+    bool operator==(const EmuThreadHandle& rhs) const {
+        return std::tie(host_handle, guest_handle) == std::tie(rhs.host_handle, rhs.guest_handle);
+    }
+
+    bool operator!=(const EmuThreadHandle& rhs) const {
+        return !operator==(rhs);
+    }
+
+    static constexpr EmuThreadHandle InvalidHandle() {
+        constexpr u32 invalid_handle = 0xFFFFFFFF;
+        return {invalid_handle, invalid_handle};
+    }
+};
+
+} // namespace Core
diff --git a/src/core/hle/kernel/address_arbiter.cpp b/src/core/hle/kernel/address_arbiter.cpp
index db189c8e3..8475b698c 100644
--- a/src/core/hle/kernel/address_arbiter.cpp
+++ b/src/core/hle/kernel/address_arbiter.cpp
@@ -8,7 +8,6 @@
 #include "common/assert.h"
 #include "common/common_types.h"
 #include "core/core.h"
-#include "core/core_cpu.h"
 #include "core/hle/kernel/address_arbiter.h"
 #include "core/hle/kernel/errors.h"
 #include "core/hle/kernel/scheduler.h"
@@ -202,42 +201,39 @@ void AddressArbiter::HandleWakeupThread(std::shared_ptr<Thread> thread) {
 void AddressArbiter::InsertThread(std::shared_ptr<Thread> thread) {
     const VAddr arb_addr = thread->GetArbiterWaitAddress();
     std::list<std::shared_ptr<Thread>>& thread_list = arb_threads[arb_addr];
-    auto it = thread_list.begin();
-    while (it != thread_list.end()) {
-        const std::shared_ptr<Thread>& current_thread = *it;
-        if (current_thread->GetPriority() >= thread->GetPriority()) {
-            thread_list.insert(it, thread);
-            return;
-        }
-        ++it;
+
+    const auto iter =
+        std::find_if(thread_list.cbegin(), thread_list.cend(), [&thread](const auto& entry) {
+            return entry->GetPriority() >= thread->GetPriority();
+        });
+
+    if (iter == thread_list.cend()) {
+        thread_list.push_back(std::move(thread));
+    } else {
+        thread_list.insert(iter, std::move(thread));
     }
-    thread_list.push_back(std::move(thread));
 }
 
 void AddressArbiter::RemoveThread(std::shared_ptr<Thread> thread) {
     const VAddr arb_addr = thread->GetArbiterWaitAddress();
     std::list<std::shared_ptr<Thread>>& thread_list = arb_threads[arb_addr];
-    auto it = thread_list.begin();
-    while (it != thread_list.end()) {
-        const std::shared_ptr<Thread>& current_thread = *it;
-        if (current_thread.get() == thread.get()) {
-            thread_list.erase(it);
-            return;
-        }
-        ++it;
-    }
-    UNREACHABLE();
+
+    const auto iter = std::find_if(thread_list.cbegin(), thread_list.cend(),
+                                   [&thread](const auto& entry) { return thread == entry; });
+
+    ASSERT(iter != thread_list.cend());
+
+    thread_list.erase(iter);
 }
 
-std::vector<std::shared_ptr<Thread>> AddressArbiter::GetThreadsWaitingOnAddress(VAddr address) {
-    std::vector<std::shared_ptr<Thread>> result;
-    std::list<std::shared_ptr<Thread>>& thread_list = arb_threads[address];
-    auto it = thread_list.begin();
-    while (it != thread_list.end()) {
-        std::shared_ptr<Thread> current_thread = *it;
-        result.push_back(std::move(current_thread));
-        ++it;
+std::vector<std::shared_ptr<Thread>> AddressArbiter::GetThreadsWaitingOnAddress(
+    VAddr address) const {
+    const auto iter = arb_threads.find(address);
+    if (iter == arb_threads.cend()) {
+        return {};
     }
-    return result;
+
+    const std::list<std::shared_ptr<Thread>>& thread_list = iter->second;
+    return {thread_list.cbegin(), thread_list.cend()};
 }
 } // namespace Kernel
diff --git a/src/core/hle/kernel/address_arbiter.h b/src/core/hle/kernel/address_arbiter.h
index 386983e54..f958eee5a 100644
--- a/src/core/hle/kernel/address_arbiter.h
+++ b/src/core/hle/kernel/address_arbiter.h
@@ -86,7 +86,7 @@ private:
     void RemoveThread(std::shared_ptr<Thread> thread);
 
     // Gets the threads waiting on an address.
-    std::vector<std::shared_ptr<Thread>> GetThreadsWaitingOnAddress(VAddr address);
+    std::vector<std::shared_ptr<Thread>> GetThreadsWaitingOnAddress(VAddr address) const;
 
     /// List of threads waiting for a address arbiter
     std::unordered_map<VAddr, std::list<std::shared_ptr<Thread>>> arb_threads;
diff --git a/src/core/hle/kernel/client_session.cpp b/src/core/hle/kernel/client_session.cpp
index 4669a14ad..6d66276bc 100644
--- a/src/core/hle/kernel/client_session.cpp
+++ b/src/core/hle/kernel/client_session.cpp
@@ -12,7 +12,7 @@
 
 namespace Kernel {
 
-ClientSession::ClientSession(KernelCore& kernel) : WaitObject{kernel} {}
+ClientSession::ClientSession(KernelCore& kernel) : SynchronizationObject{kernel} {}
 
 ClientSession::~ClientSession() {
     // This destructor will be called automatically when the last ClientSession handle is closed by
@@ -31,6 +31,11 @@ void ClientSession::Acquire(Thread* thread) {
     UNIMPLEMENTED();
 }
 
+bool ClientSession::IsSignaled() const {
+    UNIMPLEMENTED();
+    return true;
+}
+
 ResultVal<std::shared_ptr<ClientSession>> ClientSession::Create(KernelCore& kernel,
                                                                 std::shared_ptr<Session> parent,
                                                                 std::string name) {
diff --git a/src/core/hle/kernel/client_session.h b/src/core/hle/kernel/client_session.h
index b4289a9a8..d15b09554 100644
--- a/src/core/hle/kernel/client_session.h
+++ b/src/core/hle/kernel/client_session.h
@@ -7,7 +7,7 @@
 #include <memory>
 #include <string>
 
-#include "core/hle/kernel/wait_object.h"
+#include "core/hle/kernel/synchronization_object.h"
 #include "core/hle/result.h"
 
 union ResultCode;
@@ -22,7 +22,7 @@ class KernelCore;
 class Session;
 class Thread;
 
-class ClientSession final : public WaitObject {
+class ClientSession final : public SynchronizationObject {
 public:
     explicit ClientSession(KernelCore& kernel);
     ~ClientSession() override;
@@ -48,6 +48,8 @@ public:
 
     void Acquire(Thread* thread) override;
 
+    bool IsSignaled() const override;
+
 private:
     static ResultVal<std::shared_ptr<ClientSession>> Create(KernelCore& kernel,
                                                             std::shared_ptr<Session> parent,
diff --git a/src/core/hle/kernel/hle_ipc.cpp b/src/core/hle/kernel/hle_ipc.cpp
index 2db28dcf0..c558a2f33 100644
--- a/src/core/hle/kernel/hle_ipc.cpp
+++ b/src/core/hle/kernel/hle_ipc.cpp
@@ -47,15 +47,15 @@ std::shared_ptr<WritableEvent> HLERequestContext::SleepClientThread(
     const std::string& reason, u64 timeout, WakeupCallback&& callback,
     std::shared_ptr<WritableEvent> writable_event) {
     // Put the client thread to sleep until the wait event is signaled or the timeout expires.
-    thread->SetWakeupCallback([context = *this, callback](ThreadWakeupReason reason,
-                                                          std::shared_ptr<Thread> thread,
-                                                          std::shared_ptr<WaitObject> object,
-                                                          std::size_t index) mutable -> bool {
-        ASSERT(thread->GetStatus() == ThreadStatus::WaitHLEEvent);
-        callback(thread, context, reason);
-        context.WriteToOutgoingCommandBuffer(*thread);
-        return true;
-    });
+    thread->SetWakeupCallback(
+        [context = *this, callback](ThreadWakeupReason reason, std::shared_ptr<Thread> thread,
+                                    std::shared_ptr<SynchronizationObject> object,
+                                    std::size_t index) mutable -> bool {
+            ASSERT(thread->GetStatus() == ThreadStatus::WaitHLEEvent);
+            callback(thread, context, reason);
+            context.WriteToOutgoingCommandBuffer(*thread);
+            return true;
+        });
 
     auto& kernel = Core::System::GetInstance().Kernel();
     if (!writable_event) {
@@ -67,7 +67,7 @@ std::shared_ptr<WritableEvent> HLERequestContext::SleepClientThread(
     const auto readable_event{writable_event->GetReadableEvent()};
     writable_event->Clear();
     thread->SetStatus(ThreadStatus::WaitHLEEvent);
-    thread->SetWaitObjects({readable_event});
+    thread->SetSynchronizationObjects({readable_event});
     readable_event->AddWaitingThread(thread);
 
     if (timeout > 0) {
@@ -284,13 +284,18 @@ ResultCode HLERequestContext::WriteToOutgoingCommandBuffer(Thread& thread) {
 
 std::vector<u8> HLERequestContext::ReadBuffer(int buffer_index) const {
     std::vector<u8> buffer;
-    const bool is_buffer_a{BufferDescriptorA().size() && BufferDescriptorA()[buffer_index].Size()};
+    const bool is_buffer_a{BufferDescriptorA().size() > buffer_index &&
+                           BufferDescriptorA()[buffer_index].Size()};
     auto& memory = Core::System::GetInstance().Memory();
 
     if (is_buffer_a) {
+        ASSERT_MSG(BufferDescriptorA().size() > buffer_index,
+                   "BufferDescriptorA invalid buffer_index {}", buffer_index);
         buffer.resize(BufferDescriptorA()[buffer_index].Size());
         memory.ReadBlock(BufferDescriptorA()[buffer_index].Address(), buffer.data(), buffer.size());
     } else {
+        ASSERT_MSG(BufferDescriptorX().size() > buffer_index,
+                   "BufferDescriptorX invalid buffer_index {}", buffer_index);
         buffer.resize(BufferDescriptorX()[buffer_index].Size());
         memory.ReadBlock(BufferDescriptorX()[buffer_index].Address(), buffer.data(), buffer.size());
     }
@@ -305,7 +310,8 @@ std::size_t HLERequestContext::WriteBuffer(const void* buffer, std::size_t size,
         return 0;
     }
 
-    const bool is_buffer_b{BufferDescriptorB().size() && BufferDescriptorB()[buffer_index].Size()};
+    const bool is_buffer_b{BufferDescriptorB().size() > buffer_index &&
+                           BufferDescriptorB()[buffer_index].Size()};
     const std::size_t buffer_size{GetWriteBufferSize(buffer_index)};
     if (size > buffer_size) {
         LOG_CRITICAL(Core, "size ({:016X}) is greater than buffer_size ({:016X})", size,
@@ -315,8 +321,16 @@ std::size_t HLERequestContext::WriteBuffer(const void* buffer, std::size_t size,
 
     auto& memory = Core::System::GetInstance().Memory();
     if (is_buffer_b) {
+        ASSERT_MSG(BufferDescriptorB().size() > buffer_index,
+                   "BufferDescriptorB invalid buffer_index {}", buffer_index);
+        ASSERT_MSG(BufferDescriptorB()[buffer_index].Size() >= size,
+                   "BufferDescriptorB buffer_index {} is not large enough", buffer_index);
         memory.WriteBlock(BufferDescriptorB()[buffer_index].Address(), buffer, size);
     } else {
+        ASSERT_MSG(BufferDescriptorC().size() > buffer_index,
+                   "BufferDescriptorC invalid buffer_index {}", buffer_index);
+        ASSERT_MSG(BufferDescriptorC()[buffer_index].Size() >= size,
+                   "BufferDescriptorC buffer_index {} is not large enough", buffer_index);
         memory.WriteBlock(BufferDescriptorC()[buffer_index].Address(), buffer, size);
     }
 
@@ -324,15 +338,35 @@ std::size_t HLERequestContext::WriteBuffer(const void* buffer, std::size_t size,
 }
 
 std::size_t HLERequestContext::GetReadBufferSize(int buffer_index) const {
-    const bool is_buffer_a{BufferDescriptorA().size() && BufferDescriptorA()[buffer_index].Size()};
-    return is_buffer_a ? BufferDescriptorA()[buffer_index].Size()
-                       : BufferDescriptorX()[buffer_index].Size();
+    const bool is_buffer_a{BufferDescriptorA().size() > buffer_index &&
+                           BufferDescriptorA()[buffer_index].Size()};
+    if (is_buffer_a) {
+        ASSERT_MSG(BufferDescriptorA().size() > buffer_index,
+                   "BufferDescriptorA invalid buffer_index {}", buffer_index);
+        ASSERT_MSG(BufferDescriptorA()[buffer_index].Size() > 0,
+                   "BufferDescriptorA buffer_index {} is empty", buffer_index);
+        return BufferDescriptorA()[buffer_index].Size();
+    } else {
+        ASSERT_MSG(BufferDescriptorX().size() > buffer_index,
+                   "BufferDescriptorX invalid buffer_index {}", buffer_index);
+        ASSERT_MSG(BufferDescriptorX()[buffer_index].Size() > 0,
+                   "BufferDescriptorX buffer_index {} is empty", buffer_index);
+        return BufferDescriptorX()[buffer_index].Size();
+    }
 }
 
 std::size_t HLERequestContext::GetWriteBufferSize(int buffer_index) const {
-    const bool is_buffer_b{BufferDescriptorB().size() && BufferDescriptorB()[buffer_index].Size()};
-    return is_buffer_b ? BufferDescriptorB()[buffer_index].Size()
-                       : BufferDescriptorC()[buffer_index].Size();
+    const bool is_buffer_b{BufferDescriptorB().size() > buffer_index &&
+                           BufferDescriptorB()[buffer_index].Size()};
+    if (is_buffer_b) {
+        ASSERT_MSG(BufferDescriptorB().size() > buffer_index,
+                   "BufferDescriptorB invalid buffer_index {}", buffer_index);
+        return BufferDescriptorB()[buffer_index].Size();
+    } else {
+        ASSERT_MSG(BufferDescriptorC().size() > buffer_index,
+                   "BufferDescriptorC invalid buffer_index {}", buffer_index);
+        return BufferDescriptorC()[buffer_index].Size();
+    }
 }
 
 std::string HLERequestContext::Description() const {
diff --git a/src/core/hle/kernel/kernel.cpp b/src/core/hle/kernel/kernel.cpp
index 1d0783bd3..4eb1d8703 100644
--- a/src/core/hle/kernel/kernel.cpp
+++ b/src/core/hle/kernel/kernel.cpp
@@ -3,13 +3,15 @@
 // Refer to the license.txt file included.
 
 #include <atomic>
+#include <functional>
 #include <memory>
 #include <mutex>
 #include <utility>
 
 #include "common/assert.h"
 #include "common/logging/log.h"
-
+#include "core/arm/arm_interface.h"
+#include "core/arm/exclusive_monitor.h"
 #include "core/core.h"
 #include "core/core_timing.h"
 #include "core/core_timing_util.h"
@@ -17,9 +19,11 @@
 #include "core/hle/kernel/errors.h"
 #include "core/hle/kernel/handle_table.h"
 #include "core/hle/kernel/kernel.h"
+#include "core/hle/kernel/physical_core.h"
 #include "core/hle/kernel/process.h"
 #include "core/hle/kernel/resource_limit.h"
 #include "core/hle/kernel/scheduler.h"
+#include "core/hle/kernel/synchronization.h"
 #include "core/hle/kernel/thread.h"
 #include "core/hle/lock.h"
 #include "core/hle/result.h"
@@ -51,10 +55,10 @@ static void ThreadWakeupCallback(u64 thread_handle, [[maybe_unused]] s64 cycles_
     if (thread->GetStatus() == ThreadStatus::WaitSynch ||
         thread->GetStatus() == ThreadStatus::WaitHLEEvent) {
         // Remove the thread from each of its waiting objects' waitlists
-        for (const auto& object : thread->GetWaitObjects()) {
+        for (const auto& object : thread->GetSynchronizationObjects()) {
             object->RemoveWaitingThread(thread);
         }
-        thread->ClearWaitObjects();
+        thread->ClearSynchronizationObjects();
 
         // Invoke the wakeup callback before clearing the wait objects
         if (thread->HasWakeupCallback()) {
@@ -93,11 +97,13 @@ static void ThreadWakeupCallback(u64 thread_handle, [[maybe_unused]] s64 cycles_
 }
 
 struct KernelCore::Impl {
-    explicit Impl(Core::System& system) : system{system}, global_scheduler{system} {}
+    explicit Impl(Core::System& system)
+        : system{system}, global_scheduler{system}, synchronization{system} {}
 
     void Initialize(KernelCore& kernel) {
         Shutdown();
 
+        InitializePhysicalCores();
         InitializeSystemResourceLimit(kernel);
         InitializeThreads();
         InitializePreemption();
@@ -121,6 +127,21 @@ struct KernelCore::Impl {
         global_scheduler.Shutdown();
 
         named_ports.clear();
+
+        for (auto& core : cores) {
+            core.Shutdown();
+        }
+        cores.clear();
+
+        exclusive_monitor.reset();
+    }
+
+    void InitializePhysicalCores() {
+        exclusive_monitor =
+            Core::MakeExclusiveMonitor(system.Memory(), global_scheduler.CpuCoresCount());
+        for (std::size_t i = 0; i < global_scheduler.CpuCoresCount(); i++) {
+            cores.emplace_back(system, i, *exclusive_monitor);
+        }
     }
 
     // Creates the default system resource limit
@@ -172,6 +193,7 @@ struct KernelCore::Impl {
     std::vector<std::shared_ptr<Process>> process_list;
     Process* current_process = nullptr;
     Kernel::GlobalScheduler global_scheduler;
+    Kernel::Synchronization synchronization;
 
     std::shared_ptr<ResourceLimit> system_resource_limit;
 
@@ -186,6 +208,9 @@ struct KernelCore::Impl {
     /// the ConnectToPort SVC.
     NamedPortTable named_ports;
 
+    std::unique_ptr<Core::ExclusiveMonitor> exclusive_monitor;
+    std::vector<Kernel::PhysicalCore> cores;
+
     // System context
     Core::System& system;
 };
@@ -240,6 +265,42 @@ const Kernel::GlobalScheduler& KernelCore::GlobalScheduler() const {
     return impl->global_scheduler;
 }
 
+Kernel::PhysicalCore& KernelCore::PhysicalCore(std::size_t id) {
+    return impl->cores[id];
+}
+
+const Kernel::PhysicalCore& KernelCore::PhysicalCore(std::size_t id) const {
+    return impl->cores[id];
+}
+
+Kernel::Synchronization& KernelCore::Synchronization() {
+    return impl->synchronization;
+}
+
+const Kernel::Synchronization& KernelCore::Synchronization() const {
+    return impl->synchronization;
+}
+
+Core::ExclusiveMonitor& KernelCore::GetExclusiveMonitor() {
+    return *impl->exclusive_monitor;
+}
+
+const Core::ExclusiveMonitor& KernelCore::GetExclusiveMonitor() const {
+    return *impl->exclusive_monitor;
+}
+
+void KernelCore::InvalidateAllInstructionCaches() {
+    for (std::size_t i = 0; i < impl->global_scheduler.CpuCoresCount(); i++) {
+        PhysicalCore(i).ArmInterface().ClearInstructionCache();
+    }
+}
+
+void KernelCore::PrepareReschedule(std::size_t id) {
+    if (id < impl->global_scheduler.CpuCoresCount()) {
+        impl->cores[id].Stop();
+    }
+}
+
 void KernelCore::AddNamedPort(std::string name, std::shared_ptr<ClientPort> port) {
     impl->named_ports.emplace(std::move(name), std::move(port));
 }
diff --git a/src/core/hle/kernel/kernel.h b/src/core/hle/kernel/kernel.h
index 3bf0068ed..1eede3063 100644
--- a/src/core/hle/kernel/kernel.h
+++ b/src/core/hle/kernel/kernel.h
@@ -11,8 +11,9 @@
 #include "core/hle/kernel/object.h"
 
 namespace Core {
+class ExclusiveMonitor;
 class System;
-}
+} // namespace Core
 
 namespace Core::Timing {
 class CoreTiming;
@@ -25,8 +26,10 @@ class AddressArbiter;
 class ClientPort;
 class GlobalScheduler;
 class HandleTable;
+class PhysicalCore;
 class Process;
 class ResourceLimit;
+class Synchronization;
 class Thread;
 
 /// Represents a single instance of the kernel.
@@ -84,6 +87,27 @@ public:
     /// Gets the sole instance of the global scheduler
     const Kernel::GlobalScheduler& GlobalScheduler() const;
 
+    /// Gets the an instance of the respective physical CPU core.
+    Kernel::PhysicalCore& PhysicalCore(std::size_t id);
+
+    /// Gets the an instance of the respective physical CPU core.
+    const Kernel::PhysicalCore& PhysicalCore(std::size_t id) const;
+
+    /// Gets the an instance of the Synchronization Interface.
+    Kernel::Synchronization& Synchronization();
+
+    /// Gets the an instance of the Synchronization Interface.
+    const Kernel::Synchronization& Synchronization() const;
+
+    /// Stops execution of 'id' core, in order to reschedule a new thread.
+    void PrepareReschedule(std::size_t id);
+
+    Core::ExclusiveMonitor& GetExclusiveMonitor();
+
+    const Core::ExclusiveMonitor& GetExclusiveMonitor() const;
+
+    void InvalidateAllInstructionCaches();
+
     /// Adds a port to the named port table
     void AddNamedPort(std::string name, std::shared_ptr<ClientPort> port);
 
diff --git a/src/core/hle/kernel/physical_core.cpp b/src/core/hle/kernel/physical_core.cpp
new file mode 100644
index 000000000..9303dd273
--- /dev/null
+++ b/src/core/hle/kernel/physical_core.cpp
@@ -0,0 +1,51 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include "common/logging/log.h"
+#include "core/arm/arm_interface.h"
+#ifdef ARCHITECTURE_x86_64
+#include "core/arm/dynarmic/arm_dynarmic.h"
+#endif
+#include "core/arm/exclusive_monitor.h"
+#include "core/arm/unicorn/arm_unicorn.h"
+#include "core/core.h"
+#include "core/hle/kernel/physical_core.h"
+#include "core/hle/kernel/scheduler.h"
+#include "core/hle/kernel/thread.h"
+
+namespace Kernel {
+
+PhysicalCore::PhysicalCore(Core::System& system, std::size_t id,
+                           Core::ExclusiveMonitor& exclusive_monitor)
+    : core_index{id} {
+#ifdef ARCHITECTURE_x86_64
+    arm_interface = std::make_unique<Core::ARM_Dynarmic>(system, exclusive_monitor, core_index);
+#else
+    arm_interface = std::make_shared<Core::ARM_Unicorn>(system);
+    LOG_WARNING(Core, "CPU JIT requested, but Dynarmic not available");
+#endif
+
+    scheduler = std::make_unique<Kernel::Scheduler>(system, *arm_interface, core_index);
+}
+
+PhysicalCore::~PhysicalCore() = default;
+
+void PhysicalCore::Run() {
+    arm_interface->Run();
+    arm_interface->ClearExclusiveState();
+}
+
+void PhysicalCore::Step() {
+    arm_interface->Step();
+}
+
+void PhysicalCore::Stop() {
+    arm_interface->PrepareReschedule();
+}
+
+void PhysicalCore::Shutdown() {
+    scheduler->Shutdown();
+}
+
+} // namespace Kernel
diff --git a/src/core/hle/kernel/physical_core.h b/src/core/hle/kernel/physical_core.h
new file mode 100644
index 000000000..4c32c0f1b
--- /dev/null
+++ b/src/core/hle/kernel/physical_core.h
@@ -0,0 +1,77 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <cstddef>
+#include <memory>
+
+namespace Kernel {
+class Scheduler;
+} // namespace Kernel
+
+namespace Core {
+class ARM_Interface;
+class ExclusiveMonitor;
+class System;
+} // namespace Core
+
+namespace Kernel {
+
+class PhysicalCore {
+public:
+    PhysicalCore(Core::System& system, std::size_t id, Core::ExclusiveMonitor& exclusive_monitor);
+    ~PhysicalCore();
+
+    PhysicalCore(const PhysicalCore&) = delete;
+    PhysicalCore& operator=(const PhysicalCore&) = delete;
+
+    PhysicalCore(PhysicalCore&&) = default;
+    PhysicalCore& operator=(PhysicalCore&&) = default;
+
+    /// Execute current jit state
+    void Run();
+    /// Execute a single instruction in current jit.
+    void Step();
+    /// Stop JIT execution/exit
+    void Stop();
+
+    // Shutdown this physical core.
+    void Shutdown();
+
+    Core::ARM_Interface& ArmInterface() {
+        return *arm_interface;
+    }
+
+    const Core::ARM_Interface& ArmInterface() const {
+        return *arm_interface;
+    }
+
+    bool IsMainCore() const {
+        return core_index == 0;
+    }
+
+    bool IsSystemCore() const {
+        return core_index == 3;
+    }
+
+    std::size_t CoreIndex() const {
+        return core_index;
+    }
+
+    Kernel::Scheduler& Scheduler() {
+        return *scheduler;
+    }
+
+    const Kernel::Scheduler& Scheduler() const {
+        return *scheduler;
+    }
+
+private:
+    std::size_t core_index;
+    std::unique_ptr<Core::ARM_Interface> arm_interface;
+    std::unique_ptr<Kernel::Scheduler> scheduler;
+};
+
+} // namespace Kernel
diff --git a/src/core/hle/kernel/process.cpp b/src/core/hle/kernel/process.cpp
index b9035a0be..2fcb7326c 100644
--- a/src/core/hle/kernel/process.cpp
+++ b/src/core/hle/kernel/process.cpp
@@ -337,7 +337,7 @@ void Process::LoadModule(CodeSet module_, VAddr base_addr) {
 }
 
 Process::Process(Core::System& system)
-    : WaitObject{system.Kernel()}, vm_manager{system},
+    : SynchronizationObject{system.Kernel()}, vm_manager{system},
       address_arbiter{system}, mutex{system}, system{system} {}
 
 Process::~Process() = default;
@@ -357,7 +357,7 @@ void Process::ChangeStatus(ProcessStatus new_status) {
 
     status = new_status;
     is_signaled = true;
-    WakeupAllWaitingThreads();
+    Signal();
 }
 
 void Process::AllocateMainThreadStack(u64 stack_size) {
diff --git a/src/core/hle/kernel/process.h b/src/core/hle/kernel/process.h
index 3483fa19d..4887132a7 100644
--- a/src/core/hle/kernel/process.h
+++ b/src/core/hle/kernel/process.h
@@ -15,8 +15,8 @@
 #include "core/hle/kernel/handle_table.h"
 #include "core/hle/kernel/mutex.h"
 #include "core/hle/kernel/process_capability.h"
+#include "core/hle/kernel/synchronization_object.h"
 #include "core/hle/kernel/vm_manager.h"
-#include "core/hle/kernel/wait_object.h"
 #include "core/hle/result.h"
 
 namespace Core {
@@ -60,7 +60,7 @@ enum class ProcessStatus {
     DebugBreak,
 };
 
-class Process final : public WaitObject {
+class Process final : public SynchronizationObject {
 public:
     explicit Process(Core::System& system);
     ~Process() override;
@@ -359,10 +359,6 @@ private:
     /// specified by metadata provided to the process during loading.
     bool is_64bit_process = true;
 
-    /// Whether or not this process is signaled. This occurs
-    /// upon the process changing to a different state.
-    bool is_signaled = false;
-
     /// Total running time for the process in ticks.
     u64 total_process_running_time_ticks = 0;
 
diff --git a/src/core/hle/kernel/readable_event.cpp b/src/core/hle/kernel/readable_event.cpp
index d8ac97aa1..9d3d3a81b 100644
--- a/src/core/hle/kernel/readable_event.cpp
+++ b/src/core/hle/kernel/readable_event.cpp
@@ -11,30 +11,30 @@
 
 namespace Kernel {
 
-ReadableEvent::ReadableEvent(KernelCore& kernel) : WaitObject{kernel} {}
+ReadableEvent::ReadableEvent(KernelCore& kernel) : SynchronizationObject{kernel} {}
 ReadableEvent::~ReadableEvent() = default;
 
 bool ReadableEvent::ShouldWait(const Thread* thread) const {
-    return !signaled;
+    return !is_signaled;
 }
 
 void ReadableEvent::Acquire(Thread* thread) {
-    ASSERT_MSG(!ShouldWait(thread), "object unavailable!");
+    ASSERT_MSG(IsSignaled(), "object unavailable!");
 }
 
 void ReadableEvent::Signal() {
-    if (!signaled) {
-        signaled = true;
-        WakeupAllWaitingThreads();
+    if (!is_signaled) {
+        is_signaled = true;
+        SynchronizationObject::Signal();
     };
 }
 
 void ReadableEvent::Clear() {
-    signaled = false;
+    is_signaled = false;
 }
 
 ResultCode ReadableEvent::Reset() {
-    if (!signaled) {
+    if (!is_signaled) {
         return ERR_INVALID_STATE;
     }
 
diff --git a/src/core/hle/kernel/readable_event.h b/src/core/hle/kernel/readable_event.h
index 11ff71c3a..3264dd066 100644
--- a/src/core/hle/kernel/readable_event.h
+++ b/src/core/hle/kernel/readable_event.h
@@ -5,7 +5,7 @@
 #pragma once
 
 #include "core/hle/kernel/object.h"
-#include "core/hle/kernel/wait_object.h"
+#include "core/hle/kernel/synchronization_object.h"
 
 union ResultCode;
 
@@ -14,7 +14,7 @@ namespace Kernel {
 class KernelCore;
 class WritableEvent;
 
-class ReadableEvent final : public WaitObject {
+class ReadableEvent final : public SynchronizationObject {
     friend class WritableEvent;
 
 public:
@@ -46,13 +46,11 @@ public:
     ///      then ERR_INVALID_STATE will be returned.
     ResultCode Reset();
 
+    void Signal() override;
+
 private:
     explicit ReadableEvent(KernelCore& kernel);
 
-    void Signal();
-
-    bool signaled{};
-
     std::string name; ///< Name of event (optional)
 };
 
diff --git a/src/core/hle/kernel/scheduler.cpp b/src/core/hle/kernel/scheduler.cpp
index d36fcd7d9..86f1421bf 100644
--- a/src/core/hle/kernel/scheduler.cpp
+++ b/src/core/hle/kernel/scheduler.cpp
@@ -14,7 +14,6 @@
 #include "common/logging/log.h"
 #include "core/arm/arm_interface.h"
 #include "core/core.h"
-#include "core/core_cpu.h"
 #include "core/core_timing.h"
 #include "core/hle/kernel/kernel.h"
 #include "core/hle/kernel/process.h"
@@ -125,8 +124,8 @@ bool GlobalScheduler::YieldThreadAndBalanceLoad(Thread* yielding_thread) {
                "Thread yielding without being in front");
     scheduled_queue[core_id].yield(priority);
 
-    std::array<Thread*, NUM_CPU_CORES> current_threads;
-    for (u32 i = 0; i < NUM_CPU_CORES; i++) {
+    std::array<Thread*, Core::Hardware::NUM_CPU_CORES> current_threads;
+    for (std::size_t i = 0; i < current_threads.size(); i++) {
         current_threads[i] = scheduled_queue[i].empty() ? nullptr : scheduled_queue[i].front();
     }
 
@@ -178,8 +177,8 @@ bool GlobalScheduler::YieldThreadAndWaitForLoadBalancing(Thread* yielding_thread
     // function...
     if (scheduled_queue[core_id].empty()) {
         // Here, "current_threads" is calculated after the ""yield"", unlike yield -1
-        std::array<Thread*, NUM_CPU_CORES> current_threads;
-        for (u32 i = 0; i < NUM_CPU_CORES; i++) {
+        std::array<Thread*, Core::Hardware::NUM_CPU_CORES> current_threads;
+        for (std::size_t i = 0; i < current_threads.size(); i++) {
             current_threads[i] = scheduled_queue[i].empty() ? nullptr : scheduled_queue[i].front();
         }
         for (auto& thread : suggested_queue[core_id]) {
@@ -209,7 +208,7 @@ bool GlobalScheduler::YieldThreadAndWaitForLoadBalancing(Thread* yielding_thread
 }
 
 void GlobalScheduler::PreemptThreads() {
-    for (std::size_t core_id = 0; core_id < NUM_CPU_CORES; core_id++) {
+    for (std::size_t core_id = 0; core_id < Core::Hardware::NUM_CPU_CORES; core_id++) {
         const u32 priority = preemption_priorities[core_id];
 
         if (scheduled_queue[core_id].size(priority) > 0) {
@@ -350,7 +349,7 @@ bool GlobalScheduler::AskForReselectionOrMarkRedundant(Thread* current_thread,
 }
 
 void GlobalScheduler::Shutdown() {
-    for (std::size_t core = 0; core < NUM_CPU_CORES; core++) {
+    for (std::size_t core = 0; core < Core::Hardware::NUM_CPU_CORES; core++) {
         scheduled_queue[core].clear();
         suggested_queue[core].clear();
     }
diff --git a/src/core/hle/kernel/scheduler.h b/src/core/hle/kernel/scheduler.h
index 14b77960a..96db049cb 100644
--- a/src/core/hle/kernel/scheduler.h
+++ b/src/core/hle/kernel/scheduler.h
@@ -10,6 +10,7 @@
 
 #include "common/common_types.h"
 #include "common/multi_level_queue.h"
+#include "core/hardware_properties.h"
 #include "core/hle/kernel/thread.h"
 
 namespace Core {
@@ -23,8 +24,6 @@ class Process;
 
 class GlobalScheduler final {
 public:
-    static constexpr u32 NUM_CPU_CORES = 4;
-
     explicit GlobalScheduler(Core::System& system);
     ~GlobalScheduler();
 
@@ -125,7 +124,7 @@ public:
     void PreemptThreads();
 
     u32 CpuCoresCount() const {
-        return NUM_CPU_CORES;
+        return Core::Hardware::NUM_CPU_CORES;
     }
 
     void SetReselectionPending() {
@@ -149,13 +148,15 @@ private:
     bool AskForReselectionOrMarkRedundant(Thread* current_thread, const Thread* winner);
 
     static constexpr u32 min_regular_priority = 2;
-    std::array<Common::MultiLevelQueue<Thread*, THREADPRIO_COUNT>, NUM_CPU_CORES> scheduled_queue;
-    std::array<Common::MultiLevelQueue<Thread*, THREADPRIO_COUNT>, NUM_CPU_CORES> suggested_queue;
+    std::array<Common::MultiLevelQueue<Thread*, THREADPRIO_COUNT>, Core::Hardware::NUM_CPU_CORES>
+        scheduled_queue;
+    std::array<Common::MultiLevelQueue<Thread*, THREADPRIO_COUNT>, Core::Hardware::NUM_CPU_CORES>
+        suggested_queue;
     std::atomic<bool> is_reselection_pending{false};
 
     // The priority levels at which the global scheduler preempts threads every 10 ms. They are
     // ordered from Core 0 to Core 3.
-    std::array<u32, NUM_CPU_CORES> preemption_priorities = {59, 59, 59, 62};
+    std::array<u32, Core::Hardware::NUM_CPU_CORES> preemption_priorities = {59, 59, 59, 62};
 
     /// Lists all thread ids that aren't deleted/etc.
     std::vector<std::shared_ptr<Thread>> thread_list;
diff --git a/src/core/hle/kernel/server_port.cpp b/src/core/hle/kernel/server_port.cpp
index a4ccfa35e..a549ae9d7 100644
--- a/src/core/hle/kernel/server_port.cpp
+++ b/src/core/hle/kernel/server_port.cpp
@@ -13,7 +13,7 @@
 
 namespace Kernel {
 
-ServerPort::ServerPort(KernelCore& kernel) : WaitObject{kernel} {}
+ServerPort::ServerPort(KernelCore& kernel) : SynchronizationObject{kernel} {}
 ServerPort::~ServerPort() = default;
 
 ResultVal<std::shared_ptr<ServerSession>> ServerPort::Accept() {
@@ -39,6 +39,10 @@ void ServerPort::Acquire(Thread* thread) {
     ASSERT_MSG(!ShouldWait(thread), "object unavailable!");
 }
 
+bool ServerPort::IsSignaled() const {
+    return !pending_sessions.empty();
+}
+
 ServerPort::PortPair ServerPort::CreatePortPair(KernelCore& kernel, u32 max_sessions,
                                                 std::string name) {
     std::shared_ptr<ServerPort> server_port = std::make_shared<ServerPort>(kernel);
diff --git a/src/core/hle/kernel/server_port.h b/src/core/hle/kernel/server_port.h
index 8be8a75ea..41b191b86 100644
--- a/src/core/hle/kernel/server_port.h
+++ b/src/core/hle/kernel/server_port.h
@@ -10,7 +10,7 @@
 #include <vector>
 #include "common/common_types.h"
 #include "core/hle/kernel/object.h"
-#include "core/hle/kernel/wait_object.h"
+#include "core/hle/kernel/synchronization_object.h"
 #include "core/hle/result.h"
 
 namespace Kernel {
@@ -20,7 +20,7 @@ class KernelCore;
 class ServerSession;
 class SessionRequestHandler;
 
-class ServerPort final : public WaitObject {
+class ServerPort final : public SynchronizationObject {
 public:
     explicit ServerPort(KernelCore& kernel);
     ~ServerPort() override;
@@ -82,6 +82,8 @@ public:
     bool ShouldWait(const Thread* thread) const override;
     void Acquire(Thread* thread) override;
 
+    bool IsSignaled() const override;
+
 private:
     /// ServerSessions waiting to be accepted by the port
     std::vector<std::shared_ptr<ServerSession>> pending_sessions;
diff --git a/src/core/hle/kernel/server_session.cpp b/src/core/hle/kernel/server_session.cpp
index 7825e1ec4..4604e35c5 100644
--- a/src/core/hle/kernel/server_session.cpp
+++ b/src/core/hle/kernel/server_session.cpp
@@ -24,7 +24,7 @@
 
 namespace Kernel {
 
-ServerSession::ServerSession(KernelCore& kernel) : WaitObject{kernel} {}
+ServerSession::ServerSession(KernelCore& kernel) : SynchronizationObject{kernel} {}
 ServerSession::~ServerSession() = default;
 
 ResultVal<std::shared_ptr<ServerSession>> ServerSession::Create(KernelCore& kernel,
@@ -50,6 +50,16 @@ bool ServerSession::ShouldWait(const Thread* thread) const {
     return pending_requesting_threads.empty() || currently_handling != nullptr;
 }
 
+bool ServerSession::IsSignaled() const {
+    // Closed sessions should never wait, an error will be returned from svcReplyAndReceive.
+    if (!parent->Client()) {
+        return true;
+    }
+
+    // Wait if we have no pending requests, or if we're currently handling a request.
+    return !pending_requesting_threads.empty() && currently_handling == nullptr;
+}
+
 void ServerSession::Acquire(Thread* thread) {
     ASSERT_MSG(!ShouldWait(thread), "object unavailable!");
     // We are now handling a request, pop it from the stack.
diff --git a/src/core/hle/kernel/server_session.h b/src/core/hle/kernel/server_session.h
index d6e48109e..77e4f6721 100644
--- a/src/core/hle/kernel/server_session.h
+++ b/src/core/hle/kernel/server_session.h
@@ -10,7 +10,7 @@
 #include <vector>
 
 #include "common/threadsafe_queue.h"
-#include "core/hle/kernel/wait_object.h"
+#include "core/hle/kernel/synchronization_object.h"
 #include "core/hle/result.h"
 
 namespace Memory {
@@ -41,7 +41,7 @@ class Thread;
  * After the server replies to the request, the response is marshalled back to the caller's
  * TLS buffer and control is transferred back to it.
  */
-class ServerSession final : public WaitObject {
+class ServerSession final : public SynchronizationObject {
 public:
     explicit ServerSession(KernelCore& kernel);
     ~ServerSession() override;
@@ -73,6 +73,8 @@ public:
         return parent.get();
     }
 
+    bool IsSignaled() const override;
+
     /**
      * Sets the HLE handler for the session. This handler will be called to service IPC requests
      * instead of the regular IPC machinery. (The regular IPC machinery is currently not
diff --git a/src/core/hle/kernel/session.cpp b/src/core/hle/kernel/session.cpp
index dee6e2b72..e4dd53e24 100644
--- a/src/core/hle/kernel/session.cpp
+++ b/src/core/hle/kernel/session.cpp
@@ -9,7 +9,7 @@
 
 namespace Kernel {
 
-Session::Session(KernelCore& kernel) : WaitObject{kernel} {}
+Session::Session(KernelCore& kernel) : SynchronizationObject{kernel} {}
 Session::~Session() = default;
 
 Session::SessionPair Session::Create(KernelCore& kernel, std::string name) {
@@ -29,6 +29,11 @@ bool Session::ShouldWait(const Thread* thread) const {
     return {};
 }
 
+bool Session::IsSignaled() const {
+    UNIMPLEMENTED();
+    return true;
+}
+
 void Session::Acquire(Thread* thread) {
     UNIMPLEMENTED();
 }
diff --git a/src/core/hle/kernel/session.h b/src/core/hle/kernel/session.h
index 15a5ac15f..7cd9c0d77 100644
--- a/src/core/hle/kernel/session.h
+++ b/src/core/hle/kernel/session.h
@@ -8,7 +8,7 @@
 #include <string>
 #include <utility>
 
-#include "core/hle/kernel/wait_object.h"
+#include "core/hle/kernel/synchronization_object.h"
 
 namespace Kernel {
 
@@ -19,7 +19,7 @@ class ServerSession;
  * Parent structure to link the client and server endpoints of a session with their associated
  * client port.
  */
-class Session final : public WaitObject {
+class Session final : public SynchronizationObject {
 public:
     explicit Session(KernelCore& kernel);
     ~Session() override;
@@ -39,6 +39,8 @@ public:
 
     bool ShouldWait(const Thread* thread) const override;
 
+    bool IsSignaled() const override;
+
     void Acquire(Thread* thread) override;
 
     std::shared_ptr<ClientSession> Client() {
diff --git a/src/core/hle/kernel/svc.cpp b/src/core/hle/kernel/svc.cpp
index dbcdb0b88..fd91779a3 100644
--- a/src/core/hle/kernel/svc.cpp
+++ b/src/core/hle/kernel/svc.cpp
@@ -15,7 +15,7 @@
 #include "common/string_util.h"
 #include "core/arm/exclusive_monitor.h"
 #include "core/core.h"
-#include "core/core_cpu.h"
+#include "core/core_manager.h"
 #include "core/core_timing.h"
 #include "core/core_timing_util.h"
 #include "core/hle/kernel/address_arbiter.h"
@@ -32,6 +32,7 @@
 #include "core/hle/kernel/shared_memory.h"
 #include "core/hle/kernel/svc.h"
 #include "core/hle/kernel/svc_wrap.h"
+#include "core/hle/kernel/synchronization.h"
 #include "core/hle/kernel/thread.h"
 #include "core/hle/kernel/transfer_memory.h"
 #include "core/hle/kernel/writable_event.h"
@@ -433,22 +434,6 @@ static ResultCode GetProcessId(Core::System& system, u64* process_id, Handle han
     return ERR_INVALID_HANDLE;
 }
 
-/// Default thread wakeup callback for WaitSynchronization
-static bool DefaultThreadWakeupCallback(ThreadWakeupReason reason, std::shared_ptr<Thread> thread,
-                                        std::shared_ptr<WaitObject> object, std::size_t index) {
-    ASSERT(thread->GetStatus() == ThreadStatus::WaitSynch);
-
-    if (reason == ThreadWakeupReason::Timeout) {
-        thread->SetWaitSynchronizationResult(RESULT_TIMEOUT);
-        return true;
-    }
-
-    ASSERT(reason == ThreadWakeupReason::Signal);
-    thread->SetWaitSynchronizationResult(RESULT_SUCCESS);
-    thread->SetWaitSynchronizationOutput(static_cast<u32>(index));
-    return true;
-};
-
 /// Wait for the given handles to synchronize, timeout after the specified nanoseconds
 static ResultCode WaitSynchronization(Core::System& system, Handle* index, VAddr handles_address,
                                       u64 handle_count, s64 nano_seconds) {
@@ -472,14 +457,14 @@ static ResultCode WaitSynchronization(Core::System& system, Handle* index, VAddr
     }
 
     auto* const thread = system.CurrentScheduler().GetCurrentThread();
-
-    using ObjectPtr = Thread::ThreadWaitObjects::value_type;
-    Thread::ThreadWaitObjects objects(handle_count);
-    const auto& handle_table = system.Kernel().CurrentProcess()->GetHandleTable();
+    auto& kernel = system.Kernel();
+    using ObjectPtr = Thread::ThreadSynchronizationObjects::value_type;
+    Thread::ThreadSynchronizationObjects objects(handle_count);
+    const auto& handle_table = kernel.CurrentProcess()->GetHandleTable();
 
     for (u64 i = 0; i < handle_count; ++i) {
         const Handle handle = memory.Read32(handles_address + i * sizeof(Handle));
-        const auto object = handle_table.Get<WaitObject>(handle);
+        const auto object = handle_table.Get<SynchronizationObject>(handle);
 
         if (object == nullptr) {
             LOG_ERROR(Kernel_SVC, "Object is a nullptr");
@@ -488,47 +473,10 @@ static ResultCode WaitSynchronization(Core::System& system, Handle* index, VAddr
 
         objects[i] = object;
     }
-
-    // Find the first object that is acquirable in the provided list of objects
-    auto itr = std::find_if(objects.begin(), objects.end(), [thread](const ObjectPtr& object) {
-        return !object->ShouldWait(thread);
-    });
-
-    if (itr != objects.end()) {
-        // We found a ready object, acquire it and set the result value
-        WaitObject* object = itr->get();
-        object->Acquire(thread);
-        *index = static_cast<s32>(std::distance(objects.begin(), itr));
-        return RESULT_SUCCESS;
-    }
-
-    // No objects were ready to be acquired, prepare to suspend the thread.
-
-    // If a timeout value of 0 was provided, just return the Timeout error code instead of
-    // suspending the thread.
-    if (nano_seconds == 0) {
-        return RESULT_TIMEOUT;
-    }
-
-    if (thread->IsSyncCancelled()) {
-        thread->SetSyncCancelled(false);
-        return ERR_SYNCHRONIZATION_CANCELED;
-    }
-
-    for (auto& object : objects) {
-        object->AddWaitingThread(SharedFrom(thread));
-    }
-
-    thread->SetWaitObjects(std::move(objects));
-    thread->SetStatus(ThreadStatus::WaitSynch);
-
-    // Create an event to wake the thread up after the specified nanosecond delay has passed
-    thread->WakeAfterDelay(nano_seconds);
-    thread->SetWakeupCallback(DefaultThreadWakeupCallback);
-
-    system.PrepareReschedule(thread->GetProcessorID());
-
-    return RESULT_TIMEOUT;
+    auto& synchronization = kernel.Synchronization();
+    const auto [result, handle_result] = synchronization.WaitFor(objects, nano_seconds);
+    *index = handle_result;
+    return result;
 }
 
 /// Resumes a thread waiting on WaitSynchronization
@@ -1863,10 +1811,14 @@ static ResultCode CreateTransferMemory(Core::System& system, Handle* handle, VAd
     }
 
     auto& kernel = system.Kernel();
-    auto transfer_mem_handle = TransferMemory::Create(kernel, addr, size, perms);
+    auto transfer_mem_handle = TransferMemory::Create(kernel, system.Memory(), addr, size, perms);
+
+    if (const auto reserve_result{transfer_mem_handle->Reserve()}; reserve_result.IsError()) {
+        return reserve_result;
+    }
 
     auto& handle_table = kernel.CurrentProcess()->GetHandleTable();
-    const auto result = handle_table.Create(std::move(transfer_mem_handle));
+    const auto result{handle_table.Create(std::move(transfer_mem_handle))};
     if (result.Failed()) {
         return result.Code();
     }
diff --git a/src/core/hle/kernel/synchronization.cpp b/src/core/hle/kernel/synchronization.cpp
new file mode 100644
index 000000000..dc37fad1a
--- /dev/null
+++ b/src/core/hle/kernel/synchronization.cpp
@@ -0,0 +1,87 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include "core/core.h"
+#include "core/hle/kernel/errors.h"
+#include "core/hle/kernel/handle_table.h"
+#include "core/hle/kernel/kernel.h"
+#include "core/hle/kernel/scheduler.h"
+#include "core/hle/kernel/synchronization.h"
+#include "core/hle/kernel/synchronization_object.h"
+#include "core/hle/kernel/thread.h"
+
+namespace Kernel {
+
+/// Default thread wakeup callback for WaitSynchronization
+static bool DefaultThreadWakeupCallback(ThreadWakeupReason reason, std::shared_ptr<Thread> thread,
+                                        std::shared_ptr<SynchronizationObject> object,
+                                        std::size_t index) {
+    ASSERT(thread->GetStatus() == ThreadStatus::WaitSynch);
+
+    if (reason == ThreadWakeupReason::Timeout) {
+        thread->SetWaitSynchronizationResult(RESULT_TIMEOUT);
+        return true;
+    }
+
+    ASSERT(reason == ThreadWakeupReason::Signal);
+    thread->SetWaitSynchronizationResult(RESULT_SUCCESS);
+    thread->SetWaitSynchronizationOutput(static_cast<u32>(index));
+    return true;
+}
+
+Synchronization::Synchronization(Core::System& system) : system{system} {}
+
+void Synchronization::SignalObject(SynchronizationObject& obj) const {
+    if (obj.IsSignaled()) {
+        obj.WakeupAllWaitingThreads();
+    }
+}
+
+std::pair<ResultCode, Handle> Synchronization::WaitFor(
+    std::vector<std::shared_ptr<SynchronizationObject>>& sync_objects, s64 nano_seconds) {
+    auto* const thread = system.CurrentScheduler().GetCurrentThread();
+    // Find the first object that is acquirable in the provided list of objects
+    const auto itr = std::find_if(sync_objects.begin(), sync_objects.end(),
+                                  [thread](const std::shared_ptr<SynchronizationObject>& object) {
+                                      return object->IsSignaled();
+                                  });
+
+    if (itr != sync_objects.end()) {
+        // We found a ready object, acquire it and set the result value
+        SynchronizationObject* object = itr->get();
+        object->Acquire(thread);
+        const u32 index = static_cast<s32>(std::distance(sync_objects.begin(), itr));
+        return {RESULT_SUCCESS, index};
+    }
+
+    // No objects were ready to be acquired, prepare to suspend the thread.
+
+    // If a timeout value of 0 was provided, just return the Timeout error code instead of
+    // suspending the thread.
+    if (nano_seconds == 0) {
+        return {RESULT_TIMEOUT, InvalidHandle};
+    }
+
+    if (thread->IsSyncCancelled()) {
+        thread->SetSyncCancelled(false);
+        return {ERR_SYNCHRONIZATION_CANCELED, InvalidHandle};
+    }
+
+    for (auto& object : sync_objects) {
+        object->AddWaitingThread(SharedFrom(thread));
+    }
+
+    thread->SetSynchronizationObjects(std::move(sync_objects));
+    thread->SetStatus(ThreadStatus::WaitSynch);
+
+    // Create an event to wake the thread up after the specified nanosecond delay has passed
+    thread->WakeAfterDelay(nano_seconds);
+    thread->SetWakeupCallback(DefaultThreadWakeupCallback);
+
+    system.PrepareReschedule(thread->GetProcessorID());
+
+    return {RESULT_TIMEOUT, InvalidHandle};
+}
+
+} // namespace Kernel
diff --git a/src/core/hle/kernel/synchronization.h b/src/core/hle/kernel/synchronization.h
new file mode 100644
index 000000000..379f4b1d3
--- /dev/null
+++ b/src/core/hle/kernel/synchronization.h
@@ -0,0 +1,44 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "core/hle/kernel/object.h"
+#include "core/hle/result.h"
+
+namespace Core {
+class System;
+} // namespace Core
+
+namespace Kernel {
+
+class SynchronizationObject;
+
+/**
+ * The 'Synchronization' class is an interface for handling synchronization methods
+ * used by Synchronization objects and synchronization SVCs. This centralizes processing of
+ * such
+ */
+class Synchronization {
+public:
+    explicit Synchronization(Core::System& system);
+
+    /// Signals a synchronization object, waking up all its waiting threads
+    void SignalObject(SynchronizationObject& obj) const;
+
+    /// Tries to see if waiting for any of the sync_objects is necessary, if not
+    /// it returns Success and the handle index of the signaled sync object. In
+    /// case not, the current thread will be locked and wait for nano_seconds or
+    /// for a synchronization object to signal.
+    std::pair<ResultCode, Handle> WaitFor(
+        std::vector<std::shared_ptr<SynchronizationObject>>& sync_objects, s64 nano_seconds);
+
+private:
+    Core::System& system;
+};
+} // namespace Kernel
diff --git a/src/core/hle/kernel/wait_object.cpp b/src/core/hle/kernel/synchronization_object.cpp
index 745f2c4e8..43f3eef18 100644
--- a/src/core/hle/kernel/wait_object.cpp
+++ b/src/core/hle/kernel/synchronization_object.cpp
@@ -7,24 +7,29 @@
 #include "common/common_types.h"
 #include "common/logging/log.h"
 #include "core/core.h"
-#include "core/core_cpu.h"
 #include "core/hle/kernel/kernel.h"
 #include "core/hle/kernel/object.h"
 #include "core/hle/kernel/process.h"
+#include "core/hle/kernel/synchronization.h"
+#include "core/hle/kernel/synchronization_object.h"
 #include "core/hle/kernel/thread.h"
 
 namespace Kernel {
 
-WaitObject::WaitObject(KernelCore& kernel) : Object{kernel} {}
-WaitObject::~WaitObject() = default;
+SynchronizationObject::SynchronizationObject(KernelCore& kernel) : Object{kernel} {}
+SynchronizationObject::~SynchronizationObject() = default;
 
-void WaitObject::AddWaitingThread(std::shared_ptr<Thread> thread) {
+void SynchronizationObject::Signal() {
+    kernel.Synchronization().SignalObject(*this);
+}
+
+void SynchronizationObject::AddWaitingThread(std::shared_ptr<Thread> thread) {
     auto itr = std::find(waiting_threads.begin(), waiting_threads.end(), thread);
     if (itr == waiting_threads.end())
         waiting_threads.push_back(std::move(thread));
 }
 
-void WaitObject::RemoveWaitingThread(std::shared_ptr<Thread> thread) {
+void SynchronizationObject::RemoveWaitingThread(std::shared_ptr<Thread> thread) {
     auto itr = std::find(waiting_threads.begin(), waiting_threads.end(), thread);
     // If a thread passed multiple handles to the same object,
     // the kernel might attempt to remove the thread from the object's
@@ -33,7 +38,7 @@ void WaitObject::RemoveWaitingThread(std::shared_ptr<Thread> thread) {
         waiting_threads.erase(itr);
 }
 
-std::shared_ptr<Thread> WaitObject::GetHighestPriorityReadyThread() const {
+std::shared_ptr<Thread> SynchronizationObject::GetHighestPriorityReadyThread() const {
     Thread* candidate = nullptr;
     u32 candidate_priority = THREADPRIO_LOWEST + 1;
 
@@ -51,23 +56,14 @@ std::shared_ptr<Thread> WaitObject::GetHighestPriorityReadyThread() const {
         if (ShouldWait(thread.get()))
             continue;
 
-        // A thread is ready to run if it's either in ThreadStatus::WaitSynch
-        // and the rest of the objects it is waiting on are ready.
-        bool ready_to_run = true;
-        if (thread_status == ThreadStatus::WaitSynch) {
-            ready_to_run = thread->AllWaitObjectsReady();
-        }
-
-        if (ready_to_run) {
-            candidate = thread.get();
-            candidate_priority = thread->GetPriority();
-        }
+        candidate = thread.get();
+        candidate_priority = thread->GetPriority();
     }
 
     return SharedFrom(candidate);
 }
 
-void WaitObject::WakeupWaitingThread(std::shared_ptr<Thread> thread) {
+void SynchronizationObject::WakeupWaitingThread(std::shared_ptr<Thread> thread) {
     ASSERT(!ShouldWait(thread.get()));
 
     if (!thread) {
@@ -75,7 +71,7 @@ void WaitObject::WakeupWaitingThread(std::shared_ptr<Thread> thread) {
     }
 
     if (thread->IsSleepingOnWait()) {
-        for (const auto& object : thread->GetWaitObjects()) {
+        for (const auto& object : thread->GetSynchronizationObjects()) {
             ASSERT(!object->ShouldWait(thread.get()));
             object->Acquire(thread.get());
         }
@@ -83,9 +79,9 @@ void WaitObject::WakeupWaitingThread(std::shared_ptr<Thread> thread) {
         Acquire(thread.get());
     }
 
-    const std::size_t index = thread->GetWaitObjectIndex(SharedFrom(this));
+    const std::size_t index = thread->GetSynchronizationObjectIndex(SharedFrom(this));
 
-    thread->ClearWaitObjects();
+    thread->ClearSynchronizationObjects();
 
     thread->CancelWakeupTimer();
 
@@ -96,17 +92,17 @@ void WaitObject::WakeupWaitingThread(std::shared_ptr<Thread> thread) {
     }
     if (resume) {
         thread->ResumeFromWait();
-        Core::System::GetInstance().PrepareReschedule(thread->GetProcessorID());
+        kernel.PrepareReschedule(thread->GetProcessorID());
     }
 }
 
-void WaitObject::WakeupAllWaitingThreads() {
+void SynchronizationObject::WakeupAllWaitingThreads() {
     while (auto thread = GetHighestPriorityReadyThread()) {
         WakeupWaitingThread(thread);
     }
 }
 
-const std::vector<std::shared_ptr<Thread>>& WaitObject::GetWaitingThreads() const {
+const std::vector<std::shared_ptr<Thread>>& SynchronizationObject::GetWaitingThreads() const {
     return waiting_threads;
 }
 
diff --git a/src/core/hle/kernel/wait_object.h b/src/core/hle/kernel/synchronization_object.h
index 9a17958a4..741c31faf 100644
--- a/src/core/hle/kernel/wait_object.h
+++ b/src/core/hle/kernel/synchronization_object.h
@@ -15,10 +15,10 @@ class KernelCore;
 class Thread;
 
 /// Class that represents a Kernel object that a thread can be waiting on
-class WaitObject : public Object {
+class SynchronizationObject : public Object {
 public:
-    explicit WaitObject(KernelCore& kernel);
-    ~WaitObject() override;
+    explicit SynchronizationObject(KernelCore& kernel);
+    ~SynchronizationObject() override;
 
     /**
      * Check if the specified thread should wait until the object is available
@@ -30,6 +30,13 @@ public:
     /// Acquire/lock the object for the specified thread if it is available
     virtual void Acquire(Thread* thread) = 0;
 
+    /// Signal this object
+    virtual void Signal();
+
+    virtual bool IsSignaled() const {
+        return is_signaled;
+    }
+
     /**
      * Add a thread to wait on this object
      * @param thread Pointer to thread to add
@@ -60,16 +67,20 @@ public:
     /// Get a const reference to the waiting threads list for debug use
     const std::vector<std::shared_ptr<Thread>>& GetWaitingThreads() const;
 
+protected:
+    bool is_signaled{}; // Tells if this sync object is signalled;
+
 private:
     /// Threads waiting for this object to become available
     std::vector<std::shared_ptr<Thread>> waiting_threads;
 };
 
-// Specialization of DynamicObjectCast for WaitObjects
+// Specialization of DynamicObjectCast for SynchronizationObjects
 template <>
-inline std::shared_ptr<WaitObject> DynamicObjectCast<WaitObject>(std::shared_ptr<Object> object) {
+inline std::shared_ptr<SynchronizationObject> DynamicObjectCast<SynchronizationObject>(
+    std::shared_ptr<Object> object) {
     if (object != nullptr && object->IsWaitable()) {
-        return std::static_pointer_cast<WaitObject>(object);
+        return std::static_pointer_cast<SynchronizationObject>(object);
     }
     return nullptr;
 }
diff --git a/src/core/hle/kernel/thread.cpp b/src/core/hle/kernel/thread.cpp
index e84e5ce0d..ae5f2c8bd 100644
--- a/src/core/hle/kernel/thread.cpp
+++ b/src/core/hle/kernel/thread.cpp
@@ -13,9 +13,9 @@
 #include "common/thread_queue_list.h"
 #include "core/arm/arm_interface.h"
 #include "core/core.h"
-#include "core/core_cpu.h"
 #include "core/core_timing.h"
 #include "core/core_timing_util.h"
+#include "core/hardware_properties.h"
 #include "core/hle/kernel/errors.h"
 #include "core/hle/kernel/handle_table.h"
 #include "core/hle/kernel/kernel.h"
@@ -32,11 +32,15 @@ bool Thread::ShouldWait(const Thread* thread) const {
     return status != ThreadStatus::Dead;
 }
 
+bool Thread::IsSignaled() const {
+    return status == ThreadStatus::Dead;
+}
+
 void Thread::Acquire(Thread* thread) {
     ASSERT_MSG(!ShouldWait(thread), "object unavailable!");
 }
 
-Thread::Thread(KernelCore& kernel) : WaitObject{kernel} {}
+Thread::Thread(KernelCore& kernel) : SynchronizationObject{kernel} {}
 Thread::~Thread() = default;
 
 void Thread::Stop() {
@@ -46,7 +50,7 @@ void Thread::Stop() {
     kernel.ThreadWakeupCallbackHandleTable().Close(callback_handle);
     callback_handle = 0;
     SetStatus(ThreadStatus::Dead);
-    WakeupAllWaitingThreads();
+    Signal();
 
     // Clean up any dangling references in objects that this thread was waiting for
     for (auto& wait_object : wait_objects) {
@@ -216,7 +220,7 @@ void Thread::SetWaitSynchronizationOutput(s32 output) {
     context.cpu_registers[1] = output;
 }
 
-s32 Thread::GetWaitObjectIndex(std::shared_ptr<WaitObject> object) const {
+s32 Thread::GetSynchronizationObjectIndex(std::shared_ptr<SynchronizationObject> object) const {
     ASSERT_MSG(!wait_objects.empty(), "Thread is not waiting for anything");
     const auto match = std::find(wait_objects.rbegin(), wait_objects.rend(), object);
     return static_cast<s32>(std::distance(match, wait_objects.rend()) - 1);
@@ -337,14 +341,16 @@ void Thread::ChangeCore(u32 core, u64 mask) {
     SetCoreAndAffinityMask(core, mask);
 }
 
-bool Thread::AllWaitObjectsReady() const {
-    return std::none_of(
-        wait_objects.begin(), wait_objects.end(),
-        [this](const std::shared_ptr<WaitObject>& object) { return object->ShouldWait(this); });
+bool Thread::AllSynchronizationObjectsReady() const {
+    return std::none_of(wait_objects.begin(), wait_objects.end(),
+                        [this](const std::shared_ptr<SynchronizationObject>& object) {
+                            return object->ShouldWait(this);
+                        });
 }
 
 bool Thread::InvokeWakeupCallback(ThreadWakeupReason reason, std::shared_ptr<Thread> thread,
-                                  std::shared_ptr<WaitObject> object, std::size_t index) {
+                                  std::shared_ptr<SynchronizationObject> object,
+                                  std::size_t index) {
     ASSERT(wakeup_callback);
     return wakeup_callback(reason, std::move(thread), std::move(object), index);
 }
@@ -356,7 +362,7 @@ void Thread::SetActivity(ThreadActivity value) {
         // Set status if not waiting
         if (status == ThreadStatus::Ready || status == ThreadStatus::Running) {
             SetStatus(ThreadStatus::Paused);
-            Core::System::GetInstance().CpuCore(processor_id).PrepareReschedule();
+            kernel.PrepareReschedule(processor_id);
         }
     } else if (status == ThreadStatus::Paused) {
         // Ready to reschedule
@@ -426,7 +432,7 @@ ResultCode Thread::SetCoreAndAffinityMask(s32 new_core, u64 new_affinity_mask) {
             const s32 old_core = processor_id;
             if (processor_id >= 0 && ((affinity_mask >> processor_id) & 1) == 0) {
                 if (static_cast<s32>(ideal_core) < 0) {
-                    processor_id = HighestSetCore(affinity_mask, GlobalScheduler::NUM_CPU_CORES);
+                    processor_id = HighestSetCore(affinity_mask, Core::Hardware::NUM_CPU_CORES);
                 } else {
                     processor_id = ideal_core;
                 }
@@ -450,7 +456,7 @@ void Thread::AdjustSchedulingOnStatus(u32 old_flags) {
             scheduler.Unschedule(current_priority, static_cast<u32>(processor_id), this);
         }
 
-        for (u32 core = 0; core < GlobalScheduler::NUM_CPU_CORES; core++) {
+        for (u32 core = 0; core < Core::Hardware::NUM_CPU_CORES; core++) {
             if (core != static_cast<u32>(processor_id) && ((affinity_mask >> core) & 1) != 0) {
                 scheduler.Unsuggest(current_priority, core, this);
             }
@@ -461,7 +467,7 @@ void Thread::AdjustSchedulingOnStatus(u32 old_flags) {
             scheduler.Schedule(current_priority, static_cast<u32>(processor_id), this);
         }
 
-        for (u32 core = 0; core < GlobalScheduler::NUM_CPU_CORES; core++) {
+        for (u32 core = 0; core < Core::Hardware::NUM_CPU_CORES; core++) {
             if (core != static_cast<u32>(processor_id) && ((affinity_mask >> core) & 1) != 0) {
                 scheduler.Suggest(current_priority, core, this);
             }
@@ -475,12 +481,12 @@ void Thread::AdjustSchedulingOnPriority(u32 old_priority) {
     if (GetSchedulingStatus() != ThreadSchedStatus::Runnable) {
         return;
     }
-    auto& scheduler = Core::System::GetInstance().GlobalScheduler();
+    auto& scheduler = kernel.GlobalScheduler();
     if (processor_id >= 0) {
         scheduler.Unschedule(old_priority, static_cast<u32>(processor_id), this);
     }
 
-    for (u32 core = 0; core < GlobalScheduler::NUM_CPU_CORES; core++) {
+    for (u32 core = 0; core < Core::Hardware::NUM_CPU_CORES; core++) {
         if (core != static_cast<u32>(processor_id) && ((affinity_mask >> core) & 1) != 0) {
             scheduler.Unsuggest(old_priority, core, this);
         }
@@ -497,7 +503,7 @@ void Thread::AdjustSchedulingOnPriority(u32 old_priority) {
         }
     }
 
-    for (u32 core = 0; core < GlobalScheduler::NUM_CPU_CORES; core++) {
+    for (u32 core = 0; core < Core::Hardware::NUM_CPU_CORES; core++) {
         if (core != static_cast<u32>(processor_id) && ((affinity_mask >> core) & 1) != 0) {
             scheduler.Suggest(current_priority, core, this);
         }
@@ -507,13 +513,13 @@ void Thread::AdjustSchedulingOnPriority(u32 old_priority) {
 }
 
 void Thread::AdjustSchedulingOnAffinity(u64 old_affinity_mask, s32 old_core) {
-    auto& scheduler = Core::System::GetInstance().GlobalScheduler();
+    auto& scheduler = kernel.GlobalScheduler();
     if (GetSchedulingStatus() != ThreadSchedStatus::Runnable ||
         current_priority >= THREADPRIO_COUNT) {
         return;
     }
 
-    for (u32 core = 0; core < GlobalScheduler::NUM_CPU_CORES; core++) {
+    for (u32 core = 0; core < Core::Hardware::NUM_CPU_CORES; core++) {
         if (((old_affinity_mask >> core) & 1) != 0) {
             if (core == static_cast<u32>(old_core)) {
                 scheduler.Unschedule(current_priority, core, this);
@@ -523,7 +529,7 @@ void Thread::AdjustSchedulingOnAffinity(u64 old_affinity_mask, s32 old_core) {
         }
     }
 
-    for (u32 core = 0; core < GlobalScheduler::NUM_CPU_CORES; core++) {
+    for (u32 core = 0; core < Core::Hardware::NUM_CPU_CORES; core++) {
         if (((affinity_mask >> core) & 1) != 0) {
             if (core == static_cast<u32>(processor_id)) {
                 scheduler.Schedule(current_priority, core, this);
diff --git a/src/core/hle/kernel/thread.h b/src/core/hle/kernel/thread.h
index 3bcf9e137..7a4916318 100644
--- a/src/core/hle/kernel/thread.h
+++ b/src/core/hle/kernel/thread.h
@@ -11,7 +11,7 @@
 #include "common/common_types.h"
 #include "core/arm/arm_interface.h"
 #include "core/hle/kernel/object.h"
-#include "core/hle/kernel/wait_object.h"
+#include "core/hle/kernel/synchronization_object.h"
 #include "core/hle/result.h"
 
 namespace Kernel {
@@ -95,7 +95,7 @@ enum class ThreadSchedMasks : u32 {
     ForcePauseMask = 0x0070,
 };
 
-class Thread final : public WaitObject {
+class Thread final : public SynchronizationObject {
 public:
     explicit Thread(KernelCore& kernel);
     ~Thread() override;
@@ -104,11 +104,11 @@ public:
 
     using ThreadContext = Core::ARM_Interface::ThreadContext;
 
-    using ThreadWaitObjects = std::vector<std::shared_ptr<WaitObject>>;
+    using ThreadSynchronizationObjects = std::vector<std::shared_ptr<SynchronizationObject>>;
 
     using WakeupCallback =
         std::function<bool(ThreadWakeupReason reason, std::shared_ptr<Thread> thread,
-                           std::shared_ptr<WaitObject> object, std::size_t index)>;
+                           std::shared_ptr<SynchronizationObject> object, std::size_t index)>;
 
     /**
      * Creates and returns a new thread. The new thread is immediately scheduled
@@ -146,6 +146,7 @@ public:
 
     bool ShouldWait(const Thread* thread) const override;
     void Acquire(Thread* thread) override;
+    bool IsSignaled() const override;
 
     /**
      * Gets the thread's current priority
@@ -233,7 +234,7 @@ public:
      *
      * @param object Object to query the index of.
      */
-    s32 GetWaitObjectIndex(std::shared_ptr<WaitObject> object) const;
+    s32 GetSynchronizationObjectIndex(std::shared_ptr<SynchronizationObject> object) const;
 
     /**
      * Stops a thread, invalidating it from further use
@@ -314,15 +315,15 @@ public:
         return owner_process;
     }
 
-    const ThreadWaitObjects& GetWaitObjects() const {
+    const ThreadSynchronizationObjects& GetSynchronizationObjects() const {
         return wait_objects;
     }
 
-    void SetWaitObjects(ThreadWaitObjects objects) {
+    void SetSynchronizationObjects(ThreadSynchronizationObjects objects) {
         wait_objects = std::move(objects);
     }
 
-    void ClearWaitObjects() {
+    void ClearSynchronizationObjects() {
         for (const auto& waiting_object : wait_objects) {
             waiting_object->RemoveWaitingThread(SharedFrom(this));
         }
@@ -330,7 +331,7 @@ public:
     }
 
     /// Determines whether all the objects this thread is waiting on are ready.
-    bool AllWaitObjectsReady() const;
+    bool AllSynchronizationObjectsReady() const;
 
     const MutexWaitingThreads& GetMutexWaitingThreads() const {
         return wait_mutex_threads;
@@ -395,7 +396,7 @@ public:
      *      will cause an assertion to trigger.
      */
     bool InvokeWakeupCallback(ThreadWakeupReason reason, std::shared_ptr<Thread> thread,
-                              std::shared_ptr<WaitObject> object, std::size_t index);
+                              std::shared_ptr<SynchronizationObject> object, std::size_t index);
 
     u32 GetIdealCore() const {
         return ideal_core;
@@ -494,7 +495,7 @@ private:
 
     /// Objects that the thread is waiting on, in the same order as they were
     /// passed to WaitSynchronization.
-    ThreadWaitObjects wait_objects;
+    ThreadSynchronizationObjects wait_objects;
 
     /// List of threads that are waiting for a mutex that is held by this thread.
     MutexWaitingThreads wait_mutex_threads;
diff --git a/src/core/hle/kernel/transfer_memory.cpp b/src/core/hle/kernel/transfer_memory.cpp
index f0e73f57b..f2d3f8b49 100644
--- a/src/core/hle/kernel/transfer_memory.cpp
+++ b/src/core/hle/kernel/transfer_memory.cpp
@@ -8,15 +8,23 @@
 #include "core/hle/kernel/shared_memory.h"
 #include "core/hle/kernel/transfer_memory.h"
 #include "core/hle/result.h"
+#include "core/memory.h"
 
 namespace Kernel {
 
-TransferMemory::TransferMemory(KernelCore& kernel) : Object{kernel} {}
-TransferMemory::~TransferMemory() = default;
+TransferMemory::TransferMemory(KernelCore& kernel, Memory::Memory& memory)
+    : Object{kernel}, memory{memory} {}
 
-std::shared_ptr<TransferMemory> TransferMemory::Create(KernelCore& kernel, VAddr base_address,
-                                                       u64 size, MemoryPermission permissions) {
-    std::shared_ptr<TransferMemory> transfer_memory{std::make_shared<TransferMemory>(kernel)};
+TransferMemory::~TransferMemory() {
+    // Release memory region when transfer memory is destroyed
+    Reset();
+}
+
+std::shared_ptr<TransferMemory> TransferMemory::Create(KernelCore& kernel, Memory::Memory& memory,
+                                                       VAddr base_address, u64 size,
+                                                       MemoryPermission permissions) {
+    std::shared_ptr<TransferMemory> transfer_memory{
+        std::make_shared<TransferMemory>(kernel, memory)};
 
     transfer_memory->base_address = base_address;
     transfer_memory->memory_size = size;
@@ -27,7 +35,7 @@ std::shared_ptr<TransferMemory> TransferMemory::Create(KernelCore& kernel, VAddr
 }
 
 const u8* TransferMemory::GetPointer() const {
-    return backing_block.get()->data();
+    return memory.GetPointer(base_address);
 }
 
 u64 TransferMemory::GetSize() const {
@@ -62,6 +70,52 @@ ResultCode TransferMemory::MapMemory(VAddr address, u64 size, MemoryPermission p
     return RESULT_SUCCESS;
 }
 
+ResultCode TransferMemory::Reserve() {
+    auto& vm_manager{owner_process->VMManager()};
+    const auto check_range_result{vm_manager.CheckRangeState(
+        base_address, memory_size, MemoryState::FlagTransfer | MemoryState::FlagMemoryPoolAllocated,
+        MemoryState::FlagTransfer | MemoryState::FlagMemoryPoolAllocated, VMAPermission::All,
+        VMAPermission::ReadWrite, MemoryAttribute::Mask, MemoryAttribute::None,
+        MemoryAttribute::IpcAndDeviceMapped)};
+
+    if (check_range_result.Failed()) {
+        return check_range_result.Code();
+    }
+
+    auto [state_, permissions_, attribute] = *check_range_result;
+
+    if (const auto result{vm_manager.ReprotectRange(
+            base_address, memory_size, SharedMemory::ConvertPermissions(owner_permissions))};
+        result.IsError()) {
+        return result;
+    }
+
+    return vm_manager.SetMemoryAttribute(base_address, memory_size, MemoryAttribute::Mask,
+                                         attribute | MemoryAttribute::Locked);
+}
+
+ResultCode TransferMemory::Reset() {
+    auto& vm_manager{owner_process->VMManager()};
+    if (const auto result{vm_manager.CheckRangeState(
+            base_address, memory_size,
+            MemoryState::FlagTransfer | MemoryState::FlagMemoryPoolAllocated,
+            MemoryState::FlagTransfer | MemoryState::FlagMemoryPoolAllocated, VMAPermission::None,
+            VMAPermission::None, MemoryAttribute::Mask, MemoryAttribute::Locked,
+            MemoryAttribute::IpcAndDeviceMapped)};
+        result.Failed()) {
+        return result.Code();
+    }
+
+    if (const auto result{
+            vm_manager.ReprotectRange(base_address, memory_size, VMAPermission::ReadWrite)};
+        result.IsError()) {
+        return result;
+    }
+
+    return vm_manager.SetMemoryAttribute(base_address, memory_size, MemoryAttribute::Mask,
+                                         MemoryAttribute::None);
+}
+
 ResultCode TransferMemory::UnmapMemory(VAddr address, u64 size) {
     if (memory_size != size) {
         return ERR_INVALID_SIZE;
diff --git a/src/core/hle/kernel/transfer_memory.h b/src/core/hle/kernel/transfer_memory.h
index 0a6e15d18..6e388536a 100644
--- a/src/core/hle/kernel/transfer_memory.h
+++ b/src/core/hle/kernel/transfer_memory.h
@@ -11,6 +11,10 @@
 
 union ResultCode;
 
+namespace Memory {
+class Memory;
+}
+
 namespace Kernel {
 
 class KernelCore;
@@ -26,12 +30,13 @@ enum class MemoryPermission : u32;
 ///
 class TransferMemory final : public Object {
 public:
-    explicit TransferMemory(KernelCore& kernel);
+    explicit TransferMemory(KernelCore& kernel, Memory::Memory& memory);
     ~TransferMemory() override;
 
     static constexpr HandleType HANDLE_TYPE = HandleType::TransferMemory;
 
-    static std::shared_ptr<TransferMemory> Create(KernelCore& kernel, VAddr base_address, u64 size,
+    static std::shared_ptr<TransferMemory> Create(KernelCore& kernel, Memory::Memory& memory,
+                                                  VAddr base_address, u64 size,
                                                   MemoryPermission permissions);
 
     TransferMemory(const TransferMemory&) = delete;
@@ -80,6 +85,14 @@ public:
     ///
     ResultCode UnmapMemory(VAddr address, u64 size);
 
+    /// Reserves the region to be used for the transfer memory, called after the transfer memory is
+    /// created.
+    ResultCode Reserve();
+
+    /// Resets the region previously used for the transfer memory, called after the transfer memory
+    /// is closed.
+    ResultCode Reset();
+
 private:
     /// Memory block backing this instance.
     std::shared_ptr<PhysicalMemory> backing_block;
@@ -98,6 +111,8 @@ private:
 
     /// Whether or not this transfer memory instance has mapped memory.
     bool is_mapped = false;
+
+    Memory::Memory& memory;
 };
 
 } // namespace Kernel
diff --git a/src/core/hle/kernel/vm_manager.cpp b/src/core/hle/kernel/vm_manager.cpp
index 0b3500fce..024c22901 100644
--- a/src/core/hle/kernel/vm_manager.cpp
+++ b/src/core/hle/kernel/vm_manager.cpp
@@ -544,7 +544,8 @@ MemoryInfo VMManager::QueryMemory(VAddr address) const {
 
 ResultCode VMManager::SetMemoryAttribute(VAddr address, u64 size, MemoryAttribute mask,
                                          MemoryAttribute attribute) {
-    constexpr auto ignore_mask = MemoryAttribute::Uncached | MemoryAttribute::DeviceMapped;
+    constexpr auto ignore_mask =
+        MemoryAttribute::Uncached | MemoryAttribute::DeviceMapped | MemoryAttribute::Locked;
     constexpr auto attribute_mask = ~ignore_mask;
 
     const auto result = CheckRangeState(
diff --git a/src/core/hle/kernel/vm_manager.h b/src/core/hle/kernel/vm_manager.h
index 850a7ebc3..90b4b006a 100644
--- a/src/core/hle/kernel/vm_manager.h
+++ b/src/core/hle/kernel/vm_manager.h
@@ -98,6 +98,8 @@ enum class MemoryAttribute : u32 {
     DeviceMapped = 4,
     /// Uncached memory
     Uncached = 8,
+
+    IpcAndDeviceMapped = LockedForIPC | DeviceMapped,
 };
 
 constexpr MemoryAttribute operator|(MemoryAttribute lhs, MemoryAttribute rhs) {
@@ -654,6 +656,35 @@ public:
     /// is scheduled.
     Common::PageTable page_table{Memory::PAGE_BITS};
 
+    using CheckResults = ResultVal<std::tuple<MemoryState, VMAPermission, MemoryAttribute>>;
+
+    /// Checks if an address range adheres to the specified states provided.
+    ///
+    /// @param address         The starting address of the address range.
+    /// @param size            The size of the address range.
+    /// @param state_mask      The memory state mask.
+    /// @param state           The state to compare the individual VMA states against,
+    ///                        which is done in the form of: (vma.state & state_mask) != state.
+    /// @param permission_mask The memory permissions mask.
+    /// @param permissions     The permission to compare the individual VMA permissions against,
+    ///                        which is done in the form of:
+    ///                        (vma.permission & permission_mask) != permission.
+    /// @param attribute_mask  The memory attribute mask.
+    /// @param attribute       The memory attributes to compare the individual VMA attributes
+    ///                        against, which is done in the form of:
+    ///                        (vma.attributes & attribute_mask) != attribute.
+    /// @param ignore_mask     The memory attributes to ignore during the check.
+    ///
+    /// @returns If successful, returns a tuple containing the memory attributes
+    ///          (with ignored bits specified by ignore_mask unset), memory permissions, and
+    ///          memory state across the memory range.
+    /// @returns If not successful, returns ERR_INVALID_ADDRESS_STATE.
+    ///
+    CheckResults CheckRangeState(VAddr address, u64 size, MemoryState state_mask, MemoryState state,
+                                 VMAPermission permission_mask, VMAPermission permissions,
+                                 MemoryAttribute attribute_mask, MemoryAttribute attribute,
+                                 MemoryAttribute ignore_mask) const;
+
 private:
     using VMAIter = VMAMap::iterator;
 
@@ -707,35 +738,6 @@ private:
     /// Clears out the page table
     void ClearPageTable();
 
-    using CheckResults = ResultVal<std::tuple<MemoryState, VMAPermission, MemoryAttribute>>;
-
-    /// Checks if an address range adheres to the specified states provided.
-    ///
-    /// @param address         The starting address of the address range.
-    /// @param size            The size of the address range.
-    /// @param state_mask      The memory state mask.
-    /// @param state           The state to compare the individual VMA states against,
-    ///                        which is done in the form of: (vma.state & state_mask) != state.
-    /// @param permission_mask The memory permissions mask.
-    /// @param permissions     The permission to compare the individual VMA permissions against,
-    ///                        which is done in the form of:
-    ///                        (vma.permission & permission_mask) != permission.
-    /// @param attribute_mask  The memory attribute mask.
-    /// @param attribute       The memory attributes to compare the individual VMA attributes
-    ///                        against, which is done in the form of:
-    ///                        (vma.attributes & attribute_mask) != attribute.
-    /// @param ignore_mask     The memory attributes to ignore during the check.
-    ///
-    /// @returns If successful, returns a tuple containing the memory attributes
-    ///          (with ignored bits specified by ignore_mask unset), memory permissions, and
-    ///          memory state across the memory range.
-    /// @returns If not successful, returns ERR_INVALID_ADDRESS_STATE.
-    ///
-    CheckResults CheckRangeState(VAddr address, u64 size, MemoryState state_mask, MemoryState state,
-                                 VMAPermission permission_mask, VMAPermission permissions,
-                                 MemoryAttribute attribute_mask, MemoryAttribute attribute,
-                                 MemoryAttribute ignore_mask) const;
-
     /// Gets the amount of memory currently mapped (state != Unmapped) in a range.
     ResultVal<std::size_t> SizeOfAllocatedVMAsInRange(VAddr address, std::size_t size) const;
 
diff --git a/src/core/hle/kernel/writable_event.cpp b/src/core/hle/kernel/writable_event.cpp
index c9332e3e1..fc2f7c424 100644
--- a/src/core/hle/kernel/writable_event.cpp
+++ b/src/core/hle/kernel/writable_event.cpp
@@ -22,7 +22,6 @@ EventPair WritableEvent::CreateEventPair(KernelCore& kernel, std::string name) {
     writable_event->name = name + ":Writable";
     writable_event->readable = readable_event;
     readable_event->name = name + ":Readable";
-    readable_event->signaled = false;
 
     return {std::move(readable_event), std::move(writable_event)};
 }
@@ -40,7 +39,7 @@ void WritableEvent::Clear() {
 }
 
 bool WritableEvent::IsSignaled() const {
-    return readable->signaled;
+    return readable->IsSignaled();
 }
 
 } // namespace Kernel
diff --git a/src/core/hle/service/am/am.cpp b/src/core/hle/service/am/am.cpp
index 95aa5d23d..cc978713b 100644
--- a/src/core/hle/service/am/am.cpp
+++ b/src/core/hle/service/am/am.cpp
@@ -709,8 +709,34 @@ void ICommonStateGetter::SetCpuBoostMode(Kernel::HLERequestContext& ctx) {
     apm_sys->SetCpuBoostMode(ctx);
 }
 
-IStorage::IStorage(std::vector<u8> buffer)
-    : ServiceFramework("IStorage"), buffer(std::move(buffer)) {
+IStorageImpl::~IStorageImpl() = default;
+
+class StorageDataImpl final : public IStorageImpl {
+public:
+    explicit StorageDataImpl(std::vector<u8>&& buffer) : buffer{std::move(buffer)} {}
+
+    std::vector<u8>& GetData() override {
+        return buffer;
+    }
+
+    const std::vector<u8>& GetData() const override {
+        return buffer;
+    }
+
+    std::size_t GetSize() const override {
+        return buffer.size();
+    }
+
+private:
+    std::vector<u8> buffer;
+};
+
+IStorage::IStorage(std::vector<u8>&& buffer)
+    : ServiceFramework("IStorage"), impl{std::make_shared<StorageDataImpl>(std::move(buffer))} {
+    Register();
+}
+
+void IStorage::Register() {
     // clang-format off
         static const FunctionInfo functions[] = {
             {0, &IStorage::Open, "Open"},
@@ -723,8 +749,13 @@ IStorage::IStorage(std::vector<u8> buffer)
 
 IStorage::~IStorage() = default;
 
-const std::vector<u8>& IStorage::GetData() const {
-    return buffer;
+void IStorage::Open(Kernel::HLERequestContext& ctx) {
+    LOG_DEBUG(Service_AM, "called");
+
+    IPC::ResponseBuilder rb{ctx, 2, 0, 1};
+
+    rb.Push(RESULT_SUCCESS);
+    rb.PushIpcInterface<IStorageAccessor>(*this);
 }
 
 void ICommonStateGetter::GetOperationMode(Kernel::HLERequestContext& ctx) {
@@ -816,7 +847,7 @@ private:
         LOG_DEBUG(Service_AM, "called");
 
         IPC::RequestParser rp{ctx};
-        applet->GetBroker().PushNormalDataFromGame(*rp.PopIpcInterface<IStorage>());
+        applet->GetBroker().PushNormalDataFromGame(rp.PopIpcInterface<IStorage>());
 
         IPC::ResponseBuilder rb{ctx, 2};
         rb.Push(RESULT_SUCCESS);
@@ -825,26 +856,25 @@ private:
     void PopOutData(Kernel::HLERequestContext& ctx) {
         LOG_DEBUG(Service_AM, "called");
 
-        IPC::ResponseBuilder rb{ctx, 2, 0, 1};
-
         const auto storage = applet->GetBroker().PopNormalDataToGame();
         if (storage == nullptr) {
             LOG_ERROR(Service_AM,
                       "storage is a nullptr. There is no data in the current normal channel");
-
+            IPC::ResponseBuilder rb{ctx, 2};
             rb.Push(ERR_NO_DATA_IN_CHANNEL);
             return;
         }
 
+        IPC::ResponseBuilder rb{ctx, 2, 0, 1};
         rb.Push(RESULT_SUCCESS);
-        rb.PushIpcInterface<IStorage>(std::move(*storage));
+        rb.PushIpcInterface<IStorage>(std::move(storage));
     }
 
     void PushInteractiveInData(Kernel::HLERequestContext& ctx) {
         LOG_DEBUG(Service_AM, "called");
 
         IPC::RequestParser rp{ctx};
-        applet->GetBroker().PushInteractiveDataFromGame(*rp.PopIpcInterface<IStorage>());
+        applet->GetBroker().PushInteractiveDataFromGame(rp.PopIpcInterface<IStorage>());
 
         ASSERT(applet->IsInitialized());
         applet->ExecuteInteractive();
@@ -857,19 +887,18 @@ private:
     void PopInteractiveOutData(Kernel::HLERequestContext& ctx) {
         LOG_DEBUG(Service_AM, "called");
 
-        IPC::ResponseBuilder rb{ctx, 2, 0, 1};
-
         const auto storage = applet->GetBroker().PopInteractiveDataToGame();
         if (storage == nullptr) {
             LOG_ERROR(Service_AM,
                       "storage is a nullptr. There is no data in the current interactive channel");
-
+            IPC::ResponseBuilder rb{ctx, 2};
             rb.Push(ERR_NO_DATA_IN_CHANNEL);
             return;
         }
 
+        IPC::ResponseBuilder rb{ctx, 2, 0, 1};
         rb.Push(RESULT_SUCCESS);
-        rb.PushIpcInterface<IStorage>(std::move(*storage));
+        rb.PushIpcInterface<IStorage>(std::move(storage));
     }
 
     void GetPopOutDataEvent(Kernel::HLERequestContext& ctx) {
@@ -891,15 +920,6 @@ private:
     std::shared_ptr<Applets::Applet> applet;
 };
 
-void IStorage::Open(Kernel::HLERequestContext& ctx) {
-    LOG_DEBUG(Service_AM, "called");
-
-    IPC::ResponseBuilder rb{ctx, 2, 0, 1};
-
-    rb.Push(RESULT_SUCCESS);
-    rb.PushIpcInterface<IStorageAccessor>(*this);
-}
-
 IStorageAccessor::IStorageAccessor(IStorage& storage)
     : ServiceFramework("IStorageAccessor"), backing(storage) {
     // clang-format off
@@ -921,7 +941,7 @@ void IStorageAccessor::GetSize(Kernel::HLERequestContext& ctx) {
     IPC::ResponseBuilder rb{ctx, 4};
 
     rb.Push(RESULT_SUCCESS);
-    rb.Push(static_cast<u64>(backing.buffer.size()));
+    rb.Push(static_cast<u64>(backing.GetSize()));
 }
 
 void IStorageAccessor::Write(Kernel::HLERequestContext& ctx) {
@@ -932,17 +952,17 @@ void IStorageAccessor::Write(Kernel::HLERequestContext& ctx) {
 
     LOG_DEBUG(Service_AM, "called, offset={}, size={}", offset, data.size());
 
-    if (data.size() > backing.buffer.size() - offset) {
+    if (data.size() > backing.GetSize() - offset) {
         LOG_ERROR(Service_AM,
                   "offset is out of bounds, backing_buffer_sz={}, data_size={}, offset={}",
-                  backing.buffer.size(), data.size(), offset);
+                  backing.GetSize(), data.size(), offset);
 
         IPC::ResponseBuilder rb{ctx, 2};
         rb.Push(ERR_SIZE_OUT_OF_BOUNDS);
         return;
     }
 
-    std::memcpy(backing.buffer.data() + offset, data.data(), data.size());
+    std::memcpy(backing.GetData().data() + offset, data.data(), data.size());
 
     IPC::ResponseBuilder rb{ctx, 2};
     rb.Push(RESULT_SUCCESS);
@@ -956,16 +976,16 @@ void IStorageAccessor::Read(Kernel::HLERequestContext& ctx) {
 
     LOG_DEBUG(Service_AM, "called, offset={}, size={}", offset, size);
 
-    if (size > backing.buffer.size() - offset) {
+    if (size > backing.GetSize() - offset) {
         LOG_ERROR(Service_AM, "offset is out of bounds, backing_buffer_sz={}, size={}, offset={}",
-                  backing.buffer.size(), size, offset);
+                  backing.GetSize(), size, offset);
 
         IPC::ResponseBuilder rb{ctx, 2};
         rb.Push(ERR_SIZE_OUT_OF_BOUNDS);
         return;
     }
 
-    ctx.WriteBuffer(backing.buffer.data() + offset, size);
+    ctx.WriteBuffer(backing.GetData().data() + offset, size);
 
     IPC::ResponseBuilder rb{ctx, 2};
     rb.Push(RESULT_SUCCESS);
@@ -1031,7 +1051,7 @@ void ILibraryAppletCreator::CreateTransferMemoryStorage(Kernel::HLERequestContex
     rp.SetCurrentOffset(3);
     const auto handle{rp.Pop<Kernel::Handle>()};
 
-    const auto transfer_mem =
+    auto transfer_mem =
         system.CurrentProcess()->GetHandleTable().Get<Kernel::TransferMemory>(handle);
 
     if (transfer_mem == nullptr) {
@@ -1047,7 +1067,7 @@ void ILibraryAppletCreator::CreateTransferMemoryStorage(Kernel::HLERequestContex
 
     IPC::ResponseBuilder rb{ctx, 2, 0, 1};
     rb.Push(RESULT_SUCCESS);
-    rb.PushIpcInterface(std::make_shared<IStorage>(std::move(memory)));
+    rb.PushIpcInterface<IStorage>(std::move(memory));
 }
 
 IApplicationFunctions::IApplicationFunctions(Core::System& system_)
@@ -1189,13 +1209,11 @@ void IApplicationFunctions::PopLaunchParameter(Kernel::HLERequestContext& ctx) {
         u64 build_id{};
         std::memcpy(&build_id, build_id_full.data(), sizeof(u64));
 
-        const auto data =
-            backend->GetLaunchParameter({system.CurrentProcess()->GetTitleID(), build_id});
-
+        auto data = backend->GetLaunchParameter({system.CurrentProcess()->GetTitleID(), build_id});
         if (data.has_value()) {
             IPC::ResponseBuilder rb{ctx, 2, 0, 1};
             rb.Push(RESULT_SUCCESS);
-            rb.PushIpcInterface<AM::IStorage>(*data);
+            rb.PushIpcInterface<IStorage>(std::move(*data));
             launch_popped_application_specific = true;
             return;
         }
@@ -1218,7 +1236,7 @@ void IApplicationFunctions::PopLaunchParameter(Kernel::HLERequestContext& ctx) {
         std::vector<u8> buffer(sizeof(LaunchParameterAccountPreselectedUser));
         std::memcpy(buffer.data(), &params, buffer.size());
 
-        rb.PushIpcInterface<AM::IStorage>(buffer);
+        rb.PushIpcInterface<IStorage>(std::move(buffer));
         launch_popped_account_preselect = true;
         return;
     }
diff --git a/src/core/hle/service/am/am.h b/src/core/hle/service/am/am.h
index 448817be9..0b9a4332d 100644
--- a/src/core/hle/service/am/am.h
+++ b/src/core/hle/service/am/am.h
@@ -12,7 +12,8 @@
 
 namespace Kernel {
 class KernelCore;
-}
+class TransferMemory;
+} // namespace Kernel
 
 namespace Service::NVFlinger {
 class NVFlinger;
@@ -188,19 +189,36 @@ private:
     std::shared_ptr<AppletMessageQueue> msg_queue;
 };
 
+class IStorageImpl {
+public:
+    virtual ~IStorageImpl();
+    virtual std::vector<u8>& GetData() = 0;
+    virtual const std::vector<u8>& GetData() const = 0;
+    virtual std::size_t GetSize() const = 0;
+};
+
 class IStorage final : public ServiceFramework<IStorage> {
 public:
-    explicit IStorage(std::vector<u8> buffer);
+    explicit IStorage(std::vector<u8>&& buffer);
     ~IStorage() override;
 
-    const std::vector<u8>& GetData() const;
+    std::vector<u8>& GetData() {
+        return impl->GetData();
+    }
+
+    const std::vector<u8>& GetData() const {
+        return impl->GetData();
+    }
+
+    std::size_t GetSize() const {
+        return impl->GetSize();
+    }
 
 private:
+    void Register();
     void Open(Kernel::HLERequestContext& ctx);
 
-    std::vector<u8> buffer;
-
-    friend class IStorageAccessor;
+    std::shared_ptr<IStorageImpl> impl;
 };
 
 class IStorageAccessor final : public ServiceFramework<IStorageAccessor> {
diff --git a/src/core/hle/service/am/applets/applets.cpp b/src/core/hle/service/am/applets/applets.cpp
index 92f995f8f..c3261f3e6 100644
--- a/src/core/hle/service/am/applets/applets.cpp
+++ b/src/core/hle/service/am/applets/applets.cpp
@@ -50,16 +50,17 @@ AppletDataBroker::RawChannelData AppletDataBroker::PeekDataToAppletForDebug() co
     return {std::move(out_normal), std::move(out_interactive)};
 }
 
-std::unique_ptr<IStorage> AppletDataBroker::PopNormalDataToGame() {
+std::shared_ptr<IStorage> AppletDataBroker::PopNormalDataToGame() {
     if (out_channel.empty())
         return nullptr;
 
     auto out = std::move(out_channel.front());
     out_channel.pop_front();
+    pop_out_data_event.writable->Clear();
     return out;
 }
 
-std::unique_ptr<IStorage> AppletDataBroker::PopNormalDataToApplet() {
+std::shared_ptr<IStorage> AppletDataBroker::PopNormalDataToApplet() {
     if (in_channel.empty())
         return nullptr;
 
@@ -68,16 +69,17 @@ std::unique_ptr<IStorage> AppletDataBroker::PopNormalDataToApplet() {
     return out;
 }
 
-std::unique_ptr<IStorage> AppletDataBroker::PopInteractiveDataToGame() {
+std::shared_ptr<IStorage> AppletDataBroker::PopInteractiveDataToGame() {
     if (out_interactive_channel.empty())
         return nullptr;
 
     auto out = std::move(out_interactive_channel.front());
     out_interactive_channel.pop_front();
+    pop_interactive_out_data_event.writable->Clear();
     return out;
 }
 
-std::unique_ptr<IStorage> AppletDataBroker::PopInteractiveDataToApplet() {
+std::shared_ptr<IStorage> AppletDataBroker::PopInteractiveDataToApplet() {
     if (in_interactive_channel.empty())
         return nullptr;
 
@@ -86,21 +88,21 @@ std::unique_ptr<IStorage> AppletDataBroker::PopInteractiveDataToApplet() {
     return out;
 }
 
-void AppletDataBroker::PushNormalDataFromGame(IStorage storage) {
-    in_channel.push_back(std::make_unique<IStorage>(storage));
+void AppletDataBroker::PushNormalDataFromGame(std::shared_ptr<IStorage>&& storage) {
+    in_channel.emplace_back(std::move(storage));
 }
 
-void AppletDataBroker::PushNormalDataFromApplet(IStorage storage) {
-    out_channel.push_back(std::make_unique<IStorage>(storage));
+void AppletDataBroker::PushNormalDataFromApplet(std::shared_ptr<IStorage>&& storage) {
+    out_channel.emplace_back(std::move(storage));
     pop_out_data_event.writable->Signal();
 }
 
-void AppletDataBroker::PushInteractiveDataFromGame(IStorage storage) {
-    in_interactive_channel.push_back(std::make_unique<IStorage>(storage));
+void AppletDataBroker::PushInteractiveDataFromGame(std::shared_ptr<IStorage>&& storage) {
+    in_interactive_channel.emplace_back(std::move(storage));
 }
 
-void AppletDataBroker::PushInteractiveDataFromApplet(IStorage storage) {
-    out_interactive_channel.push_back(std::make_unique<IStorage>(storage));
+void AppletDataBroker::PushInteractiveDataFromApplet(std::shared_ptr<IStorage>&& storage) {
+    out_interactive_channel.emplace_back(std::move(storage));
     pop_interactive_out_data_event.writable->Signal();
 }
 
diff --git a/src/core/hle/service/am/applets/applets.h b/src/core/hle/service/am/applets/applets.h
index 16e61fc6f..e75be86a2 100644
--- a/src/core/hle/service/am/applets/applets.h
+++ b/src/core/hle/service/am/applets/applets.h
@@ -72,17 +72,17 @@ public:
     // Retrieves but does not pop the data sent to applet.
     RawChannelData PeekDataToAppletForDebug() const;
 
-    std::unique_ptr<IStorage> PopNormalDataToGame();
-    std::unique_ptr<IStorage> PopNormalDataToApplet();
+    std::shared_ptr<IStorage> PopNormalDataToGame();
+    std::shared_ptr<IStorage> PopNormalDataToApplet();
 
-    std::unique_ptr<IStorage> PopInteractiveDataToGame();
-    std::unique_ptr<IStorage> PopInteractiveDataToApplet();
+    std::shared_ptr<IStorage> PopInteractiveDataToGame();
+    std::shared_ptr<IStorage> PopInteractiveDataToApplet();
 
-    void PushNormalDataFromGame(IStorage storage);
-    void PushNormalDataFromApplet(IStorage storage);
+    void PushNormalDataFromGame(std::shared_ptr<IStorage>&& storage);
+    void PushNormalDataFromApplet(std::shared_ptr<IStorage>&& storage);
 
-    void PushInteractiveDataFromGame(IStorage storage);
-    void PushInteractiveDataFromApplet(IStorage storage);
+    void PushInteractiveDataFromGame(std::shared_ptr<IStorage>&& storage);
+    void PushInteractiveDataFromApplet(std::shared_ptr<IStorage>&& storage);
 
     void SignalStateChanged() const;
 
@@ -94,16 +94,16 @@ private:
     // Queues are named from applet's perspective
 
     // PopNormalDataToApplet and PushNormalDataFromGame
-    std::deque<std::unique_ptr<IStorage>> in_channel;
+    std::deque<std::shared_ptr<IStorage>> in_channel;
 
     // PopNormalDataToGame and PushNormalDataFromApplet
-    std::deque<std::unique_ptr<IStorage>> out_channel;
+    std::deque<std::shared_ptr<IStorage>> out_channel;
 
     // PopInteractiveDataToApplet and PushInteractiveDataFromGame
-    std::deque<std::unique_ptr<IStorage>> in_interactive_channel;
+    std::deque<std::shared_ptr<IStorage>> in_interactive_channel;
 
     // PopInteractiveDataToGame and PushInteractiveDataFromApplet
-    std::deque<std::unique_ptr<IStorage>> out_interactive_channel;
+    std::deque<std::shared_ptr<IStorage>> out_interactive_channel;
 
     Kernel::EventPair state_changed_event;
 
diff --git a/src/core/hle/service/am/applets/error.cpp b/src/core/hle/service/am/applets/error.cpp
index eab0d42c9..f12fd7f89 100644
--- a/src/core/hle/service/am/applets/error.cpp
+++ b/src/core/hle/service/am/applets/error.cpp
@@ -186,7 +186,7 @@ void Error::Execute() {
 
 void Error::DisplayCompleted() {
     complete = true;
-    broker.PushNormalDataFromApplet(IStorage{{}});
+    broker.PushNormalDataFromApplet(std::make_shared<IStorage>(std::vector<u8>{}));
     broker.SignalStateChanged();
 }
 
diff --git a/src/core/hle/service/am/applets/general_backend.cpp b/src/core/hle/service/am/applets/general_backend.cpp
index 328438a1d..104501ac5 100644
--- a/src/core/hle/service/am/applets/general_backend.cpp
+++ b/src/core/hle/service/am/applets/general_backend.cpp
@@ -20,7 +20,7 @@ namespace Service::AM::Applets {
 constexpr ResultCode ERROR_INVALID_PIN{ErrorModule::PCTL, 221};
 
 static void LogCurrentStorage(AppletDataBroker& broker, std::string_view prefix) {
-    std::unique_ptr<IStorage> storage = broker.PopNormalDataToApplet();
+    std::shared_ptr<IStorage> storage = broker.PopNormalDataToApplet();
     for (; storage != nullptr; storage = broker.PopNormalDataToApplet()) {
         const auto data = storage->GetData();
         LOG_INFO(Service_AM,
@@ -148,7 +148,7 @@ void Auth::AuthFinished(bool successful) {
     std::vector<u8> out(sizeof(Return));
     std::memcpy(out.data(), &return_, sizeof(Return));
 
-    broker.PushNormalDataFromApplet(IStorage{out});
+    broker.PushNormalDataFromApplet(std::make_shared<IStorage>(std::move(out)));
     broker.SignalStateChanged();
 }
 
@@ -198,7 +198,7 @@ void PhotoViewer::Execute() {
 }
 
 void PhotoViewer::ViewFinished() {
-    broker.PushNormalDataFromApplet(IStorage{{}});
+    broker.PushNormalDataFromApplet(std::make_shared<IStorage>(std::vector<u8>{}));
     broker.SignalStateChanged();
 }
 
@@ -234,8 +234,8 @@ void StubApplet::ExecuteInteractive() {
     LOG_WARNING(Service_AM, "called (STUBBED)");
     LogCurrentStorage(broker, "ExecuteInteractive");
 
-    broker.PushNormalDataFromApplet(IStorage{std::vector<u8>(0x1000)});
-    broker.PushInteractiveDataFromApplet(IStorage{std::vector<u8>(0x1000)});
+    broker.PushNormalDataFromApplet(std::make_shared<IStorage>(std::vector<u8>(0x1000)));
+    broker.PushInteractiveDataFromApplet(std::make_shared<IStorage>(std::vector<u8>(0x1000)));
     broker.SignalStateChanged();
 }
 
@@ -243,8 +243,8 @@ void StubApplet::Execute() {
     LOG_WARNING(Service_AM, "called (STUBBED)");
     LogCurrentStorage(broker, "Execute");
 
-    broker.PushNormalDataFromApplet(IStorage{std::vector<u8>(0x1000)});
-    broker.PushInteractiveDataFromApplet(IStorage{std::vector<u8>(0x1000)});
+    broker.PushNormalDataFromApplet(std::make_shared<IStorage>(std::vector<u8>(0x1000)));
+    broker.PushInteractiveDataFromApplet(std::make_shared<IStorage>(std::vector<u8>(0x1000)));
     broker.SignalStateChanged();
 }
 
diff --git a/src/core/hle/service/am/applets/profile_select.cpp b/src/core/hle/service/am/applets/profile_select.cpp
index 3eba696ca..70cc23552 100644
--- a/src/core/hle/service/am/applets/profile_select.cpp
+++ b/src/core/hle/service/am/applets/profile_select.cpp
@@ -50,7 +50,7 @@ void ProfileSelect::ExecuteInteractive() {
 
 void ProfileSelect::Execute() {
     if (complete) {
-        broker.PushNormalDataFromApplet(IStorage{final_data});
+        broker.PushNormalDataFromApplet(std::make_shared<IStorage>(std::move(final_data)));
         return;
     }
 
@@ -71,7 +71,7 @@ void ProfileSelect::SelectionComplete(std::optional<Common::UUID> uuid) {
 
     final_data = std::vector<u8>(sizeof(UserSelectionOutput));
     std::memcpy(final_data.data(), &output, final_data.size());
-    broker.PushNormalDataFromApplet(IStorage{final_data});
+    broker.PushNormalDataFromApplet(std::make_shared<IStorage>(std::move(final_data)));
     broker.SignalStateChanged();
 }
 
diff --git a/src/core/hle/service/am/applets/software_keyboard.cpp b/src/core/hle/service/am/applets/software_keyboard.cpp
index 748559cd0..54e63c138 100644
--- a/src/core/hle/service/am/applets/software_keyboard.cpp
+++ b/src/core/hle/service/am/applets/software_keyboard.cpp
@@ -102,7 +102,8 @@ void SoftwareKeyboard::ExecuteInteractive() {
 
 void SoftwareKeyboard::Execute() {
     if (complete) {
-        broker.PushNormalDataFromApplet(IStorage{final_data});
+        broker.PushNormalDataFromApplet(std::make_shared<IStorage>(std::move(final_data)));
+        broker.SignalStateChanged();
         return;
     }
 
@@ -119,7 +120,7 @@ void SoftwareKeyboard::WriteText(std::optional<std::u16string> text) {
         std::vector<u8> output_sub(SWKBD_OUTPUT_BUFFER_SIZE);
 
         if (config.utf_8) {
-            const u64 size = text->size() + 8;
+            const u64 size = text->size() + sizeof(u64);
             const auto new_text = Common::UTF16ToUTF8(*text);
 
             std::memcpy(output_sub.data(), &size, sizeof(u64));
@@ -130,7 +131,7 @@ void SoftwareKeyboard::WriteText(std::optional<std::u16string> text) {
             std::memcpy(output_main.data() + 4, new_text.data(),
                         std::min(new_text.size(), SWKBD_OUTPUT_BUFFER_SIZE - 4));
         } else {
-            const u64 size = text->size() * 2 + 8;
+            const u64 size = text->size() * 2 + sizeof(u64);
             std::memcpy(output_sub.data(), &size, sizeof(u64));
             std::memcpy(output_sub.data() + 8, text->data(),
                         std::min(text->size() * 2, SWKBD_OUTPUT_BUFFER_SIZE - 8));
@@ -144,15 +145,15 @@ void SoftwareKeyboard::WriteText(std::optional<std::u16string> text) {
         final_data = output_main;
 
         if (complete) {
-            broker.PushNormalDataFromApplet(IStorage{output_main});
+            broker.PushNormalDataFromApplet(std::make_shared<IStorage>(std::move(output_main)));
             broker.SignalStateChanged();
         } else {
-            broker.PushInteractiveDataFromApplet(IStorage{output_sub});
+            broker.PushInteractiveDataFromApplet(std::make_shared<IStorage>(std::move(output_sub)));
         }
     } else {
         output_main[0] = 1;
         complete = true;
-        broker.PushNormalDataFromApplet(IStorage{output_main});
+        broker.PushNormalDataFromApplet(std::make_shared<IStorage>(std::move(output_main)));
         broker.SignalStateChanged();
     }
 }
diff --git a/src/core/hle/service/am/applets/web_browser.cpp b/src/core/hle/service/am/applets/web_browser.cpp
index 5546ef6e8..12443c910 100644
--- a/src/core/hle/service/am/applets/web_browser.cpp
+++ b/src/core/hle/service/am/applets/web_browser.cpp
@@ -284,7 +284,7 @@ void WebBrowser::Finalize() {
     std::vector<u8> data(sizeof(WebCommonReturnValue));
     std::memcpy(data.data(), &out, sizeof(WebCommonReturnValue));
 
-    broker.PushNormalDataFromApplet(IStorage{data});
+    broker.PushNormalDataFromApplet(std::make_shared<IStorage>(std::move(data)));
     broker.SignalStateChanged();
 
     if (!temporary_dir.empty() && FileUtil::IsDirectory(temporary_dir)) {
diff --git a/src/core/hle/service/audio/hwopus.cpp b/src/core/hle/service/audio/hwopus.cpp
index cb839e4a2..d19513cbb 100644
--- a/src/core/hle/service/audio/hwopus.cpp
+++ b/src/core/hle/service/audio/hwopus.cpp
@@ -170,8 +170,10 @@ public:
             {3, nullptr, "SetContextForMultiStream"},
             {4, &IHardwareOpusDecoderManager::DecodeInterleavedWithPerfOld, "DecodeInterleavedWithPerfOld"},
             {5, nullptr, "DecodeInterleavedForMultiStreamWithPerfOld"},
-            {6, &IHardwareOpusDecoderManager::DecodeInterleaved, "DecodeInterleaved"},
-            {7, nullptr, "DecodeInterleavedForMultiStream"},
+            {6, &IHardwareOpusDecoderManager::DecodeInterleaved, "DecodeInterleavedWithPerfAndResetOld"},
+            {7, nullptr, "DecodeInterleavedForMultiStreamWithPerfAndResetOld"},
+            {8, &IHardwareOpusDecoderManager::DecodeInterleaved, "DecodeInterleaved"},
+            {9, nullptr, "DecodeInterleavedForMultiStream"},
         };
         // clang-format on
 
diff --git a/src/core/hle/service/bcat/backend/backend.cpp b/src/core/hle/service/bcat/backend/backend.cpp
index 6f5ea095a..def3410cc 100644
--- a/src/core/hle/service/bcat/backend/backend.cpp
+++ b/src/core/hle/service/bcat/backend/backend.cpp
@@ -117,13 +117,13 @@ bool NullBackend::SynchronizeDirectory(TitleIDVersion title, std::string name,
 }
 
 bool NullBackend::Clear(u64 title_id) {
-    LOG_DEBUG(Service_BCAT, "called, title_id={:016X}");
+    LOG_DEBUG(Service_BCAT, "called, title_id={:016X}", title_id);
 
     return true;
 }
 
 void NullBackend::SetPassphrase(u64 title_id, const Passphrase& passphrase) {
-    LOG_DEBUG(Service_BCAT, "called, title_id={:016X}, passphrase = {}", title_id,
+    LOG_DEBUG(Service_BCAT, "called, title_id={:016X}, passphrase={}", title_id,
               Common::HexToString(passphrase));
 }
 
diff --git a/src/core/hle/service/bcat/backend/boxcat.cpp b/src/core/hle/service/bcat/backend/boxcat.cpp
index 67e39a5c4..f589864ee 100644
--- a/src/core/hle/service/bcat/backend/boxcat.cpp
+++ b/src/core/hle/service/bcat/backend/boxcat.cpp
@@ -200,7 +200,8 @@ private:
     DownloadResult DownloadInternal(const std::string& resolved_path, u32 timeout_seconds,
                                     const std::string& content_type_name) {
         if (client == nullptr) {
-            client = std::make_unique<httplib::SSLClient>(BOXCAT_HOSTNAME, PORT, timeout_seconds);
+            client = std::make_unique<httplib::SSLClient>(BOXCAT_HOSTNAME, PORT);
+            client->set_timeout_sec(timeout_seconds);
         }
 
         httplib::Headers headers{
@@ -448,8 +449,8 @@ std::optional<std::vector<u8>> Boxcat::GetLaunchParameter(TitleIDVersion title)
 
 Boxcat::StatusResult Boxcat::GetStatus(std::optional<std::string>& global,
                                        std::map<std::string, EventStatus>& games) {
-    httplib::SSLClient client{BOXCAT_HOSTNAME, static_cast<int>(PORT),
-                              static_cast<int>(TIMEOUT_SECONDS)};
+    httplib::SSLClient client{BOXCAT_HOSTNAME, static_cast<int>(PORT)};
+    client.set_timeout_sec(static_cast<int>(TIMEOUT_SECONDS));
 
     httplib::Headers headers{
         {std::string("Game-Assets-API-Version"), std::string(BOXCAT_API_VERSION)},
diff --git a/src/core/hle/service/filesystem/fsp_srv.cpp b/src/core/hle/service/filesystem/fsp_srv.cpp
index 55d62fc5e..e6811d5b5 100644
--- a/src/core/hle/service/filesystem/fsp_srv.cpp
+++ b/src/core/hle/service/filesystem/fsp_srv.cpp
@@ -420,7 +420,7 @@ public:
             return;
         }
 
-        IFile file(result.Unwrap());
+        auto file = std::make_shared<IFile>(result.Unwrap());
 
         IPC::ResponseBuilder rb{ctx, 2, 0, 1};
         rb.Push(RESULT_SUCCESS);
@@ -445,7 +445,7 @@ public:
             return;
         }
 
-        IDirectory directory(result.Unwrap());
+        auto directory = std::make_shared<IDirectory>(result.Unwrap());
 
         IPC::ResponseBuilder rb{ctx, 2, 0, 1};
         rb.Push(RESULT_SUCCESS);
@@ -794,8 +794,8 @@ void FSP_SRV::OpenFileSystemWithPatch(Kernel::HLERequestContext& ctx) {
 void FSP_SRV::OpenSdCardFileSystem(Kernel::HLERequestContext& ctx) {
     LOG_DEBUG(Service_FS, "called");
 
-    IFileSystem filesystem(fsc.OpenSDMC().Unwrap(),
-                           SizeGetter::FromStorageId(fsc, FileSys::StorageId::SdCard));
+    auto filesystem = std::make_shared<IFileSystem>(
+        fsc.OpenSDMC().Unwrap(), SizeGetter::FromStorageId(fsc, FileSys::StorageId::SdCard));
 
     IPC::ResponseBuilder rb{ctx, 2, 0, 1};
     rb.Push(RESULT_SUCCESS);
@@ -846,7 +846,8 @@ void FSP_SRV::OpenSaveDataFileSystem(Kernel::HLERequestContext& ctx) {
         id = FileSys::StorageId::NandSystem;
     }
 
-    IFileSystem filesystem(std::move(dir.Unwrap()), SizeGetter::FromStorageId(fsc, id));
+    auto filesystem =
+        std::make_shared<IFileSystem>(std::move(dir.Unwrap()), SizeGetter::FromStorageId(fsc, id));
 
     IPC::ResponseBuilder rb{ctx, 2, 0, 1};
     rb.Push(RESULT_SUCCESS);
@@ -898,7 +899,7 @@ void FSP_SRV::OpenDataStorageByCurrentProcess(Kernel::HLERequestContext& ctx) {
         return;
     }
 
-    IStorage storage(std::move(romfs.Unwrap()));
+    auto storage = std::make_shared<IStorage>(std::move(romfs.Unwrap()));
 
     IPC::ResponseBuilder rb{ctx, 2, 0, 1};
     rb.Push(RESULT_SUCCESS);
@@ -937,7 +938,8 @@ void FSP_SRV::OpenDataStorageByDataId(Kernel::HLERequestContext& ctx) {
 
     FileSys::PatchManager pm{title_id};
 
-    IStorage storage(pm.PatchRomFS(std::move(data.Unwrap()), 0, FileSys::ContentRecordType::Data));
+    auto storage = std::make_shared<IStorage>(
+        pm.PatchRomFS(std::move(data.Unwrap()), 0, FileSys::ContentRecordType::Data));
 
     IPC::ResponseBuilder rb{ctx, 2, 0, 1};
     rb.Push(RESULT_SUCCESS);
diff --git a/src/core/hle/service/hid/controllers/npad.cpp b/src/core/hle/service/hid/controllers/npad.cpp
index 4d952adc0..15c09f04c 100644
--- a/src/core/hle/service/hid/controllers/npad.cpp
+++ b/src/core/hle/service/hid/controllers/npad.cpp
@@ -250,6 +250,10 @@ void Controller_NPad::RequestPadStateUpdate(u32 npad_id) {
     auto& rstick_entry = npad_pad_states[controller_idx].r_stick;
     const auto& button_state = buttons[controller_idx];
     const auto& analog_state = sticks[controller_idx];
+    const auto [stick_l_x_f, stick_l_y_f] =
+        analog_state[static_cast<std::size_t>(JoystickId::Joystick_Left)]->GetStatus();
+    const auto [stick_r_x_f, stick_r_y_f] =
+        analog_state[static_cast<std::size_t>(JoystickId::Joystick_Right)]->GetStatus();
 
     using namespace Settings::NativeButton;
     pad_state.a.Assign(button_state[A - BUTTON_HID_BEGIN]->GetStatus());
@@ -270,23 +274,32 @@ void Controller_NPad::RequestPadStateUpdate(u32 npad_id) {
     pad_state.d_right.Assign(button_state[DRight - BUTTON_HID_BEGIN]->GetStatus());
     pad_state.d_down.Assign(button_state[DDown - BUTTON_HID_BEGIN]->GetStatus());
 
-    pad_state.l_stick_left.Assign(button_state[LStick_Left - BUTTON_HID_BEGIN]->GetStatus());
-    pad_state.l_stick_up.Assign(button_state[LStick_Up - BUTTON_HID_BEGIN]->GetStatus());
-    pad_state.l_stick_right.Assign(button_state[LStick_Right - BUTTON_HID_BEGIN]->GetStatus());
-    pad_state.l_stick_down.Assign(button_state[LStick_Down - BUTTON_HID_BEGIN]->GetStatus());
-
-    pad_state.r_stick_left.Assign(button_state[RStick_Left - BUTTON_HID_BEGIN]->GetStatus());
-    pad_state.r_stick_up.Assign(button_state[RStick_Up - BUTTON_HID_BEGIN]->GetStatus());
-    pad_state.r_stick_right.Assign(button_state[RStick_Right - BUTTON_HID_BEGIN]->GetStatus());
-    pad_state.r_stick_down.Assign(button_state[RStick_Down - BUTTON_HID_BEGIN]->GetStatus());
+    pad_state.l_stick_right.Assign(
+        analog_state[static_cast<std::size_t>(JoystickId::Joystick_Left)]->GetAnalogDirectionStatus(
+            Input::AnalogDirection::RIGHT));
+    pad_state.l_stick_left.Assign(
+        analog_state[static_cast<std::size_t>(JoystickId::Joystick_Left)]->GetAnalogDirectionStatus(
+            Input::AnalogDirection::LEFT));
+    pad_state.l_stick_up.Assign(
+        analog_state[static_cast<std::size_t>(JoystickId::Joystick_Left)]->GetAnalogDirectionStatus(
+            Input::AnalogDirection::UP));
+    pad_state.l_stick_down.Assign(
+        analog_state[static_cast<std::size_t>(JoystickId::Joystick_Left)]->GetAnalogDirectionStatus(
+            Input::AnalogDirection::DOWN));
+
+    pad_state.r_stick_up.Assign(analog_state[static_cast<std::size_t>(JoystickId::Joystick_Right)]
+                                    ->GetAnalogDirectionStatus(Input::AnalogDirection::RIGHT));
+    pad_state.r_stick_left.Assign(analog_state[static_cast<std::size_t>(JoystickId::Joystick_Right)]
+                                      ->GetAnalogDirectionStatus(Input::AnalogDirection::LEFT));
+    pad_state.r_stick_right.Assign(
+        analog_state[static_cast<std::size_t>(JoystickId::Joystick_Right)]
+            ->GetAnalogDirectionStatus(Input::AnalogDirection::UP));
+    pad_state.r_stick_down.Assign(analog_state[static_cast<std::size_t>(JoystickId::Joystick_Right)]
+                                      ->GetAnalogDirectionStatus(Input::AnalogDirection::DOWN));
 
     pad_state.left_sl.Assign(button_state[SL - BUTTON_HID_BEGIN]->GetStatus());
     pad_state.left_sr.Assign(button_state[SR - BUTTON_HID_BEGIN]->GetStatus());
 
-    const auto [stick_l_x_f, stick_l_y_f] =
-        analog_state[static_cast<std::size_t>(JoystickId::Joystick_Left)]->GetStatus();
-    const auto [stick_r_x_f, stick_r_y_f] =
-        analog_state[static_cast<std::size_t>(JoystickId::Joystick_Right)]->GetStatus();
     lstick_entry.x = static_cast<s32>(stick_l_x_f * HID_JOYSTICK_MAX);
     lstick_entry.y = static_cast<s32>(stick_l_y_f * HID_JOYSTICK_MAX);
     rstick_entry.x = static_cast<s32>(stick_r_x_f * HID_JOYSTICK_MAX);
diff --git a/src/core/hle/service/hid/hid.cpp b/src/core/hle/service/hid/hid.cpp
index 89bf8b815..e6b56a9f9 100644
--- a/src/core/hle/service/hid/hid.cpp
+++ b/src/core/hle/service/hid/hid.cpp
@@ -10,6 +10,7 @@
 #include "core/core_timing_util.h"
 #include "core/frontend/emu_window.h"
 #include "core/frontend/input.h"
+#include "core/hardware_properties.h"
 #include "core/hle/ipc_helpers.h"
 #include "core/hle/kernel/client_port.h"
 #include "core/hle/kernel/client_session.h"
@@ -37,11 +38,11 @@ namespace Service::HID {
 
 // Updating period for each HID device.
 // TODO(ogniK): Find actual polling rate of hid
-constexpr s64 pad_update_ticks = static_cast<s64>(Core::Timing::BASE_CLOCK_RATE / 66);
+constexpr s64 pad_update_ticks = static_cast<s64>(Core::Hardware::BASE_CLOCK_RATE / 66);
 [[maybe_unused]] constexpr s64 accelerometer_update_ticks =
-    static_cast<s64>(Core::Timing::BASE_CLOCK_RATE / 100);
+    static_cast<s64>(Core::Hardware::BASE_CLOCK_RATE / 100);
 [[maybe_unused]] constexpr s64 gyroscope_update_ticks =
-    static_cast<s64>(Core::Timing::BASE_CLOCK_RATE / 100);
+    static_cast<s64>(Core::Hardware::BASE_CLOCK_RATE / 100);
 constexpr std::size_t SHARED_MEMORY_SIZE = 0x40000;
 
 IAppletResource::IAppletResource(Core::System& system)
diff --git a/src/core/hle/service/ldn/ldn.cpp b/src/core/hle/service/ldn/ldn.cpp
index ed5059047..92adde6d4 100644
--- a/src/core/hle/service/ldn/ldn.cpp
+++ b/src/core/hle/service/ldn/ldn.cpp
@@ -129,12 +129,20 @@ public:
             {304, nullptr, "Disconnect"},
             {400, nullptr, "Initialize"},
             {401, nullptr, "Finalize"},
-            {402, nullptr, "SetOperationMode"},
+            {402, &IUserLocalCommunicationService::Initialize2, "Initialize2"}, // 7.0.0+
         };
         // clang-format on
 
         RegisterHandlers(functions);
     }
+
+    void Initialize2(Kernel::HLERequestContext& ctx) {
+        LOG_WARNING(Service_LDN, "(STUBBED) called");
+        // Result success seem make this services start network and continue.
+        // If we just pass result error then it will stop and maybe try again and again.
+        IPC::ResponseBuilder rb{ctx, 2};
+        rb.Push(RESULT_UNKNOWN);
+    }
 };
 
 class LDNS final : public ServiceFramework<LDNS> {
diff --git a/src/core/hle/service/nvdrv/devices/nvhost_gpu.cpp b/src/core/hle/service/nvdrv/devices/nvhost_gpu.cpp
index 6d8bca8bb..f1966ac0e 100644
--- a/src/core/hle/service/nvdrv/devices/nvhost_gpu.cpp
+++ b/src/core/hle/service/nvdrv/devices/nvhost_gpu.cpp
@@ -44,6 +44,8 @@ u32 nvhost_gpu::ioctl(Ioctl command, const std::vector<u8>& input, const std::ve
         return GetWaitbase(input, output);
     case IoctlCommand::IocChannelSetTimeoutCommand:
         return ChannelSetTimeout(input, output);
+    case IoctlCommand::IocChannelSetTimeslice:
+        return ChannelSetTimeslice(input, output);
     default:
         break;
     }
@@ -228,4 +230,14 @@ u32 nvhost_gpu::ChannelSetTimeout(const std::vector<u8>& input, std::vector<u8>&
     return 0;
 }
 
+u32 nvhost_gpu::ChannelSetTimeslice(const std::vector<u8>& input, std::vector<u8>& output) {
+    IoctlSetTimeslice params{};
+    std::memcpy(&params, input.data(), sizeof(IoctlSetTimeslice));
+    LOG_INFO(Service_NVDRV, "called, timeslice=0x{:X}", params.timeslice);
+
+    channel_timeslice = params.timeslice;
+
+    return 0;
+}
+
 } // namespace Service::Nvidia::Devices
diff --git a/src/core/hle/service/nvdrv/devices/nvhost_gpu.h b/src/core/hle/service/nvdrv/devices/nvhost_gpu.h
index d056dd046..2ac74743f 100644
--- a/src/core/hle/service/nvdrv/devices/nvhost_gpu.h
+++ b/src/core/hle/service/nvdrv/devices/nvhost_gpu.h
@@ -48,6 +48,7 @@ private:
         IocAllocObjCtxCommand = 0xC0104809,
         IocChannelGetWaitbaseCommand = 0xC0080003,
         IocChannelSetTimeoutCommand = 0x40044803,
+        IocChannelSetTimeslice = 0xC004481D,
     };
 
     enum class CtxObjects : u32_le {
@@ -101,6 +102,11 @@ private:
     static_assert(sizeof(IoctlChannelSetPriority) == 4,
                   "IoctlChannelSetPriority is incorrect size");
 
+    struct IoctlSetTimeslice {
+        u32_le timeslice;
+    };
+    static_assert(sizeof(IoctlSetTimeslice) == 4, "IoctlSetTimeslice is incorrect size");
+
     struct IoctlEventIdControl {
         u32_le cmd; // 0=disable, 1=enable, 2=clear
         u32_le id;
@@ -174,6 +180,7 @@ private:
     u64_le user_data{};
     IoctlZCullBind zcull_params{};
     u32_le channel_priority{};
+    u32_le channel_timeslice{};
 
     u32 SetNVMAPfd(const std::vector<u8>& input, std::vector<u8>& output);
     u32 SetClientData(const std::vector<u8>& input, std::vector<u8>& output);
@@ -188,6 +195,7 @@ private:
                   const std::vector<u8>& input2, IoctlVersion version);
     u32 GetWaitbase(const std::vector<u8>& input, std::vector<u8>& output);
     u32 ChannelSetTimeout(const std::vector<u8>& input, std::vector<u8>& output);
+    u32 ChannelSetTimeslice(const std::vector<u8>& input, std::vector<u8>& output);
 
     std::shared_ptr<nvmap> nvmap_dev;
     u32 assigned_syncpoints{};
diff --git a/src/core/hle/service/nvflinger/nvflinger.cpp b/src/core/hle/service/nvflinger/nvflinger.cpp
index 62752e419..134152210 100644
--- a/src/core/hle/service/nvflinger/nvflinger.cpp
+++ b/src/core/hle/service/nvflinger/nvflinger.cpp
@@ -12,6 +12,7 @@
 #include "core/core.h"
 #include "core/core_timing.h"
 #include "core/core_timing_util.h"
+#include "core/hardware_properties.h"
 #include "core/hle/kernel/kernel.h"
 #include "core/hle/kernel/readable_event.h"
 #include "core/hle/service/nvdrv/devices/nvdisp_disp0.h"
@@ -26,8 +27,8 @@
 
 namespace Service::NVFlinger {
 
-constexpr s64 frame_ticks = static_cast<s64>(Core::Timing::BASE_CLOCK_RATE / 60);
-constexpr s64 frame_ticks_30fps = static_cast<s64>(Core::Timing::BASE_CLOCK_RATE / 30);
+constexpr s64 frame_ticks = static_cast<s64>(Core::Hardware::BASE_CLOCK_RATE / 60);
+constexpr s64 frame_ticks_30fps = static_cast<s64>(Core::Hardware::BASE_CLOCK_RATE / 30);
 
 NVFlinger::NVFlinger(Core::System& system) : system(system) {
     displays.emplace_back(0, "Default", system);
@@ -222,7 +223,7 @@ void NVFlinger::Compose() {
 
 s64 NVFlinger::GetNextTicks() const {
     constexpr s64 max_hertz = 120LL;
-    return (Core::Timing::BASE_CLOCK_RATE * (1LL << swap_interval)) / max_hertz;
+    return (Core::Hardware::BASE_CLOCK_RATE * (1LL << swap_interval)) / max_hertz;
 }
 
 } // namespace Service::NVFlinger
diff --git a/src/core/hle/service/prepo/prepo.cpp b/src/core/hle/service/prepo/prepo.cpp
index 5eb26caf8..8f1be0e48 100644
--- a/src/core/hle/service/prepo/prepo.cpp
+++ b/src/core/hle/service/prepo/prepo.cpp
@@ -50,16 +50,16 @@ private:
         IPC::RequestParser rp{ctx};
         const auto process_id = rp.PopRaw<u64>();
 
-        const auto data1 = ctx.ReadBuffer(0);
-        const auto data2 = ctx.ReadBuffer(1);
+        std::vector<std::vector<u8>> data{ctx.ReadBuffer(0)};
+        if (Type == Core::Reporter::PlayReportType::New) {
+            data.emplace_back(ctx.ReadBuffer(1));
+        }
 
-        LOG_DEBUG(Service_PREPO,
-                  "called, type={:02X}, process_id={:016X}, data1_size={:016X}, data2_size={:016X}",
-                  static_cast<u8>(Type), process_id, data1.size(), data2.size());
+        LOG_DEBUG(Service_PREPO, "called, type={:02X}, process_id={:016X}, data1_size={:016X}",
+                  static_cast<u8>(Type), process_id, data[0].size());
 
         const auto& reporter{system.GetReporter()};
-        reporter.SavePlayReport(Type, system.CurrentProcess()->GetTitleID(), {data1, data2},
-                                process_id);
+        reporter.SavePlayReport(Type, system.CurrentProcess()->GetTitleID(), data, process_id);
 
         IPC::ResponseBuilder rb{ctx, 2};
         rb.Push(RESULT_SUCCESS);
@@ -70,19 +70,19 @@ private:
         IPC::RequestParser rp{ctx};
         const auto user_id = rp.PopRaw<u128>();
         const auto process_id = rp.PopRaw<u64>();
-
-        const auto data1 = ctx.ReadBuffer(0);
-        const auto data2 = ctx.ReadBuffer(1);
+        std::vector<std::vector<u8>> data{ctx.ReadBuffer(0)};
+        if (Type == Core::Reporter::PlayReportType::New) {
+            data.emplace_back(ctx.ReadBuffer(1));
+        }
 
         LOG_DEBUG(
             Service_PREPO,
-            "called, type={:02X}, user_id={:016X}{:016X}, process_id={:016X}, data1_size={:016X}, "
-            "data2_size={:016X}",
-            static_cast<u8>(Type), user_id[1], user_id[0], process_id, data1.size(), data2.size());
+            "called, type={:02X}, user_id={:016X}{:016X}, process_id={:016X}, data1_size={:016X}",
+            static_cast<u8>(Type), user_id[1], user_id[0], process_id, data[0].size());
 
         const auto& reporter{system.GetReporter()};
-        reporter.SavePlayReport(Type, system.CurrentProcess()->GetTitleID(), {data1, data2},
-                                process_id, user_id);
+        reporter.SavePlayReport(Type, system.CurrentProcess()->GetTitleID(), data, process_id,
+                                user_id);
 
         IPC::ResponseBuilder rb{ctx, 2};
         rb.Push(RESULT_SUCCESS);
diff --git a/src/core/hle/service/sockets/bsd.cpp b/src/core/hle/service/sockets/bsd.cpp
index 884ad173b..f67fab2f9 100644
--- a/src/core/hle/service/sockets/bsd.cpp
+++ b/src/core/hle/service/sockets/bsd.cpp
@@ -42,6 +42,26 @@ void BSD::Socket(Kernel::HLERequestContext& ctx) {
     rb.Push<u32>(0); // bsd errno
 }
 
+void BSD::Select(Kernel::HLERequestContext& ctx) {
+    LOG_WARNING(Service, "(STUBBED) called");
+
+    IPC::ResponseBuilder rb{ctx, 4};
+
+    rb.Push(RESULT_SUCCESS);
+    rb.Push<u32>(0); // ret
+    rb.Push<u32>(0); // bsd errno
+}
+
+void BSD::Bind(Kernel::HLERequestContext& ctx) {
+    LOG_WARNING(Service, "(STUBBED) called");
+
+    IPC::ResponseBuilder rb{ctx, 4};
+
+    rb.Push(RESULT_SUCCESS);
+    rb.Push<u32>(0); // ret
+    rb.Push<u32>(0); // bsd errno
+}
+
 void BSD::Connect(Kernel::HLERequestContext& ctx) {
     LOG_WARNING(Service, "(STUBBED) called");
 
@@ -52,6 +72,26 @@ void BSD::Connect(Kernel::HLERequestContext& ctx) {
     rb.Push<u32>(0); // bsd errno
 }
 
+void BSD::Listen(Kernel::HLERequestContext& ctx) {
+    LOG_WARNING(Service, "(STUBBED) called");
+
+    IPC::ResponseBuilder rb{ctx, 4};
+
+    rb.Push(RESULT_SUCCESS);
+    rb.Push<u32>(0); // ret
+    rb.Push<u32>(0); // bsd errno
+}
+
+void BSD::SetSockOpt(Kernel::HLERequestContext& ctx) {
+    LOG_WARNING(Service, "(STUBBED) called");
+
+    IPC::ResponseBuilder rb{ctx, 4};
+
+    rb.Push(RESULT_SUCCESS);
+    rb.Push<u32>(0); // ret
+    rb.Push<u32>(0); // bsd errno
+}
+
 void BSD::SendTo(Kernel::HLERequestContext& ctx) {
     LOG_WARNING(Service, "(STUBBED) called");
 
@@ -80,7 +120,7 @@ BSD::BSD(const char* name) : ServiceFramework(name) {
         {2, &BSD::Socket, "Socket"},
         {3, nullptr, "SocketExempt"},
         {4, nullptr, "Open"},
-        {5, nullptr, "Select"},
+        {5, &BSD::Select, "Select"},
         {6, nullptr, "Poll"},
         {7, nullptr, "Sysctl"},
         {8, nullptr, "Recv"},
@@ -88,15 +128,15 @@ BSD::BSD(const char* name) : ServiceFramework(name) {
         {10, nullptr, "Send"},
         {11, &BSD::SendTo, "SendTo"},
         {12, nullptr, "Accept"},
-        {13, nullptr, "Bind"},
+        {13, &BSD::Bind, "Bind"},
         {14, &BSD::Connect, "Connect"},
         {15, nullptr, "GetPeerName"},
         {16, nullptr, "GetSockName"},
         {17, nullptr, "GetSockOpt"},
-        {18, nullptr, "Listen"},
+        {18, &BSD::Listen, "Listen"},
         {19, nullptr, "Ioctl"},
         {20, nullptr, "Fcntl"},
-        {21, nullptr, "SetSockOpt"},
+        {21, &BSD::SetSockOpt, "SetSockOpt"},
         {22, nullptr, "Shutdown"},
         {23, nullptr, "ShutdownAllSockets"},
         {24, nullptr, "Write"},
diff --git a/src/core/hle/service/sockets/bsd.h b/src/core/hle/service/sockets/bsd.h
index 0fe0e65c6..3098e3baf 100644
--- a/src/core/hle/service/sockets/bsd.h
+++ b/src/core/hle/service/sockets/bsd.h
@@ -18,7 +18,11 @@ private:
     void RegisterClient(Kernel::HLERequestContext& ctx);
     void StartMonitoring(Kernel::HLERequestContext& ctx);
     void Socket(Kernel::HLERequestContext& ctx);
+    void Select(Kernel::HLERequestContext& ctx);
+    void Bind(Kernel::HLERequestContext& ctx);
     void Connect(Kernel::HLERequestContext& ctx);
+    void Listen(Kernel::HLERequestContext& ctx);
+    void SetSockOpt(Kernel::HLERequestContext& ctx);
     void SendTo(Kernel::HLERequestContext& ctx);
     void Close(Kernel::HLERequestContext& ctx);
 
diff --git a/src/core/hle/service/time/standard_steady_clock_core.cpp b/src/core/hle/service/time/standard_steady_clock_core.cpp
index ca1a783fc..1575f0b49 100644
--- a/src/core/hle/service/time/standard_steady_clock_core.cpp
+++ b/src/core/hle/service/time/standard_steady_clock_core.cpp
@@ -5,6 +5,7 @@
 #include "core/core.h"
 #include "core/core_timing.h"
 #include "core/core_timing_util.h"
+#include "core/hardware_properties.h"
 #include "core/hle/service/time/standard_steady_clock_core.h"
 
 namespace Service::Time::Clock {
@@ -12,7 +13,7 @@ namespace Service::Time::Clock {
 TimeSpanType StandardSteadyClockCore::GetCurrentRawTimePoint(Core::System& system) {
     const TimeSpanType ticks_time_span{TimeSpanType::FromTicks(
         Core::Timing::CpuCyclesToClockCycles(system.CoreTiming().GetTicks()),
-        Core::Timing::CNTFREQ)};
+        Core::Hardware::CNTFREQ)};
     TimeSpanType raw_time_point{setup_value.nanoseconds + ticks_time_span.nanoseconds};
 
     if (raw_time_point.nanoseconds < cached_raw_time_point.nanoseconds) {
diff --git a/src/core/hle/service/time/tick_based_steady_clock_core.cpp b/src/core/hle/service/time/tick_based_steady_clock_core.cpp
index c77b98189..44d5bc651 100644
--- a/src/core/hle/service/time/tick_based_steady_clock_core.cpp
+++ b/src/core/hle/service/time/tick_based_steady_clock_core.cpp
@@ -5,6 +5,7 @@
 #include "core/core.h"
 #include "core/core_timing.h"
 #include "core/core_timing_util.h"
+#include "core/hardware_properties.h"
 #include "core/hle/service/time/tick_based_steady_clock_core.h"
 
 namespace Service::Time::Clock {
@@ -12,7 +13,7 @@ namespace Service::Time::Clock {
 SteadyClockTimePoint TickBasedSteadyClockCore::GetTimePoint(Core::System& system) {
     const TimeSpanType ticks_time_span{TimeSpanType::FromTicks(
         Core::Timing::CpuCyclesToClockCycles(system.CoreTiming().GetTicks()),
-        Core::Timing::CNTFREQ)};
+        Core::Hardware::CNTFREQ)};
 
     return {ticks_time_span.ToSeconds(), GetClockSourceId()};
 }
diff --git a/src/core/hle/service/time/time.cpp b/src/core/hle/service/time/time.cpp
index 8ef4efcef..749b7be70 100644
--- a/src/core/hle/service/time/time.cpp
+++ b/src/core/hle/service/time/time.cpp
@@ -6,6 +6,7 @@
 #include "core/core.h"
 #include "core/core_timing.h"
 #include "core/core_timing_util.h"
+#include "core/hardware_properties.h"
 #include "core/hle/ipc_helpers.h"
 #include "core/hle/kernel/client_port.h"
 #include "core/hle/kernel/client_session.h"
@@ -233,7 +234,7 @@ void Module::Interface::CalculateMonotonicSystemClockBaseTimePoint(Kernel::HLERe
     if (current_time_point.clock_source_id == context.steady_time_point.clock_source_id) {
         const auto ticks{Clock::TimeSpanType::FromTicks(
             Core::Timing::CpuCyclesToClockCycles(system.CoreTiming().GetTicks()),
-            Core::Timing::CNTFREQ)};
+            Core::Hardware::CNTFREQ)};
         const s64 base_time_point{context.offset + current_time_point.time_point -
                                   ticks.ToSeconds()};
         IPC::ResponseBuilder rb{ctx, (sizeof(s64) / 4) + 2};
diff --git a/src/core/hle/service/time/time_sharedmemory.cpp b/src/core/hle/service/time/time_sharedmemory.cpp
index 9b03191bf..fdaef233f 100644
--- a/src/core/hle/service/time/time_sharedmemory.cpp
+++ b/src/core/hle/service/time/time_sharedmemory.cpp
@@ -5,6 +5,7 @@
 #include "core/core.h"
 #include "core/core_timing.h"
 #include "core/core_timing_util.h"
+#include "core/hardware_properties.h"
 #include "core/hle/service/time/clock_types.h"
 #include "core/hle/service/time/steady_clock_core.h"
 #include "core/hle/service/time/time_sharedmemory.h"
@@ -31,7 +32,7 @@ void SharedMemory::SetupStandardSteadyClock(Core::System& system,
                                             Clock::TimeSpanType current_time_point) {
     const Clock::TimeSpanType ticks_time_span{Clock::TimeSpanType::FromTicks(
         Core::Timing::CpuCyclesToClockCycles(system.CoreTiming().GetTicks()),
-        Core::Timing::CNTFREQ)};
+        Core::Hardware::CNTFREQ)};
     const Clock::SteadyClockContext context{
         static_cast<u64>(current_time_point.nanoseconds - ticks_time_span.nanoseconds),
         clock_source_id};
diff --git a/src/core/loader/nso.cpp b/src/core/loader/nso.cpp
index 515c5accb..044067a5b 100644
--- a/src/core/loader/nso.cpp
+++ b/src/core/loader/nso.cpp
@@ -97,7 +97,8 @@ std::optional<VAddr> AppLoader_NSO::LoadModule(Kernel::Process& process,
         if (nso_header.IsSegmentCompressed(i)) {
             data = DecompressSegment(data, nso_header.segments[i]);
         }
-        program_image.resize(nso_header.segments[i].location + data.size());
+        program_image.resize(nso_header.segments[i].location +
+                             PageAlignSize(static_cast<u32>(data.size())));
         std::memcpy(program_image.data() + nso_header.segments[i].location, data.data(),
                     data.size());
         codeset.segments[i].addr = nso_header.segments[i].location;
@@ -105,8 +106,12 @@ std::optional<VAddr> AppLoader_NSO::LoadModule(Kernel::Process& process,
         codeset.segments[i].size = PageAlignSize(static_cast<u32>(data.size()));
     }
 
-    if (should_pass_arguments && !Settings::values.program_args.empty()) {
-        const auto arg_data = Settings::values.program_args;
+    if (should_pass_arguments) {
+        std::vector<u8> arg_data{Settings::values.program_args.begin(),
+                                 Settings::values.program_args.end()};
+        if (arg_data.empty()) {
+            arg_data.resize(NSO_ARGUMENT_DEFAULT_SIZE);
+        }
         codeset.DataSegment().size += NSO_ARGUMENT_DATA_ALLOCATION_SIZE;
         NSOArgumentHeader args_header{
             NSO_ARGUMENT_DATA_ALLOCATION_SIZE, static_cast<u32_le>(arg_data.size()), {}};
diff --git a/src/core/loader/nso.h b/src/core/loader/nso.h
index 58cbe162d..d2d600cd9 100644
--- a/src/core/loader/nso.h
+++ b/src/core/loader/nso.h
@@ -56,6 +56,8 @@ static_assert(sizeof(NSOHeader) == 0x100, "NSOHeader has incorrect size.");
 static_assert(std::is_trivially_copyable_v<NSOHeader>, "NSOHeader must be trivially copyable.");
 
 constexpr u64 NSO_ARGUMENT_DATA_ALLOCATION_SIZE = 0x9000;
+// NOTE: Official software default argument state is unverified.
+constexpr u64 NSO_ARGUMENT_DEFAULT_SIZE = 1;
 
 struct NSOArgumentHeader {
     u32_le allocated_size;
diff --git a/src/core/memory/cheat_engine.cpp b/src/core/memory/cheat_engine.cpp
index d1e6bed93..4472500d2 100644
--- a/src/core/memory/cheat_engine.cpp
+++ b/src/core/memory/cheat_engine.cpp
@@ -9,6 +9,7 @@
 #include "core/core.h"
 #include "core/core_timing.h"
 #include "core/core_timing_util.h"
+#include "core/hardware_properties.h"
 #include "core/hle/kernel/process.h"
 #include "core/hle/service/hid/controllers/npad.h"
 #include "core/hle/service/hid/hid.h"
@@ -17,7 +18,7 @@
 
 namespace Memory {
 
-constexpr s64 CHEAT_ENGINE_TICKS = static_cast<s64>(Core::Timing::BASE_CLOCK_RATE / 12);
+constexpr s64 CHEAT_ENGINE_TICKS = static_cast<s64>(Core::Hardware::BASE_CLOCK_RATE / 12);
 constexpr u32 KEYPAD_BITMASK = 0x3FFFFFF;
 
 StandardVmCallbacks::StandardVmCallbacks(Core::System& system, const CheatProcessMetadata& metadata)
diff --git a/src/core/settings.h b/src/core/settings.h
index 9c98a9287..f837d3fbc 100644
--- a/src/core/settings.h
+++ b/src/core/settings.h
@@ -371,6 +371,11 @@ enum class SDMCSize : u64 {
     S1TB = 0x10000000000ULL,
 };
 
+enum class RendererBackend {
+    OpenGL = 0,
+    Vulkan = 1,
+};
+
 struct Values {
     // System
     bool use_docked_mode;
@@ -401,6 +406,9 @@ struct Values {
     std::string motion_device;
     TouchscreenInput touchscreen;
     std::atomic_bool is_device_reload_pending{true};
+    std::string udp_input_address;
+    u16 udp_input_port;
+    u8 udp_pad_index;
 
     // Core
     bool use_multi_core;
@@ -416,7 +424,12 @@ struct Values {
     SDMCSize sdmc_size;
 
     // Renderer
+    RendererBackend renderer_backend;
+    bool renderer_debug;
+    int vulkan_device;
+
     float resolution_factor;
+    int aspect_ratio;
     bool use_frame_limit;
     u16 frame_limit;
     bool use_disk_shader_cache;
diff --git a/src/core/telemetry_session.cpp b/src/core/telemetry_session.cpp
index 320e8ad73..0e72d31cd 100644
--- a/src/core/telemetry_session.cpp
+++ b/src/core/telemetry_session.cpp
@@ -46,6 +46,16 @@ static u64 GenerateTelemetryId() {
     return telemetry_id;
 }
 
+static const char* TranslateRenderer(Settings::RendererBackend backend) {
+    switch (backend) {
+    case Settings::RendererBackend::OpenGL:
+        return "OpenGL";
+    case Settings::RendererBackend::Vulkan:
+        return "Vulkan";
+    }
+    return "Unknown";
+}
+
 u64 GetTelemetryId() {
     u64 telemetry_id{};
     const std::string filename{FileUtil::GetUserPath(FileUtil::UserPath::ConfigDir) +
@@ -169,7 +179,7 @@ void TelemetrySession::AddInitialInfo(Loader::AppLoader& app_loader) {
     AddField(field_type, "Audio_SinkId", Settings::values.sink_id);
     AddField(field_type, "Audio_EnableAudioStretching", Settings::values.enable_audio_stretching);
     AddField(field_type, "Core_UseMultiCore", Settings::values.use_multi_core);
-    AddField(field_type, "Renderer_Backend", "OpenGL");
+    AddField(field_type, "Renderer_Backend", TranslateRenderer(Settings::values.renderer_backend));
     AddField(field_type, "Renderer_ResolutionFactor", Settings::values.resolution_factor);
     AddField(field_type, "Renderer_UseFrameLimit", Settings::values.use_frame_limit);
     AddField(field_type, "Renderer_FrameLimit", Settings::values.frame_limit);
diff --git a/src/core/tools/freezer.cpp b/src/core/tools/freezer.cpp
index 55e0dbc49..1e060f009 100644
--- a/src/core/tools/freezer.cpp
+++ b/src/core/tools/freezer.cpp
@@ -7,13 +7,14 @@
 #include "core/core.h"
 #include "core/core_timing.h"
 #include "core/core_timing_util.h"
+#include "core/hardware_properties.h"
 #include "core/memory.h"
 #include "core/tools/freezer.h"
 
 namespace Tools {
 namespace {
 
-constexpr s64 MEMORY_FREEZER_TICKS = static_cast<s64>(Core::Timing::BASE_CLOCK_RATE / 60);
+constexpr s64 MEMORY_FREEZER_TICKS = static_cast<s64>(Core::Hardware::BASE_CLOCK_RATE / 60);
 
 u64 MemoryReadWidth(Memory::Memory& memory, u32 width, VAddr addr) {
     switch (width) {
diff --git a/src/input_common/CMakeLists.txt b/src/input_common/CMakeLists.txt
index 5b4e032bd..2520ba321 100644
--- a/src/input_common/CMakeLists.txt
+++ b/src/input_common/CMakeLists.txt
@@ -9,6 +9,12 @@ add_library(input_common STATIC
     motion_emu.h
     sdl/sdl.cpp
     sdl/sdl.h
+    udp/client.cpp
+    udp/client.h
+    udp/protocol.cpp
+    udp/protocol.h
+    udp/udp.cpp
+    udp/udp.h
 )
 
 if(SDL2_FOUND)
@@ -21,4 +27,4 @@ if(SDL2_FOUND)
 endif()
 
 create_target_directory_groups(input_common)
-target_link_libraries(input_common PUBLIC core PRIVATE common)
+target_link_libraries(input_common PUBLIC core PRIVATE common ${Boost_LIBRARIES})
diff --git a/src/input_common/main.cpp b/src/input_common/main.cpp
index 8e66c1b15..c98c848cf 100644
--- a/src/input_common/main.cpp
+++ b/src/input_common/main.cpp
@@ -9,6 +9,7 @@
 #include "input_common/keyboard.h"
 #include "input_common/main.h"
 #include "input_common/motion_emu.h"
+#include "input_common/udp/udp.h"
 #ifdef HAVE_SDL2
 #include "input_common/sdl/sdl.h"
 #endif
@@ -18,6 +19,7 @@ namespace InputCommon {
 static std::shared_ptr<Keyboard> keyboard;
 static std::shared_ptr<MotionEmu> motion_emu;
 static std::unique_ptr<SDL::State> sdl;
+static std::unique_ptr<CemuhookUDP::State> udp;
 
 void Init() {
     keyboard = std::make_shared<Keyboard>();
@@ -28,6 +30,8 @@ void Init() {
     Input::RegisterFactory<Input::MotionDevice>("motion_emu", motion_emu);
 
     sdl = SDL::Init();
+
+    udp = CemuhookUDP::Init();
 }
 
 void Shutdown() {
@@ -37,6 +41,7 @@ void Shutdown() {
     Input::UnregisterFactory<Input::MotionDevice>("motion_emu");
     motion_emu.reset();
     sdl.reset();
+    udp.reset();
 }
 
 Keyboard* GetKeyboard() {
@@ -72,11 +77,13 @@ std::string GenerateAnalogParamFromKeys(int key_up, int key_down, int key_left,
 namespace Polling {
 
 std::vector<std::unique_ptr<DevicePoller>> GetPollers(DeviceType type) {
+    std::vector<std::unique_ptr<DevicePoller>> pollers;
+
 #ifdef HAVE_SDL2
-    return sdl->GetPollers(type);
-#else
-    return {};
+    pollers = sdl->GetPollers(type);
 #endif
+
+    return pollers;
 }
 
 } // namespace Polling
diff --git a/src/input_common/sdl/sdl_impl.cpp b/src/input_common/sdl/sdl_impl.cpp
index d2e9d278f..a2e0c0bd2 100644
--- a/src/input_common/sdl/sdl_impl.cpp
+++ b/src/input_common/sdl/sdl_impl.cpp
@@ -342,6 +342,22 @@ public:
         return std::make_tuple<float, float>(0.0f, 0.0f);
     }
 
+    bool GetAnalogDirectionStatus(Input::AnalogDirection direction) const override {
+        const auto [x, y] = GetStatus();
+        const float directional_deadzone = 0.4f;
+        switch (direction) {
+        case Input::AnalogDirection::RIGHT:
+            return x > directional_deadzone;
+        case Input::AnalogDirection::LEFT:
+            return x < -directional_deadzone;
+        case Input::AnalogDirection::UP:
+            return y > directional_deadzone;
+        case Input::AnalogDirection::DOWN:
+            return y < -directional_deadzone;
+        }
+        return false;
+    }
+
 private:
     std::shared_ptr<SDLJoystick> joystick;
     const int axis_x;
diff --git a/src/input_common/udp/client.cpp b/src/input_common/udp/client.cpp
new file mode 100644
index 000000000..2228571a6
--- /dev/null
+++ b/src/input_common/udp/client.cpp
@@ -0,0 +1,286 @@
+// Copyright 2018 Citra Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <algorithm>
+#include <array>
+#include <chrono>
+#include <cstring>
+#include <functional>
+#include <thread>
+#include <boost/asio.hpp>
+#include <boost/bind.hpp>
+#include "common/logging/log.h"
+#include "input_common/udp/client.h"
+#include "input_common/udp/protocol.h"
+
+using boost::asio::ip::udp;
+
+namespace InputCommon::CemuhookUDP {
+
+struct SocketCallback {
+    std::function<void(Response::Version)> version;
+    std::function<void(Response::PortInfo)> port_info;
+    std::function<void(Response::PadData)> pad_data;
+};
+
+class Socket {
+public:
+    using clock = std::chrono::system_clock;
+
+    explicit Socket(const std::string& host, u16 port, u8 pad_index, u32 client_id,
+                    SocketCallback callback)
+        : callback(std::move(callback)), timer(io_service),
+          socket(io_service, udp::endpoint(udp::v4(), 0)), client_id(client_id),
+          pad_index(pad_index),
+          send_endpoint(udp::endpoint(boost::asio::ip::make_address_v4(host), port)) {}
+
+    void Stop() {
+        io_service.stop();
+    }
+
+    void Loop() {
+        io_service.run();
+    }
+
+    void StartSend(const clock::time_point& from) {
+        timer.expires_at(from + std::chrono::seconds(3));
+        timer.async_wait([this](const boost::system::error_code& error) { HandleSend(error); });
+    }
+
+    void StartReceive() {
+        socket.async_receive_from(
+            boost::asio::buffer(receive_buffer), receive_endpoint,
+            [this](const boost::system::error_code& error, std::size_t bytes_transferred) {
+                HandleReceive(error, bytes_transferred);
+            });
+    }
+
+private:
+    void HandleReceive(const boost::system::error_code& error, std::size_t bytes_transferred) {
+        if (auto type = Response::Validate(receive_buffer.data(), bytes_transferred)) {
+            switch (*type) {
+            case Type::Version: {
+                Response::Version version;
+                std::memcpy(&version, &receive_buffer[sizeof(Header)], sizeof(Response::Version));
+                callback.version(std::move(version));
+                break;
+            }
+            case Type::PortInfo: {
+                Response::PortInfo port_info;
+                std::memcpy(&port_info, &receive_buffer[sizeof(Header)],
+                            sizeof(Response::PortInfo));
+                callback.port_info(std::move(port_info));
+                break;
+            }
+            case Type::PadData: {
+                Response::PadData pad_data;
+                std::memcpy(&pad_data, &receive_buffer[sizeof(Header)], sizeof(Response::PadData));
+                callback.pad_data(std::move(pad_data));
+                break;
+            }
+            }
+        }
+        StartReceive();
+    }
+
+    void HandleSend(const boost::system::error_code& error) {
+        // Send a request for getting port info for the pad
+        Request::PortInfo port_info{1, {pad_index, 0, 0, 0}};
+        const auto port_message = Request::Create(port_info, client_id);
+        std::memcpy(&send_buffer1, &port_message, PORT_INFO_SIZE);
+        socket.send_to(boost::asio::buffer(send_buffer1), send_endpoint);
+
+        // Send a request for getting pad data for the pad
+        Request::PadData pad_data{Request::PadData::Flags::Id, pad_index, EMPTY_MAC_ADDRESS};
+        const auto pad_message = Request::Create(pad_data, client_id);
+        std::memcpy(send_buffer2.data(), &pad_message, PAD_DATA_SIZE);
+        socket.send_to(boost::asio::buffer(send_buffer2), send_endpoint);
+        StartSend(timer.expiry());
+    }
+
+    SocketCallback callback;
+    boost::asio::io_service io_service;
+    boost::asio::basic_waitable_timer<clock> timer;
+    udp::socket socket;
+
+    u32 client_id{};
+    u8 pad_index{};
+
+    static constexpr std::size_t PORT_INFO_SIZE = sizeof(Message<Request::PortInfo>);
+    static constexpr std::size_t PAD_DATA_SIZE = sizeof(Message<Request::PadData>);
+    std::array<u8, PORT_INFO_SIZE> send_buffer1;
+    std::array<u8, PAD_DATA_SIZE> send_buffer2;
+    udp::endpoint send_endpoint;
+
+    std::array<u8, MAX_PACKET_SIZE> receive_buffer;
+    udp::endpoint receive_endpoint;
+};
+
+static void SocketLoop(Socket* socket) {
+    socket->StartReceive();
+    socket->StartSend(Socket::clock::now());
+    socket->Loop();
+}
+
+Client::Client(std::shared_ptr<DeviceStatus> status, const std::string& host, u16 port,
+               u8 pad_index, u32 client_id)
+    : status(std::move(status)) {
+    StartCommunication(host, port, pad_index, client_id);
+}
+
+Client::~Client() {
+    socket->Stop();
+    thread.join();
+}
+
+void Client::ReloadSocket(const std::string& host, u16 port, u8 pad_index, u32 client_id) {
+    socket->Stop();
+    thread.join();
+    StartCommunication(host, port, pad_index, client_id);
+}
+
+void Client::OnVersion(Response::Version data) {
+    LOG_TRACE(Input, "Version packet received: {}", data.version);
+}
+
+void Client::OnPortInfo(Response::PortInfo data) {
+    LOG_TRACE(Input, "PortInfo packet received: {}", data.model);
+}
+
+void Client::OnPadData(Response::PadData data) {
+    LOG_TRACE(Input, "PadData packet received");
+    if (data.packet_counter <= packet_sequence) {
+        LOG_WARNING(
+            Input,
+            "PadData packet dropped because its stale info. Current count: {} Packet count: {}",
+            packet_sequence, data.packet_counter);
+        return;
+    }
+    packet_sequence = data.packet_counter;
+    // TODO: Check how the Switch handles motions and how the CemuhookUDP motion
+    // directions correspond to the ones of the Switch
+    Common::Vec3f accel = Common::MakeVec<float>(data.accel.x, data.accel.y, data.accel.z);
+    Common::Vec3f gyro = Common::MakeVec<float>(data.gyro.pitch, data.gyro.yaw, data.gyro.roll);
+    {
+        std::lock_guard guard(status->update_mutex);
+
+        status->motion_status = {accel, gyro};
+
+        // TODO: add a setting for "click" touch. Click touch refers to a device that differentiates
+        // between a simple "tap" and a hard press that causes the touch screen to click.
+        const bool is_active = data.touch_1.is_active != 0;
+
+        float x = 0;
+        float y = 0;
+
+        if (is_active && status->touch_calibration) {
+            const u16 min_x = status->touch_calibration->min_x;
+            const u16 max_x = status->touch_calibration->max_x;
+            const u16 min_y = status->touch_calibration->min_y;
+            const u16 max_y = status->touch_calibration->max_y;
+
+            x = (std::clamp(static_cast<u16>(data.touch_1.x), min_x, max_x) - min_x) /
+                static_cast<float>(max_x - min_x);
+            y = (std::clamp(static_cast<u16>(data.touch_1.y), min_y, max_y) - min_y) /
+                static_cast<float>(max_y - min_y);
+        }
+
+        status->touch_status = {x, y, is_active};
+    }
+}
+
+void Client::StartCommunication(const std::string& host, u16 port, u8 pad_index, u32 client_id) {
+    SocketCallback callback{[this](Response::Version version) { OnVersion(version); },
+                            [this](Response::PortInfo info) { OnPortInfo(info); },
+                            [this](Response::PadData data) { OnPadData(data); }};
+    LOG_INFO(Input, "Starting communication with UDP input server on {}:{}", host, port);
+    socket = std::make_unique<Socket>(host, port, pad_index, client_id, callback);
+    thread = std::thread{SocketLoop, this->socket.get()};
+}
+
+void TestCommunication(const std::string& host, u16 port, u8 pad_index, u32 client_id,
+                       std::function<void()> success_callback,
+                       std::function<void()> failure_callback) {
+    std::thread([=] {
+        Common::Event success_event;
+        SocketCallback callback{[](Response::Version version) {}, [](Response::PortInfo info) {},
+                                [&](Response::PadData data) { success_event.Set(); }};
+        Socket socket{host, port, pad_index, client_id, std::move(callback)};
+        std::thread worker_thread{SocketLoop, &socket};
+        bool result = success_event.WaitFor(std::chrono::seconds(8));
+        socket.Stop();
+        worker_thread.join();
+        if (result) {
+            success_callback();
+        } else {
+            failure_callback();
+        }
+    })
+        .detach();
+}
+
+CalibrationConfigurationJob::CalibrationConfigurationJob(
+    const std::string& host, u16 port, u8 pad_index, u32 client_id,
+    std::function<void(Status)> status_callback,
+    std::function<void(u16, u16, u16, u16)> data_callback) {
+
+    std::thread([=] {
+        constexpr u16 CALIBRATION_THRESHOLD = 100;
+
+        u16 min_x{UINT16_MAX};
+        u16 min_y{UINT16_MAX};
+        u16 max_x{};
+        u16 max_y{};
+
+        Status current_status{Status::Initialized};
+        SocketCallback callback{[](Response::Version version) {}, [](Response::PortInfo info) {},
+                                [&](Response::PadData data) {
+                                    if (current_status == Status::Initialized) {
+                                        // Receiving data means the communication is ready now
+                                        current_status = Status::Ready;
+                                        status_callback(current_status);
+                                    }
+                                    if (!data.touch_1.is_active) {
+                                        return;
+                                    }
+                                    LOG_DEBUG(Input, "Current touch: {} {}", data.touch_1.x,
+                                              data.touch_1.y);
+                                    min_x = std::min(min_x, static_cast<u16>(data.touch_1.x));
+                                    min_y = std::min(min_y, static_cast<u16>(data.touch_1.y));
+                                    if (current_status == Status::Ready) {
+                                        // First touch - min data (min_x/min_y)
+                                        current_status = Status::Stage1Completed;
+                                        status_callback(current_status);
+                                    }
+                                    if (data.touch_1.x - min_x > CALIBRATION_THRESHOLD &&
+                                        data.touch_1.y - min_y > CALIBRATION_THRESHOLD) {
+                                        // Set the current position as max value and finishes
+                                        // configuration
+                                        max_x = data.touch_1.x;
+                                        max_y = data.touch_1.y;
+                                        current_status = Status::Completed;
+                                        data_callback(min_x, min_y, max_x, max_y);
+                                        status_callback(current_status);
+
+                                        complete_event.Set();
+                                    }
+                                }};
+        Socket socket{host, port, pad_index, client_id, std::move(callback)};
+        std::thread worker_thread{SocketLoop, &socket};
+        complete_event.Wait();
+        socket.Stop();
+        worker_thread.join();
+    })
+        .detach();
+}
+
+CalibrationConfigurationJob::~CalibrationConfigurationJob() {
+    Stop();
+}
+
+void CalibrationConfigurationJob::Stop() {
+    complete_event.Set();
+}
+
+} // namespace InputCommon::CemuhookUDP
diff --git a/src/input_common/udp/client.h b/src/input_common/udp/client.h
new file mode 100644
index 000000000..b8c654755
--- /dev/null
+++ b/src/input_common/udp/client.h
@@ -0,0 +1,95 @@
+// Copyright 2018 Citra Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <functional>
+#include <memory>
+#include <mutex>
+#include <optional>
+#include <string>
+#include <thread>
+#include <tuple>
+#include "common/common_types.h"
+#include "common/thread.h"
+#include "common/vector_math.h"
+
+namespace InputCommon::CemuhookUDP {
+
+constexpr u16 DEFAULT_PORT = 26760;
+constexpr char DEFAULT_ADDR[] = "127.0.0.1";
+
+class Socket;
+
+namespace Response {
+struct PadData;
+struct PortInfo;
+struct Version;
+} // namespace Response
+
+struct DeviceStatus {
+    std::mutex update_mutex;
+    std::tuple<Common::Vec3<float>, Common::Vec3<float>> motion_status;
+    std::tuple<float, float, bool> touch_status;
+
+    // calibration data for scaling the device's touch area to 3ds
+    struct CalibrationData {
+        u16 min_x{};
+        u16 min_y{};
+        u16 max_x{};
+        u16 max_y{};
+    };
+    std::optional<CalibrationData> touch_calibration;
+};
+
+class Client {
+public:
+    explicit Client(std::shared_ptr<DeviceStatus> status, const std::string& host = DEFAULT_ADDR,
+                    u16 port = DEFAULT_PORT, u8 pad_index = 0, u32 client_id = 24872);
+    ~Client();
+    void ReloadSocket(const std::string& host = "127.0.0.1", u16 port = 26760, u8 pad_index = 0,
+                      u32 client_id = 24872);
+
+private:
+    void OnVersion(Response::Version);
+    void OnPortInfo(Response::PortInfo);
+    void OnPadData(Response::PadData);
+    void StartCommunication(const std::string& host, u16 port, u8 pad_index, u32 client_id);
+
+    std::unique_ptr<Socket> socket;
+    std::shared_ptr<DeviceStatus> status;
+    std::thread thread;
+    u64 packet_sequence = 0;
+};
+
+/// An async job allowing configuration of the touchpad calibration.
+class CalibrationConfigurationJob {
+public:
+    enum class Status {
+        Initialized,
+        Ready,
+        Stage1Completed,
+        Completed,
+    };
+    /**
+     * Constructs and starts the job with the specified parameter.
+     *
+     * @param status_callback Callback for job status updates
+     * @param data_callback Called when calibration data is ready
+     */
+    explicit CalibrationConfigurationJob(const std::string& host, u16 port, u8 pad_index,
+                                         u32 client_id, std::function<void(Status)> status_callback,
+                                         std::function<void(u16, u16, u16, u16)> data_callback);
+    ~CalibrationConfigurationJob();
+    void Stop();
+
+private:
+    Common::Event complete_event;
+};
+
+void TestCommunication(const std::string& host, u16 port, u8 pad_index, u32 client_id,
+                       std::function<void()> success_callback,
+                       std::function<void()> failure_callback);
+
+} // namespace InputCommon::CemuhookUDP
diff --git a/src/input_common/udp/protocol.cpp b/src/input_common/udp/protocol.cpp
new file mode 100644
index 000000000..a982ac49d
--- /dev/null
+++ b/src/input_common/udp/protocol.cpp
@@ -0,0 +1,79 @@
+// Copyright 2018 Citra Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <cstddef>
+#include <cstring>
+#include "common/logging/log.h"
+#include "input_common/udp/protocol.h"
+
+namespace InputCommon::CemuhookUDP {
+
+static constexpr std::size_t GetSizeOfResponseType(Type t) {
+    switch (t) {
+    case Type::Version:
+        return sizeof(Response::Version);
+    case Type::PortInfo:
+        return sizeof(Response::PortInfo);
+    case Type::PadData:
+        return sizeof(Response::PadData);
+    }
+    return 0;
+}
+
+namespace Response {
+
+/**
+ * Returns Type if the packet is valid, else none
+ *
+ * Note: Modifies the buffer to zero out the crc (since thats the easiest way to check without
+ * copying the buffer)
+ */
+std::optional<Type> Validate(u8* data, std::size_t size) {
+    if (size < sizeof(Header)) {
+        LOG_DEBUG(Input, "Invalid UDP packet received");
+        return std::nullopt;
+    }
+    Header header{};
+    std::memcpy(&header, data, sizeof(Header));
+    if (header.magic != SERVER_MAGIC) {
+        LOG_ERROR(Input, "UDP Packet has an unexpected magic value");
+        return std::nullopt;
+    }
+    if (header.protocol_version != PROTOCOL_VERSION) {
+        LOG_ERROR(Input, "UDP Packet protocol mismatch");
+        return std::nullopt;
+    }
+    if (header.type < Type::Version || header.type > Type::PadData) {
+        LOG_ERROR(Input, "UDP Packet is an unknown type");
+        return std::nullopt;
+    }
+
+    // Packet size must equal sizeof(Header) + sizeof(Data)
+    // and also verify that the packet info mentions the correct size. Since the spec includes the
+    // type of the packet as part of the data, we need to include it in size calculations here
+    // ie: payload_length == sizeof(T) + sizeof(Type)
+    const std::size_t data_len = GetSizeOfResponseType(header.type);
+    if (header.payload_length != data_len + sizeof(Type) || size < data_len + sizeof(Header)) {
+        LOG_ERROR(
+            Input,
+            "UDP Packet payload length doesn't match. Received: {} PayloadLength: {} Expected: {}",
+            size, header.payload_length, data_len + sizeof(Type));
+        return std::nullopt;
+    }
+
+    const u32 crc32 = header.crc;
+    boost::crc_32_type result;
+    // zero out the crc in the buffer and then run the crc against it
+    std::memset(&data[offsetof(Header, crc)], 0, sizeof(u32_le));
+
+    result.process_bytes(data, data_len + sizeof(Header));
+    if (crc32 != result.checksum()) {
+        LOG_ERROR(Input, "UDP Packet CRC check failed. Offset: {}", offsetof(Header, crc));
+        return std::nullopt;
+    }
+    return header.type;
+}
+} // namespace Response
+
+} // namespace InputCommon::CemuhookUDP
diff --git a/src/input_common/udp/protocol.h b/src/input_common/udp/protocol.h
new file mode 100644
index 000000000..3ba4d1fc8
--- /dev/null
+++ b/src/input_common/udp/protocol.h
@@ -0,0 +1,255 @@
+// Copyright 2018 Citra Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <array>
+#include <optional>
+#include <type_traits>
+#include <boost/crc.hpp>
+#include "common/bit_field.h"
+#include "common/swap.h"
+
+namespace InputCommon::CemuhookUDP {
+
+constexpr std::size_t MAX_PACKET_SIZE = 100;
+constexpr u16 PROTOCOL_VERSION = 1001;
+constexpr u32 CLIENT_MAGIC = 0x43555344; // DSUC (but flipped for LE)
+constexpr u32 SERVER_MAGIC = 0x53555344; // DSUS (but flipped for LE)
+
+enum class Type : u32 {
+    Version = 0x00100000,
+    PortInfo = 0x00100001,
+    PadData = 0x00100002,
+};
+
+struct Header {
+    u32_le magic{};
+    u16_le protocol_version{};
+    u16_le payload_length{};
+    u32_le crc{};
+    u32_le id{};
+    ///> In the protocol, the type of the packet is not part of the header, but its convenient to
+    ///> include in the header so the callee doesn't have to duplicate the type twice when building
+    ///> the data
+    Type type{};
+};
+static_assert(sizeof(Header) == 20, "UDP Message Header struct has wrong size");
+static_assert(std::is_trivially_copyable_v<Header>, "UDP Message Header is not trivially copyable");
+
+using MacAddress = std::array<u8, 6>;
+constexpr MacAddress EMPTY_MAC_ADDRESS = {0, 0, 0, 0, 0, 0};
+
+#pragma pack(push, 1)
+template <typename T>
+struct Message {
+    Header header{};
+    T data;
+};
+#pragma pack(pop)
+
+template <typename T>
+constexpr Type GetMessageType();
+
+namespace Request {
+
+struct Version {};
+/**
+ * Requests the server to send information about what controllers are plugged into the ports
+ * In citra's case, we only have one controller, so for simplicity's sake, we can just send a
+ * request explicitly for the first controller port and leave it at that. In the future it would be
+ * nice to make this configurable
+ */
+constexpr u32 MAX_PORTS = 4;
+struct PortInfo {
+    u32_le pad_count{}; ///> Number of ports to request data for
+    std::array<u8, MAX_PORTS> port;
+};
+static_assert(std::is_trivially_copyable_v<PortInfo>,
+              "UDP Request PortInfo is not trivially copyable");
+
+/**
+ * Request the latest pad information from the server. If the server hasn't received this message
+ * from the client in a reasonable time frame, the server will stop sending updates. The default
+ * timeout seems to be 5 seconds.
+ */
+struct PadData {
+    enum class Flags : u8 {
+        AllPorts,
+        Id,
+        Mac,
+    };
+    /// Determines which method will be used as a look up for the controller
+    Flags flags{};
+    /// Index of the port of the controller to retrieve data about
+    u8 port_id{};
+    /// Mac address of the controller to retrieve data about
+    MacAddress mac;
+};
+static_assert(sizeof(PadData) == 8, "UDP Request PadData struct has wrong size");
+static_assert(std::is_trivially_copyable_v<PadData>,
+              "UDP Request PadData is not trivially copyable");
+
+/**
+ * Creates a message with the proper header data that can be sent to the server.
+ * @param T data Request body to send
+ * @param client_id ID of the udp client (usually not checked on the server)
+ */
+template <typename T>
+Message<T> Create(const T data, const u32 client_id = 0) {
+    boost::crc_32_type crc;
+    Header header{
+        CLIENT_MAGIC, PROTOCOL_VERSION, sizeof(T) + sizeof(Type), 0, client_id, GetMessageType<T>(),
+    };
+    Message<T> message{header, data};
+    crc.process_bytes(&message, sizeof(Message<T>));
+    message.header.crc = crc.checksum();
+    return message;
+}
+} // namespace Request
+
+namespace Response {
+
+struct Version {
+    u16_le version{};
+};
+static_assert(sizeof(Version) == 2, "UDP Response Version struct has wrong size");
+static_assert(std::is_trivially_copyable_v<Version>,
+              "UDP Response Version is not trivially copyable");
+
+struct PortInfo {
+    u8 id{};
+    u8 state{};
+    u8 model{};
+    u8 connection_type{};
+    MacAddress mac;
+    u8 battery{};
+    u8 is_pad_active{};
+};
+static_assert(sizeof(PortInfo) == 12, "UDP Response PortInfo struct has wrong size");
+static_assert(std::is_trivially_copyable_v<PortInfo>,
+              "UDP Response PortInfo is not trivially copyable");
+
+#pragma pack(push, 1)
+struct PadData {
+    PortInfo info{};
+    u32_le packet_counter{};
+
+    u16_le digital_button{};
+    // The following union isn't trivially copyable but we don't use this input anyway.
+    // union DigitalButton {
+    //     u16_le button;
+    //     BitField<0, 1, u16> button_1;   // Share
+    //     BitField<1, 1, u16> button_2;   // L3
+    //     BitField<2, 1, u16> button_3;   // R3
+    //     BitField<3, 1, u16> button_4;   // Options
+    //     BitField<4, 1, u16> button_5;   // Up
+    //     BitField<5, 1, u16> button_6;   // Right
+    //     BitField<6, 1, u16> button_7;   // Down
+    //     BitField<7, 1, u16> button_8;   // Left
+    //     BitField<8, 1, u16> button_9;   // L2
+    //     BitField<9, 1, u16> button_10;  // R2
+    //     BitField<10, 1, u16> button_11; // L1
+    //     BitField<11, 1, u16> button_12; // R1
+    //     BitField<12, 1, u16> button_13; // Triangle
+    //     BitField<13, 1, u16> button_14; // Circle
+    //     BitField<14, 1, u16> button_15; // Cross
+    //     BitField<15, 1, u16> button_16; // Square
+    // } digital_button;
+
+    u8 home;
+    /// If the device supports a "click" on the touchpad, this will change to 1 when a click happens
+    u8 touch_hard_press{};
+    u8 left_stick_x{};
+    u8 left_stick_y{};
+    u8 right_stick_x{};
+    u8 right_stick_y{};
+
+    struct AnalogButton {
+        u8 button_8{};
+        u8 button_7{};
+        u8 button_6{};
+        u8 button_5{};
+        u8 button_12{};
+        u8 button_11{};
+        u8 button_10{};
+        u8 button_9{};
+        u8 button_16{};
+        u8 button_15{};
+        u8 button_14{};
+        u8 button_13{};
+    } analog_button;
+
+    struct TouchPad {
+        u8 is_active{};
+        u8 id{};
+        u16_le x{};
+        u16_le y{};
+    } touch_1, touch_2;
+
+    u64_le motion_timestamp;
+
+    struct Accelerometer {
+        float x{};
+        float y{};
+        float z{};
+    } accel;
+
+    struct Gyroscope {
+        float pitch{};
+        float yaw{};
+        float roll{};
+    } gyro;
+};
+#pragma pack(pop)
+
+static_assert(sizeof(PadData) == 80, "UDP Response PadData struct has wrong size ");
+static_assert(std::is_trivially_copyable_v<PadData>,
+              "UDP Response PadData is not trivially copyable");
+
+static_assert(sizeof(Message<PadData>) == MAX_PACKET_SIZE,
+              "UDP MAX_PACKET_SIZE is no longer larger than Message<PadData>");
+
+static_assert(sizeof(PadData::AnalogButton) == 12,
+              "UDP Response AnalogButton struct has wrong size ");
+static_assert(sizeof(PadData::TouchPad) == 6, "UDP Response TouchPad struct has wrong size ");
+static_assert(sizeof(PadData::Accelerometer) == 12,
+              "UDP Response Accelerometer struct has wrong size ");
+static_assert(sizeof(PadData::Gyroscope) == 12, "UDP Response Gyroscope struct has wrong size ");
+
+/**
+ * Create a Response Message from the data
+ * @param data array of bytes sent from the server
+ * @return boost::none if it failed to parse or Type if it succeeded. The client can then safely
+ * copy the data into the appropriate struct for that Type
+ */
+std::optional<Type> Validate(u8* data, std::size_t size);
+
+} // namespace Response
+
+template <>
+constexpr Type GetMessageType<Request::Version>() {
+    return Type::Version;
+}
+template <>
+constexpr Type GetMessageType<Request::PortInfo>() {
+    return Type::PortInfo;
+}
+template <>
+constexpr Type GetMessageType<Request::PadData>() {
+    return Type::PadData;
+}
+template <>
+constexpr Type GetMessageType<Response::Version>() {
+    return Type::Version;
+}
+template <>
+constexpr Type GetMessageType<Response::PortInfo>() {
+    return Type::PortInfo;
+}
+template <>
+constexpr Type GetMessageType<Response::PadData>() {
+    return Type::PadData;
+}
+} // namespace InputCommon::CemuhookUDP
diff --git a/src/input_common/udp/udp.cpp b/src/input_common/udp/udp.cpp
new file mode 100644
index 000000000..ca99cc22f
--- /dev/null
+++ b/src/input_common/udp/udp.cpp
@@ -0,0 +1,98 @@
+// Copyright 2018 Citra Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <mutex>
+#include <tuple>
+
+#include "common/param_package.h"
+#include "core/frontend/input.h"
+#include "core/settings.h"
+#include "input_common/udp/client.h"
+#include "input_common/udp/udp.h"
+
+namespace InputCommon::CemuhookUDP {
+
+class UDPTouchDevice final : public Input::TouchDevice {
+public:
+    explicit UDPTouchDevice(std::shared_ptr<DeviceStatus> status_) : status(std::move(status_)) {}
+    std::tuple<float, float, bool> GetStatus() const override {
+        std::lock_guard guard(status->update_mutex);
+        return status->touch_status;
+    }
+
+private:
+    std::shared_ptr<DeviceStatus> status;
+};
+
+class UDPMotionDevice final : public Input::MotionDevice {
+public:
+    explicit UDPMotionDevice(std::shared_ptr<DeviceStatus> status_) : status(std::move(status_)) {}
+    std::tuple<Common::Vec3<float>, Common::Vec3<float>> GetStatus() const override {
+        std::lock_guard guard(status->update_mutex);
+        return status->motion_status;
+    }
+
+private:
+    std::shared_ptr<DeviceStatus> status;
+};
+
+class UDPTouchFactory final : public Input::Factory<Input::TouchDevice> {
+public:
+    explicit UDPTouchFactory(std::shared_ptr<DeviceStatus> status_) : status(std::move(status_)) {}
+
+    std::unique_ptr<Input::TouchDevice> Create(const Common::ParamPackage& params) override {
+        {
+            std::lock_guard guard(status->update_mutex);
+            status->touch_calibration.emplace();
+            // These default values work well for DS4 but probably not other touch inputs
+            status->touch_calibration->min_x = params.Get("min_x", 100);
+            status->touch_calibration->min_y = params.Get("min_y", 50);
+            status->touch_calibration->max_x = params.Get("max_x", 1800);
+            status->touch_calibration->max_y = params.Get("max_y", 850);
+        }
+        return std::make_unique<UDPTouchDevice>(status);
+    }
+
+private:
+    std::shared_ptr<DeviceStatus> status;
+};
+
+class UDPMotionFactory final : public Input::Factory<Input::MotionDevice> {
+public:
+    explicit UDPMotionFactory(std::shared_ptr<DeviceStatus> status_) : status(std::move(status_)) {}
+
+    std::unique_ptr<Input::MotionDevice> Create(const Common::ParamPackage& params) override {
+        return std::make_unique<UDPMotionDevice>(status);
+    }
+
+private:
+    std::shared_ptr<DeviceStatus> status;
+};
+
+State::State() {
+    auto status = std::make_shared<DeviceStatus>();
+    client =
+        std::make_unique<Client>(status, Settings::values.udp_input_address,
+                                 Settings::values.udp_input_port, Settings::values.udp_pad_index);
+
+    Input::RegisterFactory<Input::TouchDevice>("cemuhookudp",
+                                               std::make_shared<UDPTouchFactory>(status));
+    Input::RegisterFactory<Input::MotionDevice>("cemuhookudp",
+                                                std::make_shared<UDPMotionFactory>(status));
+}
+
+State::~State() {
+    Input::UnregisterFactory<Input::TouchDevice>("cemuhookudp");
+    Input::UnregisterFactory<Input::MotionDevice>("cemuhookudp");
+}
+
+void State::ReloadUDPClient() {
+    client->ReloadSocket(Settings::values.udp_input_address, Settings::values.udp_input_port,
+                         Settings::values.udp_pad_index);
+}
+
+std::unique_ptr<State> Init() {
+    return std::make_unique<State>();
+}
+} // namespace InputCommon::CemuhookUDP
diff --git a/src/input_common/udp/udp.h b/src/input_common/udp/udp.h
new file mode 100644
index 000000000..4f83f0441
--- /dev/null
+++ b/src/input_common/udp/udp.h
@@ -0,0 +1,25 @@
+// Copyright 2018 Citra Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <memory>
+
+namespace InputCommon::CemuhookUDP {
+
+class Client;
+
+class State {
+public:
+    State();
+    ~State();
+    void ReloadUDPClient();
+
+private:
+    std::unique_ptr<Client> client;
+};
+
+std::unique_ptr<State> Init();
+
+} // namespace InputCommon::CemuhookUDP
diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt
index ccfed4f2e..4b0c6346f 100644
--- a/src/video_core/CMakeLists.txt
+++ b/src/video_core/CMakeLists.txt
@@ -29,12 +29,15 @@ add_library(video_core STATIC
     gpu_synch.h
     gpu_thread.cpp
     gpu_thread.h
+    guest_driver.cpp
+    guest_driver.h
     macro_interpreter.cpp
     macro_interpreter.h
     memory_manager.cpp
     memory_manager.h
     morton.cpp
     morton.h
+    query_cache.h
     rasterizer_accelerated.cpp
     rasterizer_accelerated.h
     rasterizer_cache.cpp
@@ -72,6 +75,8 @@ add_library(video_core STATIC
     renderer_opengl/gl_stream_buffer.h
     renderer_opengl/gl_texture_cache.cpp
     renderer_opengl/gl_texture_cache.h
+    renderer_opengl/gl_query_cache.cpp
+    renderer_opengl/gl_query_cache.h
     renderer_opengl/maxwell_to_gl.h
     renderer_opengl/renderer_opengl.cpp
     renderer_opengl/renderer_opengl.h
@@ -154,6 +159,7 @@ if (ENABLE_VULKAN)
         renderer_vulkan/maxwell_to_vk.cpp
         renderer_vulkan/maxwell_to_vk.h
         renderer_vulkan/renderer_vulkan.h
+        renderer_vulkan/renderer_vulkan.cpp
         renderer_vulkan/vk_blit_screen.cpp
         renderer_vulkan/vk_blit_screen.h
         renderer_vulkan/vk_buffer_cache.cpp
@@ -174,6 +180,8 @@ if (ENABLE_VULKAN)
         renderer_vulkan/vk_memory_manager.h
         renderer_vulkan/vk_pipeline_cache.cpp
         renderer_vulkan/vk_pipeline_cache.h
+        renderer_vulkan/vk_query_cache.cpp
+        renderer_vulkan/vk_query_cache.h
         renderer_vulkan/vk_rasterizer.cpp
         renderer_vulkan/vk_rasterizer.h
         renderer_vulkan/vk_renderpass_cache.cpp
diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h
index 0510ed777..186aca61d 100644
--- a/src/video_core/buffer_cache/buffer_cache.h
+++ b/src/video_core/buffer_cache/buffer_cache.h
@@ -101,7 +101,10 @@ public:
     void TickFrame() {
         ++epoch;
         while (!pending_destruction.empty()) {
-            if (pending_destruction.front()->GetEpoch() + 1 > epoch) {
+            // Delay at least 4 frames before destruction.
+            // This is due to triple buffering happening on some drivers.
+            static constexpr u64 epochs_to_destroy = 5;
+            if (pending_destruction.front()->GetEpoch() + epochs_to_destroy > epoch) {
                 break;
             }
             pending_destruction.pop_front();
diff --git a/src/video_core/engines/const_buffer_engine_interface.h b/src/video_core/engines/const_buffer_engine_interface.h
index 44b8b8d22..d56a47710 100644
--- a/src/video_core/engines/const_buffer_engine_interface.h
+++ b/src/video_core/engines/const_buffer_engine_interface.h
@@ -9,6 +9,7 @@
 #include "common/common_types.h"
 #include "video_core/engines/shader_bytecode.h"
 #include "video_core/engines/shader_type.h"
+#include "video_core/guest_driver.h"
 #include "video_core/textures/texture.h"
 
 namespace Tegra::Engines {
@@ -106,6 +107,9 @@ public:
     virtual SamplerDescriptor AccessBindlessSampler(ShaderType stage, u64 const_buffer,
                                                     u64 offset) const = 0;
     virtual u32 GetBoundBuffer() const = 0;
+
+    virtual VideoCore::GuestDriverProfile& AccessGuestDriverProfile() = 0;
+    virtual const VideoCore::GuestDriverProfile& AccessGuestDriverProfile() const = 0;
 };
 
 } // namespace Tegra::Engines
diff --git a/src/video_core/engines/kepler_compute.cpp b/src/video_core/engines/kepler_compute.cpp
index 110406f2f..4b824aa4e 100644
--- a/src/video_core/engines/kepler_compute.cpp
+++ b/src/video_core/engines/kepler_compute.cpp
@@ -94,6 +94,14 @@ SamplerDescriptor KeplerCompute::AccessBindlessSampler(ShaderType stage, u64 con
     return result;
 }
 
+VideoCore::GuestDriverProfile& KeplerCompute::AccessGuestDriverProfile() {
+    return rasterizer.AccessGuestDriverProfile();
+}
+
+const VideoCore::GuestDriverProfile& KeplerCompute::AccessGuestDriverProfile() const {
+    return rasterizer.AccessGuestDriverProfile();
+}
+
 void KeplerCompute::ProcessLaunch() {
     const GPUVAddr launch_desc_loc = regs.launch_desc_loc.Address();
     memory_manager.ReadBlockUnsafe(launch_desc_loc, &launch_description,
diff --git a/src/video_core/engines/kepler_compute.h b/src/video_core/engines/kepler_compute.h
index 4ef3e0613..eeb79c56f 100644
--- a/src/video_core/engines/kepler_compute.h
+++ b/src/video_core/engines/kepler_compute.h
@@ -218,6 +218,10 @@ public:
         return regs.tex_cb_index;
     }
 
+    VideoCore::GuestDriverProfile& AccessGuestDriverProfile() override;
+
+    const VideoCore::GuestDriverProfile& AccessGuestDriverProfile() const override;
+
 private:
     Core::System& system;
     VideoCore::RasterizerInterface& rasterizer;
diff --git a/src/video_core/engines/maxwell_3d.cpp b/src/video_core/engines/maxwell_3d.cpp
index 58dfa8033..b28de1092 100644
--- a/src/video_core/engines/maxwell_3d.cpp
+++ b/src/video_core/engines/maxwell_3d.cpp
@@ -4,17 +4,21 @@
 
 #include <cinttypes>
 #include <cstring>
+#include <optional>
 #include "common/assert.h"
 #include "core/core.h"
 #include "core/core_timing.h"
 #include "video_core/engines/maxwell_3d.h"
 #include "video_core/engines/shader_type.h"
+#include "video_core/gpu.h"
 #include "video_core/memory_manager.h"
 #include "video_core/rasterizer_interface.h"
 #include "video_core/textures/texture.h"
 
 namespace Tegra::Engines {
 
+using VideoCore::QueryType;
+
 /// First register id that is actually a Macro call.
 constexpr u32 MacroRegistersStart = 0xE00;
 
@@ -399,6 +403,10 @@ void Maxwell3D::CallMethod(const GPU::MethodCall& method_call) {
         ProcessQueryCondition();
         break;
     }
+    case MAXWELL3D_REG_INDEX(counter_reset): {
+        ProcessCounterReset();
+        break;
+    }
     case MAXWELL3D_REG_INDEX(sync_info): {
         ProcessSyncPoint();
         break;
@@ -481,7 +489,7 @@ void Maxwell3D::FlushMMEInlineDraw() {
 
     const bool is_indexed = mme_draw.current_mode == MMEDrawMode::Indexed;
     if (ShouldExecute()) {
-        rasterizer.DrawMultiBatch(is_indexed);
+        rasterizer.Draw(is_indexed, true);
     }
 
     // TODO(bunnei): Below, we reset vertex count so that we can use these registers to determine if
@@ -519,61 +527,51 @@ void Maxwell3D::ProcessFirmwareCall4() {
     regs.reg_array[0xd00] = 1;
 }
 
-void Maxwell3D::ProcessQueryGet() {
+void Maxwell3D::StampQueryResult(u64 payload, bool long_query) {
+    struct LongQueryResult {
+        u64_le value;
+        u64_le timestamp;
+    };
+    static_assert(sizeof(LongQueryResult) == 16, "LongQueryResult has wrong size");
     const GPUVAddr sequence_address{regs.query.QueryAddress()};
-    // Since the sequence address is given as a GPU VAddr, we have to convert it to an application
-    // VAddr before writing.
+    if (long_query) {
+        // Write the 128-bit result structure in long mode. Note: We emulate an infinitely fast
+        // GPU, this command may actually take a while to complete in real hardware due to GPU
+        // wait queues.
+        LongQueryResult query_result{payload, system.GPU().GetTicks()};
+        memory_manager.WriteBlock(sequence_address, &query_result, sizeof(query_result));
+    } else {
+        memory_manager.Write<u32>(sequence_address, static_cast<u32>(payload));
+    }
+}
 
+void Maxwell3D::ProcessQueryGet() {
     // TODO(Subv): Support the other query units.
     ASSERT_MSG(regs.query.query_get.unit == Regs::QueryUnit::Crop,
                "Units other than CROP are unimplemented");
 
-    u64 result = 0;
-
-    // TODO(Subv): Support the other query variables
-    switch (regs.query.query_get.select) {
-    case Regs::QuerySelect::Zero:
-        // This seems to actually write the query sequence to the query address.
-        result = regs.query.query_sequence;
+    switch (regs.query.query_get.operation) {
+    case Regs::QueryOperation::Release:
+        StampQueryResult(regs.query.query_sequence, regs.query.query_get.short_query == 0);
         break;
-    default:
-        result = 1;
-        UNIMPLEMENTED_MSG("Unimplemented query select type {}",
-                          static_cast<u32>(regs.query.query_get.select.Value()));
-    }
-
-    // TODO(Subv): Research and implement how query sync conditions work.
-
-    struct LongQueryResult {
-        u64_le value;
-        u64_le timestamp;
-    };
-    static_assert(sizeof(LongQueryResult) == 16, "LongQueryResult has wrong size");
-
-    switch (regs.query.query_get.mode) {
-    case Regs::QueryMode::Write:
-    case Regs::QueryMode::Write2: {
-        u32 sequence = regs.query.query_sequence;
-        if (regs.query.query_get.short_query) {
-            // Write the current query sequence to the sequence address.
-            // TODO(Subv): Find out what happens if you use a long query type but mark it as a short
-            // query.
-            memory_manager.Write<u32>(sequence_address, sequence);
-        } else {
-            // Write the 128-bit result structure in long mode. Note: We emulate an infinitely fast
-            // GPU, this command may actually take a while to complete in real hardware due to GPU
-            // wait queues.
-            LongQueryResult query_result{};
-            query_result.value = result;
-            // TODO(Subv): Generate a real GPU timestamp and write it here instead of CoreTiming
-            query_result.timestamp = system.CoreTiming().GetTicks();
-            memory_manager.WriteBlock(sequence_address, &query_result, sizeof(query_result));
+    case Regs::QueryOperation::Acquire:
+        // TODO(Blinkhawk): Under this operation, the GPU waits for the CPU to write a value that
+        // matches the current payload.
+        UNIMPLEMENTED_MSG("Unimplemented query operation ACQUIRE");
+        break;
+    case Regs::QueryOperation::Counter:
+        if (const std::optional<u64> result = GetQueryResult()) {
+            // If the query returns an empty optional it means it's cached and deferred.
+            // In this case we have a non-empty result, so we stamp it immediately.
+            StampQueryResult(*result, regs.query.query_get.short_query == 0);
         }
         break;
-    }
+    case Regs::QueryOperation::Trap:
+        UNIMPLEMENTED_MSG("Unimplemented query operation TRAP");
+        break;
     default:
-        UNIMPLEMENTED_MSG("Query mode {} not implemented",
-                          static_cast<u32>(regs.query.query_get.mode.Value()));
+        UNIMPLEMENTED_MSG("Unknown query operation");
+        break;
     }
 }
 
@@ -590,20 +588,20 @@ void Maxwell3D::ProcessQueryCondition() {
     }
     case Regs::ConditionMode::ResNonZero: {
         Regs::QueryCompare cmp;
-        memory_manager.ReadBlockUnsafe(condition_address, &cmp, sizeof(cmp));
+        memory_manager.ReadBlock(condition_address, &cmp, sizeof(cmp));
         execute_on = cmp.initial_sequence != 0U && cmp.initial_mode != 0U;
         break;
     }
     case Regs::ConditionMode::Equal: {
         Regs::QueryCompare cmp;
-        memory_manager.ReadBlockUnsafe(condition_address, &cmp, sizeof(cmp));
+        memory_manager.ReadBlock(condition_address, &cmp, sizeof(cmp));
         execute_on =
             cmp.initial_sequence == cmp.current_sequence && cmp.initial_mode == cmp.current_mode;
         break;
     }
     case Regs::ConditionMode::NotEqual: {
         Regs::QueryCompare cmp;
-        memory_manager.ReadBlockUnsafe(condition_address, &cmp, sizeof(cmp));
+        memory_manager.ReadBlock(condition_address, &cmp, sizeof(cmp));
         execute_on =
             cmp.initial_sequence != cmp.current_sequence || cmp.initial_mode != cmp.current_mode;
         break;
@@ -616,6 +614,18 @@ void Maxwell3D::ProcessQueryCondition() {
     }
 }
 
+void Maxwell3D::ProcessCounterReset() {
+    switch (regs.counter_reset) {
+    case Regs::CounterReset::SampleCnt:
+        rasterizer.ResetCounter(QueryType::SamplesPassed);
+        break;
+    default:
+        LOG_WARNING(Render_OpenGL, "Unimplemented counter reset={}",
+                    static_cast<int>(regs.counter_reset));
+        break;
+    }
+}
+
 void Maxwell3D::ProcessSyncPoint() {
     const u32 sync_point = regs.sync_info.sync_point.Value();
     const u32 increment = regs.sync_info.increment.Value();
@@ -644,7 +654,7 @@ void Maxwell3D::DrawArrays() {
 
     const bool is_indexed{regs.index_array.count && !regs.vertex_buffer.count};
     if (ShouldExecute()) {
-        rasterizer.DrawBatch(is_indexed);
+        rasterizer.Draw(is_indexed, false);
     }
 
     // TODO(bunnei): Below, we reset vertex count so that we can use these registers to determine if
@@ -658,6 +668,22 @@ void Maxwell3D::DrawArrays() {
     }
 }
 
+std::optional<u64> Maxwell3D::GetQueryResult() {
+    switch (regs.query.query_get.select) {
+    case Regs::QuerySelect::Zero:
+        return 0;
+    case Regs::QuerySelect::SamplesPassed:
+        // Deferred.
+        rasterizer.Query(regs.query.QueryAddress(), VideoCore::QueryType::SamplesPassed,
+                         system.GPU().GetTicks());
+        return {};
+    default:
+        UNIMPLEMENTED_MSG("Unimplemented query select type {}",
+                          static_cast<u32>(regs.query.query_get.select.Value()));
+        return 1;
+    }
+}
+
 void Maxwell3D::ProcessCBBind(std::size_t stage_index) {
     // Bind the buffer currently in CB_ADDRESS to the specified index in the desired shader stage.
     auto& shader = state.shader_stages[stage_index];
@@ -784,4 +810,12 @@ SamplerDescriptor Maxwell3D::AccessBindlessSampler(ShaderType stage, u64 const_b
     return result;
 }
 
+VideoCore::GuestDriverProfile& Maxwell3D::AccessGuestDriverProfile() {
+    return rasterizer.AccessGuestDriverProfile();
+}
+
+const VideoCore::GuestDriverProfile& Maxwell3D::AccessGuestDriverProfile() const {
+    return rasterizer.AccessGuestDriverProfile();
+}
+
 } // namespace Tegra::Engines
diff --git a/src/video_core/engines/maxwell_3d.h b/src/video_core/engines/maxwell_3d.h
index ee79260fc..26939be3f 100644
--- a/src/video_core/engines/maxwell_3d.h
+++ b/src/video_core/engines/maxwell_3d.h
@@ -6,6 +6,7 @@
 
 #include <array>
 #include <bitset>
+#include <optional>
 #include <type_traits>
 #include <unordered_map>
 #include <vector>
@@ -71,12 +72,11 @@ public:
         static constexpr std::size_t MaxConstBuffers = 18;
         static constexpr std::size_t MaxConstBufferSize = 0x10000;
 
-        enum class QueryMode : u32 {
-            Write = 0,
-            Sync = 1,
-            // TODO(Subv): It is currently unknown what the difference between method 2 and method 0
-            // is.
-            Write2 = 2,
+        enum class QueryOperation : u32 {
+            Release = 0,
+            Acquire = 1,
+            Counter = 2,
+            Trap = 3,
         };
 
         enum class QueryUnit : u32 {
@@ -410,6 +410,27 @@ public:
             Linear = 1,
         };
 
+        enum class CounterReset : u32 {
+            SampleCnt = 0x01,
+            Unk02 = 0x02,
+            Unk03 = 0x03,
+            Unk04 = 0x04,
+            EmittedPrimitives = 0x10, // Not tested
+            Unk11 = 0x11,
+            Unk12 = 0x12,
+            Unk13 = 0x13,
+            Unk15 = 0x15,
+            Unk16 = 0x16,
+            Unk17 = 0x17,
+            Unk18 = 0x18,
+            Unk1A = 0x1A,
+            Unk1B = 0x1B,
+            Unk1C = 0x1C,
+            Unk1D = 0x1D,
+            Unk1E = 0x1E,
+            GeneratedPrimitives = 0x1F,
+        };
+
         struct Cull {
             enum class FrontFace : u32 {
                 ClockWise = 0x0900,
@@ -704,8 +725,8 @@ public:
                 INSERT_UNION_PADDING_WORDS(0x15);
 
                 s32 stencil_back_func_ref;
-                u32 stencil_back_func_mask;
                 u32 stencil_back_mask;
+                u32 stencil_back_func_mask;
 
                 INSERT_UNION_PADDING_WORDS(0xC);
 
@@ -858,11 +879,19 @@ public:
                     BitField<7, 1, u32> c7;
                 } clip_distance_enabled;
 
-                INSERT_UNION_PADDING_WORDS(0x1);
+                u32 samplecnt_enable;
 
                 float point_size;
 
-                INSERT_UNION_PADDING_WORDS(0x7);
+                INSERT_UNION_PADDING_WORDS(0x1);
+
+                u32 point_sprite_enable;
+
+                INSERT_UNION_PADDING_WORDS(0x3);
+
+                CounterReset counter_reset;
+
+                INSERT_UNION_PADDING_WORDS(0x1);
 
                 u32 zeta_enable;
 
@@ -1077,7 +1106,7 @@ public:
                     u32 query_sequence;
                     union {
                         u32 raw;
-                        BitField<0, 2, QueryMode> mode;
+                        BitField<0, 2, QueryOperation> operation;
                         BitField<4, 1, u32> fence;
                         BitField<12, 4, QueryUnit> unit;
                         BitField<16, 1, QuerySyncCondition> sync_cond;
@@ -1306,6 +1335,10 @@ public:
         return regs.tex_cb_index;
     }
 
+    VideoCore::GuestDriverProfile& AccessGuestDriverProfile() override;
+
+    const VideoCore::GuestDriverProfile& AccessGuestDriverProfile() const override;
+
     /// Memory for macro code - it's undetermined how big this is, however 1MB is much larger than
     /// we've seen used.
     using MacroMemory = std::array<u32, 0x40000>;
@@ -1405,9 +1438,15 @@ private:
     /// Handles a write to the QUERY_GET register.
     void ProcessQueryGet();
 
-    // Handles Conditional Rendering
+    /// Writes the query result accordingly.
+    void StampQueryResult(u64 payload, bool long_query);
+
+    /// Handles conditional rendering.
     void ProcessQueryCondition();
 
+    /// Handles counter resets.
+    void ProcessCounterReset();
+
     /// Handles writes to syncing register.
     void ProcessSyncPoint();
 
@@ -1424,6 +1463,9 @@ private:
 
     // Handles a instance drawcall from MME
     void StepInstance(MMEDrawMode expected_mode, u32 count);
+
+    /// Returns a query's value or an empty object if the value will be deferred through a cache.
+    std::optional<u64> GetQueryResult();
 };
 
 #define ASSERT_REG_POSITION(field_name, position)                                                  \
@@ -1454,8 +1496,8 @@ ASSERT_REG_POSITION(polygon_offset_fill_enable, 0x372);
 ASSERT_REG_POSITION(patch_vertices, 0x373);
 ASSERT_REG_POSITION(scissor_test, 0x380);
 ASSERT_REG_POSITION(stencil_back_func_ref, 0x3D5);
-ASSERT_REG_POSITION(stencil_back_func_mask, 0x3D6);
-ASSERT_REG_POSITION(stencil_back_mask, 0x3D7);
+ASSERT_REG_POSITION(stencil_back_mask, 0x3D6);
+ASSERT_REG_POSITION(stencil_back_func_mask, 0x3D7);
 ASSERT_REG_POSITION(color_mask_common, 0x3E4);
 ASSERT_REG_POSITION(rt_separate_frag_data, 0x3EB);
 ASSERT_REG_POSITION(depth_bounds, 0x3E7);
@@ -1489,7 +1531,10 @@ ASSERT_REG_POSITION(screen_y_control, 0x4EB);
 ASSERT_REG_POSITION(vb_element_base, 0x50D);
 ASSERT_REG_POSITION(vb_base_instance, 0x50E);
 ASSERT_REG_POSITION(clip_distance_enabled, 0x544);
+ASSERT_REG_POSITION(samplecnt_enable, 0x545);
 ASSERT_REG_POSITION(point_size, 0x546);
+ASSERT_REG_POSITION(point_sprite_enable, 0x548);
+ASSERT_REG_POSITION(counter_reset, 0x54C);
 ASSERT_REG_POSITION(zeta_enable, 0x54E);
 ASSERT_REG_POSITION(multisample_control, 0x54F);
 ASSERT_REG_POSITION(condition, 0x554);
diff --git a/src/video_core/engines/shader_bytecode.h b/src/video_core/engines/shader_bytecode.h
index 6f98bd827..c9bc83cd7 100644
--- a/src/video_core/engines/shader_bytecode.h
+++ b/src/video_core/engines/shader_bytecode.h
@@ -227,6 +227,28 @@ enum class AtomicOp : u64 {
     Exch = 8,
 };
 
+enum class GlobalAtomicOp : u64 {
+    Add = 0,
+    Min = 1,
+    Max = 2,
+    Inc = 3,
+    Dec = 4,
+    And = 5,
+    Or = 6,
+    Xor = 7,
+    Exch = 8,
+    SafeAdd = 10,
+};
+
+enum class GlobalAtomicType : u64 {
+    U32 = 0,
+    S32 = 1,
+    U64 = 2,
+    F32_FTZ_RN = 3,
+    F16x2_FTZ_RN = 4,
+    S64 = 5,
+};
+
 enum class UniformType : u64 {
     UnsignedByte = 0,
     SignedByte = 1,
@@ -602,6 +624,19 @@ enum class ShuffleOperation : u64 {
     Bfly = 3, // shuffleXorNV
 };
 
+enum class ShfType : u64 {
+    Bits32 = 0,
+    U64 = 2,
+    S64 = 3,
+};
+
+enum class ShfXmode : u64 {
+    None = 0,
+    HI = 1,
+    X = 2,
+    XHI = 3,
+};
+
 union Instruction {
     constexpr Instruction& operator=(const Instruction& instr) {
         value = instr.value;
@@ -754,6 +789,13 @@ union Instruction {
     } shr;
 
     union {
+        BitField<37, 2, ShfType> type;
+        BitField<48, 2, ShfXmode> xmode;
+        BitField<50, 1, u64> wrap;
+        BitField<20, 6, u64> immediate;
+    } shf;
+
+    union {
         BitField<39, 5, u64> shift_amount;
         BitField<48, 1, u64> negate_b;
         BitField<49, 1, u64> negate_a;
@@ -958,6 +1000,12 @@ union Instruction {
     } stg;
 
     union {
+        BitField<52, 4, GlobalAtomicOp> operation;
+        BitField<49, 3, GlobalAtomicType> type;
+        BitField<28, 20, s64> offset;
+    } atom;
+
+    union {
         BitField<52, 4, AtomicOp> operation;
         BitField<28, 2, AtomicType> type;
         BitField<30, 22, s64> offset;
@@ -1096,6 +1144,11 @@ union Instruction {
     } fset;
 
     union {
+        BitField<47, 1, u64> ftz;
+        BitField<48, 4, PredCondition> cond;
+    } fcmp;
+
+    union {
         BitField<49, 1, u64> bf;
         BitField<35, 3, PredCondition> cond;
         BitField<50, 1, u64> ftz;
@@ -1624,11 +1677,11 @@ union Instruction {
     } xmad;
 
     union {
-        BitField<20, 14, u64> offset;
+        BitField<20, 14, u64> shifted_offset;
         BitField<34, 5, u64> index;
 
         u64 GetOffset() const {
-            return offset * 4;
+            return shifted_offset * 4;
         }
     } cbuf34;
 
@@ -1675,6 +1728,7 @@ public:
         BFE_C,
         BFE_R,
         BFE_IMM,
+        BFI_RC,
         BFI_IMM_R,
         BRA,
         BRX,
@@ -1690,6 +1744,7 @@ public:
         ST_S,
         ST,    // Store in generic memory
         STG,   // Store in global memory
+        ATOM,  // Atomic operation on global memory
         ATOMS, // Atomic operation on shared memory
         AL2P,  // Transforms attribute memory into physical memory
         TEX,
@@ -1771,6 +1826,7 @@ public:
         ICMP_R,
         ICMP_CR,
         ICMP_IMM,
+        FCMP_R,
         MUFU,  // Multi-Function Operator
         RRO_C, // Range Reduction Operator
         RRO_R,
@@ -1994,6 +2050,7 @@ private:
             INST("1110111101010---", Id::ST_L, Type::Memory, "ST_L"),
             INST("101-------------", Id::ST, Type::Memory, "ST"),
             INST("1110111011011---", Id::STG, Type::Memory, "STG"),
+            INST("11101101--------", Id::ATOM, Type::Memory, "ATOM"),
             INST("11101100--------", Id::ATOMS, Type::Memory, "ATOMS"),
             INST("1110111110100---", Id::AL2P, Type::Memory, "AL2P"),
             INST("110000----111---", Id::TEX, Type::Texture, "TEX"),
@@ -2074,6 +2131,7 @@ private:
             INST("0101110100100---", Id::HSETP2_R, Type::HalfSetPredicate, "HSETP2_R"),
             INST("0111111-0-------", Id::HSETP2_IMM, Type::HalfSetPredicate, "HSETP2_IMM"),
             INST("0101110100011---", Id::HSET2_R, Type::HalfSet, "HSET2_R"),
+            INST("010110111010----", Id::FCMP_R, Type::Arithmetic, "FCMP_R"),
             INST("0101000010000---", Id::MUFU, Type::Arithmetic, "MUFU"),
             INST("0100110010010---", Id::RRO_C, Type::Arithmetic, "RRO_C"),
             INST("0101110010010---", Id::RRO_R, Type::Arithmetic, "RRO_R"),
@@ -2098,6 +2156,7 @@ private:
             INST("0100110000000---", Id::BFE_C, Type::Bfe, "BFE_C"),
             INST("0101110000000---", Id::BFE_R, Type::Bfe, "BFE_R"),
             INST("0011100-00000---", Id::BFE_IMM, Type::Bfe, "BFE_IMM"),
+            INST("0101001111110---", Id::BFI_RC, Type::Bfi, "BFI_RC"),
             INST("0011011-11110---", Id::BFI_IMM_R, Type::Bfi, "BFI_IMM_R"),
             INST("0100110001000---", Id::LOP_C, Type::ArithmeticInteger, "LOP_C"),
             INST("0101110001000---", Id::LOP_R, Type::ArithmeticInteger, "LOP_R"),
diff --git a/src/video_core/gpu.cpp b/src/video_core/gpu.cpp
index b9c5c41a2..7d7137109 100644
--- a/src/video_core/gpu.cpp
+++ b/src/video_core/gpu.cpp
@@ -6,6 +6,7 @@
 #include "common/microprofile.h"
 #include "core/core.h"
 #include "core/core_timing.h"
+#include "core/core_timing_util.h"
 #include "core/memory.h"
 #include "video_core/engines/fermi_2d.h"
 #include "video_core/engines/kepler_compute.h"
@@ -122,6 +123,19 @@ bool GPU::CancelSyncptInterrupt(const u32 syncpoint_id, const u32 value) {
     return true;
 }
 
+u64 GPU::GetTicks() const {
+    // This values were reversed engineered by fincs from NVN
+    // The gpu clock is reported in units of 385/625 nanoseconds
+    constexpr u64 gpu_ticks_num = 384;
+    constexpr u64 gpu_ticks_den = 625;
+
+    const u64 cpu_ticks = system.CoreTiming().GetTicks();
+    const u64 nanoseconds = Core::Timing::CyclesToNs(cpu_ticks).count();
+    const u64 nanoseconds_num = nanoseconds / gpu_ticks_den;
+    const u64 nanoseconds_rem = nanoseconds % gpu_ticks_den;
+    return nanoseconds_num * gpu_ticks_num + (nanoseconds_rem * gpu_ticks_num) / gpu_ticks_den;
+}
+
 void GPU::FlushCommands() {
     renderer.Rasterizer().FlushCommands();
 }
@@ -340,7 +354,7 @@ void GPU::ProcessSemaphoreTriggerMethod() {
         block.sequence = regs.semaphore_sequence;
         // TODO(Kmather73): Generate a real GPU timestamp and write it here instead of
         // CoreTiming
-        block.timestamp = system.CoreTiming().GetTicks();
+        block.timestamp = GetTicks();
         memory_manager->WriteBlock(regs.semaphore_address.SemaphoreAddress(), &block,
                                    sizeof(block));
     } else {
diff --git a/src/video_core/gpu.h b/src/video_core/gpu.h
index b648317bb..07727210c 100644
--- a/src/video_core/gpu.h
+++ b/src/video_core/gpu.h
@@ -192,6 +192,8 @@ public:
 
     bool CancelSyncptInterrupt(u32 syncpoint_id, u32 value);
 
+    u64 GetTicks() const;
+
     std::unique_lock<std::mutex> LockSync() {
         return std::unique_lock{sync_mutex};
     }
diff --git a/src/video_core/gpu_thread.h b/src/video_core/gpu_thread.h
index 08dc96bb3..882e2d9c7 100644
--- a/src/video_core/gpu_thread.h
+++ b/src/video_core/gpu_thread.h
@@ -86,7 +86,7 @@ struct CommandDataContainer {
 struct SynchState final {
     std::atomic_bool is_running{true};
 
-    using CommandQueue = Common::SPSCQueue<CommandDataContainer>;
+    using CommandQueue = Common::MPSCQueue<CommandDataContainer>;
     CommandQueue queue;
     u64 last_fence{};
     std::atomic<u64> signaled_fence{};
diff --git a/src/video_core/guest_driver.cpp b/src/video_core/guest_driver.cpp
new file mode 100644
index 000000000..6adef459e
--- /dev/null
+++ b/src/video_core/guest_driver.cpp
@@ -0,0 +1,36 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <algorithm>
+#include <limits>
+
+#include "video_core/guest_driver.h"
+
+namespace VideoCore {
+
+void GuestDriverProfile::DeduceTextureHandlerSize(std::vector<u32>&& bound_offsets) {
+    if (texture_handler_size_deduced) {
+        return;
+    }
+    const std::size_t size = bound_offsets.size();
+    if (size < 2) {
+        return;
+    }
+    std::sort(bound_offsets.begin(), bound_offsets.end(), std::less{});
+    u32 min_val = std::numeric_limits<u32>::max();
+    for (std::size_t i = 1; i < size; ++i) {
+        if (bound_offsets[i] == bound_offsets[i - 1]) {
+            continue;
+        }
+        const u32 new_min = bound_offsets[i] - bound_offsets[i - 1];
+        min_val = std::min(min_val, new_min);
+    }
+    if (min_val > 2) {
+        return;
+    }
+    texture_handler_size_deduced = true;
+    texture_handler_size = min_texture_handler_size * min_val;
+}
+
+} // namespace VideoCore
diff --git a/src/video_core/guest_driver.h b/src/video_core/guest_driver.h
new file mode 100644
index 000000000..fc1917347
--- /dev/null
+++ b/src/video_core/guest_driver.h
@@ -0,0 +1,41 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <vector>
+
+#include "common/common_types.h"
+
+namespace VideoCore {
+
+/**
+ * The GuestDriverProfile class is used to learn about the GPU drivers behavior and collect
+ * information necessary for impossible to avoid HLE methods like shader tracks as they are
+ * Entscheidungsproblems.
+ */
+class GuestDriverProfile {
+public:
+    void DeduceTextureHandlerSize(std::vector<u32>&& bound_offsets);
+
+    u32 GetTextureHandlerSize() const {
+        return texture_handler_size;
+    }
+
+    bool TextureHandlerSizeKnown() const {
+        return texture_handler_size_deduced;
+    }
+
+private:
+    // Minimum size of texture handler any driver can use.
+    static constexpr u32 min_texture_handler_size = 4;
+    // This goes with Vulkan and OpenGL standards but Nvidia GPUs can easily
+    // use 4 bytes instead. Thus, certain drivers may squish the size.
+    static constexpr u32 default_texture_handler_size = 8;
+
+    u32 texture_handler_size = default_texture_handler_size;
+    bool texture_handler_size_deduced = false;
+};
+
+} // namespace VideoCore
diff --git a/src/video_core/memory_manager.cpp b/src/video_core/memory_manager.cpp
index 11848fbce..f5d33f27a 100644
--- a/src/video_core/memory_manager.cpp
+++ b/src/video_core/memory_manager.cpp
@@ -9,6 +9,7 @@
 #include "core/hle/kernel/process.h"
 #include "core/hle/kernel/vm_manager.h"
 #include "core/memory.h"
+#include "video_core/gpu.h"
 #include "video_core/memory_manager.h"
 #include "video_core/rasterizer_interface.h"
 
@@ -84,7 +85,9 @@ GPUVAddr MemoryManager::UnmapBuffer(GPUVAddr gpu_addr, u64 size) {
     const auto cpu_addr = GpuToCpuAddress(gpu_addr);
     ASSERT(cpu_addr);
 
-    rasterizer.FlushAndInvalidateRegion(cache_addr, aligned_size);
+    // Flush and invalidate through the GPU interface, to be asynchronous if possible.
+    system.GPU().FlushAndInvalidateRegion(cache_addr, aligned_size);
+
     UnmapRange(gpu_addr, aligned_size);
     ASSERT(system.CurrentProcess()
                ->VMManager()
@@ -242,6 +245,8 @@ void MemoryManager::ReadBlock(GPUVAddr src_addr, void* dest_buffer, const std::s
         switch (page_table.attributes[page_index]) {
         case Common::PageType::Memory: {
             const u8* src_ptr{page_table.pointers[page_index] + page_offset};
+            // Flush must happen on the rasterizer interface, such that memory is always synchronous
+            // when it is read (even when in asynchronous GPU mode). Fixes Dead Cells title menu.
             rasterizer.FlushRegion(ToCacheAddr(src_ptr), copy_amount);
             std::memcpy(dest_buffer, src_ptr, copy_amount);
             break;
@@ -292,6 +297,8 @@ void MemoryManager::WriteBlock(GPUVAddr dest_addr, const void* src_buffer, const
         switch (page_table.attributes[page_index]) {
         case Common::PageType::Memory: {
             u8* dest_ptr{page_table.pointers[page_index] + page_offset};
+            // Invalidate must happen on the rasterizer interface, such that memory is always
+            // synchronous when it is written (even when in asynchronous GPU mode).
             rasterizer.InvalidateRegion(ToCacheAddr(dest_ptr), copy_amount);
             std::memcpy(dest_ptr, src_buffer, copy_amount);
             break;
@@ -339,6 +346,8 @@ void MemoryManager::CopyBlock(GPUVAddr dest_addr, GPUVAddr src_addr, const std::
 
         switch (page_table.attributes[page_index]) {
         case Common::PageType::Memory: {
+            // Flush must happen on the rasterizer interface, such that memory is always synchronous
+            // when it is copied (even when in asynchronous GPU mode).
             const u8* src_ptr{page_table.pointers[page_index] + page_offset};
             rasterizer.FlushRegion(ToCacheAddr(src_ptr), copy_amount);
             WriteBlock(dest_addr, src_ptr, copy_amount);
diff --git a/src/video_core/query_cache.h b/src/video_core/query_cache.h
new file mode 100644
index 000000000..e66054ed0
--- /dev/null
+++ b/src/video_core/query_cache.h
@@ -0,0 +1,359 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <algorithm>
+#include <array>
+#include <cstring>
+#include <iterator>
+#include <memory>
+#include <mutex>
+#include <optional>
+#include <unordered_map>
+#include <vector>
+
+#include "common/assert.h"
+#include "core/core.h"
+#include "video_core/engines/maxwell_3d.h"
+#include "video_core/gpu.h"
+#include "video_core/memory_manager.h"
+#include "video_core/rasterizer_interface.h"
+
+namespace VideoCommon {
+
+template <class QueryCache, class HostCounter>
+class CounterStreamBase {
+public:
+    explicit CounterStreamBase(QueryCache& cache, VideoCore::QueryType type)
+        : cache{cache}, type{type} {}
+
+    /// Updates the state of the stream, enabling or disabling as needed.
+    void Update(bool enabled) {
+        if (enabled) {
+            Enable();
+        } else {
+            Disable();
+        }
+    }
+
+    /// Resets the stream to zero. It doesn't disable the query after resetting.
+    void Reset() {
+        if (current) {
+            current->EndQuery();
+
+            // Immediately start a new query to avoid disabling its state.
+            current = cache.Counter(nullptr, type);
+        }
+        last = nullptr;
+    }
+
+    /// Returns the current counter slicing as needed.
+    std::shared_ptr<HostCounter> Current() {
+        if (!current) {
+            return nullptr;
+        }
+        current->EndQuery();
+        last = std::move(current);
+        current = cache.Counter(last, type);
+        return last;
+    }
+
+    /// Returns true when the counter stream is enabled.
+    bool IsEnabled() const {
+        return current != nullptr;
+    }
+
+private:
+    /// Enables the stream.
+    void Enable() {
+        if (current) {
+            return;
+        }
+        current = cache.Counter(last, type);
+    }
+
+    // Disables the stream.
+    void Disable() {
+        if (current) {
+            current->EndQuery();
+        }
+        last = std::exchange(current, nullptr);
+    }
+
+    QueryCache& cache;
+    const VideoCore::QueryType type;
+
+    std::shared_ptr<HostCounter> current;
+    std::shared_ptr<HostCounter> last;
+};
+
+template <class QueryCache, class CachedQuery, class CounterStream, class HostCounter,
+          class QueryPool>
+class QueryCacheBase {
+public:
+    explicit QueryCacheBase(Core::System& system, VideoCore::RasterizerInterface& rasterizer)
+        : system{system}, rasterizer{rasterizer}, streams{{CounterStream{
+                                                      static_cast<QueryCache&>(*this),
+                                                      VideoCore::QueryType::SamplesPassed}}} {}
+
+    void InvalidateRegion(CacheAddr addr, std::size_t size) {
+        std::unique_lock lock{mutex};
+        FlushAndRemoveRegion(addr, size);
+    }
+
+    void FlushRegion(CacheAddr addr, std::size_t size) {
+        std::unique_lock lock{mutex};
+        FlushAndRemoveRegion(addr, size);
+    }
+
+    /**
+     * Records a query in GPU mapped memory, potentially marked with a timestamp.
+     * @param gpu_addr  GPU address to flush to when the mapped memory is read.
+     * @param type      Query type, e.g. SamplesPassed.
+     * @param timestamp Timestamp, when empty the flushed query is assumed to be short.
+     */
+    void Query(GPUVAddr gpu_addr, VideoCore::QueryType type, std::optional<u64> timestamp) {
+        std::unique_lock lock{mutex};
+        auto& memory_manager = system.GPU().MemoryManager();
+        const auto host_ptr = memory_manager.GetPointer(gpu_addr);
+
+        CachedQuery* query = TryGet(ToCacheAddr(host_ptr));
+        if (!query) {
+            const auto cpu_addr = memory_manager.GpuToCpuAddress(gpu_addr);
+            ASSERT_OR_EXECUTE(cpu_addr, return;);
+
+            query = Register(type, *cpu_addr, host_ptr, timestamp.has_value());
+        }
+
+        query->BindCounter(Stream(type).Current(), timestamp);
+    }
+
+    /// Updates counters from GPU state. Expected to be called once per draw, clear or dispatch.
+    void UpdateCounters() {
+        std::unique_lock lock{mutex};
+        const auto& regs = system.GPU().Maxwell3D().regs;
+        Stream(VideoCore::QueryType::SamplesPassed).Update(regs.samplecnt_enable);
+    }
+
+    /// Resets a counter to zero. It doesn't disable the query after resetting.
+    void ResetCounter(VideoCore::QueryType type) {
+        std::unique_lock lock{mutex};
+        Stream(type).Reset();
+    }
+
+    /// Disable all active streams. Expected to be called at the end of a command buffer.
+    void DisableStreams() {
+        std::unique_lock lock{mutex};
+        for (auto& stream : streams) {
+            stream.Update(false);
+        }
+    }
+
+    /// Returns a new host counter.
+    std::shared_ptr<HostCounter> Counter(std::shared_ptr<HostCounter> dependency,
+                                         VideoCore::QueryType type) {
+        return std::make_shared<HostCounter>(static_cast<QueryCache&>(*this), std::move(dependency),
+                                             type);
+    }
+
+    /// Returns the counter stream of the specified type.
+    CounterStream& Stream(VideoCore::QueryType type) {
+        return streams[static_cast<std::size_t>(type)];
+    }
+
+    /// Returns the counter stream of the specified type.
+    const CounterStream& Stream(VideoCore::QueryType type) const {
+        return streams[static_cast<std::size_t>(type)];
+    }
+
+protected:
+    std::array<QueryPool, VideoCore::NumQueryTypes> query_pools;
+
+private:
+    /// Flushes a memory range to guest memory and removes it from the cache.
+    void FlushAndRemoveRegion(CacheAddr addr, std::size_t size) {
+        const u64 addr_begin = static_cast<u64>(addr);
+        const u64 addr_end = addr_begin + static_cast<u64>(size);
+        const auto in_range = [addr_begin, addr_end](CachedQuery& query) {
+            const u64 cache_begin = query.GetCacheAddr();
+            const u64 cache_end = cache_begin + query.SizeInBytes();
+            return cache_begin < addr_end && addr_begin < cache_end;
+        };
+
+        const u64 page_end = addr_end >> PAGE_SHIFT;
+        for (u64 page = addr_begin >> PAGE_SHIFT; page <= page_end; ++page) {
+            const auto& it = cached_queries.find(page);
+            if (it == std::end(cached_queries)) {
+                continue;
+            }
+            auto& contents = it->second;
+            for (auto& query : contents) {
+                if (!in_range(query)) {
+                    continue;
+                }
+                rasterizer.UpdatePagesCachedCount(query.CpuAddr(), query.SizeInBytes(), -1);
+                query.Flush();
+            }
+            contents.erase(std::remove_if(std::begin(contents), std::end(contents), in_range),
+                           std::end(contents));
+        }
+    }
+
+    /// Registers the passed parameters as cached and returns a pointer to the stored cached query.
+    CachedQuery* Register(VideoCore::QueryType type, VAddr cpu_addr, u8* host_ptr, bool timestamp) {
+        rasterizer.UpdatePagesCachedCount(cpu_addr, CachedQuery::SizeInBytes(timestamp), 1);
+        const u64 page = static_cast<u64>(ToCacheAddr(host_ptr)) >> PAGE_SHIFT;
+        return &cached_queries[page].emplace_back(static_cast<QueryCache&>(*this), type, cpu_addr,
+                                                  host_ptr);
+    }
+
+    /// Tries to a get a cached query. Returns nullptr on failure.
+    CachedQuery* TryGet(CacheAddr addr) {
+        const u64 page = static_cast<u64>(addr) >> PAGE_SHIFT;
+        const auto it = cached_queries.find(page);
+        if (it == std::end(cached_queries)) {
+            return nullptr;
+        }
+        auto& contents = it->second;
+        const auto found =
+            std::find_if(std::begin(contents), std::end(contents),
+                         [addr](auto& query) { return query.GetCacheAddr() == addr; });
+        return found != std::end(contents) ? &*found : nullptr;
+    }
+
+    static constexpr std::uintptr_t PAGE_SIZE = 4096;
+    static constexpr unsigned PAGE_SHIFT = 12;
+
+    Core::System& system;
+    VideoCore::RasterizerInterface& rasterizer;
+
+    std::recursive_mutex mutex;
+
+    std::unordered_map<u64, std::vector<CachedQuery>> cached_queries;
+
+    std::array<CounterStream, VideoCore::NumQueryTypes> streams;
+};
+
+template <class QueryCache, class HostCounter>
+class HostCounterBase {
+public:
+    explicit HostCounterBase(std::shared_ptr<HostCounter> dependency_)
+        : dependency{std::move(dependency_)}, depth{dependency ? (dependency->Depth() + 1) : 0} {
+        // Avoid nesting too many dependencies to avoid a stack overflow when these are deleted.
+        constexpr u64 depth_threshold = 96;
+        if (depth > depth_threshold) {
+            depth = 0;
+            base_result = dependency->Query();
+            dependency = nullptr;
+        }
+    }
+    virtual ~HostCounterBase() = default;
+
+    /// Returns the current value of the query.
+    u64 Query() {
+        if (result) {
+            return *result;
+        }
+
+        u64 value = BlockingQuery() + base_result;
+        if (dependency) {
+            value += dependency->Query();
+            dependency = nullptr;
+        }
+
+        result = value;
+        return *result;
+    }
+
+    /// Returns true when flushing this query will potentially wait.
+    bool WaitPending() const noexcept {
+        return result.has_value();
+    }
+
+    u64 Depth() const noexcept {
+        return depth;
+    }
+
+protected:
+    /// Returns the value of query from the backend API blocking as needed.
+    virtual u64 BlockingQuery() const = 0;
+
+private:
+    std::shared_ptr<HostCounter> dependency; ///< Counter to add to this value.
+    std::optional<u64> result;               ///< Filled with the already returned value.
+    u64 depth;                               ///< Number of nested dependencies.
+    u64 base_result = 0;                     ///< Equivalent to nested dependencies value.
+};
+
+template <class HostCounter>
+class CachedQueryBase {
+public:
+    explicit CachedQueryBase(VAddr cpu_addr, u8* host_ptr)
+        : cpu_addr{cpu_addr}, host_ptr{host_ptr} {}
+    virtual ~CachedQueryBase() = default;
+
+    CachedQueryBase(CachedQueryBase&&) noexcept = default;
+    CachedQueryBase(const CachedQueryBase&) = delete;
+
+    CachedQueryBase& operator=(CachedQueryBase&&) noexcept = default;
+    CachedQueryBase& operator=(const CachedQueryBase&) = delete;
+
+    /// Flushes the query to guest memory.
+    virtual void Flush() {
+        // When counter is nullptr it means that it's just been reseted. We are supposed to write a
+        // zero in these cases.
+        const u64 value = counter ? counter->Query() : 0;
+        std::memcpy(host_ptr, &value, sizeof(u64));
+
+        if (timestamp) {
+            std::memcpy(host_ptr + TIMESTAMP_OFFSET, &*timestamp, sizeof(u64));
+        }
+    }
+
+    /// Binds a counter to this query.
+    void BindCounter(std::shared_ptr<HostCounter> counter_, std::optional<u64> timestamp_) {
+        if (counter) {
+            // If there's an old counter set it means the query is being rewritten by the game.
+            // To avoid losing the data forever, flush here.
+            Flush();
+        }
+        counter = std::move(counter_);
+        timestamp = timestamp_;
+    }
+
+    VAddr CpuAddr() const noexcept {
+        return cpu_addr;
+    }
+
+    CacheAddr GetCacheAddr() const noexcept {
+        return ToCacheAddr(host_ptr);
+    }
+
+    u64 SizeInBytes() const noexcept {
+        return SizeInBytes(timestamp.has_value());
+    }
+
+    static constexpr u64 SizeInBytes(bool with_timestamp) noexcept {
+        return with_timestamp ? LARGE_QUERY_SIZE : SMALL_QUERY_SIZE;
+    }
+
+protected:
+    /// Returns true when querying the counter may potentially block.
+    bool WaitPending() const noexcept {
+        return counter && counter->WaitPending();
+    }
+
+private:
+    static constexpr std::size_t SMALL_QUERY_SIZE = 8;   // Query size without timestamp.
+    static constexpr std::size_t LARGE_QUERY_SIZE = 16;  // Query size with timestamp.
+    static constexpr std::intptr_t TIMESTAMP_OFFSET = 8; // Timestamp offset in a large query.
+
+    VAddr cpu_addr;                       ///< Guest CPU address.
+    u8* host_ptr;                         ///< Writable host pointer.
+    std::shared_ptr<HostCounter> counter; ///< Host counter to query, owns the dependency tree.
+    std::optional<u64> timestamp;         ///< Timestamp to flush to guest memory.
+};
+
+} // namespace VideoCommon
diff --git a/src/video_core/rasterizer_interface.h b/src/video_core/rasterizer_interface.h
index 5b0eca9e2..f18eaf4bc 100644
--- a/src/video_core/rasterizer_interface.h
+++ b/src/video_core/rasterizer_interface.h
@@ -6,9 +6,11 @@
 
 #include <atomic>
 #include <functional>
+#include <optional>
 #include "common/common_types.h"
 #include "video_core/engines/fermi_2d.h"
 #include "video_core/gpu.h"
+#include "video_core/guest_driver.h"
 
 namespace Tegra {
 class MemoryManager;
@@ -16,6 +18,11 @@ class MemoryManager;
 
 namespace VideoCore {
 
+enum class QueryType {
+    SamplesPassed,
+};
+constexpr std::size_t NumQueryTypes = 1;
+
 enum class LoadCallbackStage {
     Prepare,
     Decompile,
@@ -28,11 +35,8 @@ class RasterizerInterface {
 public:
     virtual ~RasterizerInterface() {}
 
-    /// Draw the current batch of vertex arrays
-    virtual bool DrawBatch(bool is_indexed) = 0;
-
-    /// Draw the current batch of multiple instances of vertex arrays
-    virtual bool DrawMultiBatch(bool is_indexed) = 0;
+    /// Dispatches a draw invocation
+    virtual void Draw(bool is_indexed, bool is_instanced) = 0;
 
     /// Clear the current framebuffer
     virtual void Clear() = 0;
@@ -40,6 +44,12 @@ public:
     /// Dispatches a compute shader invocation
     virtual void DispatchCompute(GPUVAddr code_addr) = 0;
 
+    /// Resets the counter of a query
+    virtual void ResetCounter(QueryType type) = 0;
+
+    /// Records a GPU query and caches it
+    virtual void Query(GPUVAddr gpu_addr, QueryType type, std::optional<u64> timestamp) = 0;
+
     /// Notify rasterizer that all caches should be flushed to Switch memory
     virtual void FlushAll() = 0;
 
@@ -78,5 +88,18 @@ public:
     /// Initialize disk cached resources for the game being emulated
     virtual void LoadDiskResources(const std::atomic_bool& stop_loading = false,
                                    const DiskResourceLoadCallback& callback = {}) {}
+
+    /// Grant access to the Guest Driver Profile for recording/obtaining info on the guest driver.
+    GuestDriverProfile& AccessGuestDriverProfile() {
+        return guest_driver_profile;
+    }
+
+    /// Grant access to the Guest Driver Profile for recording/obtaining info on the guest driver.
+    const GuestDriverProfile& AccessGuestDriverProfile() const {
+        return guest_driver_profile;
+    }
+
+private:
+    GuestDriverProfile guest_driver_profile{};
 };
 } // namespace VideoCore
diff --git a/src/video_core/renderer_opengl/gl_query_cache.cpp b/src/video_core/renderer_opengl/gl_query_cache.cpp
new file mode 100644
index 000000000..f12e9f55f
--- /dev/null
+++ b/src/video_core/renderer_opengl/gl_query_cache.cpp
@@ -0,0 +1,120 @@
+// Copyright 2019 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <algorithm>
+#include <cstring>
+#include <memory>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include <glad/glad.h>
+
+#include "common/assert.h"
+#include "core/core.h"
+#include "video_core/engines/maxwell_3d.h"
+#include "video_core/memory_manager.h"
+#include "video_core/renderer_opengl/gl_query_cache.h"
+#include "video_core/renderer_opengl/gl_rasterizer.h"
+
+namespace OpenGL {
+
+namespace {
+
+constexpr std::array<GLenum, VideoCore::NumQueryTypes> QueryTargets = {GL_SAMPLES_PASSED};
+
+constexpr GLenum GetTarget(VideoCore::QueryType type) {
+    return QueryTargets[static_cast<std::size_t>(type)];
+}
+
+} // Anonymous namespace
+
+QueryCache::QueryCache(Core::System& system, RasterizerOpenGL& gl_rasterizer)
+    : VideoCommon::QueryCacheBase<
+          QueryCache, CachedQuery, CounterStream, HostCounter,
+          std::vector<OGLQuery>>{system,
+                                 static_cast<VideoCore::RasterizerInterface&>(gl_rasterizer)},
+      gl_rasterizer{gl_rasterizer} {}
+
+QueryCache::~QueryCache() = default;
+
+OGLQuery QueryCache::AllocateQuery(VideoCore::QueryType type) {
+    auto& reserve = query_pools[static_cast<std::size_t>(type)];
+    OGLQuery query;
+    if (reserve.empty()) {
+        query.Create(GetTarget(type));
+        return query;
+    }
+
+    query = std::move(reserve.back());
+    reserve.pop_back();
+    return query;
+}
+
+void QueryCache::Reserve(VideoCore::QueryType type, OGLQuery&& query) {
+    query_pools[static_cast<std::size_t>(type)].push_back(std::move(query));
+}
+
+bool QueryCache::AnyCommandQueued() const noexcept {
+    return gl_rasterizer.AnyCommandQueued();
+}
+
+HostCounter::HostCounter(QueryCache& cache, std::shared_ptr<HostCounter> dependency,
+                         VideoCore::QueryType type)
+    : VideoCommon::HostCounterBase<QueryCache, HostCounter>{std::move(dependency)}, cache{cache},
+      type{type}, query{cache.AllocateQuery(type)} {
+    glBeginQuery(GetTarget(type), query.handle);
+}
+
+HostCounter::~HostCounter() {
+    cache.Reserve(type, std::move(query));
+}
+
+void HostCounter::EndQuery() {
+    if (!cache.AnyCommandQueued()) {
+        // There are chances a query waited on without commands (glDraw, glClear, glDispatch). Not
+        // having any of these causes a lock. glFlush is considered a command, so we can safely wait
+        // for this. Insert to the OpenGL command stream a flush.
+        glFlush();
+    }
+    glEndQuery(GetTarget(type));
+}
+
+u64 HostCounter::BlockingQuery() const {
+    GLint64 value;
+    glGetQueryObjecti64v(query.handle, GL_QUERY_RESULT, &value);
+    return static_cast<u64>(value);
+}
+
+CachedQuery::CachedQuery(QueryCache& cache, VideoCore::QueryType type, VAddr cpu_addr, u8* host_ptr)
+    : VideoCommon::CachedQueryBase<HostCounter>{cpu_addr, host_ptr}, cache{&cache}, type{type} {}
+
+CachedQuery::CachedQuery(CachedQuery&& rhs) noexcept
+    : VideoCommon::CachedQueryBase<HostCounter>(std::move(rhs)), cache{rhs.cache}, type{rhs.type} {}
+
+CachedQuery& CachedQuery::operator=(CachedQuery&& rhs) noexcept {
+    VideoCommon::CachedQueryBase<HostCounter>::operator=(std::move(rhs));
+    cache = rhs.cache;
+    type = rhs.type;
+    return *this;
+}
+
+void CachedQuery::Flush() {
+    // Waiting for a query while another query of the same target is enabled locks Nvidia's driver.
+    // To avoid this disable and re-enable keeping the dependency stream.
+    // But we only have to do this if we have pending waits to be done.
+    auto& stream = cache->Stream(type);
+    const bool slice_counter = WaitPending() && stream.IsEnabled();
+    if (slice_counter) {
+        stream.Update(false);
+    }
+
+    VideoCommon::CachedQueryBase<HostCounter>::Flush();
+
+    if (slice_counter) {
+        stream.Update(true);
+    }
+}
+
+} // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_query_cache.h b/src/video_core/renderer_opengl/gl_query_cache.h
new file mode 100644
index 000000000..d8e7052a1
--- /dev/null
+++ b/src/video_core/renderer_opengl/gl_query_cache.h
@@ -0,0 +1,78 @@
+// Copyright 2019 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <array>
+#include <memory>
+#include <vector>
+
+#include "common/common_types.h"
+#include "video_core/query_cache.h"
+#include "video_core/rasterizer_interface.h"
+#include "video_core/renderer_opengl/gl_resource_manager.h"
+
+namespace Core {
+class System;
+}
+
+namespace OpenGL {
+
+class CachedQuery;
+class HostCounter;
+class QueryCache;
+class RasterizerOpenGL;
+
+using CounterStream = VideoCommon::CounterStreamBase<QueryCache, HostCounter>;
+
+class QueryCache final : public VideoCommon::QueryCacheBase<QueryCache, CachedQuery, CounterStream,
+                                                            HostCounter, std::vector<OGLQuery>> {
+public:
+    explicit QueryCache(Core::System& system, RasterizerOpenGL& rasterizer);
+    ~QueryCache();
+
+    OGLQuery AllocateQuery(VideoCore::QueryType type);
+
+    void Reserve(VideoCore::QueryType type, OGLQuery&& query);
+
+    bool AnyCommandQueued() const noexcept;
+
+private:
+    RasterizerOpenGL& gl_rasterizer;
+};
+
+class HostCounter final : public VideoCommon::HostCounterBase<QueryCache, HostCounter> {
+public:
+    explicit HostCounter(QueryCache& cache, std::shared_ptr<HostCounter> dependency,
+                         VideoCore::QueryType type);
+    ~HostCounter();
+
+    void EndQuery();
+
+private:
+    u64 BlockingQuery() const override;
+
+    QueryCache& cache;
+    const VideoCore::QueryType type;
+    OGLQuery query;
+};
+
+class CachedQuery final : public VideoCommon::CachedQueryBase<HostCounter> {
+public:
+    explicit CachedQuery(QueryCache& cache, VideoCore::QueryType type, VAddr cpu_addr,
+                         u8* host_ptr);
+    CachedQuery(CachedQuery&& rhs) noexcept;
+    CachedQuery(const CachedQuery&) = delete;
+
+    CachedQuery& operator=(CachedQuery&& rhs) noexcept;
+    CachedQuery& operator=(const CachedQuery&) = delete;
+
+    void Flush() override;
+
+private:
+    QueryCache* cache;
+    VideoCore::QueryType type;
+};
+
+} // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp
index c428f06e4..e1965fb21 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@@ -25,6 +25,7 @@
 #include "video_core/engines/maxwell_3d.h"
 #include "video_core/engines/shader_type.h"
 #include "video_core/memory_manager.h"
+#include "video_core/renderer_opengl/gl_query_cache.h"
 #include "video_core/renderer_opengl/gl_rasterizer.h"
 #include "video_core/renderer_opengl/gl_shader_cache.h"
 #include "video_core/renderer_opengl/gl_shader_gen.h"
@@ -55,16 +56,20 @@ namespace {
 
 template <typename Engine, typename Entry>
 Tegra::Texture::FullTextureInfo GetTextureInfo(const Engine& engine, const Entry& entry,
-                                               Tegra::Engines::ShaderType shader_type) {
+                                               Tegra::Engines::ShaderType shader_type,
+                                               std::size_t index = 0) {
     if (entry.IsBindless()) {
         const Tegra::Texture::TextureHandle tex_handle =
             engine.AccessConstBuffer32(shader_type, entry.GetBuffer(), entry.GetOffset());
         return engine.GetTextureInfo(tex_handle);
     }
+    const auto& gpu_profile = engine.AccessGuestDriverProfile();
+    const u32 offset =
+        entry.GetOffset() + static_cast<u32>(index * gpu_profile.GetTextureHandlerSize());
     if constexpr (std::is_same_v<Engine, Tegra::Engines::Maxwell3D>) {
-        return engine.GetStageTexture(shader_type, entry.GetOffset());
+        return engine.GetStageTexture(shader_type, offset);
     } else {
-        return engine.GetTexture(entry.GetOffset());
+        return engine.GetTexture(offset);
     }
 }
 
@@ -88,8 +93,8 @@ std::size_t GetConstBufferSize(const Tegra::Engines::ConstBufferInfo& buffer,
 RasterizerOpenGL::RasterizerOpenGL(Core::System& system, Core::Frontend::EmuWindow& emu_window,
                                    ScreenInfo& info)
     : RasterizerAccelerated{system.Memory()}, texture_cache{system, *this, device},
-      shader_cache{*this, system, emu_window, device}, system{system}, screen_info{info},
-      buffer_cache{*this, system, device, STREAM_BUFFER_SIZE} {
+      shader_cache{*this, system, emu_window, device}, query_cache{system, *this}, system{system},
+      screen_info{info}, buffer_cache{*this, system, device, STREAM_BUFFER_SIZE} {
     shader_program_manager = std::make_unique<GLShader::ProgramManager>();
     state.draw.shader_program = 0;
     state.Apply();
@@ -244,9 +249,6 @@ void RasterizerOpenGL::SetupVertexInstances(GLuint vao) {
 }
 
 GLintptr RasterizerOpenGL::SetupIndexBuffer() {
-    if (accelerate_draw != AccelDraw::Indexed) {
-        return 0;
-    }
     MICROPROFILE_SCOPE(OpenGL_Index);
     const auto& regs = system.GPU().Maxwell3D().regs;
     const std::size_t size = CalculateIndexBufferSize();
@@ -540,10 +542,16 @@ void RasterizerOpenGL::Clear() {
     } else if (use_stencil) {
         glClearBufferiv(GL_STENCIL, 0, &regs.clear_stencil);
     }
+
+    ++num_queued_commands;
 }
 
-void RasterizerOpenGL::DrawPrelude() {
+void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) {
+    MICROPROFILE_SCOPE(OpenGL_Drawing);
     auto& gpu = system.GPU().Maxwell3D();
+    const auto& regs = gpu.regs;
+
+    query_cache.UpdateCounters();
 
     SyncRasterizeEnable(state);
     SyncColorMask();
@@ -563,9 +571,6 @@ void RasterizerOpenGL::DrawPrelude() {
 
     buffer_cache.Acquire();
 
-    // Draw the vertex batch
-    const bool is_indexed = accelerate_draw == AccelDraw::Indexed;
-
     std::size_t buffer_size = CalculateVertexArraysSize();
 
     // Add space for index buffer
@@ -592,7 +597,11 @@ void RasterizerOpenGL::DrawPrelude() {
     // Upload vertex and index data.
     SetupVertexBuffer(vao);
     SetupVertexInstances(vao);
-    index_buffer_offset = SetupIndexBuffer();
+
+    GLintptr index_buffer_offset;
+    if (is_indexed) {
+        index_buffer_offset = SetupIndexBuffer();
+    }
 
     // Prepare packed bindings.
     bind_ubo_pushbuffer.Setup();
@@ -608,7 +617,7 @@ void RasterizerOpenGL::DrawPrelude() {
 
     // Setup shaders and their used resources.
     texture_cache.GuardSamplers(true);
-    const auto primitive_mode = MaxwellToGL::PrimitiveTopology(gpu.regs.draw.topology);
+    const GLenum primitive_mode = MaxwellToGL::PrimitiveTopology(gpu.regs.draw.topology);
     SetupShaders(primitive_mode);
     texture_cache.GuardSamplers(false);
 
@@ -626,6 +635,7 @@ void RasterizerOpenGL::DrawPrelude() {
         // As all cached buffers are invalidated, we need to recheck their state.
         gpu.dirty.ResetVertexArrays();
     }
+    gpu.dirty.memory_general = false;
 
     shader_program_manager->ApplyTo(state);
     state.Apply();
@@ -633,107 +643,46 @@ void RasterizerOpenGL::DrawPrelude() {
     if (texture_cache.TextureBarrier()) {
         glTextureBarrier();
     }
-}
 
-struct DrawParams {
-    bool is_indexed{};
-    bool is_instanced{};
-    GLenum primitive_mode{};
-    GLint count{};
-    GLint base_vertex{};
-
-    // Indexed settings
-    GLenum index_format{};
-    GLintptr index_buffer_offset{};
-
-    // Instanced setting
-    GLint num_instances{};
-    GLint base_instance{};
-
-    void DispatchDraw() {
-        if (is_indexed) {
-            const auto index_buffer_ptr = reinterpret_cast<const void*>(index_buffer_offset);
-            if (is_instanced) {
-                glDrawElementsInstancedBaseVertexBaseInstance(primitive_mode, count, index_format,
-                                                              index_buffer_ptr, num_instances,
-                                                              base_vertex, base_instance);
-            } else {
-                glDrawElementsBaseVertex(primitive_mode, count, index_format, index_buffer_ptr,
-                                         base_vertex);
-            }
+    ++num_queued_commands;
+
+    const GLuint base_instance = static_cast<GLuint>(gpu.regs.vb_base_instance);
+    const GLsizei num_instances =
+        static_cast<GLsizei>(is_instanced ? gpu.mme_draw.instance_count : 1);
+    if (is_indexed) {
+        const GLint base_vertex = static_cast<GLint>(gpu.regs.vb_element_base);
+        const GLsizei num_vertices = static_cast<GLsizei>(gpu.regs.index_array.count);
+        const GLvoid* offset = reinterpret_cast<const GLvoid*>(index_buffer_offset);
+        const GLenum format = MaxwellToGL::IndexFormat(gpu.regs.index_array.format);
+        if (num_instances == 1 && base_instance == 0 && base_vertex == 0) {
+            glDrawElements(primitive_mode, num_vertices, format, offset);
+        } else if (num_instances == 1 && base_instance == 0) {
+            glDrawElementsBaseVertex(primitive_mode, num_vertices, format, offset, base_vertex);
+        } else if (base_vertex == 0 && base_instance == 0) {
+            glDrawElementsInstanced(primitive_mode, num_vertices, format, offset, num_instances);
+        } else if (base_vertex == 0) {
+            glDrawElementsInstancedBaseInstance(primitive_mode, num_vertices, format, offset,
+                                                num_instances, base_instance);
+        } else if (base_instance == 0) {
+            glDrawElementsInstancedBaseVertex(primitive_mode, num_vertices, format, offset,
+                                              num_instances, base_vertex);
         } else {
-            if (is_instanced) {
-                glDrawArraysInstancedBaseInstance(primitive_mode, base_vertex, count, num_instances,
-                                                  base_instance);
-            } else {
-                glDrawArrays(primitive_mode, base_vertex, count);
-            }
+            glDrawElementsInstancedBaseVertexBaseInstance(primitive_mode, num_vertices, format,
+                                                          offset, num_instances, base_vertex,
+                                                          base_instance);
         }
-    }
-};
-
-bool RasterizerOpenGL::DrawBatch(bool is_indexed) {
-    accelerate_draw = is_indexed ? AccelDraw::Indexed : AccelDraw::Arrays;
-
-    MICROPROFILE_SCOPE(OpenGL_Drawing);
-
-    DrawPrelude();
-
-    auto& maxwell3d = system.GPU().Maxwell3D();
-    const auto& regs = maxwell3d.regs;
-    const auto current_instance = maxwell3d.state.current_instance;
-    DrawParams draw_call{};
-    draw_call.is_indexed = is_indexed;
-    draw_call.num_instances = static_cast<GLint>(1);
-    draw_call.base_instance = static_cast<GLint>(current_instance);
-    draw_call.is_instanced = current_instance > 0;
-    draw_call.primitive_mode = MaxwellToGL::PrimitiveTopology(regs.draw.topology);
-    if (draw_call.is_indexed) {
-        draw_call.count = static_cast<GLint>(regs.index_array.count);
-        draw_call.base_vertex = static_cast<GLint>(regs.vb_element_base);
-        draw_call.index_format = MaxwellToGL::IndexFormat(regs.index_array.format);
-        draw_call.index_buffer_offset = index_buffer_offset;
     } else {
-        draw_call.count = static_cast<GLint>(regs.vertex_buffer.count);
-        draw_call.base_vertex = static_cast<GLint>(regs.vertex_buffer.first);
-    }
-    draw_call.DispatchDraw();
-
-    maxwell3d.dirty.memory_general = false;
-    accelerate_draw = AccelDraw::Disabled;
-    return true;
-}
-
-bool RasterizerOpenGL::DrawMultiBatch(bool is_indexed) {
-    accelerate_draw = is_indexed ? AccelDraw::Indexed : AccelDraw::Arrays;
-
-    MICROPROFILE_SCOPE(OpenGL_Drawing);
-
-    DrawPrelude();
-
-    auto& maxwell3d = system.GPU().Maxwell3D();
-    const auto& regs = maxwell3d.regs;
-    const auto& draw_setup = maxwell3d.mme_draw;
-    DrawParams draw_call{};
-    draw_call.is_indexed = is_indexed;
-    draw_call.num_instances = static_cast<GLint>(draw_setup.instance_count);
-    draw_call.base_instance = static_cast<GLint>(regs.vb_base_instance);
-    draw_call.is_instanced = draw_setup.instance_count > 1;
-    draw_call.primitive_mode = MaxwellToGL::PrimitiveTopology(regs.draw.topology);
-    if (draw_call.is_indexed) {
-        draw_call.count = static_cast<GLint>(regs.index_array.count);
-        draw_call.base_vertex = static_cast<GLint>(regs.vb_element_base);
-        draw_call.index_format = MaxwellToGL::IndexFormat(regs.index_array.format);
-        draw_call.index_buffer_offset = index_buffer_offset;
-    } else {
-        draw_call.count = static_cast<GLint>(regs.vertex_buffer.count);
-        draw_call.base_vertex = static_cast<GLint>(regs.vertex_buffer.first);
+        const GLint base_vertex = static_cast<GLint>(gpu.regs.vertex_buffer.first);
+        const GLsizei num_vertices = static_cast<GLsizei>(gpu.regs.vertex_buffer.count);
+        if (num_instances == 1 && base_instance == 0) {
+            glDrawArrays(primitive_mode, base_vertex, num_vertices);
+        } else if (base_instance == 0) {
+            glDrawArraysInstanced(primitive_mode, base_vertex, num_vertices, num_instances);
+        } else {
+            glDrawArraysInstancedBaseInstance(primitive_mode, base_vertex, num_vertices,
+                                              num_instances, base_instance);
+        }
     }
-    draw_call.DispatchDraw();
-
-    maxwell3d.dirty.memory_general = false;
-    accelerate_draw = AccelDraw::Disabled;
-    return true;
 }
 
 void RasterizerOpenGL::DispatchCompute(GPUVAddr code_addr) {
@@ -776,6 +725,16 @@ void RasterizerOpenGL::DispatchCompute(GPUVAddr code_addr) {
     state.ApplyProgramPipeline();
 
     glDispatchCompute(launch_desc.grid_dim_x, launch_desc.grid_dim_y, launch_desc.grid_dim_z);
+    ++num_queued_commands;
+}
+
+void RasterizerOpenGL::ResetCounter(VideoCore::QueryType type) {
+    query_cache.ResetCounter(type);
+}
+
+void RasterizerOpenGL::Query(GPUVAddr gpu_addr, VideoCore::QueryType type,
+                             std::optional<u64> timestamp) {
+    query_cache.Query(gpu_addr, type, timestamp);
 }
 
 void RasterizerOpenGL::FlushAll() {}
@@ -787,6 +746,7 @@ void RasterizerOpenGL::FlushRegion(CacheAddr addr, u64 size) {
     }
     texture_cache.FlushRegion(addr, size);
     buffer_cache.FlushRegion(addr, size);
+    query_cache.FlushRegion(addr, size);
 }
 
 void RasterizerOpenGL::InvalidateRegion(CacheAddr addr, u64 size) {
@@ -797,6 +757,7 @@ void RasterizerOpenGL::InvalidateRegion(CacheAddr addr, u64 size) {
     texture_cache.InvalidateRegion(addr, size);
     shader_cache.InvalidateRegion(addr, size);
     buffer_cache.InvalidateRegion(addr, size);
+    query_cache.InvalidateRegion(addr, size);
 }
 
 void RasterizerOpenGL::FlushAndInvalidateRegion(CacheAddr addr, u64 size) {
@@ -807,10 +768,18 @@ void RasterizerOpenGL::FlushAndInvalidateRegion(CacheAddr addr, u64 size) {
 }
 
 void RasterizerOpenGL::FlushCommands() {
+    // Only flush when we have commands queued to OpenGL.
+    if (num_queued_commands == 0) {
+        return;
+    }
+    num_queued_commands = 0;
     glFlush();
 }
 
 void RasterizerOpenGL::TickFrame() {
+    // Ticking a frame means that buffers will be swapped, calling glFlush implicitly.
+    num_queued_commands = 0;
+
     buffer_cache.TickFrame();
 }
 
@@ -942,8 +911,15 @@ void RasterizerOpenGL::SetupDrawTextures(std::size_t stage_index, const Shader&
     u32 binding = device.GetBaseBindings(stage_index).sampler;
     for (const auto& entry : shader->GetShaderEntries().samplers) {
         const auto shader_type = static_cast<Tegra::Engines::ShaderType>(stage_index);
-        const auto texture = GetTextureInfo(maxwell3d, entry, shader_type);
-        SetupTexture(binding++, texture, entry);
+        if (!entry.IsIndexed()) {
+            const auto texture = GetTextureInfo(maxwell3d, entry, shader_type);
+            SetupTexture(binding++, texture, entry);
+        } else {
+            for (std::size_t i = 0; i < entry.Size(); ++i) {
+                const auto texture = GetTextureInfo(maxwell3d, entry, shader_type, i);
+                SetupTexture(binding++, texture, entry);
+            }
+        }
     }
 }
 
@@ -952,8 +928,17 @@ void RasterizerOpenGL::SetupComputeTextures(const Shader& kernel) {
     const auto& compute = system.GPU().KeplerCompute();
     u32 binding = 0;
     for (const auto& entry : kernel->GetShaderEntries().samplers) {
-        const auto texture = GetTextureInfo(compute, entry, Tegra::Engines::ShaderType::Compute);
-        SetupTexture(binding++, texture, entry);
+        if (!entry.IsIndexed()) {
+            const auto texture =
+                GetTextureInfo(compute, entry, Tegra::Engines::ShaderType::Compute);
+            SetupTexture(binding++, texture, entry);
+        } else {
+            for (std::size_t i = 0; i < entry.Size(); ++i) {
+                const auto texture =
+                    GetTextureInfo(compute, entry, Tegra::Engines::ShaderType::Compute, i);
+                SetupTexture(binding++, texture, entry);
+            }
+        }
     }
 }
 
@@ -1273,6 +1258,7 @@ void RasterizerOpenGL::SyncPointState() {
     // Limit the point size to 1 since nouveau sometimes sets a point size of 0 (and that's invalid
     // in OpenGL).
     state.point.program_control = regs.vp_point_size.enable != 0;
+    state.point.sprite = regs.point_sprite_enable != 0;
     state.point.size = std::max(1.0f, regs.point_size);
 }
 
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h
index 6a27cf497..68abe9a21 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.h
+++ b/src/video_core/renderer_opengl/gl_rasterizer.h
@@ -24,6 +24,7 @@
 #include "video_core/renderer_opengl/gl_buffer_cache.h"
 #include "video_core/renderer_opengl/gl_device.h"
 #include "video_core/renderer_opengl/gl_framebuffer_cache.h"
+#include "video_core/renderer_opengl/gl_query_cache.h"
 #include "video_core/renderer_opengl/gl_resource_manager.h"
 #include "video_core/renderer_opengl/gl_sampler_cache.h"
 #include "video_core/renderer_opengl/gl_shader_cache.h"
@@ -57,10 +58,11 @@ public:
                               ScreenInfo& info);
     ~RasterizerOpenGL() override;
 
-    bool DrawBatch(bool is_indexed) override;
-    bool DrawMultiBatch(bool is_indexed) override;
+    void Draw(bool is_indexed, bool is_instanced) override;
     void Clear() override;
     void DispatchCompute(GPUVAddr code_addr) override;
+    void ResetCounter(VideoCore::QueryType type) override;
+    void Query(GPUVAddr gpu_addr, VideoCore::QueryType type, std::optional<u64> timestamp) override;
     void FlushAll() override;
     void FlushRegion(CacheAddr addr, u64 size) override;
     void InvalidateRegion(CacheAddr addr, u64 size) override;
@@ -75,6 +77,11 @@ public:
     void LoadDiskResources(const std::atomic_bool& stop_loading,
                            const VideoCore::DiskResourceLoadCallback& callback) override;
 
+    /// Returns true when there are commands queued to the OpenGL server.
+    bool AnyCommandQueued() const {
+        return num_queued_commands > 0;
+    }
+
 private:
     /// Configures the color and depth framebuffer states.
     void ConfigureFramebuffers();
@@ -102,9 +109,6 @@ private:
     void SetupGlobalMemory(u32 binding, const GLShader::GlobalMemoryEntry& entry, GPUVAddr gpu_addr,
                            std::size_t size);
 
-    /// Syncs all the state, shaders, render targets and textures setting before a draw call.
-    void DrawPrelude();
-
     /// Configures the current textures to use for the draw command.
     void SetupDrawTextures(std::size_t stage_index, const Shader& shader);
 
@@ -180,10 +184,23 @@ private:
     /// Syncs the alpha test state to match the guest state
     void SyncAlphaTest();
 
-    /// Check for extension that are not strictly required
-    /// but are needed for correct emulation
+    /// Check for extension that are not strictly required but are needed for correct emulation
     void CheckExtensions();
 
+    std::size_t CalculateVertexArraysSize() const;
+
+    std::size_t CalculateIndexBufferSize() const;
+
+    /// Updates and returns a vertex array object representing current vertex format
+    GLuint SetupVertexFormat();
+
+    void SetupVertexBuffer(GLuint vao);
+    void SetupVertexInstances(GLuint vao);
+
+    GLintptr SetupIndexBuffer();
+
+    void SetupShaders(GLenum primitive_mode);
+
     const Device device;
     OpenGLState state;
 
@@ -191,6 +208,7 @@ private:
     ShaderCacheOpenGL shader_cache;
     SamplerCacheOpenGL sampler_cache;
     FramebufferCacheOpenGL framebuffer_cache;
+    QueryCache query_cache;
 
     Core::System& system;
     ScreenInfo& screen_info;
@@ -208,24 +226,8 @@ private:
     BindBuffersRangePushBuffer bind_ubo_pushbuffer{GL_UNIFORM_BUFFER};
     BindBuffersRangePushBuffer bind_ssbo_pushbuffer{GL_SHADER_STORAGE_BUFFER};
 
-    std::size_t CalculateVertexArraysSize() const;
-
-    std::size_t CalculateIndexBufferSize() const;
-
-    /// Updates and returns a vertex array object representing current vertex format
-    GLuint SetupVertexFormat();
-
-    void SetupVertexBuffer(GLuint vao);
-    void SetupVertexInstances(GLuint vao);
-
-    GLintptr SetupIndexBuffer();
-
-    GLintptr index_buffer_offset;
-
-    void SetupShaders(GLenum primitive_mode);
-
-    enum class AccelDraw { Disabled, Arrays, Indexed };
-    AccelDraw accelerate_draw = AccelDraw::Disabled;
+    /// Number of commands queued to the OpenGL driver. Reseted on flush.
+    std::size_t num_queued_commands = 0;
 };
 
 } // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_resource_manager.cpp b/src/video_core/renderer_opengl/gl_resource_manager.cpp
index 5c96c1d46..f0ddfb276 100644
--- a/src/video_core/renderer_opengl/gl_resource_manager.cpp
+++ b/src/video_core/renderer_opengl/gl_resource_manager.cpp
@@ -207,4 +207,21 @@ void OGLFramebuffer::Release() {
     handle = 0;
 }
 
+void OGLQuery::Create(GLenum target) {
+    if (handle != 0)
+        return;
+
+    MICROPROFILE_SCOPE(OpenGL_ResourceCreation);
+    glCreateQueries(target, 1, &handle);
+}
+
+void OGLQuery::Release() {
+    if (handle == 0)
+        return;
+
+    MICROPROFILE_SCOPE(OpenGL_ResourceDeletion);
+    glDeleteQueries(1, &handle);
+    handle = 0;
+}
+
 } // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_resource_manager.h b/src/video_core/renderer_opengl/gl_resource_manager.h
index 3a85a1d4c..514d1d165 100644
--- a/src/video_core/renderer_opengl/gl_resource_manager.h
+++ b/src/video_core/renderer_opengl/gl_resource_manager.h
@@ -266,4 +266,29 @@ public:
     GLuint handle = 0;
 };
 
+class OGLQuery : private NonCopyable {
+public:
+    OGLQuery() = default;
+
+    OGLQuery(OGLQuery&& o) noexcept : handle(std::exchange(o.handle, 0)) {}
+
+    ~OGLQuery() {
+        Release();
+    }
+
+    OGLQuery& operator=(OGLQuery&& o) noexcept {
+        Release();
+        handle = std::exchange(o.handle, 0);
+        return *this;
+    }
+
+    /// Creates a new internal OpenGL resource and stores the handle
+    void Create(GLenum target);
+
+    /// Deletes the internal OpenGL resource
+    void Release();
+
+    GLuint handle = 0;
+};
+
 } // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_shader_cache.cpp b/src/video_core/renderer_opengl/gl_shader_cache.cpp
index 3c5bdd377..489eb143c 100644
--- a/src/video_core/renderer_opengl/gl_shader_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_cache.cpp
@@ -214,6 +214,7 @@ std::unique_ptr<ConstBufferLocker> MakeLocker(Core::System& system, ShaderType s
 }
 
 void FillLocker(ConstBufferLocker& locker, const ShaderDiskCacheUsage& usage) {
+    locker.SetBoundBuffer(usage.bound_buffer);
     for (const auto& key : usage.keys) {
         const auto [buffer, offset] = key.first;
         locker.InsertKey(buffer, offset, key.second);
@@ -418,7 +419,8 @@ bool CachedShader::EnsureValidLockerVariant() {
 
 ShaderDiskCacheUsage CachedShader::GetUsage(const ProgramVariant& variant,
                                             const ConstBufferLocker& locker) const {
-    return ShaderDiskCacheUsage{unique_identifier, variant, locker.GetKeys(),
+    return ShaderDiskCacheUsage{unique_identifier,         variant,
+                                locker.GetBoundBuffer(),   locker.GetKeys(),
                                 locker.GetBoundSamplers(), locker.GetBindlessSamplers()};
 }
 
diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
index 2996aaf08..4735000b5 100644
--- a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
@@ -391,6 +391,7 @@ public:
         DeclareVertex();
         DeclareGeometry();
         DeclareRegisters();
+        DeclareCustomVariables();
         DeclarePredicates();
         DeclareLocalMemory();
         DeclareInternalFlags();
@@ -503,6 +504,16 @@ private:
         }
     }
 
+    void DeclareCustomVariables() {
+        const u32 num_custom_variables = ir.GetNumCustomVariables();
+        for (u32 i = 0; i < num_custom_variables; ++i) {
+            code.AddLine("float {} = 0.0f;", GetCustomVariable(i));
+        }
+        if (num_custom_variables > 0) {
+            code.AddNewLine();
+        }
+    }
+
     void DeclarePredicates() {
         const auto& predicates = ir.GetPredicates();
         for (const auto pred : predicates) {
@@ -655,7 +666,8 @@ private:
         u32 binding = device.GetBaseBindings(stage).sampler;
         for (const auto& sampler : ir.GetSamplers()) {
             const std::string name = GetSampler(sampler);
-            const std::string description = fmt::format("layout (binding = {}) uniform", binding++);
+            const std::string description = fmt::format("layout (binding = {}) uniform", binding);
+            binding += sampler.IsIndexed() ? sampler.Size() : 1;
 
             std::string sampler_type = [&]() {
                 if (sampler.IsBuffer()) {
@@ -682,7 +694,11 @@ private:
                 sampler_type += "Shadow";
             }
 
-            code.AddLine("{} {} {};", description, sampler_type, name);
+            if (!sampler.IsIndexed()) {
+                code.AddLine("{} {} {};", description, sampler_type, name);
+            } else {
+                code.AddLine("{} {} {}[{}];", description, sampler_type, name, sampler.Size());
+            }
         }
         if (!ir.GetSamplers().empty()) {
             code.AddNewLine();
@@ -775,6 +791,11 @@ private:
             return {GetRegister(index), Type::Float};
         }
 
+        if (const auto cv = std::get_if<CustomVarNode>(&*node)) {
+            const u32 index = cv->GetIndex();
+            return {GetCustomVariable(index), Type::Float};
+        }
+
         if (const auto immediate = std::get_if<ImmediateNode>(&*node)) {
             const u32 value = immediate->GetValue();
             if (value < 10) {
@@ -1019,7 +1040,6 @@ private:
                 }
                 return {{"gl_ViewportIndex", Type::Int}};
             case 3:
-                UNIMPLEMENTED_MSG("Requires some state changes for gl_PointSize to work in shader");
                 return {{"gl_PointSize", Type::Float}};
             }
             return {};
@@ -1099,7 +1119,11 @@ private:
         } else if (!meta->ptp.empty()) {
             expr += "Offsets";
         }
-        expr += '(' + GetSampler(meta->sampler) + ", ";
+        if (!meta->sampler.IsIndexed()) {
+            expr += '(' + GetSampler(meta->sampler) + ", ";
+        } else {
+            expr += '(' + GetSampler(meta->sampler) + '[' + Visit(meta->index).AsUint() + "], ";
+        }
         expr += coord_constructors.at(count + (has_array ? 1 : 0) +
                                       (has_shadow && !separate_dc ? 1 : 0) - 1);
         expr += '(';
@@ -1311,6 +1335,8 @@ private:
             const std::string final_offset = fmt::format("({} - {}) >> 2", real, base);
             target = {fmt::format("{}[{}]", GetGlobalMemory(gmem->GetDescriptor()), final_offset),
                       Type::Uint};
+        } else if (const auto cv = std::get_if<CustomVarNode>(&*dest)) {
+            target = {GetCustomVariable(cv->GetIndex()), Type::Float};
         } else {
             UNREACHABLE_MSG("Assign called without a proper target");
         }
@@ -1858,10 +1884,7 @@ private:
 
     template <const std::string_view& opname, Type type>
     Expression Atomic(Operation operation) {
-        ASSERT(stage == ShaderType::Compute);
-        auto& smem = std::get<SmemNode>(*operation[0]);
-
-        return {fmt::format("atomic{}(smem[{} >> 2], {})", opname, Visit(smem.GetAddress()).AsInt(),
+        return {fmt::format("atomic{}({}, {})", opname, Visit(operation[0]).GetCode(),
                             Visit(operation[1]).As(type)),
                 type};
     }
@@ -2241,6 +2264,10 @@ private:
         return GetDeclarationWithSuffix(index, "gpr");
     }
 
+    std::string GetCustomVariable(u32 index) const {
+        return GetDeclarationWithSuffix(index, "custom_var");
+    }
+
     std::string GetPredicate(Tegra::Shader::Pred pred) const {
         return GetDeclarationWithSuffix(static_cast<u32>(pred), "pred");
     }
diff --git a/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp b/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp
index cf874a09a..1fc204f6f 100644
--- a/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp
@@ -53,7 +53,7 @@ struct BindlessSamplerKey {
     Tegra::Engines::SamplerDescriptor sampler{};
 };
 
-constexpr u32 NativeVersion = 11;
+constexpr u32 NativeVersion = 12;
 
 // Making sure sizes doesn't change by accident
 static_assert(sizeof(ProgramVariant) == 20);
@@ -186,7 +186,8 @@ ShaderDiskCacheOpenGL::LoadTransferable() {
             u32 num_bound_samplers{};
             u32 num_bindless_samplers{};
             if (file.ReadArray(&usage.unique_identifier, 1) != 1 ||
-                file.ReadArray(&usage.variant, 1) != 1 || file.ReadArray(&num_keys, 1) != 1 ||
+                file.ReadArray(&usage.variant, 1) != 1 ||
+                file.ReadArray(&usage.bound_buffer, 1) != 1 || file.ReadArray(&num_keys, 1) != 1 ||
                 file.ReadArray(&num_bound_samplers, 1) != 1 ||
                 file.ReadArray(&num_bindless_samplers, 1) != 1) {
                 LOG_ERROR(Render_OpenGL, error_loading);
@@ -281,7 +282,9 @@ ShaderDiskCacheOpenGL::LoadPrecompiledFile(FileUtil::IOFile& file) {
         u32 num_bindless_samplers{};
         ShaderDiskCacheUsage usage;
         if (!LoadObjectFromPrecompiled(usage.unique_identifier) ||
-            !LoadObjectFromPrecompiled(usage.variant) || !LoadObjectFromPrecompiled(num_keys) ||
+            !LoadObjectFromPrecompiled(usage.variant) ||
+            !LoadObjectFromPrecompiled(usage.bound_buffer) ||
+            !LoadObjectFromPrecompiled(num_keys) ||
             !LoadObjectFromPrecompiled(num_bound_samplers) ||
             !LoadObjectFromPrecompiled(num_bindless_samplers)) {
             return {};
@@ -393,6 +396,7 @@ void ShaderDiskCacheOpenGL::SaveUsage(const ShaderDiskCacheUsage& usage) {
 
     if (file.WriteObject(TransferableEntryKind::Usage) != 1 ||
         file.WriteObject(usage.unique_identifier) != 1 || file.WriteObject(usage.variant) != 1 ||
+        file.WriteObject(usage.bound_buffer) != 1 ||
         file.WriteObject(static_cast<u32>(usage.keys.size())) != 1 ||
         file.WriteObject(static_cast<u32>(usage.bound_samplers.size())) != 1 ||
         file.WriteObject(static_cast<u32>(usage.bindless_samplers.size())) != 1) {
@@ -447,7 +451,7 @@ void ShaderDiskCacheOpenGL::SaveDump(const ShaderDiskCacheUsage& usage, GLuint p
     };
 
     if (!SaveObjectToPrecompiled(usage.unique_identifier) ||
-        !SaveObjectToPrecompiled(usage.variant) ||
+        !SaveObjectToPrecompiled(usage.variant) || !SaveObjectToPrecompiled(usage.bound_buffer) ||
         !SaveObjectToPrecompiled(static_cast<u32>(usage.keys.size())) ||
         !SaveObjectToPrecompiled(static_cast<u32>(usage.bound_samplers.size())) ||
         !SaveObjectToPrecompiled(static_cast<u32>(usage.bindless_samplers.size()))) {
diff --git a/src/video_core/renderer_opengl/gl_shader_disk_cache.h b/src/video_core/renderer_opengl/gl_shader_disk_cache.h
index 69a2fbdda..ef2371f6d 100644
--- a/src/video_core/renderer_opengl/gl_shader_disk_cache.h
+++ b/src/video_core/renderer_opengl/gl_shader_disk_cache.h
@@ -79,6 +79,7 @@ static_assert(std::is_trivially_copyable_v<ProgramVariant>);
 struct ShaderDiskCacheUsage {
     u64 unique_identifier{};
     ProgramVariant variant;
+    u32 bound_buffer{};
     VideoCommon::Shader::KeyMap keys;
     VideoCommon::Shader::BoundSamplerMap bound_samplers;
     VideoCommon::Shader::BindlessSamplerMap bindless_samplers;
diff --git a/src/video_core/renderer_opengl/gl_state.cpp b/src/video_core/renderer_opengl/gl_state.cpp
index cc185e9e1..ab1f7983c 100644
--- a/src/video_core/renderer_opengl/gl_state.cpp
+++ b/src/video_core/renderer_opengl/gl_state.cpp
@@ -128,6 +128,7 @@ void OpenGLState::ApplyClipDistances() {
 
 void OpenGLState::ApplyPointSize() {
     Enable(GL_PROGRAM_POINT_SIZE, cur_state.point.program_control, point.program_control);
+    Enable(GL_POINT_SPRITE, cur_state.point.sprite, point.sprite);
     if (UpdateValue(cur_state.point.size, point.size)) {
         glPointSize(point.size);
     }
diff --git a/src/video_core/renderer_opengl/gl_state.h b/src/video_core/renderer_opengl/gl_state.h
index 678e5cd89..4953eeda2 100644
--- a/src/video_core/renderer_opengl/gl_state.h
+++ b/src/video_core/renderer_opengl/gl_state.h
@@ -132,6 +132,7 @@ public:
 
     struct {
         bool program_control = false; // GL_PROGRAM_POINT_SIZE
+        bool sprite = false;          // GL_POINT_SPRITE
         GLfloat size = 1.0f;          // GL_POINT_SIZE
     } point;
 
diff --git a/src/video_core/renderer_opengl/gl_texture_cache.cpp b/src/video_core/renderer_opengl/gl_texture_cache.cpp
index e95eb069e..d4b81cd87 100644
--- a/src/video_core/renderer_opengl/gl_texture_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_texture_cache.cpp
@@ -176,6 +176,19 @@ GLint GetSwizzleSource(SwizzleSource source) {
     return GL_NONE;
 }
 
+GLenum GetComponent(PixelFormat format, bool is_first) {
+    switch (format) {
+    case PixelFormat::Z24S8:
+    case PixelFormat::Z32FS8:
+        return is_first ? GL_DEPTH_COMPONENT : GL_STENCIL_INDEX;
+    case PixelFormat::S8Z24:
+        return is_first ? GL_STENCIL_INDEX : GL_DEPTH_COMPONENT;
+    default:
+        UNREACHABLE();
+        return GL_DEPTH_COMPONENT;
+    }
+}
+
 void ApplyTextureDefaults(const SurfaceParams& params, GLuint texture) {
     if (params.IsBuffer()) {
         return;
@@ -184,7 +197,7 @@ void ApplyTextureDefaults(const SurfaceParams& params, GLuint texture) {
     glTextureParameteri(texture, GL_TEXTURE_MAG_FILTER, GL_LINEAR);
     glTextureParameteri(texture, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE);
     glTextureParameteri(texture, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE);
-    glTextureParameteri(texture, GL_TEXTURE_MAX_LEVEL, params.num_levels - 1);
+    glTextureParameteri(texture, GL_TEXTURE_MAX_LEVEL, static_cast<GLint>(params.num_levels - 1));
     if (params.num_levels == 1) {
         glTextureParameterf(texture, GL_TEXTURE_LOD_BIAS, 1000.0f);
     }
@@ -416,11 +429,21 @@ void CachedSurfaceView::ApplySwizzle(SwizzleSource x_source, SwizzleSource y_sou
     if (new_swizzle == swizzle)
         return;
     swizzle = new_swizzle;
-    const std::array<GLint, 4> gl_swizzle = {GetSwizzleSource(x_source), GetSwizzleSource(y_source),
-                                             GetSwizzleSource(z_source),
-                                             GetSwizzleSource(w_source)};
+    const std::array gl_swizzle = {GetSwizzleSource(x_source), GetSwizzleSource(y_source),
+                                   GetSwizzleSource(z_source), GetSwizzleSource(w_source)};
     const GLuint handle = GetTexture();
-    glTextureParameteriv(handle, GL_TEXTURE_SWIZZLE_RGBA, gl_swizzle.data());
+    const PixelFormat format = surface.GetSurfaceParams().pixel_format;
+    switch (format) {
+    case PixelFormat::Z24S8:
+    case PixelFormat::Z32FS8:
+    case PixelFormat::S8Z24:
+        glTextureParameteri(handle, GL_DEPTH_STENCIL_TEXTURE_MODE,
+                            GetComponent(format, x_source == SwizzleSource::R));
+        break;
+    default:
+        glTextureParameteriv(handle, GL_TEXTURE_SWIZZLE_RGBA, gl_swizzle.data());
+        break;
+    }
 }
 
 OGLTextureView CachedSurfaceView::CreateTextureView() const {
@@ -529,8 +552,11 @@ void TextureCacheOpenGL::ImageBlit(View& src_view, View& dst_view,
     const Common::Rectangle<u32>& dst_rect = copy_config.dst_rect;
     const bool is_linear = copy_config.filter == Tegra::Engines::Fermi2D::Filter::Linear;
 
-    glBlitFramebuffer(src_rect.left, src_rect.top, src_rect.right, src_rect.bottom, dst_rect.left,
-                      dst_rect.top, dst_rect.right, dst_rect.bottom, buffers,
+    glBlitFramebuffer(static_cast<GLint>(src_rect.left), static_cast<GLint>(src_rect.top),
+                      static_cast<GLint>(src_rect.right), static_cast<GLint>(src_rect.bottom),
+                      static_cast<GLint>(dst_rect.left), static_cast<GLint>(dst_rect.top),
+                      static_cast<GLint>(dst_rect.right), static_cast<GLint>(dst_rect.bottom),
+                      buffers,
                       is_linear && (buffers == GL_COLOR_BUFFER_BIT) ? GL_LINEAR : GL_NEAREST);
 }
 
diff --git a/src/video_core/renderer_opengl/maxwell_to_gl.h b/src/video_core/renderer_opengl/maxwell_to_gl.h
index ea4f35663..7ed505628 100644
--- a/src/video_core/renderer_opengl/maxwell_to_gl.h
+++ b/src/video_core/renderer_opengl/maxwell_to_gl.h
@@ -47,8 +47,7 @@ inline GLenum VertexType(Maxwell::VertexAttribute attrib) {
         case Maxwell::VertexAttribute::Size::Size_10_10_10_2:
             return GL_UNSIGNED_INT_2_10_10_10_REV;
         default:
-            LOG_CRITICAL(Render_OpenGL, "Unimplemented vertex size={}", attrib.SizeString());
-            UNREACHABLE();
+            LOG_ERROR(Render_OpenGL, "Unimplemented vertex size={}", attrib.SizeString());
             return {};
         }
     case Maxwell::VertexAttribute::Type::SignedInt:
@@ -72,8 +71,7 @@ inline GLenum VertexType(Maxwell::VertexAttribute attrib) {
         case Maxwell::VertexAttribute::Size::Size_10_10_10_2:
             return GL_INT_2_10_10_10_REV;
         default:
-            LOG_CRITICAL(Render_OpenGL, "Unimplemented vertex size={}", attrib.SizeString());
-            UNREACHABLE();
+            LOG_ERROR(Render_OpenGL, "Unimplemented vertex size={}", attrib.SizeString());
             return {};
         }
     case Maxwell::VertexAttribute::Type::Float:
@@ -89,13 +87,19 @@ inline GLenum VertexType(Maxwell::VertexAttribute attrib) {
         case Maxwell::VertexAttribute::Size::Size_32_32_32_32:
             return GL_FLOAT;
         default:
-            LOG_CRITICAL(Render_OpenGL, "Unimplemented vertex size={}", attrib.SizeString());
-            UNREACHABLE();
+            LOG_ERROR(Render_OpenGL, "Unimplemented vertex size={}", attrib.SizeString());
+            return {};
+        }
+    case Maxwell::VertexAttribute::Type::UnsignedScaled:
+        switch (attrib.size) {
+        case Maxwell::VertexAttribute::Size::Size_8_8:
+            return GL_UNSIGNED_BYTE;
+        default:
+            LOG_ERROR(Render_OpenGL, "Unimplemented vertex size={}", attrib.SizeString());
             return {};
         }
     default:
-        LOG_CRITICAL(Render_OpenGL, "Unimplemented vertex type={}", attrib.TypeString());
-        UNREACHABLE();
+        LOG_ERROR(Render_OpenGL, "Unimplemented vertex type={}", attrib.TypeString());
         return {};
     }
 }
diff --git a/src/video_core/renderer_vulkan/maxwell_to_vk.cpp b/src/video_core/renderer_vulkan/maxwell_to_vk.cpp
index 331808113..5403c3ab7 100644
--- a/src/video_core/renderer_vulkan/maxwell_to_vk.cpp
+++ b/src/video_core/renderer_vulkan/maxwell_to_vk.cpp
@@ -164,7 +164,7 @@ struct FormatTuple {
     {vk::Format::eUndefined, {}},                                // ASTC_2D_5X4
     {vk::Format::eUndefined, {}},                                // BGRA8_SRGB
     {vk::Format::eBc1RgbaSrgbBlock, {}},                         // DXT1_SRGB
-    {vk::Format::eUndefined, {}},                                // DXT23_SRGB
+    {vk::Format::eBc2SrgbBlock, {}},                             // DXT23_SRGB
     {vk::Format::eBc3SrgbBlock, {}},                             // DXT45_SRGB
     {vk::Format::eBc7SrgbBlock, {}},                             // BC7U_SRGB
     {vk::Format::eR4G4B4A4UnormPack16, Attachable},              // R4G4B4A4U
@@ -363,6 +363,8 @@ vk::Format VertexFormat(Maxwell::VertexAttribute::Type type, Maxwell::VertexAttr
             return vk::Format::eR8G8B8A8Uint;
         case Maxwell::VertexAttribute::Size::Size_32:
             return vk::Format::eR32Uint;
+        case Maxwell::VertexAttribute::Size::Size_32_32_32_32:
+            return vk::Format::eR32G32B32A32Uint;
         default:
             break;
         }
diff --git a/src/video_core/renderer_vulkan/renderer_vulkan.cpp b/src/video_core/renderer_vulkan/renderer_vulkan.cpp
new file mode 100644
index 000000000..d5032b432
--- /dev/null
+++ b/src/video_core/renderer_vulkan/renderer_vulkan.cpp
@@ -0,0 +1,265 @@
+// Copyright 2018 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <memory>
+#include <optional>
+#include <vector>
+
+#include <fmt/format.h>
+
+#include "common/assert.h"
+#include "common/logging/log.h"
+#include "common/telemetry.h"
+#include "core/core.h"
+#include "core/core_timing.h"
+#include "core/frontend/emu_window.h"
+#include "core/memory.h"
+#include "core/perf_stats.h"
+#include "core/settings.h"
+#include "core/telemetry_session.h"
+#include "video_core/gpu.h"
+#include "video_core/renderer_vulkan/declarations.h"
+#include "video_core/renderer_vulkan/renderer_vulkan.h"
+#include "video_core/renderer_vulkan/vk_blit_screen.h"
+#include "video_core/renderer_vulkan/vk_device.h"
+#include "video_core/renderer_vulkan/vk_memory_manager.h"
+#include "video_core/renderer_vulkan/vk_rasterizer.h"
+#include "video_core/renderer_vulkan/vk_resource_manager.h"
+#include "video_core/renderer_vulkan/vk_scheduler.h"
+#include "video_core/renderer_vulkan/vk_swapchain.h"
+
+namespace Vulkan {
+
+namespace {
+
+VkBool32 DebugCallback(VkDebugUtilsMessageSeverityFlagBitsEXT severity_,
+                       VkDebugUtilsMessageTypeFlagsEXT type,
+                       const VkDebugUtilsMessengerCallbackDataEXT* data,
+                       [[maybe_unused]] void* user_data) {
+    const vk::DebugUtilsMessageSeverityFlagBitsEXT severity{severity_};
+    const char* message{data->pMessage};
+
+    if (severity & vk::DebugUtilsMessageSeverityFlagBitsEXT::eError) {
+        LOG_CRITICAL(Render_Vulkan, "{}", message);
+    } else if (severity & vk::DebugUtilsMessageSeverityFlagBitsEXT::eWarning) {
+        LOG_WARNING(Render_Vulkan, "{}", message);
+    } else if (severity & vk::DebugUtilsMessageSeverityFlagBitsEXT::eInfo) {
+        LOG_INFO(Render_Vulkan, "{}", message);
+    } else if (severity & vk::DebugUtilsMessageSeverityFlagBitsEXT::eVerbose) {
+        LOG_DEBUG(Render_Vulkan, "{}", message);
+    }
+    return VK_FALSE;
+}
+
+std::string GetReadableVersion(u32 version) {
+    return fmt::format("{}.{}.{}", VK_VERSION_MAJOR(version), VK_VERSION_MINOR(version),
+                       VK_VERSION_PATCH(version));
+}
+
+std::string GetDriverVersion(const VKDevice& device) {
+    // Extracted from
+    // https://github.com/SaschaWillems/vulkan.gpuinfo.org/blob/5dddea46ea1120b0df14eef8f15ff8e318e35462/functions.php#L308-L314
+    const u32 version = device.GetDriverVersion();
+
+    if (device.GetDriverID() == vk::DriverIdKHR::eNvidiaProprietary) {
+        const u32 major = (version >> 22) & 0x3ff;
+        const u32 minor = (version >> 14) & 0x0ff;
+        const u32 secondary = (version >> 6) & 0x0ff;
+        const u32 tertiary = version & 0x003f;
+        return fmt::format("{}.{}.{}.{}", major, minor, secondary, tertiary);
+    }
+    if (device.GetDriverID() == vk::DriverIdKHR::eIntelProprietaryWindows) {
+        const u32 major = version >> 14;
+        const u32 minor = version & 0x3fff;
+        return fmt::format("{}.{}", major, minor);
+    }
+
+    return GetReadableVersion(version);
+}
+
+std::string BuildCommaSeparatedExtensions(std::vector<std::string> available_extensions) {
+    std::sort(std::begin(available_extensions), std::end(available_extensions));
+
+    static constexpr std::size_t AverageExtensionSize = 64;
+    std::string separated_extensions;
+    separated_extensions.reserve(available_extensions.size() * AverageExtensionSize);
+
+    const auto end = std::end(available_extensions);
+    for (auto extension = std::begin(available_extensions); extension != end; ++extension) {
+        if (const bool is_last = extension + 1 == end; is_last) {
+            separated_extensions += *extension;
+        } else {
+            separated_extensions += fmt::format("{},", *extension);
+        }
+    }
+    return separated_extensions;
+}
+
+} // Anonymous namespace
+
+RendererVulkan::RendererVulkan(Core::Frontend::EmuWindow& window, Core::System& system)
+    : RendererBase(window), system{system} {}
+
+RendererVulkan::~RendererVulkan() {
+    ShutDown();
+}
+
+void RendererVulkan::SwapBuffers(const Tegra::FramebufferConfig* framebuffer) {
+    const auto& layout = render_window.GetFramebufferLayout();
+    if (framebuffer && layout.width > 0 && layout.height > 0 && render_window.IsShown()) {
+        const VAddr framebuffer_addr = framebuffer->address + framebuffer->offset;
+        const bool use_accelerated =
+            rasterizer->AccelerateDisplay(*framebuffer, framebuffer_addr, framebuffer->stride);
+        const bool is_srgb = use_accelerated && screen_info.is_srgb;
+        if (swapchain->HasFramebufferChanged(layout) || swapchain->GetSrgbState() != is_srgb) {
+            swapchain->Create(layout.width, layout.height, is_srgb);
+            blit_screen->Recreate();
+        }
+
+        scheduler->WaitWorker();
+
+        swapchain->AcquireNextImage();
+        const auto [fence, render_semaphore] = blit_screen->Draw(*framebuffer, use_accelerated);
+
+        scheduler->Flush(false, render_semaphore);
+
+        if (swapchain->Present(render_semaphore, fence)) {
+            blit_screen->Recreate();
+        }
+
+        render_window.SwapBuffers();
+        rasterizer->TickFrame();
+    }
+
+    render_window.PollEvents();
+}
+
+bool RendererVulkan::Init() {
+    PFN_vkGetInstanceProcAddr vkGetInstanceProcAddr{};
+    render_window.RetrieveVulkanHandlers(&vkGetInstanceProcAddr, &instance, &surface);
+    const vk::DispatchLoaderDynamic dldi(instance, vkGetInstanceProcAddr);
+
+    std::optional<vk::DebugUtilsMessengerEXT> callback;
+    if (Settings::values.renderer_debug && dldi.vkCreateDebugUtilsMessengerEXT) {
+        callback = CreateDebugCallback(dldi);
+        if (!callback) {
+            return false;
+        }
+    }
+
+    if (!PickDevices(dldi)) {
+        if (callback) {
+            instance.destroy(*callback, nullptr, dldi);
+        }
+        return false;
+    }
+    debug_callback = UniqueDebugUtilsMessengerEXT(
+        *callback, vk::ObjectDestroy<vk::Instance, vk::DispatchLoaderDynamic>(
+                       instance, nullptr, device->GetDispatchLoader()));
+
+    Report();
+
+    memory_manager = std::make_unique<VKMemoryManager>(*device);
+
+    resource_manager = std::make_unique<VKResourceManager>(*device);
+
+    const auto& framebuffer = render_window.GetFramebufferLayout();
+    swapchain = std::make_unique<VKSwapchain>(surface, *device);
+    swapchain->Create(framebuffer.width, framebuffer.height, false);
+
+    scheduler = std::make_unique<VKScheduler>(*device, *resource_manager);
+
+    rasterizer = std::make_unique<RasterizerVulkan>(system, render_window, screen_info, *device,
+                                                    *resource_manager, *memory_manager, *scheduler);
+
+    blit_screen = std::make_unique<VKBlitScreen>(system, render_window, *rasterizer, *device,
+                                                 *resource_manager, *memory_manager, *swapchain,
+                                                 *scheduler, screen_info);
+
+    return true;
+}
+
+void RendererVulkan::ShutDown() {
+    if (!device) {
+        return;
+    }
+    const auto dev = device->GetLogical();
+    const auto& dld = device->GetDispatchLoader();
+    if (dev && dld.vkDeviceWaitIdle) {
+        dev.waitIdle(dld);
+    }
+
+    rasterizer.reset();
+    blit_screen.reset();
+    scheduler.reset();
+    swapchain.reset();
+    memory_manager.reset();
+    resource_manager.reset();
+    device.reset();
+}
+
+std::optional<vk::DebugUtilsMessengerEXT> RendererVulkan::CreateDebugCallback(
+    const vk::DispatchLoaderDynamic& dldi) {
+    const vk::DebugUtilsMessengerCreateInfoEXT callback_ci(
+        {},
+        vk::DebugUtilsMessageSeverityFlagBitsEXT::eError |
+            vk::DebugUtilsMessageSeverityFlagBitsEXT::eWarning |
+            vk::DebugUtilsMessageSeverityFlagBitsEXT::eInfo |
+            vk::DebugUtilsMessageSeverityFlagBitsEXT::eVerbose,
+        vk::DebugUtilsMessageTypeFlagBitsEXT::eGeneral |
+            vk::DebugUtilsMessageTypeFlagBitsEXT::eValidation |
+            vk::DebugUtilsMessageTypeFlagBitsEXT::ePerformance,
+        &DebugCallback, nullptr);
+    vk::DebugUtilsMessengerEXT callback;
+    if (instance.createDebugUtilsMessengerEXT(&callback_ci, nullptr, &callback, dldi) !=
+        vk::Result::eSuccess) {
+        LOG_ERROR(Render_Vulkan, "Failed to create debug callback");
+        return {};
+    }
+    return callback;
+}
+
+bool RendererVulkan::PickDevices(const vk::DispatchLoaderDynamic& dldi) {
+    const auto devices = instance.enumeratePhysicalDevices(dldi);
+
+    // TODO(Rodrigo): Choose device from config file
+    const s32 device_index = Settings::values.vulkan_device;
+    if (device_index < 0 || device_index >= static_cast<s32>(devices.size())) {
+        LOG_ERROR(Render_Vulkan, "Invalid device index {}!", device_index);
+        return false;
+    }
+    const vk::PhysicalDevice physical_device = devices[device_index];
+
+    if (!VKDevice::IsSuitable(dldi, physical_device, surface)) {
+        return false;
+    }
+
+    device = std::make_unique<VKDevice>(dldi, physical_device, surface);
+    return device->Create(dldi, instance);
+}
+
+void RendererVulkan::Report() const {
+    const std::string vendor_name{device->GetVendorName()};
+    const std::string model_name{device->GetModelName()};
+    const std::string driver_version = GetDriverVersion(*device);
+    const std::string driver_name = fmt::format("{} {}", vendor_name, driver_version);
+
+    const std::string api_version = GetReadableVersion(device->GetApiVersion());
+
+    const std::string extensions = BuildCommaSeparatedExtensions(device->GetAvailableExtensions());
+
+    LOG_INFO(Render_Vulkan, "Driver: {}", driver_name);
+    LOG_INFO(Render_Vulkan, "Device: {}", model_name);
+    LOG_INFO(Render_Vulkan, "Vulkan: {}", api_version);
+
+    auto& telemetry_session = system.TelemetrySession();
+    constexpr auto field = Telemetry::FieldType::UserSystem;
+    telemetry_session.AddField(field, "GPU_Vendor", vendor_name);
+    telemetry_session.AddField(field, "GPU_Model", model_name);
+    telemetry_session.AddField(field, "GPU_Vulkan_Driver", driver_name);
+    telemetry_session.AddField(field, "GPU_Vulkan_Version", api_version);
+    telemetry_session.AddField(field, "GPU_Vulkan_Extensions", extensions);
+}
+
+} // namespace Vulkan
+\ No newline at end of file
diff --git a/src/video_core/renderer_vulkan/vk_device.cpp b/src/video_core/renderer_vulkan/vk_device.cpp
index 939eebe83..d1da4f9d3 100644
--- a/src/video_core/renderer_vulkan/vk_device.cpp
+++ b/src/video_core/renderer_vulkan/vk_device.cpp
@@ -104,8 +104,11 @@ bool VKDevice::Create(const vk::DispatchLoaderDynamic& dldi, vk::Instance instan
     features.depthBiasClamp = true;
     features.geometryShader = true;
     features.tessellationShader = true;
+    features.occlusionQueryPrecise = true;
     features.fragmentStoresAndAtomics = true;
     features.shaderImageGatherExtended = true;
+    features.shaderStorageImageReadWithoutFormat =
+        is_shader_storage_img_read_without_format_supported;
     features.shaderStorageImageWriteWithoutFormat = true;
     features.textureCompressionASTC_LDR = is_optimal_astc_supported;
 
@@ -117,6 +120,10 @@ bool VKDevice::Create(const vk::DispatchLoaderDynamic& dldi, vk::Instance instan
     bit8_storage.uniformAndStorageBuffer8BitAccess = true;
     SetNext(next, bit8_storage);
 
+    vk::PhysicalDeviceHostQueryResetFeaturesEXT host_query_reset;
+    host_query_reset.hostQueryReset = true;
+    SetNext(next, host_query_reset);
+
     vk::PhysicalDeviceFloat16Int8FeaturesKHR float16_int8;
     if (is_float16_supported) {
         float16_int8.shaderFloat16 = true;
@@ -273,6 +280,7 @@ bool VKDevice::IsSuitable(const vk::DispatchLoaderDynamic& dldi, vk::PhysicalDev
         VK_EXT_VERTEX_ATTRIBUTE_DIVISOR_EXTENSION_NAME,
         VK_EXT_SHADER_SUBGROUP_BALLOT_EXTENSION_NAME,
         VK_EXT_SHADER_SUBGROUP_VOTE_EXTENSION_NAME,
+        VK_EXT_HOST_QUERY_RESET_EXTENSION_NAME,
     };
     std::bitset<required_extensions.size()> available_extensions{};
 
@@ -340,6 +348,7 @@ bool VKDevice::IsSuitable(const vk::DispatchLoaderDynamic& dldi, vk::PhysicalDev
         std::make_pair(features.depthBiasClamp, "depthBiasClamp"),
         std::make_pair(features.geometryShader, "geometryShader"),
         std::make_pair(features.tessellationShader, "tessellationShader"),
+        std::make_pair(features.occlusionQueryPrecise, "occlusionQueryPrecise"),
         std::make_pair(features.fragmentStoresAndAtomics, "fragmentStoresAndAtomics"),
         std::make_pair(features.shaderImageGatherExtended, "shaderImageGatherExtended"),
         std::make_pair(features.shaderStorageImageWriteWithoutFormat,
@@ -376,7 +385,7 @@ std::vector<const char*> VKDevice::LoadExtensions(const vk::DispatchLoaderDynami
         }
     };
 
-    extensions.reserve(13);
+    extensions.reserve(14);
     extensions.push_back(VK_KHR_SWAPCHAIN_EXTENSION_NAME);
     extensions.push_back(VK_KHR_16BIT_STORAGE_EXTENSION_NAME);
     extensions.push_back(VK_KHR_8BIT_STORAGE_EXTENSION_NAME);
@@ -384,6 +393,7 @@ std::vector<const char*> VKDevice::LoadExtensions(const vk::DispatchLoaderDynami
     extensions.push_back(VK_EXT_VERTEX_ATTRIBUTE_DIVISOR_EXTENSION_NAME);
     extensions.push_back(VK_EXT_SHADER_SUBGROUP_BALLOT_EXTENSION_NAME);
     extensions.push_back(VK_EXT_SHADER_SUBGROUP_VOTE_EXTENSION_NAME);
+    extensions.push_back(VK_EXT_HOST_QUERY_RESET_EXTENSION_NAME);
 
     [[maybe_unused]] const bool nsight =
         std::getenv("NVTX_INJECTION64_PATH") || std::getenv("NSIGHT_LAUNCHED");
@@ -400,8 +410,10 @@ std::vector<const char*> VKDevice::LoadExtensions(const vk::DispatchLoaderDynami
              VK_EXT_SHADER_VIEWPORT_INDEX_LAYER_EXTENSION_NAME, true);
         Test(extension, ext_subgroup_size_control, VK_EXT_SUBGROUP_SIZE_CONTROL_EXTENSION_NAME,
              false);
-        Test(extension, nv_device_diagnostic_checkpoints,
-             VK_NV_DEVICE_DIAGNOSTIC_CHECKPOINTS_EXTENSION_NAME, true);
+        if (Settings::values.renderer_debug) {
+            Test(extension, nv_device_diagnostic_checkpoints,
+                 VK_NV_DEVICE_DIAGNOSTIC_CHECKPOINTS_EXTENSION_NAME, true);
+        }
     }
 
     if (khr_shader_float16_int8) {
@@ -455,6 +467,8 @@ void VKDevice::SetupFamilies(const vk::DispatchLoaderDynamic& dldi, vk::SurfaceK
 
 void VKDevice::SetupFeatures(const vk::DispatchLoaderDynamic& dldi) {
     const auto supported_features{physical.getFeatures(dldi)};
+    is_shader_storage_img_read_without_format_supported =
+        supported_features.shaderStorageImageReadWithoutFormat;
     is_optimal_astc_supported = IsOptimalAstcSupported(supported_features, dldi);
 }
 
@@ -528,6 +542,7 @@ std::unordered_map<vk::Format, vk::FormatProperties> VKDevice::GetFormatProperti
                                         vk::Format::eBc6HUfloatBlock,
                                         vk::Format::eBc6HSfloatBlock,
                                         vk::Format::eBc1RgbaSrgbBlock,
+                                        vk::Format::eBc2SrgbBlock,
                                         vk::Format::eBc3SrgbBlock,
                                         vk::Format::eBc7SrgbBlock,
                                         vk::Format::eAstc4x4SrgbBlock,
diff --git a/src/video_core/renderer_vulkan/vk_device.h b/src/video_core/renderer_vulkan/vk_device.h
index 72603f9f6..2c27ad730 100644
--- a/src/video_core/renderer_vulkan/vk_device.h
+++ b/src/video_core/renderer_vulkan/vk_device.h
@@ -122,6 +122,11 @@ public:
         return properties.limits.maxPushConstantsSize;
     }
 
+    /// Returns true if Shader storage Image Read Without Format supported.
+    bool IsShaderStorageImageReadWithoutFormatSupported() const {
+        return is_shader_storage_img_read_without_format_supported;
+    }
+
     /// Returns true if ASTC is natively supported.
     bool IsOptimalAstcSupported() const {
         return is_optimal_astc_supported;
@@ -227,6 +232,8 @@ private:
     bool ext_depth_range_unrestricted{};       ///< Support for VK_EXT_depth_range_unrestricted.
     bool ext_shader_viewport_index_layer{};    ///< Support for VK_EXT_shader_viewport_index_layer.
     bool nv_device_diagnostic_checkpoints{};   ///< Support for VK_NV_device_diagnostic_checkpoints.
+    bool is_shader_storage_img_read_without_format_supported{}; ///< Support for shader storage
+                                                                ///< image read without format
 
     // Telemetry parameters
     std::string vendor_name;                      ///< Device's driver name.
diff --git a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp
index 48e23d4cd..7ddf7d3ee 100644
--- a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp
+++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp
@@ -325,9 +325,6 @@ VKPipelineCache::DecompileShaders(const GraphicsPipelineCacheKey& key) {
     specialization.tessellation.primitive = fixed_state.tessellation.primitive;
     specialization.tessellation.spacing = fixed_state.tessellation.spacing;
     specialization.tessellation.clockwise = fixed_state.tessellation.clockwise;
-    for (const auto& rt : key.renderpass_params.color_attachments) {
-        specialization.enabled_rendertargets.set(rt.index);
-    }
 
     SPIRVProgram program;
     std::vector<vk::DescriptorSetLayoutBinding> bindings;
diff --git a/src/video_core/renderer_vulkan/vk_query_cache.cpp b/src/video_core/renderer_vulkan/vk_query_cache.cpp
new file mode 100644
index 000000000..ffbf60dda
--- /dev/null
+++ b/src/video_core/renderer_vulkan/vk_query_cache.cpp
@@ -0,0 +1,122 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <utility>
+#include <vector>
+
+#include "video_core/renderer_vulkan/declarations.h"
+#include "video_core/renderer_vulkan/vk_device.h"
+#include "video_core/renderer_vulkan/vk_query_cache.h"
+#include "video_core/renderer_vulkan/vk_resource_manager.h"
+#include "video_core/renderer_vulkan/vk_scheduler.h"
+
+namespace Vulkan {
+
+namespace {
+
+constexpr std::array QUERY_TARGETS = {vk::QueryType::eOcclusion};
+
+constexpr vk::QueryType GetTarget(VideoCore::QueryType type) {
+    return QUERY_TARGETS[static_cast<std::size_t>(type)];
+}
+
+} // Anonymous namespace
+
+QueryPool::QueryPool() : VKFencedPool{GROW_STEP} {}
+
+QueryPool::~QueryPool() = default;
+
+void QueryPool::Initialize(const VKDevice& device_, VideoCore::QueryType type_) {
+    device = &device_;
+    type = type_;
+}
+
+std::pair<vk::QueryPool, std::uint32_t> QueryPool::Commit(VKFence& fence) {
+    std::size_t index;
+    do {
+        index = CommitResource(fence);
+    } while (usage[index]);
+    usage[index] = true;
+
+    return {*pools[index / GROW_STEP], static_cast<std::uint32_t>(index % GROW_STEP)};
+}
+
+void QueryPool::Allocate(std::size_t begin, std::size_t end) {
+    usage.resize(end);
+
+    const auto dev = device->GetLogical();
+    const u32 size = static_cast<u32>(end - begin);
+    const vk::QueryPoolCreateInfo query_pool_ci({}, GetTarget(type), size, {});
+    pools.push_back(dev.createQueryPoolUnique(query_pool_ci, nullptr, device->GetDispatchLoader()));
+}
+
+void QueryPool::Reserve(std::pair<vk::QueryPool, std::uint32_t> query) {
+    const auto it =
+        std::find_if(std::begin(pools), std::end(pools),
+                     [query_pool = query.first](auto& pool) { return query_pool == *pool; });
+    ASSERT(it != std::end(pools));
+
+    const std::ptrdiff_t pool_index = std::distance(std::begin(pools), it);
+    usage[pool_index * GROW_STEP + static_cast<std::ptrdiff_t>(query.second)] = false;
+}
+
+VKQueryCache::VKQueryCache(Core::System& system, VideoCore::RasterizerInterface& rasterizer,
+                           const VKDevice& device, VKScheduler& scheduler)
+    : VideoCommon::QueryCacheBase<VKQueryCache, CachedQuery, CounterStream, HostCounter,
+                                  QueryPool>{system, rasterizer},
+      device{device}, scheduler{scheduler} {
+    for (std::size_t i = 0; i < static_cast<std::size_t>(VideoCore::NumQueryTypes); ++i) {
+        query_pools[i].Initialize(device, static_cast<VideoCore::QueryType>(i));
+    }
+}
+
+VKQueryCache::~VKQueryCache() = default;
+
+std::pair<vk::QueryPool, std::uint32_t> VKQueryCache::AllocateQuery(VideoCore::QueryType type) {
+    return query_pools[static_cast<std::size_t>(type)].Commit(scheduler.GetFence());
+}
+
+void VKQueryCache::Reserve(VideoCore::QueryType type,
+                           std::pair<vk::QueryPool, std::uint32_t> query) {
+    query_pools[static_cast<std::size_t>(type)].Reserve(query);
+}
+
+HostCounter::HostCounter(VKQueryCache& cache, std::shared_ptr<HostCounter> dependency,
+                         VideoCore::QueryType type)
+    : VideoCommon::HostCounterBase<VKQueryCache, HostCounter>{std::move(dependency)}, cache{cache},
+      type{type}, query{cache.AllocateQuery(type)}, ticks{cache.Scheduler().Ticks()} {
+    const auto dev = cache.Device().GetLogical();
+    cache.Scheduler().Record([dev, query = query](vk::CommandBuffer cmdbuf, auto& dld) {
+        dev.resetQueryPoolEXT(query.first, query.second, 1, dld);
+        cmdbuf.beginQuery(query.first, query.second, vk::QueryControlFlagBits::ePrecise, dld);
+    });
+}
+
+HostCounter::~HostCounter() {
+    cache.Reserve(type, query);
+}
+
+void HostCounter::EndQuery() {
+    cache.Scheduler().Record([query = query](auto cmdbuf, auto& dld) {
+        cmdbuf.endQuery(query.first, query.second, dld);
+    });
+}
+
+u64 HostCounter::BlockingQuery() const {
+    if (ticks >= cache.Scheduler().Ticks()) {
+        cache.Scheduler().Flush();
+    }
+
+    const auto dev = cache.Device().GetLogical();
+    const auto& dld = cache.Device().GetDispatchLoader();
+    u64 value;
+    dev.getQueryPoolResults(query.first, query.second, 1, sizeof(value), &value, sizeof(value),
+                            vk::QueryResultFlagBits::e64 | vk::QueryResultFlagBits::eWait, dld);
+    return value;
+}
+
+} // namespace Vulkan
diff --git a/src/video_core/renderer_vulkan/vk_query_cache.h b/src/video_core/renderer_vulkan/vk_query_cache.h
new file mode 100644
index 000000000..c3092ee96
--- /dev/null
+++ b/src/video_core/renderer_vulkan/vk_query_cache.h
@@ -0,0 +1,104 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "common/common_types.h"
+#include "video_core/query_cache.h"
+#include "video_core/renderer_vulkan/declarations.h"
+#include "video_core/renderer_vulkan/vk_resource_manager.h"
+
+namespace VideoCore {
+class RasterizerInterface;
+}
+
+namespace Vulkan {
+
+class CachedQuery;
+class HostCounter;
+class VKDevice;
+class VKQueryCache;
+class VKScheduler;
+
+using CounterStream = VideoCommon::CounterStreamBase<VKQueryCache, HostCounter>;
+
+class QueryPool final : public VKFencedPool {
+public:
+    explicit QueryPool();
+    ~QueryPool() override;
+
+    void Initialize(const VKDevice& device, VideoCore::QueryType type);
+
+    std::pair<vk::QueryPool, std::uint32_t> Commit(VKFence& fence);
+
+    void Reserve(std::pair<vk::QueryPool, std::uint32_t> query);
+
+protected:
+    void Allocate(std::size_t begin, std::size_t end) override;
+
+private:
+    static constexpr std::size_t GROW_STEP = 512;
+
+    const VKDevice* device = nullptr;
+    VideoCore::QueryType type = {};
+
+    std::vector<UniqueQueryPool> pools;
+    std::vector<bool> usage;
+};
+
+class VKQueryCache final
+    : public VideoCommon::QueryCacheBase<VKQueryCache, CachedQuery, CounterStream, HostCounter,
+                                         QueryPool> {
+public:
+    explicit VKQueryCache(Core::System& system, VideoCore::RasterizerInterface& rasterizer,
+                          const VKDevice& device, VKScheduler& scheduler);
+    ~VKQueryCache();
+
+    std::pair<vk::QueryPool, std::uint32_t> AllocateQuery(VideoCore::QueryType type);
+
+    void Reserve(VideoCore::QueryType type, std::pair<vk::QueryPool, std::uint32_t> query);
+
+    const VKDevice& Device() const noexcept {
+        return device;
+    }
+
+    VKScheduler& Scheduler() const noexcept {
+        return scheduler;
+    }
+
+private:
+    const VKDevice& device;
+    VKScheduler& scheduler;
+};
+
+class HostCounter final : public VideoCommon::HostCounterBase<VKQueryCache, HostCounter> {
+public:
+    explicit HostCounter(VKQueryCache& cache, std::shared_ptr<HostCounter> dependency,
+                         VideoCore::QueryType type);
+    ~HostCounter();
+
+    void EndQuery();
+
+private:
+    u64 BlockingQuery() const override;
+
+    VKQueryCache& cache;
+    const VideoCore::QueryType type;
+    const std::pair<vk::QueryPool, std::uint32_t> query;
+    const u64 ticks;
+};
+
+class CachedQuery : public VideoCommon::CachedQueryBase<HostCounter> {
+public:
+    explicit CachedQuery(VKQueryCache&, VideoCore::QueryType, VAddr cpu_addr, u8* host_ptr)
+        : VideoCommon::CachedQueryBase<HostCounter>{cpu_addr, host_ptr} {}
+};
+
+} // namespace Vulkan
diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp
index d2c6b1189..31c078f6a 100644
--- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp
+++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp
@@ -289,25 +289,19 @@ RasterizerVulkan::RasterizerVulkan(Core::System& system, Core::Frontend::EmuWind
                     staging_pool),
       pipeline_cache(system, *this, device, scheduler, descriptor_pool, update_descriptor_queue),
       buffer_cache(*this, system, device, memory_manager, scheduler, staging_pool),
-      sampler_cache(device) {}
-
-RasterizerVulkan::~RasterizerVulkan() = default;
-
-bool RasterizerVulkan::DrawBatch(bool is_indexed) {
-    Draw(is_indexed, false);
-    return true;
+      sampler_cache(device), query_cache(system, *this, device, scheduler) {
+    scheduler.SetQueryCache(query_cache);
 }
 
-bool RasterizerVulkan::DrawMultiBatch(bool is_indexed) {
-    Draw(is_indexed, true);
-    return true;
-}
+RasterizerVulkan::~RasterizerVulkan() = default;
 
 void RasterizerVulkan::Draw(bool is_indexed, bool is_instanced) {
     MICROPROFILE_SCOPE(Vulkan_Drawing);
 
     FlushWork();
 
+    query_cache.UpdateCounters();
+
     const auto& gpu = system.GPU().Maxwell3D();
     GraphicsPipelineCacheKey key{GetFixedPipelineState(gpu.regs)};
 
@@ -362,6 +356,8 @@ void RasterizerVulkan::Draw(bool is_indexed, bool is_instanced) {
 void RasterizerVulkan::Clear() {
     MICROPROFILE_SCOPE(Vulkan_Clearing);
 
+    query_cache.UpdateCounters();
+
     const auto& gpu = system.GPU().Maxwell3D();
     if (!system.GPU().Maxwell3D().ShouldExecute()) {
         return;
@@ -429,6 +425,8 @@ void RasterizerVulkan::DispatchCompute(GPUVAddr code_addr) {
     sampled_views.clear();
     image_views.clear();
 
+    query_cache.UpdateCounters();
+
     const auto& launch_desc = system.GPU().KeplerCompute().launch_description;
     const ComputePipelineCacheKey key{
         code_addr,
@@ -471,17 +469,28 @@ void RasterizerVulkan::DispatchCompute(GPUVAddr code_addr) {
     });
 }
 
+void RasterizerVulkan::ResetCounter(VideoCore::QueryType type) {
+    query_cache.ResetCounter(type);
+}
+
+void RasterizerVulkan::Query(GPUVAddr gpu_addr, VideoCore::QueryType type,
+                             std::optional<u64> timestamp) {
+    query_cache.Query(gpu_addr, type, timestamp);
+}
+
 void RasterizerVulkan::FlushAll() {}
 
 void RasterizerVulkan::FlushRegion(CacheAddr addr, u64 size) {
     texture_cache.FlushRegion(addr, size);
     buffer_cache.FlushRegion(addr, size);
+    query_cache.FlushRegion(addr, size);
 }
 
 void RasterizerVulkan::InvalidateRegion(CacheAddr addr, u64 size) {
     texture_cache.InvalidateRegion(addr, size);
     pipeline_cache.InvalidateRegion(addr, size);
     buffer_cache.InvalidateRegion(addr, size);
+    query_cache.InvalidateRegion(addr, size);
 }
 
 void RasterizerVulkan::FlushAndInvalidateRegion(CacheAddr addr, u64 size) {
@@ -571,7 +580,7 @@ RasterizerVulkan::Texceptions RasterizerVulkan::UpdateAttachments() {
             color_attachments[rt] = texture_cache.GetColorBufferSurface(rt, true);
         }
         if (color_attachments[rt] && WalkAttachmentOverlaps(*color_attachments[rt])) {
-            texceptions.set(rt);
+            texceptions[rt] = true;
         }
     }
 
@@ -579,7 +588,7 @@ RasterizerVulkan::Texceptions RasterizerVulkan::UpdateAttachments() {
         zeta_attachment = texture_cache.GetDepthBufferSurface(true);
     }
     if (zeta_attachment && WalkAttachmentOverlaps(*zeta_attachment)) {
-        texceptions.set(ZETA_TEXCEPTION_INDEX);
+        texceptions[ZETA_TEXCEPTION_INDEX] = true;
     }
 
     texture_cache.GuardRenderTargets(false);
@@ -1122,11 +1131,12 @@ RenderPassParams RasterizerVulkan::GetRenderPassParams(Texceptions texceptions)
 
     for (std::size_t rt = 0; rt < static_cast<std::size_t>(regs.rt_control.count); ++rt) {
         const auto& rendertarget = regs.rt[rt];
-        if (rendertarget.Address() == 0 || rendertarget.format == Tegra::RenderTargetFormat::NONE)
+        if (rendertarget.Address() == 0 || rendertarget.format == Tegra::RenderTargetFormat::NONE) {
             continue;
+        }
         renderpass_params.color_attachments.push_back(RenderPassParams::ColorAttachment{
             static_cast<u32>(rt), PixelFormatFromRenderTargetFormat(rendertarget.format),
-            texceptions.test(rt)});
+            texceptions[rt]});
     }
 
     renderpass_params.has_zeta = regs.zeta_enable;
diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.h b/src/video_core/renderer_vulkan/vk_rasterizer.h
index 7be71e734..138903d60 100644
--- a/src/video_core/renderer_vulkan/vk_rasterizer.h
+++ b/src/video_core/renderer_vulkan/vk_rasterizer.h
@@ -24,6 +24,7 @@
 #include "video_core/renderer_vulkan/vk_descriptor_pool.h"
 #include "video_core/renderer_vulkan/vk_memory_manager.h"
 #include "video_core/renderer_vulkan/vk_pipeline_cache.h"
+#include "video_core/renderer_vulkan/vk_query_cache.h"
 #include "video_core/renderer_vulkan/vk_renderpass_cache.h"
 #include "video_core/renderer_vulkan/vk_resource_manager.h"
 #include "video_core/renderer_vulkan/vk_sampler_cache.h"
@@ -96,7 +97,7 @@ struct ImageView {
     vk::ImageLayout* layout = nullptr;
 };
 
-class RasterizerVulkan : public VideoCore::RasterizerAccelerated {
+class RasterizerVulkan final : public VideoCore::RasterizerAccelerated {
 public:
     explicit RasterizerVulkan(Core::System& system, Core::Frontend::EmuWindow& render_window,
                               VKScreenInfo& screen_info, const VKDevice& device,
@@ -104,10 +105,11 @@ public:
                               VKScheduler& scheduler);
     ~RasterizerVulkan() override;
 
-    bool DrawBatch(bool is_indexed) override;
-    bool DrawMultiBatch(bool is_indexed) override;
+    void Draw(bool is_indexed, bool is_instanced) override;
     void Clear() override;
     void DispatchCompute(GPUVAddr code_addr) override;
+    void ResetCounter(VideoCore::QueryType type) override;
+    void Query(GPUVAddr gpu_addr, VideoCore::QueryType type, std::optional<u64> timestamp) override;
     void FlushAll() override;
     void FlushRegion(CacheAddr addr, u64 size) override;
     void InvalidateRegion(CacheAddr addr, u64 size) override;
@@ -140,8 +142,6 @@ private:
 
     static constexpr std::size_t ZETA_TEXCEPTION_INDEX = 8;
 
-    void Draw(bool is_indexed, bool is_instanced);
-
     void FlushWork();
 
     Texceptions UpdateAttachments();
@@ -247,6 +247,7 @@ private:
     VKPipelineCache pipeline_cache;
     VKBufferCache buffer_cache;
     VKSamplerCache sampler_cache;
+    VKQueryCache query_cache;
 
     std::array<View, Maxwell::NumRenderTargets> color_attachments;
     View zeta_attachment;
diff --git a/src/video_core/renderer_vulkan/vk_sampler_cache.cpp b/src/video_core/renderer_vulkan/vk_sampler_cache.cpp
index 0a8ec8398..204b7c39c 100644
--- a/src/video_core/renderer_vulkan/vk_sampler_cache.cpp
+++ b/src/video_core/renderer_vulkan/vk_sampler_cache.cpp
@@ -23,7 +23,14 @@ static std::optional<vk::BorderColor> TryConvertBorderColor(std::array<float, 4>
     } else if (color == std::array<float, 4>{1, 1, 1, 1}) {
         return vk::BorderColor::eFloatOpaqueWhite;
     } else {
-        return {};
+        if (color[0] + color[1] + color[2] > 1.35f) {
+            // If color elements are brighter than roughly 0.5 average, use white border
+            return vk::BorderColor::eFloatOpaqueWhite;
+        }
+        if (color[3] > 0.5f) {
+            return vk::BorderColor::eFloatOpaqueBlack;
+        }
+        return vk::BorderColor::eFloatTransparentBlack;
     }
 }
 
@@ -37,8 +44,6 @@ UniqueSampler VKSamplerCache::CreateSampler(const Tegra::Texture::TSCEntry& tsc)
 
     const auto border_color{tsc.GetBorderColor()};
     const auto vk_border_color{TryConvertBorderColor(border_color)};
-    UNIMPLEMENTED_IF_MSG(!vk_border_color, "Unimplemented border color {} {} {} {}",
-                         border_color[0], border_color[1], border_color[2], border_color[3]);
 
     constexpr bool unnormalized_coords{false};
 
diff --git a/src/video_core/renderer_vulkan/vk_scheduler.cpp b/src/video_core/renderer_vulkan/vk_scheduler.cpp
index d66133ad1..92bd6c344 100644
--- a/src/video_core/renderer_vulkan/vk_scheduler.cpp
+++ b/src/video_core/renderer_vulkan/vk_scheduler.cpp
@@ -6,6 +6,7 @@
 #include "common/microprofile.h"
 #include "video_core/renderer_vulkan/declarations.h"
 #include "video_core/renderer_vulkan/vk_device.h"
+#include "video_core/renderer_vulkan/vk_query_cache.h"
 #include "video_core/renderer_vulkan/vk_resource_manager.h"
 #include "video_core/renderer_vulkan/vk_scheduler.h"
 
@@ -139,6 +140,8 @@ void VKScheduler::SubmitExecution(vk::Semaphore semaphore) {
 }
 
 void VKScheduler::AllocateNewContext() {
+    ++ticks;
+
     std::unique_lock lock{mutex};
     current_fence = next_fence;
     next_fence = &resource_manager.CommitFence();
@@ -146,6 +149,10 @@ void VKScheduler::AllocateNewContext() {
     current_cmdbuf = resource_manager.CommitCommandBuffer(*current_fence);
     current_cmdbuf.begin({vk::CommandBufferUsageFlagBits::eOneTimeSubmit},
                          device.GetDispatchLoader());
+    // Enable counters once again. These are disabled when a command buffer is finished.
+    if (query_cache) {
+        query_cache->UpdateCounters();
+    }
 }
 
 void VKScheduler::InvalidateState() {
@@ -159,6 +166,7 @@ void VKScheduler::InvalidateState() {
 }
 
 void VKScheduler::EndPendingOperations() {
+    query_cache->DisableStreams();
     EndRenderPass();
 }
 
diff --git a/src/video_core/renderer_vulkan/vk_scheduler.h b/src/video_core/renderer_vulkan/vk_scheduler.h
index bcdffbba0..62fd7858b 100644
--- a/src/video_core/renderer_vulkan/vk_scheduler.h
+++ b/src/video_core/renderer_vulkan/vk_scheduler.h
@@ -4,6 +4,7 @@
 
 #pragma once
 
+#include <atomic>
 #include <condition_variable>
 #include <memory>
 #include <optional>
@@ -18,6 +19,7 @@ namespace Vulkan {
 
 class VKDevice;
 class VKFence;
+class VKQueryCache;
 class VKResourceManager;
 
 class VKFenceView {
@@ -67,6 +69,11 @@ public:
     /// Binds a pipeline to the current execution context.
     void BindGraphicsPipeline(vk::Pipeline pipeline);
 
+    /// Assigns the query cache.
+    void SetQueryCache(VKQueryCache& query_cache_) {
+        query_cache = &query_cache_;
+    }
+
     /// Returns true when viewports have been set in the current command buffer.
     bool TouchViewports() {
         return std::exchange(state.viewports, true);
@@ -112,6 +119,11 @@ public:
         return current_fence;
     }
 
+    /// Returns the current command buffer tick.
+    u64 Ticks() const {
+        return ticks;
+    }
+
 private:
     class Command {
     public:
@@ -205,6 +217,8 @@ private:
 
     const VKDevice& device;
     VKResourceManager& resource_manager;
+    VKQueryCache* query_cache = nullptr;
+
     vk::CommandBuffer current_cmdbuf;
     VKFence* current_fence = nullptr;
     VKFence* next_fence = nullptr;
@@ -227,6 +241,7 @@ private:
     Common::SPSCQueue<std::unique_ptr<CommandChunk>> chunk_reserve;
     std::mutex mutex;
     std::condition_variable cv;
+    std::atomic<u64> ticks = 0;
     bool quit = false;
 };
 
diff --git a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
index dd6d2ef03..6d0bf6aa1 100644
--- a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
+++ b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
@@ -275,12 +275,14 @@ public:
         AddCapability(spv::Capability::ImageGatherExtended);
         AddCapability(spv::Capability::SampledBuffer);
         AddCapability(spv::Capability::StorageImageWriteWithoutFormat);
+        AddCapability(spv::Capability::DrawParameters);
         AddCapability(spv::Capability::SubgroupBallotKHR);
         AddCapability(spv::Capability::SubgroupVoteKHR);
         AddExtension("SPV_KHR_shader_ballot");
         AddExtension("SPV_KHR_subgroup_vote");
         AddExtension("SPV_KHR_storage_buffer_storage_class");
         AddExtension("SPV_KHR_variable_pointers");
+        AddExtension("SPV_KHR_shader_draw_parameters");
 
         if (ir.UsesViewportIndex()) {
             AddCapability(spv::Capability::MultiViewport);
@@ -290,6 +292,10 @@ public:
             }
         }
 
+        if (device.IsShaderStorageImageReadWithoutFormatSupported()) {
+            AddCapability(spv::Capability::StorageImageReadWithoutFormat);
+        }
+
         if (device.IsFloat16Supported()) {
             AddCapability(spv::Capability::Float16);
         }
@@ -353,6 +359,7 @@ private:
         DeclareFragment();
         DeclareCompute();
         DeclareRegisters();
+        DeclareCustomVariables();
         DeclarePredicates();
         DeclareLocalMemory();
         DeclareSharedMemory();
@@ -491,9 +498,11 @@ private:
         interfaces.push_back(AddGlobalVariable(Name(out_vertex, "out_vertex")));
 
         // Declare input attributes
-        vertex_index = DeclareInputBuiltIn(spv::BuiltIn::VertexIndex, t_in_uint, "vertex_index");
+        vertex_index = DeclareInputBuiltIn(spv::BuiltIn::VertexIndex, t_in_int, "vertex_index");
         instance_index =
-            DeclareInputBuiltIn(spv::BuiltIn::InstanceIndex, t_in_uint, "instance_index");
+            DeclareInputBuiltIn(spv::BuiltIn::InstanceIndex, t_in_int, "instance_index");
+        base_vertex = DeclareInputBuiltIn(spv::BuiltIn::BaseVertex, t_in_int, "base_vertex");
+        base_instance = DeclareInputBuiltIn(spv::BuiltIn::BaseInstance, t_in_int, "base_instance");
     }
 
     void DeclareTessControl() {
@@ -542,11 +551,10 @@ private:
             return;
         }
 
-        for (u32 rt = 0; rt < static_cast<u32>(frag_colors.size()); ++rt) {
-            if (!specialization.enabled_rendertargets[rt]) {
+        for (u32 rt = 0; rt < static_cast<u32>(std::size(frag_colors)); ++rt) {
+            if (!IsRenderTargetEnabled(rt)) {
                 continue;
             }
-
             const Id id = AddGlobalVariable(OpVariable(t_out_float4, spv::StorageClass::Output));
             Name(id, fmt::format("frag_color{}", rt));
             Decorate(id, spv::Decoration::Location, rt);
@@ -587,6 +595,15 @@ private:
         }
     }
 
+    void DeclareCustomVariables() {
+        const u32 num_custom_variables = ir.GetNumCustomVariables();
+        for (u32 i = 0; i < num_custom_variables; ++i) {
+            const Id id = OpVariable(t_prv_float, spv::StorageClass::Private, v_float_zero);
+            Name(id, fmt::format("custom_var_{}", i));
+            custom_variables.emplace(i, AddGlobalVariable(id));
+        }
+    }
+
     void DeclarePredicates() {
         for (const auto pred : ir.GetPredicates()) {
             const Id id = OpVariable(t_prv_bool, spv::StorageClass::Private, v_false);
@@ -852,6 +869,15 @@ private:
         return binding;
     }
 
+    bool IsRenderTargetEnabled(u32 rt) const {
+        for (u32 component = 0; component < 4; ++component) {
+            if (header.ps.IsColorComponentOutputEnabled(rt, component)) {
+                return true;
+            }
+        }
+        return false;
+    }
+
     bool IsInputAttributeArray() const {
         return stage == ShaderType::TesselationControl || stage == ShaderType::TesselationEval ||
                stage == ShaderType::Geometry;
@@ -974,6 +1000,11 @@ private:
             return {OpLoad(t_float, registers.at(index)), Type::Float};
         }
 
+        if (const auto cv = std::get_if<CustomVarNode>(&*node)) {
+            const u32 index = cv->GetIndex();
+            return {OpLoad(t_float, custom_variables.at(index)), Type::Float};
+        }
+
         if (const auto immediate = std::get_if<ImmediateNode>(&*node)) {
             return {Constant(t_uint, immediate->GetValue()), Type::Uint};
         }
@@ -1045,9 +1076,12 @@ private:
                     return {OpLoad(t_float, AccessElement(t_in_float, tess_coord, element)),
                             Type::Float};
                 case 2:
-                    return {OpLoad(t_uint, instance_index), Type::Uint};
+                    return {
+                        OpISub(t_int, OpLoad(t_int, instance_index), OpLoad(t_int, base_instance)),
+                        Type::Int};
                 case 3:
-                    return {OpLoad(t_uint, vertex_index), Type::Uint};
+                    return {OpISub(t_int, OpLoad(t_int, vertex_index), OpLoad(t_int, base_vertex)),
+                            Type::Int};
                 }
                 UNIMPLEMENTED_MSG("Unmanaged TessCoordInstanceIDVertexID element={}", element);
                 return {Constant(t_uint, 0U), Type::Uint};
@@ -1115,15 +1149,7 @@ private:
         }
 
         if (const auto gmem = std::get_if<GmemNode>(&*node)) {
-            const Id gmem_buffer = global_buffers.at(gmem->GetDescriptor());
-            const Id real = AsUint(Visit(gmem->GetRealAddress()));
-            const Id base = AsUint(Visit(gmem->GetBaseAddress()));
-
-            Id offset = OpISub(t_uint, real, base);
-            offset = OpUDiv(t_uint, offset, Constant(t_uint, 4U));
-            return {OpLoad(t_float,
-                           OpAccessChain(t_gmem_float, gmem_buffer, Constant(t_uint, 0U), offset)),
-                    Type::Float};
+            return {OpLoad(t_uint, GetGlobalMemoryPointer(*gmem)), Type::Uint};
         }
 
         if (const auto lmem = std::get_if<LmemNode>(&*node)) {
@@ -1134,10 +1160,7 @@ private:
         }
 
         if (const auto smem = std::get_if<SmemNode>(&*node)) {
-            Id address = AsUint(Visit(smem->GetAddress()));
-            address = OpShiftRightLogical(t_uint, address, Constant(t_uint, 2U));
-            const Id pointer = OpAccessChain(t_smem_uint, shared_memory, address);
-            return {OpLoad(t_uint, pointer), Type::Uint};
+            return {OpLoad(t_uint, GetSharedMemoryPointer(*smem)), Type::Uint};
         }
 
         if (const auto internal_flag = std::get_if<InternalFlagNode>(&*node)) {
@@ -1331,20 +1354,13 @@ private:
             target = {OpAccessChain(t_prv_float, local_memory, address), Type::Float};
 
         } else if (const auto smem = std::get_if<SmemNode>(&*dest)) {
-            ASSERT(stage == ShaderType::Compute);
-            Id address = AsUint(Visit(smem->GetAddress()));
-            address = OpShiftRightLogical(t_uint, address, Constant(t_uint, 2U));
-            target = {OpAccessChain(t_smem_uint, shared_memory, address), Type::Uint};
+            target = {GetSharedMemoryPointer(*smem), Type::Uint};
 
         } else if (const auto gmem = std::get_if<GmemNode>(&*dest)) {
-            const Id real = AsUint(Visit(gmem->GetRealAddress()));
-            const Id base = AsUint(Visit(gmem->GetBaseAddress()));
-            const Id diff = OpISub(t_uint, real, base);
-            const Id offset = OpShiftRightLogical(t_uint, diff, Constant(t_uint, 2));
+            target = {GetGlobalMemoryPointer(*gmem), Type::Uint};
 
-            const Id gmem_buffer = global_buffers.at(gmem->GetDescriptor());
-            target = {OpAccessChain(t_gmem_float, gmem_buffer, Constant(t_uint, 0), offset),
-                      Type::Float};
+        } else if (const auto cv = std::get_if<CustomVarNode>(&*dest)) {
+            target = {custom_variables.at(cv->GetIndex()), Type::Float};
 
         } else {
             UNIMPLEMENTED();
@@ -1743,8 +1759,16 @@ private:
     }
 
     Expression ImageLoad(Operation operation) {
-        UNIMPLEMENTED();
-        return {};
+        if (!device.IsShaderStorageImageReadWithoutFormatSupported()) {
+            return {v_float_zero, Type::Float};
+        }
+
+        const auto& meta{std::get<MetaImage>(operation.GetMeta())};
+
+        const Id coords = GetCoordinates(operation, Type::Int);
+        const Id texel = OpImageRead(t_uint4, GetImage(operation), coords);
+
+        return {OpCompositeExtract(t_uint, texel, meta.element), Type::Uint};
     }
 
     Expression ImageStore(Operation operation) {
@@ -1796,11 +1820,16 @@ private:
         return {};
     }
 
-    Expression UAtomicAdd(Operation operation) {
-        const auto& smem = std::get<SmemNode>(*operation[0]);
-        Id address = AsUint(Visit(smem.GetAddress()));
-        address = OpShiftRightLogical(t_uint, address, Constant(t_uint, 2U));
-        const Id pointer = OpAccessChain(t_smem_uint, shared_memory, address);
+    Expression AtomicAdd(Operation operation) {
+        Id pointer;
+        if (const auto smem = std::get_if<SmemNode>(&*operation[0])) {
+            pointer = GetSharedMemoryPointer(*smem);
+        } else if (const auto gmem = std::get_if<GmemNode>(&*operation[0])) {
+            pointer = GetGlobalMemoryPointer(*gmem);
+        } else {
+            UNREACHABLE();
+            return {Constant(t_uint, 0), Type::Uint};
+        }
 
         const Id scope = Constant(t_uint, static_cast<u32>(spv::Scope::Device));
         const Id semantics = Constant(t_uint, 0U);
@@ -1889,19 +1918,14 @@ private:
             // rendertargets/components are skipped in the register assignment.
             u32 current_reg = 0;
             for (u32 rt = 0; rt < Maxwell::NumRenderTargets; ++rt) {
-                if (!specialization.enabled_rendertargets[rt]) {
-                    // Skip rendertargets that are not enabled
-                    continue;
-                }
                 // TODO(Subv): Figure out how dual-source blending is configured in the Switch.
                 for (u32 component = 0; component < 4; ++component) {
-                    const Id pointer = AccessElement(t_out_float, frag_colors.at(rt), component);
-                    if (header.ps.IsColorComponentOutputEnabled(rt, component)) {
-                        OpStore(pointer, SafeGetRegister(current_reg));
-                        ++current_reg;
-                    } else {
-                        OpStore(pointer, component == 3 ? v_float_one : v_float_zero);
+                    if (!header.ps.IsColorComponentOutputEnabled(rt, component)) {
+                        continue;
                     }
+                    const Id pointer = AccessElement(t_out_float, frag_colors[rt], component);
+                    OpStore(pointer, SafeGetRegister(current_reg));
+                    ++current_reg;
                 }
             }
             if (header.ps.omap.depth) {
@@ -2240,6 +2264,22 @@ private:
         return {};
     }
 
+    Id GetGlobalMemoryPointer(const GmemNode& gmem) {
+        const Id real = AsUint(Visit(gmem.GetRealAddress()));
+        const Id base = AsUint(Visit(gmem.GetBaseAddress()));
+        const Id diff = OpISub(t_uint, real, base);
+        const Id offset = OpShiftRightLogical(t_uint, diff, Constant(t_uint, 2));
+        const Id buffer = global_buffers.at(gmem.GetDescriptor());
+        return OpAccessChain(t_gmem_uint, buffer, Constant(t_uint, 0), offset);
+    }
+
+    Id GetSharedMemoryPointer(const SmemNode& smem) {
+        ASSERT(stage == ShaderType::Compute);
+        Id address = AsUint(Visit(smem.GetAddress()));
+        address = OpShiftRightLogical(t_uint, address, Constant(t_uint, 2U));
+        return OpAccessChain(t_smem_uint, shared_memory, address);
+    }
+
     static constexpr std::array operation_decompilers = {
         &SPIRVDecompiler::Assign,
 
@@ -2386,7 +2426,7 @@ private:
         &SPIRVDecompiler::AtomicImageXor,
         &SPIRVDecompiler::AtomicImageExchange,
 
-        &SPIRVDecompiler::UAtomicAdd,
+        &SPIRVDecompiler::AtomicAdd,
 
         &SPIRVDecompiler::Branch,
         &SPIRVDecompiler::BranchIndirect,
@@ -2482,9 +2522,9 @@ private:
 
     Id t_smem_uint{};
 
-    const Id t_gmem_float = TypePointer(spv::StorageClass::StorageBuffer, t_float);
+    const Id t_gmem_uint = TypePointer(spv::StorageClass::StorageBuffer, t_uint);
     const Id t_gmem_array =
-        Name(Decorate(TypeRuntimeArray(t_float), spv::Decoration::ArrayStride, 4U), "GmemArray");
+        Name(Decorate(TypeRuntimeArray(t_uint), spv::Decoration::ArrayStride, 4U), "GmemArray");
     const Id t_gmem_struct = MemberDecorate(
         Decorate(TypeStruct(t_gmem_array), spv::Decoration::Block), 0, spv::Decoration::Offset, 0);
     const Id t_gmem_ssbo = TypePointer(spv::StorageClass::StorageBuffer, t_gmem_struct);
@@ -2505,6 +2545,7 @@ private:
     Id out_vertex{};
     Id in_vertex{};
     std::map<u32, Id> registers;
+    std::map<u32, Id> custom_variables;
     std::map<Tegra::Shader::Pred, Id> predicates;
     std::map<u32, Id> flow_variables;
     Id local_memory{};
@@ -2520,6 +2561,8 @@ private:
 
     Id instance_index{};
     Id vertex_index{};
+    Id base_instance{};
+    Id base_vertex{};
     std::array<Id, Maxwell::NumRenderTargets> frag_colors{};
     Id frag_depth{};
     Id frag_coord{};
diff --git a/src/video_core/renderer_vulkan/vk_shader_decompiler.h b/src/video_core/renderer_vulkan/vk_shader_decompiler.h
index 10794be1c..f5dc14d9e 100644
--- a/src/video_core/renderer_vulkan/vk_shader_decompiler.h
+++ b/src/video_core/renderer_vulkan/vk_shader_decompiler.h
@@ -102,9 +102,6 @@ struct Specialization final {
         Maxwell::TessellationSpacing spacing{};
         bool clockwise{};
     } tessellation;
-
-    // Fragment specific
-    std::bitset<8> enabled_rendertargets;
 };
 // Old gcc versions don't consider this trivially copyable.
 // static_assert(std::is_trivially_copyable_v<Specialization>);
diff --git a/src/video_core/shader/ast.h b/src/video_core/shader/ast.h
index a2f0044ba..cca13bcde 100644
--- a/src/video_core/shader/ast.h
+++ b/src/video_core/shader/ast.h
@@ -65,8 +65,8 @@ public:
     void DetachSegment(ASTNode start, ASTNode end);
     void Remove(ASTNode node);
 
-    ASTNode first{};
-    ASTNode last{};
+    ASTNode first;
+    ASTNode last;
 };
 
 class ASTProgram {
@@ -299,9 +299,9 @@ private:
     friend class ASTZipper;
 
     ASTData data;
-    ASTNode parent{};
-    ASTNode next{};
-    ASTNode previous{};
+    ASTNode parent;
+    ASTNode next;
+    ASTNode previous;
     ASTZipper* manager{};
 };
 
diff --git a/src/video_core/shader/const_buffer_locker.cpp b/src/video_core/shader/const_buffer_locker.cpp
index a4a0319eb..0638be8cb 100644
--- a/src/video_core/shader/const_buffer_locker.cpp
+++ b/src/video_core/shader/const_buffer_locker.cpp
@@ -66,6 +66,18 @@ std::optional<Tegra::Engines::SamplerDescriptor> ConstBufferLocker::ObtainBindle
     return value;
 }
 
+std::optional<u32> ConstBufferLocker::ObtainBoundBuffer() {
+    if (bound_buffer_saved) {
+        return bound_buffer;
+    }
+    if (!engine) {
+        return std::nullopt;
+    }
+    bound_buffer_saved = true;
+    bound_buffer = engine->GetBoundBuffer();
+    return bound_buffer;
+}
+
 void ConstBufferLocker::InsertKey(u32 buffer, u32 offset, u32 value) {
     keys.insert_or_assign({buffer, offset}, value);
 }
@@ -78,6 +90,11 @@ void ConstBufferLocker::InsertBindlessSampler(u32 buffer, u32 offset, SamplerDes
     bindless_samplers.insert_or_assign({buffer, offset}, sampler);
 }
 
+void ConstBufferLocker::SetBoundBuffer(u32 buffer) {
+    bound_buffer_saved = true;
+    bound_buffer = buffer;
+}
+
 bool ConstBufferLocker::IsConsistent() const {
     if (!engine) {
         return false;
diff --git a/src/video_core/shader/const_buffer_locker.h b/src/video_core/shader/const_buffer_locker.h
index d32e2d657..d3ea11087 100644
--- a/src/video_core/shader/const_buffer_locker.h
+++ b/src/video_core/shader/const_buffer_locker.h
@@ -10,6 +10,7 @@
 #include "common/hash.h"
 #include "video_core/engines/const_buffer_engine_interface.h"
 #include "video_core/engines/shader_type.h"
+#include "video_core/guest_driver.h"
 
 namespace VideoCommon::Shader {
 
@@ -40,6 +41,8 @@ public:
 
     std::optional<Tegra::Engines::SamplerDescriptor> ObtainBindlessSampler(u32 buffer, u32 offset);
 
+    std::optional<u32> ObtainBoundBuffer();
+
     /// Inserts a key.
     void InsertKey(u32 buffer, u32 offset, u32 value);
 
@@ -49,6 +52,9 @@ public:
     /// Inserts a bindless sampler key.
     void InsertBindlessSampler(u32 buffer, u32 offset, Tegra::Engines::SamplerDescriptor sampler);
 
+    /// Set the bound buffer for this locker.
+    void SetBoundBuffer(u32 buffer);
+
     /// Checks keys and samplers against engine's current const buffers. Returns true if they are
     /// the same value, false otherwise;
     bool IsConsistent() const;
@@ -71,12 +77,27 @@ public:
         return bindless_samplers;
     }
 
+    /// Gets bound buffer used on this shader
+    u32 GetBoundBuffer() const {
+        return bound_buffer;
+    }
+
+    /// Obtains access to the guest driver's profile.
+    VideoCore::GuestDriverProfile* AccessGuestDriverProfile() const {
+        if (engine) {
+            return &engine->AccessGuestDriverProfile();
+        }
+        return nullptr;
+    }
+
 private:
     const Tegra::Engines::ShaderType stage;
     Tegra::Engines::ConstBufferEngineInterface* engine = nullptr;
     KeyMap keys;
     BoundSamplerMap bound_samplers;
     BindlessSamplerMap bindless_samplers;
+    bool bound_buffer_saved{};
+    u32 bound_buffer{};
 };
 
 } // namespace VideoCommon::Shader
diff --git a/src/video_core/shader/decode.cpp b/src/video_core/shader/decode.cpp
index 22c3e5120..6b697ed5d 100644
--- a/src/video_core/shader/decode.cpp
+++ b/src/video_core/shader/decode.cpp
@@ -3,6 +3,7 @@
 // Refer to the license.txt file included.
 
 #include <cstring>
+#include <limits>
 #include <set>
 
 #include <fmt/format.h>
@@ -33,6 +34,52 @@ constexpr bool IsSchedInstruction(u32 offset, u32 main_offset) {
     return (absolute_offset % SchedPeriod) == 0;
 }
 
+void DeduceTextureHandlerSize(VideoCore::GuestDriverProfile* gpu_driver,
+                              const std::list<Sampler>& used_samplers) {
+    if (gpu_driver == nullptr) {
+        LOG_CRITICAL(HW_GPU, "GPU driver profile has not been created yet");
+        return;
+    }
+    if (gpu_driver->TextureHandlerSizeKnown() || used_samplers.size() <= 1) {
+        return;
+    }
+    u32 count{};
+    std::vector<u32> bound_offsets;
+    for (const auto& sampler : used_samplers) {
+        if (sampler.IsBindless()) {
+            continue;
+        }
+        ++count;
+        bound_offsets.emplace_back(sampler.GetOffset());
+    }
+    if (count > 1) {
+        gpu_driver->DeduceTextureHandlerSize(std::move(bound_offsets));
+    }
+}
+
+std::optional<u32> TryDeduceSamplerSize(const Sampler& sampler_to_deduce,
+                                        VideoCore::GuestDriverProfile* gpu_driver,
+                                        const std::list<Sampler>& used_samplers) {
+    if (gpu_driver == nullptr) {
+        LOG_CRITICAL(HW_GPU, "GPU Driver profile has not been created yet");
+        return std::nullopt;
+    }
+    const u32 base_offset = sampler_to_deduce.GetOffset();
+    u32 max_offset{std::numeric_limits<u32>::max()};
+    for (const auto& sampler : used_samplers) {
+        if (sampler.IsBindless()) {
+            continue;
+        }
+        if (sampler.GetOffset() > base_offset) {
+            max_offset = std::min(sampler.GetOffset(), max_offset);
+        }
+    }
+    if (max_offset == std::numeric_limits<u32>::max()) {
+        return std::nullopt;
+    }
+    return ((max_offset - base_offset) * 4) / gpu_driver->GetTextureHandlerSize();
+}
+
 } // Anonymous namespace
 
 class ASTDecoder {
@@ -315,4 +362,25 @@ u32 ShaderIR::DecodeInstr(NodeBlock& bb, u32 pc) {
     return pc + 1;
 }
 
+void ShaderIR::PostDecode() {
+    // Deduce texture handler size if needed
+    auto gpu_driver = locker.AccessGuestDriverProfile();
+    DeduceTextureHandlerSize(gpu_driver, used_samplers);
+    // Deduce Indexed Samplers
+    if (!uses_indexed_samplers) {
+        return;
+    }
+    for (auto& sampler : used_samplers) {
+        if (!sampler.IsIndexed()) {
+            continue;
+        }
+        if (const auto size = TryDeduceSamplerSize(sampler, gpu_driver, used_samplers)) {
+            sampler.SetSize(*size);
+        } else {
+            LOG_CRITICAL(HW_GPU, "Failed to deduce size of indexed sampler");
+            sampler.SetSize(1);
+        }
+    }
+}
+
 } // namespace VideoCommon::Shader
diff --git a/src/video_core/shader/decode/arithmetic.cpp b/src/video_core/shader/decode/arithmetic.cpp
index fcedd2af6..90240c765 100644
--- a/src/video_core/shader/decode/arithmetic.cpp
+++ b/src/video_core/shader/decode/arithmetic.cpp
@@ -21,7 +21,7 @@ u32 ShaderIR::DecodeArithmetic(NodeBlock& bb, u32 pc) {
 
     Node op_a = GetRegister(instr.gpr8);
 
-    Node op_b = [&]() -> Node {
+    Node op_b = [&] {
         if (instr.is_b_imm) {
             return GetImmediate19(instr);
         } else if (instr.is_b_gpr) {
@@ -141,6 +141,15 @@ u32 ShaderIR::DecodeArithmetic(NodeBlock& bb, u32 pc) {
         SetRegister(bb, instr.gpr0, value);
         break;
     }
+    case OpCode::Id::FCMP_R: {
+        UNIMPLEMENTED_IF(instr.fcmp.ftz == 0);
+        Node op_c = GetRegister(instr.gpr39);
+        Node comp = GetPredicateComparisonFloat(instr.fcmp.cond, std::move(op_c), Immediate(0.0f));
+        SetRegister(
+            bb, instr.gpr0,
+            Operation(OperationCode::Select, std::move(comp), std::move(op_a), std::move(op_b)));
+        break;
+    }
     case OpCode::Id::RRO_C:
     case OpCode::Id::RRO_R:
     case OpCode::Id::RRO_IMM: {
diff --git a/src/video_core/shader/decode/arithmetic_integer.cpp b/src/video_core/shader/decode/arithmetic_integer.cpp
index 371fae127..21366869d 100644
--- a/src/video_core/shader/decode/arithmetic_integer.cpp
+++ b/src/video_core/shader/decode/arithmetic_integer.cpp
@@ -166,13 +166,13 @@ u32 ShaderIR::DecodeArithmeticInteger(NodeBlock& bb, u32 pc) {
         const auto [op_rhs, test] = [&]() -> std::pair<Node, Node> {
             switch (opcode->get().GetId()) {
             case OpCode::Id::ICMP_CR:
-                return {GetConstBuffer(instr.cbuf34.index, instr.cbuf34.offset),
+                return {GetConstBuffer(instr.cbuf34.index, instr.cbuf34.GetOffset()),
                         GetRegister(instr.gpr39)};
             case OpCode::Id::ICMP_R:
                 return {GetRegister(instr.gpr20), GetRegister(instr.gpr39)};
             case OpCode::Id::ICMP_RC:
                 return {GetRegister(instr.gpr39),
-                        GetConstBuffer(instr.cbuf34.index, instr.cbuf34.offset)};
+                        GetConstBuffer(instr.cbuf34.index, instr.cbuf34.GetOffset())};
             case OpCode::Id::ICMP_IMM:
                 return {Immediate(instr.alu.GetSignedImm20_20()), GetRegister(instr.gpr39)};
             default:
@@ -297,7 +297,7 @@ void ShaderIR::WriteLop3Instruction(NodeBlock& bb, Register dest, Node op_a, Nod
     const Node one = Immediate(1);
     const Node two = Immediate(2);
 
-    Node value{};
+    Node value;
     for (u32 i = 0; i < lop_iterations; ++i) {
         const Node shift_amount = Immediate(i);
 
diff --git a/src/video_core/shader/decode/bfi.cpp b/src/video_core/shader/decode/bfi.cpp
index 8be1119df..70d1c055b 100644
--- a/src/video_core/shader/decode/bfi.cpp
+++ b/src/video_core/shader/decode/bfi.cpp
@@ -17,10 +17,13 @@ u32 ShaderIR::DecodeBfi(NodeBlock& bb, u32 pc) {
     const Instruction instr = {program_code[pc]};
     const auto opcode = OpCode::Decode(instr);
 
-    const auto [base, packed_shift] = [&]() -> std::tuple<Node, Node> {
+    const auto [packed_shift, base] = [&]() -> std::pair<Node, Node> {
         switch (opcode->get().GetId()) {
+        case OpCode::Id::BFI_RC:
+            return {GetRegister(instr.gpr39),
+                    GetConstBuffer(instr.cbuf34.index, instr.cbuf34.GetOffset())};
         case OpCode::Id::BFI_IMM_R:
-            return {GetRegister(instr.gpr39), Immediate(instr.alu.GetSignedImm20_20())};
+            return {Immediate(instr.alu.GetSignedImm20_20()), GetRegister(instr.gpr39)};
         default:
             UNREACHABLE();
             return {Immediate(0), Immediate(0)};
diff --git a/src/video_core/shader/decode/conversion.cpp b/src/video_core/shader/decode/conversion.cpp
index 0eeb75559..6ead42070 100644
--- a/src/video_core/shader/decode/conversion.cpp
+++ b/src/video_core/shader/decode/conversion.cpp
@@ -83,14 +83,14 @@ u32 ShaderIR::DecodeConversion(NodeBlock& bb, u32 pc) {
 
         const bool input_signed = instr.conversion.is_input_signed;
 
-        if (instr.conversion.src_size == Register::Size::Byte) {
-            const u32 offset = static_cast<u32>(instr.conversion.int_src.selector) * 8;
-            if (offset > 0) {
-                value = SignedOperation(OperationCode::ILogicalShiftRight, input_signed,
-                                        std::move(value), Immediate(offset));
+        if (const u32 offset = static_cast<u32>(instr.conversion.int_src.selector); offset > 0) {
+            ASSERT(instr.conversion.src_size == Register::Size::Byte ||
+                   instr.conversion.src_size == Register::Size::Short);
+            if (instr.conversion.src_size == Register::Size::Short) {
+                ASSERT(offset == 0 || offset == 2);
             }
-        } else {
-            UNIMPLEMENTED_IF(instr.conversion.int_src.selector != 0);
+            value = SignedOperation(OperationCode::ILogicalShiftRight, input_signed,
+                                    std::move(value), Immediate(offset * 8));
         }
 
         value = ConvertIntegerSize(value, instr.conversion.src_size, input_signed);
diff --git a/src/video_core/shader/decode/memory.cpp b/src/video_core/shader/decode/memory.cpp
index 7591a715f..b5fbc4d58 100644
--- a/src/video_core/shader/decode/memory.cpp
+++ b/src/video_core/shader/decode/memory.cpp
@@ -19,9 +19,12 @@ namespace VideoCommon::Shader {
 using Tegra::Shader::AtomicOp;
 using Tegra::Shader::AtomicType;
 using Tegra::Shader::Attribute;
+using Tegra::Shader::GlobalAtomicOp;
+using Tegra::Shader::GlobalAtomicType;
 using Tegra::Shader::Instruction;
 using Tegra::Shader::OpCode;
 using Tegra::Shader::Register;
+using Tegra::Shader::StoreType;
 
 namespace {
 
@@ -61,6 +64,27 @@ u32 GetMemorySize(Tegra::Shader::UniformType uniform_type) {
     }
 }
 
+Node ExtractUnaligned(Node value, Node address, u32 mask, u32 size) {
+    Node offset = Operation(OperationCode::UBitwiseAnd, address, Immediate(mask));
+    offset = Operation(OperationCode::ULogicalShiftLeft, std::move(offset), Immediate(3));
+    return Operation(OperationCode::UBitfieldExtract, std::move(value), std::move(offset),
+                     Immediate(size));
+}
+
+Node InsertUnaligned(Node dest, Node value, Node address, u32 mask, u32 size) {
+    Node offset = Operation(OperationCode::UBitwiseAnd, std::move(address), Immediate(mask));
+    offset = Operation(OperationCode::ULogicalShiftLeft, std::move(offset), Immediate(3));
+    return Operation(OperationCode::UBitfieldInsert, std::move(dest), std::move(value),
+                     std::move(offset), Immediate(size));
+}
+
+Node Sign16Extend(Node value) {
+    Node sign = Operation(OperationCode::UBitwiseAnd, value, Immediate(1U << 15));
+    Node is_sign = Operation(OperationCode::LogicalUEqual, std::move(sign), Immediate(1U << 15));
+    Node extend = Operation(OperationCode::Select, is_sign, Immediate(0xFFFF0000), Immediate(0));
+    return Operation(OperationCode::UBitwiseOr, std::move(value), std::move(extend));
+}
+
 } // Anonymous namespace
 
 u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) {
@@ -136,26 +160,31 @@ u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) {
         LOG_DEBUG(HW_GPU, "LD_L cache management mode: {}", static_cast<u64>(instr.ld_l.unknown));
         [[fallthrough]];
     case OpCode::Id::LD_S: {
-        const auto GetMemory = [&](s32 offset) {
+        const auto GetAddress = [&](s32 offset) {
             ASSERT(offset % 4 == 0);
             const Node immediate_offset = Immediate(static_cast<s32>(instr.smem_imm) + offset);
-            const Node address = Operation(OperationCode::IAdd, NO_PRECISE, GetRegister(instr.gpr8),
-                                           immediate_offset);
-            return opcode->get().GetId() == OpCode::Id::LD_S ? GetSharedMemory(address)
-                                                             : GetLocalMemory(address);
+            return Operation(OperationCode::IAdd, GetRegister(instr.gpr8), immediate_offset);
+        };
+        const auto GetMemory = [&](s32 offset) {
+            return opcode->get().GetId() == OpCode::Id::LD_S ? GetSharedMemory(GetAddress(offset))
+                                                             : GetLocalMemory(GetAddress(offset));
         };
 
         switch (instr.ldst_sl.type.Value()) {
-        case Tegra::Shader::StoreType::Bits32:
-        case Tegra::Shader::StoreType::Bits64:
-        case Tegra::Shader::StoreType::Bits128: {
-            const u32 count = [&]() {
+        case StoreType::Signed16:
+            SetRegister(bb, instr.gpr0,
+                        Sign16Extend(ExtractUnaligned(GetMemory(0), GetAddress(0), 0b10, 16)));
+            break;
+        case StoreType::Bits32:
+        case StoreType::Bits64:
+        case StoreType::Bits128: {
+            const u32 count = [&] {
                 switch (instr.ldst_sl.type.Value()) {
-                case Tegra::Shader::StoreType::Bits32:
+                case StoreType::Bits32:
                     return 1;
-                case Tegra::Shader::StoreType::Bits64:
+                case StoreType::Bits64:
                     return 2;
-                case Tegra::Shader::StoreType::Bits128:
+                case StoreType::Bits128:
                     return 4;
                 default:
                     UNREACHABLE();
@@ -212,12 +241,7 @@ u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) {
             // To handle unaligned loads get the bytes used to dereference global memory and extract
             // those bytes from the loaded u32.
             if (IsUnaligned(type)) {
-                Node mask = Immediate(GetUnalignedMask(type));
-                Node offset = Operation(OperationCode::UBitwiseAnd, real_address, std::move(mask));
-                offset = Operation(OperationCode::ULogicalShiftLeft, offset, Immediate(3));
-
-                gmem = Operation(OperationCode::UBitfieldExtract, std::move(gmem),
-                                 std::move(offset), Immediate(size));
+                gmem = ExtractUnaligned(gmem, real_address, GetUnalignedMask(type), size);
             }
 
             SetTemporary(bb, i, gmem);
@@ -269,21 +293,28 @@ u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) {
             return Operation(OperationCode::IAdd, NO_PRECISE, GetRegister(instr.gpr8), immediate);
         };
 
-        const auto set_memory = opcode->get().GetId() == OpCode::Id::ST_L
-                                    ? &ShaderIR::SetLocalMemory
-                                    : &ShaderIR::SetSharedMemory;
+        const bool is_local = opcode->get().GetId() == OpCode::Id::ST_L;
+        const auto set_memory = is_local ? &ShaderIR::SetLocalMemory : &ShaderIR::SetSharedMemory;
+        const auto get_memory = is_local ? &ShaderIR::GetLocalMemory : &ShaderIR::GetSharedMemory;
 
         switch (instr.ldst_sl.type.Value()) {
-        case Tegra::Shader::StoreType::Bits128:
+        case StoreType::Bits128:
             (this->*set_memory)(bb, GetAddress(12), GetRegister(instr.gpr0.Value() + 3));
             (this->*set_memory)(bb, GetAddress(8), GetRegister(instr.gpr0.Value() + 2));
             [[fallthrough]];
-        case Tegra::Shader::StoreType::Bits64:
+        case StoreType::Bits64:
             (this->*set_memory)(bb, GetAddress(4), GetRegister(instr.gpr0.Value() + 1));
             [[fallthrough]];
-        case Tegra::Shader::StoreType::Bits32:
+        case StoreType::Bits32:
             (this->*set_memory)(bb, GetAddress(0), GetRegister(instr.gpr0));
             break;
+        case StoreType::Signed16: {
+            Node address = GetAddress(0);
+            Node memory = (this->*get_memory)(address);
+            (this->*set_memory)(
+                bb, address, InsertUnaligned(memory, GetRegister(instr.gpr0), address, 0b10, 16));
+            break;
+        }
         default:
             UNIMPLEMENTED_MSG("{} unhandled type: {}", opcode->get().GetName(),
                               static_cast<u32>(instr.ldst_sl.type.Value()));
@@ -323,18 +354,32 @@ u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) {
             Node value = GetRegister(instr.gpr0.Value() + i);
 
             if (IsUnaligned(type)) {
-                Node mask = Immediate(GetUnalignedMask(type));
-                Node offset = Operation(OperationCode::UBitwiseAnd, real_address, std::move(mask));
-                offset = Operation(OperationCode::ULogicalShiftLeft, offset, Immediate(3));
-
-                value = Operation(OperationCode::UBitfieldInsert, gmem, std::move(value), offset,
-                                  Immediate(size));
+                const u32 mask = GetUnalignedMask(type);
+                value = InsertUnaligned(gmem, std::move(value), real_address, mask, size);
             }
 
             bb.push_back(Operation(OperationCode::Assign, gmem, value));
         }
         break;
     }
+    case OpCode::Id::ATOM: {
+        UNIMPLEMENTED_IF_MSG(instr.atom.operation != GlobalAtomicOp::Add, "operation={}",
+                             static_cast<int>(instr.atom.operation.Value()));
+        UNIMPLEMENTED_IF_MSG(instr.atom.type != GlobalAtomicType::S32, "type={}",
+                             static_cast<int>(instr.atom.type.Value()));
+
+        const auto [real_address, base_address, descriptor] =
+            TrackGlobalMemory(bb, instr, true, true);
+        if (!real_address || !base_address) {
+            // Tracking failed, skip atomic.
+            break;
+        }
+
+        Node gmem = MakeNode<GmemNode>(real_address, base_address, descriptor);
+        Node value = Operation(OperationCode::AtomicAdd, std::move(gmem), GetRegister(instr.gpr20));
+        SetRegister(bb, instr.gpr0, std::move(value));
+        break;
+    }
     case OpCode::Id::ATOMS: {
         UNIMPLEMENTED_IF_MSG(instr.atoms.operation != AtomicOp::Add, "operation={}",
                              static_cast<int>(instr.atoms.operation.Value()));
@@ -348,7 +393,7 @@ u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) {
         Node memory = GetSharedMemory(std::move(address));
         Node data = GetRegister(instr.gpr20);
 
-        Node value = Operation(OperationCode::UAtomicAdd, std::move(memory), std::move(data));
+        Node value = Operation(OperationCode::AtomicAdd, std::move(memory), std::move(data));
         SetRegister(bb, instr.gpr0, std::move(value));
         break;
     }
diff --git a/src/video_core/shader/decode/other.cpp b/src/video_core/shader/decode/other.cpp
index 7321698b2..4944e9d69 100644
--- a/src/video_core/shader/decode/other.cpp
+++ b/src/video_core/shader/decode/other.cpp
@@ -69,13 +69,16 @@ u32 ShaderIR::DecodeOther(NodeBlock& bb, u32 pc) {
     case OpCode::Id::MOV_SYS: {
         const Node value = [this, instr] {
             switch (instr.sys20) {
+            case SystemVariable::LaneId:
+                LOG_WARNING(HW_GPU, "MOV_SYS instruction with LaneId is incomplete");
+                return Immediate(0U);
             case SystemVariable::InvocationId:
                 return Operation(OperationCode::InvocationId);
             case SystemVariable::Ydirection:
                 return Operation(OperationCode::YNegate);
             case SystemVariable::InvocationInfo:
                 LOG_WARNING(HW_GPU, "MOV_SYS instruction with InvocationInfo is incomplete");
-                return Immediate(0u);
+                return Immediate(0U);
             case SystemVariable::Tid: {
                 Node value = Immediate(0);
                 value = BitfieldInsert(value, Operation(OperationCode::LocalInvocationIdX), 0, 9);
@@ -188,7 +191,7 @@ u32 ShaderIR::DecodeOther(NodeBlock& bb, u32 pc) {
         UNIMPLEMENTED_IF_MSG(cc != Tegra::Shader::ConditionCode::T, "SYNC condition code used: {}",
                              static_cast<u32>(cc));
 
-        if (disable_flow_stack) {
+        if (decompiled) {
             break;
         }
 
@@ -200,7 +203,7 @@ u32 ShaderIR::DecodeOther(NodeBlock& bb, u32 pc) {
         const Tegra::Shader::ConditionCode cc = instr.flow_condition_code;
         UNIMPLEMENTED_IF_MSG(cc != Tegra::Shader::ConditionCode::T, "BRK condition code used: {}",
                              static_cast<u32>(cc));
-        if (disable_flow_stack) {
+        if (decompiled) {
             break;
         }
 
diff --git a/src/video_core/shader/decode/shift.cpp b/src/video_core/shader/decode/shift.cpp
index d419e9c45..3b391d3e6 100644
--- a/src/video_core/shader/decode/shift.cpp
+++ b/src/video_core/shader/decode/shift.cpp
@@ -10,8 +10,80 @@
 
 namespace VideoCommon::Shader {
 
+using std::move;
 using Tegra::Shader::Instruction;
 using Tegra::Shader::OpCode;
+using Tegra::Shader::ShfType;
+using Tegra::Shader::ShfXmode;
+
+namespace {
+
+Node IsFull(Node shift) {
+    return Operation(OperationCode::LogicalIEqual, move(shift), Immediate(32));
+}
+
+Node Shift(OperationCode opcode, Node value, Node shift) {
+    Node is_full = Operation(OperationCode::LogicalIEqual, shift, Immediate(32));
+    Node shifted = Operation(opcode, move(value), shift);
+    return Operation(OperationCode::Select, IsFull(move(shift)), Immediate(0), move(shifted));
+}
+
+Node ClampShift(Node shift, s32 size = 32) {
+    shift = Operation(OperationCode::IMax, move(shift), Immediate(0));
+    return Operation(OperationCode::IMin, move(shift), Immediate(size));
+}
+
+Node WrapShift(Node shift, s32 size = 32) {
+    return Operation(OperationCode::UBitwiseAnd, move(shift), Immediate(size - 1));
+}
+
+Node ShiftRight(Node low, Node high, Node shift, Node low_shift, ShfType type) {
+    // These values are used when the shift value is less than 32
+    Node less_low = Shift(OperationCode::ILogicalShiftRight, low, shift);
+    Node less_high = Shift(OperationCode::ILogicalShiftLeft, high, low_shift);
+    Node less = Operation(OperationCode::IBitwiseOr, move(less_high), move(less_low));
+
+    if (type == ShfType::Bits32) {
+        // On 32 bit shifts we are either full (shifting 32) or shifting less than 32 bits
+        return Operation(OperationCode::Select, IsFull(move(shift)), move(high), move(less));
+    }
+
+    // And these when it's larger than or 32
+    const bool is_signed = type == ShfType::S64;
+    const auto opcode = SignedToUnsignedCode(OperationCode::IArithmeticShiftRight, is_signed);
+    Node reduced = Operation(OperationCode::IAdd, shift, Immediate(-32));
+    Node greater = Shift(opcode, high, move(reduced));
+
+    Node is_less = Operation(OperationCode::LogicalILessThan, shift, Immediate(32));
+    Node is_zero = Operation(OperationCode::LogicalIEqual, move(shift), Immediate(0));
+
+    Node value = Operation(OperationCode::Select, move(is_less), move(less), move(greater));
+    return Operation(OperationCode::Select, move(is_zero), move(high), move(value));
+}
+
+Node ShiftLeft(Node low, Node high, Node shift, Node low_shift, ShfType type) {
+    // These values are used when the shift value is less than 32
+    Node less_low = Operation(OperationCode::ILogicalShiftRight, low, low_shift);
+    Node less_high = Operation(OperationCode::ILogicalShiftLeft, high, shift);
+    Node less = Operation(OperationCode::IBitwiseOr, move(less_low), move(less_high));
+
+    if (type == ShfType::Bits32) {
+        // On 32 bit shifts we are either full (shifting 32) or shifting less than 32 bits
+        return Operation(OperationCode::Select, IsFull(move(shift)), move(low), move(less));
+    }
+
+    // And these when it's larger than or 32
+    Node reduced = Operation(OperationCode::IAdd, shift, Immediate(-32));
+    Node greater = Shift(OperationCode::ILogicalShiftLeft, move(low), move(reduced));
+
+    Node is_less = Operation(OperationCode::LogicalILessThan, shift, Immediate(32));
+    Node is_zero = Operation(OperationCode::LogicalIEqual, move(shift), Immediate(0));
+
+    Node value = Operation(OperationCode::Select, move(is_less), move(less), move(greater));
+    return Operation(OperationCode::Select, move(is_zero), move(high), move(value));
+}
+
+} // Anonymous namespace
 
 u32 ShaderIR::DecodeShift(NodeBlock& bb, u32 pc) {
     const Instruction instr = {program_code[pc]};
@@ -28,29 +100,48 @@ u32 ShaderIR::DecodeShift(NodeBlock& bb, u32 pc) {
         }
     }();
 
-    switch (opcode->get().GetId()) {
+    switch (const auto opid = opcode->get().GetId(); opid) {
     case OpCode::Id::SHR_C:
     case OpCode::Id::SHR_R:
     case OpCode::Id::SHR_IMM: {
-        if (instr.shr.wrap) {
-            op_b = Operation(OperationCode::UBitwiseAnd, std::move(op_b), Immediate(0x1f));
-        } else {
-            op_b = Operation(OperationCode::IMax, std::move(op_b), Immediate(0));
-            op_b = Operation(OperationCode::IMin, std::move(op_b), Immediate(31));
-        }
+        op_b = instr.shr.wrap ? WrapShift(move(op_b)) : ClampShift(move(op_b));
 
         Node value = SignedOperation(OperationCode::IArithmeticShiftRight, instr.shift.is_signed,
-                                     std::move(op_a), std::move(op_b));
+                                     move(op_a), move(op_b));
         SetInternalFlagsFromInteger(bb, value, instr.generates_cc);
-        SetRegister(bb, instr.gpr0, std::move(value));
+        SetRegister(bb, instr.gpr0, move(value));
         break;
     }
     case OpCode::Id::SHL_C:
     case OpCode::Id::SHL_R:
     case OpCode::Id::SHL_IMM: {
-        const Node value = Operation(OperationCode::ILogicalShiftLeft, op_a, op_b);
+        Node value = Operation(OperationCode::ILogicalShiftLeft, op_a, op_b);
         SetInternalFlagsFromInteger(bb, value, instr.generates_cc);
-        SetRegister(bb, instr.gpr0, value);
+        SetRegister(bb, instr.gpr0, move(value));
+        break;
+    }
+    case OpCode::Id::SHF_RIGHT_R:
+    case OpCode::Id::SHF_RIGHT_IMM:
+    case OpCode::Id::SHF_LEFT_R:
+    case OpCode::Id::SHF_LEFT_IMM: {
+        UNIMPLEMENTED_IF(instr.generates_cc);
+        UNIMPLEMENTED_IF_MSG(instr.shf.xmode != ShfXmode::None, "xmode={}",
+                             static_cast<int>(instr.shf.xmode.Value()));
+
+        if (instr.is_b_imm) {
+            op_b = Immediate(static_cast<u32>(instr.shf.immediate));
+        }
+        const s32 size = instr.shf.type == ShfType::Bits32 ? 32 : 64;
+        Node shift = instr.shf.wrap ? WrapShift(move(op_b), size) : ClampShift(move(op_b), size);
+
+        Node negated_shift = Operation(OperationCode::INegate, shift);
+        Node low_shift = Operation(OperationCode::IAdd, move(negated_shift), Immediate(32));
+
+        const bool is_right = opid == OpCode::Id::SHF_RIGHT_R || opid == OpCode::Id::SHF_RIGHT_IMM;
+        Node value = (is_right ? ShiftRight : ShiftLeft)(
+            move(op_a), GetRegister(instr.gpr39), move(shift), move(low_shift), instr.shf.type);
+
+        SetRegister(bb, instr.gpr0, move(value));
         break;
     }
     default:
diff --git a/src/video_core/shader/decode/texture.cpp b/src/video_core/shader/decode/texture.cpp
index cd984f763..bee7d8cad 100644
--- a/src/video_core/shader/decode/texture.cpp
+++ b/src/video_core/shader/decode/texture.cpp
@@ -144,7 +144,8 @@ u32 ShaderIR::DecodeTexture(NodeBlock& bb, u32 pc) {
         Node4 values;
         for (u32 element = 0; element < values.size(); ++element) {
             auto coords_copy = coords;
-            MetaTexture meta{sampler, {}, depth_compare, aoffi, {}, {}, {}, {}, component, element};
+            MetaTexture meta{sampler, {}, depth_compare, aoffi,   {}, {},
+                             {},      {}, component,     element, {}};
             values[element] = Operation(OperationCode::TextureGather, meta, std::move(coords_copy));
         }
 
@@ -161,16 +162,16 @@ u32 ShaderIR::DecodeTexture(NodeBlock& bb, u32 pc) {
     case OpCode::Id::TXD: {
         UNIMPLEMENTED_IF_MSG(instr.txd.UsesMiscMode(TextureMiscMode::AOFFI),
                              "AOFFI is not implemented");
-        UNIMPLEMENTED_IF_MSG(instr.txd.is_array != 0, "TXD Array is not implemented");
 
+        const bool is_array = instr.txd.is_array != 0;
         u64 base_reg = instr.gpr8.Value();
         const auto derivate_reg = instr.gpr20.Value();
         const auto texture_type = instr.txd.texture_type.Value();
         const auto coord_count = GetCoordCount(texture_type);
-
-        const Sampler* sampler = is_bindless
-                                     ? GetBindlessSampler(base_reg, {{texture_type, false, false}})
-                                     : GetSampler(instr.sampler, {{texture_type, false, false}});
+        Node index_var{};
+        const Sampler* sampler =
+            is_bindless ? GetBindlessSampler(base_reg, index_var, {{texture_type, is_array, false}})
+                        : GetSampler(instr.sampler, {{texture_type, is_array, false}});
         Node4 values;
         if (sampler == nullptr) {
             for (u32 element = 0; element < values.size(); ++element) {
@@ -179,6 +180,7 @@ u32 ShaderIR::DecodeTexture(NodeBlock& bb, u32 pc) {
             WriteTexInstructionFloat(bb, instr, values);
             break;
         }
+
         if (is_bindless) {
             base_reg++;
         }
@@ -192,8 +194,15 @@ u32 ShaderIR::DecodeTexture(NodeBlock& bb, u32 pc) {
             derivates.push_back(GetRegister(derivate_reg + derivate + 1));
         }
 
+        Node array_node = {};
+        if (is_array) {
+            const Node info_reg = GetRegister(base_reg + coord_count);
+            array_node = BitfieldExtract(info_reg, 0, 16);
+        }
+
         for (u32 element = 0; element < values.size(); ++element) {
-            MetaTexture meta{*sampler, {}, {}, {}, {}, derivates, {}, {}, {}, element};
+            MetaTexture meta{*sampler, array_node, {}, {},      {},       derivates,
+                             {},       {},         {}, element, index_var};
             values[element] = Operation(OperationCode::TextureGradient, std::move(meta), coords);
         }
 
@@ -208,8 +217,9 @@ u32 ShaderIR::DecodeTexture(NodeBlock& bb, u32 pc) {
         // TODO: The new commits on the texture refactor, change the way samplers work.
         // Sadly, not all texture instructions specify the type of texture their sampler
         // uses. This must be fixed at a later instance.
+        Node index_var{};
         const Sampler* sampler =
-            is_bindless ? GetBindlessSampler(instr.gpr8) : GetSampler(instr.sampler);
+            is_bindless ? GetBindlessSampler(instr.gpr8, index_var) : GetSampler(instr.sampler);
 
         if (sampler == nullptr) {
             u32 indexer = 0;
@@ -233,7 +243,7 @@ u32 ShaderIR::DecodeTexture(NodeBlock& bb, u32 pc) {
                 if (!instr.txq.IsComponentEnabled(element)) {
                     continue;
                 }
-                MetaTexture meta{*sampler, {}, {}, {}, {}, {}, {}, {}, {}, element};
+                MetaTexture meta{*sampler, {}, {}, {}, {}, {}, {}, {}, {}, element, index_var};
                 const Node value =
                     Operation(OperationCode::TextureQueryDimensions, meta,
                               GetRegister(instr.gpr8.Value() + (is_bindless ? 1 : 0)));
@@ -259,8 +269,9 @@ u32 ShaderIR::DecodeTexture(NodeBlock& bb, u32 pc) {
 
         auto texture_type = instr.tmml.texture_type.Value();
         const bool is_array = instr.tmml.array != 0;
+        Node index_var{};
         const Sampler* sampler =
-            is_bindless ? GetBindlessSampler(instr.gpr20) : GetSampler(instr.sampler);
+            is_bindless ? GetBindlessSampler(instr.gpr20, index_var) : GetSampler(instr.sampler);
 
         if (sampler == nullptr) {
             u32 indexer = 0;
@@ -302,7 +313,7 @@ u32 ShaderIR::DecodeTexture(NodeBlock& bb, u32 pc) {
                 continue;
             }
             auto params = coords;
-            MetaTexture meta{*sampler, {}, {}, {}, {}, {}, {}, {}, {}, element};
+            MetaTexture meta{*sampler, {}, {}, {}, {}, {}, {}, {}, {}, element, index_var};
             const Node value = Operation(OperationCode::TextureQueryLod, meta, std::move(params));
             SetTemporary(bb, indexer++, value);
         }
@@ -376,37 +387,65 @@ const Sampler* ShaderIR::GetSampler(const Tegra::Shader::Sampler& sampler,
     // Otherwise create a new mapping for this sampler
     const auto next_index = static_cast<u32>(used_samplers.size());
     return &used_samplers.emplace_back(next_index, offset, info.type, info.is_array, info.is_shadow,
-                                       info.is_buffer);
+                                       info.is_buffer, false);
 }
 
-const Sampler* ShaderIR::GetBindlessSampler(Tegra::Shader::Register reg,
+const Sampler* ShaderIR::GetBindlessSampler(Tegra::Shader::Register reg, Node& index_var,
                                             std::optional<SamplerInfo> sampler_info) {
     const Node sampler_register = GetRegister(reg);
-    const auto [base_sampler, buffer, offset] =
-        TrackCbuf(sampler_register, global_code, static_cast<s64>(global_code.size()));
-    ASSERT(base_sampler != nullptr);
-    if (base_sampler == nullptr) {
+    const auto [base_node, tracked_sampler_info] =
+        TrackBindlessSampler(sampler_register, global_code, static_cast<s64>(global_code.size()));
+    ASSERT(base_node != nullptr);
+    if (base_node == nullptr) {
         return nullptr;
     }
 
-    const auto info = GetSamplerInfo(sampler_info, offset, buffer);
+    if (const auto bindless_sampler_info =
+            std::get_if<BindlessSamplerNode>(&*tracked_sampler_info)) {
+        const u32 buffer = bindless_sampler_info->GetIndex();
+        const u32 offset = bindless_sampler_info->GetOffset();
+        const auto info = GetSamplerInfo(sampler_info, offset, buffer);
+
+        // If this sampler has already been used, return the existing mapping.
+        const auto it =
+            std::find_if(used_samplers.begin(), used_samplers.end(),
+                         [buffer = buffer, offset = offset](const Sampler& entry) {
+                             return entry.GetBuffer() == buffer && entry.GetOffset() == offset;
+                         });
+        if (it != used_samplers.end()) {
+            ASSERT(it->IsBindless() && it->GetType() == info.type &&
+                   it->IsArray() == info.is_array && it->IsShadow() == info.is_shadow);
+            return &*it;
+        }
 
-    // If this sampler has already been used, return the existing mapping.
-    const auto it =
-        std::find_if(used_samplers.begin(), used_samplers.end(),
-                     [buffer = buffer, offset = offset](const Sampler& entry) {
-                         return entry.GetBuffer() == buffer && entry.GetOffset() == offset;
-                     });
-    if (it != used_samplers.end()) {
-        ASSERT(it->IsBindless() && it->GetType() == info.type && it->IsArray() == info.is_array &&
-               it->IsShadow() == info.is_shadow);
-        return &*it;
-    }
+        // Otherwise create a new mapping for this sampler
+        const auto next_index = static_cast<u32>(used_samplers.size());
+        return &used_samplers.emplace_back(next_index, offset, buffer, info.type, info.is_array,
+                                           info.is_shadow, info.is_buffer, false);
+    } else if (const auto array_sampler_info =
+                   std::get_if<ArraySamplerNode>(&*tracked_sampler_info)) {
+        const u32 base_offset = array_sampler_info->GetBaseOffset() / 4;
+        index_var = GetCustomVariable(array_sampler_info->GetIndexVar());
+        const auto info = GetSamplerInfo(sampler_info, base_offset);
+
+        // If this sampler has already been used, return the existing mapping.
+        const auto it = std::find_if(
+            used_samplers.begin(), used_samplers.end(),
+            [base_offset](const Sampler& entry) { return entry.GetOffset() == base_offset; });
+        if (it != used_samplers.end()) {
+            ASSERT(!it->IsBindless() && it->GetType() == info.type &&
+                   it->IsArray() == info.is_array && it->IsShadow() == info.is_shadow &&
+                   it->IsBuffer() == info.is_buffer && it->IsIndexed());
+            return &*it;
+        }
 
-    // Otherwise create a new mapping for this sampler
-    const auto next_index = static_cast<u32>(used_samplers.size());
-    return &used_samplers.emplace_back(next_index, offset, buffer, info.type, info.is_array,
-                                       info.is_shadow, info.is_buffer);
+        uses_indexed_samplers = true;
+        // Otherwise create a new mapping for this sampler
+        const auto next_index = static_cast<u32>(used_samplers.size());
+        return &used_samplers.emplace_back(next_index, base_offset, info.type, info.is_array,
+                                           info.is_shadow, info.is_buffer, true);
+    }
+    return nullptr;
 }
 
 void ShaderIR::WriteTexInstructionFloat(NodeBlock& bb, Instruction instr, const Node4& components) {
@@ -483,66 +522,53 @@ Node4 ShaderIR::GetTextureCode(Instruction instr, TextureType texture_type,
                                Node array, Node depth_compare, u32 bias_offset,
                                std::vector<Node> aoffi,
                                std::optional<Tegra::Shader::Register> bindless_reg) {
-    const auto is_array = static_cast<bool>(array);
-    const auto is_shadow = static_cast<bool>(depth_compare);
+    const bool is_array = array != nullptr;
+    const bool is_shadow = depth_compare != nullptr;
     const bool is_bindless = bindless_reg.has_value();
 
-    UNIMPLEMENTED_IF_MSG((texture_type == TextureType::Texture3D && (is_array || is_shadow)) ||
-                             (texture_type == TextureType::TextureCube && is_array && is_shadow),
-                         "This method is not supported.");
+    UNIMPLEMENTED_IF(texture_type == TextureType::TextureCube && is_array && is_shadow);
+    ASSERT_MSG(texture_type != TextureType::Texture3D || !is_array || !is_shadow,
+               "Illegal texture type");
 
     const SamplerInfo info{texture_type, is_array, is_shadow, false};
-    const Sampler* sampler =
-        is_bindless ? GetBindlessSampler(*bindless_reg, info) : GetSampler(instr.sampler, info);
-    Node4 values;
-    if (sampler == nullptr) {
-        for (u32 element = 0; element < values.size(); ++element) {
-            values[element] = Immediate(0);
-        }
-        return values;
+    Node index_var;
+    const Sampler* sampler = is_bindless ? GetBindlessSampler(*bindless_reg, index_var, info)
+                                         : GetSampler(instr.sampler, info);
+    if (!sampler) {
+        return {Immediate(0), Immediate(0), Immediate(0), Immediate(0)};
     }
 
     const bool lod_needed = process_mode == TextureProcessMode::LZ ||
                             process_mode == TextureProcessMode::LL ||
                             process_mode == TextureProcessMode::LLA;
-
-    // LOD selection (either via bias or explicit textureLod) not supported in GL for
-    // sampler2DArrayShadow and samplerCubeArrayShadow.
-    const bool gl_lod_supported =
-        !((texture_type == Tegra::Shader::TextureType::Texture2D && is_array && is_shadow) ||
-          (texture_type == Tegra::Shader::TextureType::TextureCube && is_array && is_shadow));
-
-    const OperationCode read_method =
-        (lod_needed && gl_lod_supported) ? OperationCode::TextureLod : OperationCode::Texture;
-
-    UNIMPLEMENTED_IF(process_mode != TextureProcessMode::None && !gl_lod_supported);
+    const OperationCode opcode = lod_needed ? OperationCode::TextureLod : OperationCode::Texture;
 
     Node bias;
     Node lod;
-    if (process_mode != TextureProcessMode::None && gl_lod_supported) {
-        switch (process_mode) {
-        case TextureProcessMode::LZ:
-            lod = Immediate(0.0f);
-            break;
-        case TextureProcessMode::LB:
-            // If present, lod or bias are always stored in the register
-            // indexed by the gpr20 field with an offset depending on the
-            // usage of the other registers
-            bias = GetRegister(instr.gpr20.Value() + bias_offset);
-            break;
-        case TextureProcessMode::LL:
-            lod = GetRegister(instr.gpr20.Value() + bias_offset);
-            break;
-        default:
-            UNIMPLEMENTED_MSG("Unimplemented process mode={}", static_cast<u32>(process_mode));
-            break;
-        }
+    switch (process_mode) {
+    case TextureProcessMode::None:
+        break;
+    case TextureProcessMode::LZ:
+        lod = Immediate(0.0f);
+        break;
+    case TextureProcessMode::LB:
+        // If present, lod or bias are always stored in the register indexed by the gpr20 field with
+        // an offset depending on the usage of the other registers.
+        bias = GetRegister(instr.gpr20.Value() + bias_offset);
+        break;
+    case TextureProcessMode::LL:
+        lod = GetRegister(instr.gpr20.Value() + bias_offset);
+        break;
+    default:
+        UNIMPLEMENTED_MSG("Unimplemented process mode={}", static_cast<u32>(process_mode));
+        break;
     }
 
+    Node4 values;
     for (u32 element = 0; element < values.size(); ++element) {
-        auto copy_coords = coords;
-        MetaTexture meta{*sampler, array, depth_compare, aoffi, {}, {}, bias, lod, {}, element};
-        values[element] = Operation(read_method, meta, std::move(copy_coords));
+        MetaTexture meta{*sampler, array, depth_compare, aoffi,    {}, {}, bias,
+                         lod,      {},    element,       index_var};
+        values[element] = Operation(opcode, meta, coords);
     }
 
     return values;
@@ -589,7 +615,7 @@ Node4 ShaderIR::GetTexCode(Instruction instr, TextureType texture_type,
         aoffi = GetAoffiCoordinates(GetRegister(parameter_register++), coord_count, false);
     }
 
-    Node dc{};
+    Node dc;
     if (depth_compare) {
         // Depth is always stored in the register signaled by gpr20 or in the next register if lod
         // or bias are used
@@ -625,7 +651,7 @@ Node4 ShaderIR::GetTexsCode(Instruction instr, TextureType texture_type,
 
     const Node array = is_array ? GetRegister(array_register) : nullptr;
 
-    Node dc{};
+    Node dc;
     if (depth_compare) {
         // Depth is always stored in the register signaled by gpr20 or in the next register if lod
         // or bias are used
@@ -656,7 +682,8 @@ Node4 ShaderIR::GetTld4Code(Instruction instr, TextureType texture_type, bool de
     u64 parameter_register = instr.gpr20.Value();
 
     const SamplerInfo info{texture_type, is_array, depth_compare, false};
-    const Sampler* sampler = is_bindless ? GetBindlessSampler(parameter_register++, info)
+    Node index_var{};
+    const Sampler* sampler = is_bindless ? GetBindlessSampler(parameter_register++, index_var, info)
                                          : GetSampler(instr.sampler, info);
     Node4 values;
     if (sampler == nullptr) {
@@ -685,7 +712,8 @@ Node4 ShaderIR::GetTld4Code(Instruction instr, TextureType texture_type, bool de
     for (u32 element = 0; element < values.size(); ++element) {
         auto coords_copy = coords;
         MetaTexture meta{
-            *sampler, GetRegister(array_register), dc, aoffi, ptp, {}, {}, {}, component, element};
+            *sampler, GetRegister(array_register), dc, aoffi, ptp, {}, {}, {}, component, element,
+            index_var};
         values[element] = Operation(OperationCode::TextureGather, meta, std::move(coords_copy));
     }
 
@@ -718,7 +746,7 @@ Node4 ShaderIR::GetTldCode(Tegra::Shader::Instruction instr) {
     Node4 values;
     for (u32 element = 0; element < values.size(); ++element) {
         auto coords_copy = coords;
-        MetaTexture meta{sampler, array_register, {}, {}, {}, {}, {}, lod, {}, element};
+        MetaTexture meta{sampler, array_register, {}, {}, {}, {}, {}, lod, {}, element, {}};
         values[element] = Operation(OperationCode::TexelFetch, meta, std::move(coords_copy));
     }
 
@@ -768,7 +796,7 @@ Node4 ShaderIR::GetTldsCode(Instruction instr, TextureType texture_type, bool is
     Node4 values;
     for (u32 element = 0; element < values.size(); ++element) {
         auto coords_copy = coords;
-        MetaTexture meta{sampler, array, {}, {}, {}, {}, {}, lod, {}, element};
+        MetaTexture meta{sampler, array, {}, {}, {}, {}, {}, lod, {}, element, {}};
         values[element] = Operation(OperationCode::TexelFetch, meta, std::move(coords_copy));
     }
     return values;
diff --git a/src/video_core/shader/node.h b/src/video_core/shader/node.h
index 075c7d07c..a0a7b9111 100644
--- a/src/video_core/shader/node.h
+++ b/src/video_core/shader/node.h
@@ -162,7 +162,7 @@ enum class OperationCode {
     AtomicImageXor,      /// (MetaImage, int[N] coords) -> void
     AtomicImageExchange, /// (MetaImage, int[N] coords) -> void
 
-    UAtomicAdd, /// (smem, uint) -> uint
+    AtomicAdd, /// (memory, {u}int) -> {u}int
 
     Branch,         /// (uint branch_target) -> void
     BranchIndirect, /// (uint branch_target) -> void
@@ -212,6 +212,7 @@ enum class MetaStackClass {
 class OperationNode;
 class ConditionalNode;
 class GprNode;
+class CustomVarNode;
 class ImmediateNode;
 class InternalFlagNode;
 class PredicateNode;
@@ -223,26 +224,32 @@ class SmemNode;
 class GmemNode;
 class CommentNode;
 
-using NodeData = std::variant<OperationNode, ConditionalNode, GprNode, ImmediateNode,
+using NodeData = std::variant<OperationNode, ConditionalNode, GprNode, CustomVarNode, ImmediateNode,
                               InternalFlagNode, PredicateNode, AbufNode, PatchNode, CbufNode,
                               LmemNode, SmemNode, GmemNode, CommentNode>;
 using Node = std::shared_ptr<NodeData>;
 using Node4 = std::array<Node, 4>;
 using NodeBlock = std::vector<Node>;
 
+class BindlessSamplerNode;
+class ArraySamplerNode;
+
+using TrackSamplerData = std::variant<BindlessSamplerNode, ArraySamplerNode>;
+using TrackSampler = std::shared_ptr<TrackSamplerData>;
+
 class Sampler {
 public:
     /// This constructor is for bound samplers
     constexpr explicit Sampler(u32 index, u32 offset, Tegra::Shader::TextureType type,
-                               bool is_array, bool is_shadow, bool is_buffer)
+                               bool is_array, bool is_shadow, bool is_buffer, bool is_indexed)
         : index{index}, offset{offset}, type{type}, is_array{is_array}, is_shadow{is_shadow},
-          is_buffer{is_buffer} {}
+          is_buffer{is_buffer}, is_indexed{is_indexed} {}
 
     /// This constructor is for bindless samplers
     constexpr explicit Sampler(u32 index, u32 offset, u32 buffer, Tegra::Shader::TextureType type,
-                               bool is_array, bool is_shadow, bool is_buffer)
+                               bool is_array, bool is_shadow, bool is_buffer, bool is_indexed)
         : index{index}, offset{offset}, buffer{buffer}, type{type}, is_array{is_array},
-          is_shadow{is_shadow}, is_buffer{is_buffer}, is_bindless{true} {}
+          is_shadow{is_shadow}, is_buffer{is_buffer}, is_bindless{true}, is_indexed{is_indexed} {}
 
     constexpr u32 GetIndex() const {
         return index;
@@ -276,16 +283,72 @@ public:
         return is_bindless;
     }
 
+    constexpr bool IsIndexed() const {
+        return is_indexed;
+    }
+
+    constexpr u32 Size() const {
+        return size;
+    }
+
+    constexpr void SetSize(u32 new_size) {
+        size = new_size;
+    }
+
 private:
     u32 index{};  ///< Emulated index given for the this sampler.
     u32 offset{}; ///< Offset in the const buffer from where the sampler is being read.
     u32 buffer{}; ///< Buffer where the bindless sampler is being read (unused on bound samplers).
+    u32 size{};   ///< Size of the sampler if indexed.
 
     Tegra::Shader::TextureType type{}; ///< The type used to sample this texture (Texture2D, etc)
     bool is_array{};    ///< Whether the texture is being sampled as an array texture or not.
     bool is_shadow{};   ///< Whether the texture is being sampled as a depth texture or not.
     bool is_buffer{};   ///< Whether the texture is a texture buffer without sampler.
     bool is_bindless{}; ///< Whether this sampler belongs to a bindless texture or not.
+    bool is_indexed{};  ///< Whether this sampler is an indexed array of textures.
+};
+
+/// Represents a tracked bindless sampler into a direct const buffer
+class ArraySamplerNode final {
+public:
+    explicit ArraySamplerNode(u32 index, u32 base_offset, u32 bindless_var)
+        : index{index}, base_offset{base_offset}, bindless_var{bindless_var} {}
+
+    constexpr u32 GetIndex() const {
+        return index;
+    }
+
+    constexpr u32 GetBaseOffset() const {
+        return base_offset;
+    }
+
+    constexpr u32 GetIndexVar() const {
+        return bindless_var;
+    }
+
+private:
+    u32 index;
+    u32 base_offset;
+    u32 bindless_var;
+};
+
+/// Represents a tracked bindless sampler into a direct const buffer
+class BindlessSamplerNode final {
+public:
+    explicit BindlessSamplerNode(u32 index, u32 offset) : index{index}, offset{offset} {}
+
+    constexpr u32 GetIndex() const {
+        return index;
+    }
+
+    constexpr u32 GetOffset() const {
+        return offset;
+    }
+
+private:
+    u32 index;
+    u32 offset;
 };
 
 class Image final {
@@ -380,8 +443,9 @@ struct MetaTexture {
     std::vector<Node> derivates;
     Node bias;
     Node lod;
-    Node component{};
+    Node component;
     u32 element{};
+    Node index;
 };
 
 struct MetaImage {
@@ -488,6 +552,19 @@ private:
     Tegra::Shader::Register index{};
 };
 
+/// A custom variable
+class CustomVarNode final {
+public:
+    explicit constexpr CustomVarNode(u32 index) : index{index} {}
+
+    constexpr u32 GetIndex() const {
+        return index;
+    }
+
+private:
+    u32 index{};
+};
+
 /// A 32-bits value that represents an immediate value
 class ImmediateNode final {
 public:
diff --git a/src/video_core/shader/node_helper.h b/src/video_core/shader/node_helper.h
index 0c2aa749b..11231bbea 100644
--- a/src/video_core/shader/node_helper.h
+++ b/src/video_core/shader/node_helper.h
@@ -45,6 +45,12 @@ Node MakeNode(Args&&... args) {
     return std::make_shared<NodeData>(T(std::forward<Args>(args)...));
 }
 
+template <typename T, typename... Args>
+TrackSampler MakeTrackSampler(Args&&... args) {
+    static_assert(std::is_convertible_v<T, TrackSamplerData>);
+    return std::make_shared<TrackSamplerData>(T(std::forward<Args>(args)...));
+}
+
 template <typename... Args>
 Node Operation(OperationCode code, Args&&... args) {
     if constexpr (sizeof...(args) == 0) {
diff --git a/src/video_core/shader/shader_ir.cpp b/src/video_core/shader/shader_ir.cpp
index 31eecb3f4..3a5d280a9 100644
--- a/src/video_core/shader/shader_ir.cpp
+++ b/src/video_core/shader/shader_ir.cpp
@@ -27,6 +27,7 @@ ShaderIR::ShaderIR(const ProgramCode& program_code, u32 main_offset, CompilerSet
                    ConstBufferLocker& locker)
     : program_code{program_code}, main_offset{main_offset}, settings{settings}, locker{locker} {
     Decode();
+    PostDecode();
 }
 
 ShaderIR::~ShaderIR() = default;
@@ -38,6 +39,10 @@ Node ShaderIR::GetRegister(Register reg) {
     return MakeNode<GprNode>(reg);
 }
 
+Node ShaderIR::GetCustomVariable(u32 id) {
+    return MakeNode<CustomVarNode>(id);
+}
+
 Node ShaderIR::GetImmediate19(Instruction instr) {
     return Immediate(instr.alu.GetImm20_19());
 }
@@ -452,4 +457,8 @@ std::size_t ShaderIR::DeclareAmend(Node new_amend) {
     return id;
 }
 
+u32 ShaderIR::NewCustomVariable() {
+    return num_custom_variables++;
+}
+
 } // namespace VideoCommon::Shader
diff --git a/src/video_core/shader/shader_ir.h b/src/video_core/shader/shader_ir.h
index ba1db4c11..b0851c3be 100644
--- a/src/video_core/shader/shader_ir.h
+++ b/src/video_core/shader/shader_ir.h
@@ -180,6 +180,10 @@ public:
         return amend_code[index];
     }
 
+    u32 GetNumCustomVariables() const {
+        return num_custom_variables;
+    }
+
 private:
     friend class ASTDecoder;
 
@@ -191,6 +195,7 @@ private:
     };
 
     void Decode();
+    void PostDecode();
 
     NodeBlock DecodeRange(u32 begin, u32 end);
     void DecodeRangeInner(NodeBlock& bb, u32 begin, u32 end);
@@ -235,6 +240,8 @@ private:
 
     /// Generates a node for a passed register.
     Node GetRegister(Tegra::Shader::Register reg);
+    /// Generates a node for a custom variable
+    Node GetCustomVariable(u32 id);
     /// Generates a node representing a 19-bit immediate value
     Node GetImmediate19(Tegra::Shader::Instruction instr);
     /// Generates a node representing a 32-bit immediate value
@@ -321,7 +328,7 @@ private:
                               std::optional<SamplerInfo> sampler_info = std::nullopt);
 
     /// Accesses a texture sampler for a bindless texture.
-    const Sampler* GetBindlessSampler(Tegra::Shader::Register reg,
+    const Sampler* GetBindlessSampler(Tegra::Shader::Register reg, Node& index_var,
                                       std::optional<SamplerInfo> sampler_info = std::nullopt);
 
     /// Accesses an image.
@@ -387,6 +394,9 @@ private:
 
     std::tuple<Node, u32, u32> TrackCbuf(Node tracked, const NodeBlock& code, s64 cursor) const;
 
+    std::tuple<Node, TrackSampler> TrackBindlessSampler(Node tracked, const NodeBlock& code,
+                                                        s64 cursor);
+
     std::optional<u32> TrackImmediate(Node tracked, const NodeBlock& code, s64 cursor) const;
 
     std::pair<Node, s64> TrackRegister(const GprNode* tracked, const NodeBlock& code,
@@ -399,6 +409,8 @@ private:
     /// Register new amending code and obtain the reference id.
     std::size_t DeclareAmend(Node new_amend);
 
+    u32 NewCustomVariable();
+
     const ProgramCode& program_code;
     const u32 main_offset;
     const CompilerSettings settings;
@@ -414,6 +426,7 @@ private:
     NodeBlock global_code;
     ASTManager program_manager{true, true};
     std::vector<Node> amend_code;
+    u32 num_custom_variables{};
 
     std::set<u32> used_registers;
     std::set<Tegra::Shader::Pred> used_predicates;
@@ -431,6 +444,7 @@ private:
     bool uses_instance_id{};
     bool uses_vertex_id{};
     bool uses_warps{};
+    bool uses_indexed_samplers{};
 
     Tegra::Shader::Header header;
 };
diff --git a/src/video_core/shader/track.cpp b/src/video_core/shader/track.cpp
index 165c79330..face8c943 100644
--- a/src/video_core/shader/track.cpp
+++ b/src/video_core/shader/track.cpp
@@ -8,6 +8,7 @@
 
 #include "common/common_types.h"
 #include "video_core/shader/node.h"
+#include "video_core/shader/node_helper.h"
 #include "video_core/shader/shader_ir.h"
 
 namespace VideoCommon::Shader {
@@ -35,8 +36,113 @@ std::pair<Node, s64> FindOperation(const NodeBlock& code, s64 cursor,
     }
     return {};
 }
+
+std::optional<std::pair<Node, Node>> DecoupleIndirectRead(const OperationNode& operation) {
+    if (operation.GetCode() != OperationCode::UAdd) {
+        return std::nullopt;
+    }
+    Node gpr;
+    Node offset;
+    ASSERT(operation.GetOperandsCount() == 2);
+    for (std::size_t i = 0; i < operation.GetOperandsCount(); i++) {
+        Node operand = operation[i];
+        if (std::holds_alternative<ImmediateNode>(*operand)) {
+            offset = operation[i];
+        } else if (std::holds_alternative<GprNode>(*operand)) {
+            gpr = operation[i];
+        }
+    }
+    if (offset && gpr) {
+        return std::make_pair(gpr, offset);
+    }
+    return std::nullopt;
+}
+
+bool AmendNodeCv(std::size_t amend_index, Node node) {
+    if (const auto operation = std::get_if<OperationNode>(&*node)) {
+        operation->SetAmendIndex(amend_index);
+        return true;
+    } else if (const auto conditional = std::get_if<ConditionalNode>(&*node)) {
+        conditional->SetAmendIndex(amend_index);
+        return true;
+    }
+    return false;
+}
+
 } // Anonymous namespace
 
+std::tuple<Node, TrackSampler> ShaderIR::TrackBindlessSampler(Node tracked, const NodeBlock& code,
+                                                              s64 cursor) {
+    if (const auto cbuf = std::get_if<CbufNode>(&*tracked)) {
+        // Constant buffer found, test if it's an immediate
+        const auto offset = cbuf->GetOffset();
+        if (const auto immediate = std::get_if<ImmediateNode>(&*offset)) {
+            auto track =
+                MakeTrackSampler<BindlessSamplerNode>(cbuf->GetIndex(), immediate->GetValue());
+            return {tracked, track};
+        } else if (const auto operation = std::get_if<OperationNode>(&*offset)) {
+            auto bound_buffer = locker.ObtainBoundBuffer();
+            if (!bound_buffer) {
+                return {};
+            }
+            if (*bound_buffer != cbuf->GetIndex()) {
+                return {};
+            }
+            auto pair = DecoupleIndirectRead(*operation);
+            if (!pair) {
+                return {};
+            }
+            auto [gpr, base_offset] = *pair;
+            const auto offset_inm = std::get_if<ImmediateNode>(&*base_offset);
+            auto gpu_driver = locker.AccessGuestDriverProfile();
+            if (gpu_driver == nullptr) {
+                return {};
+            }
+            const u32 bindless_cv = NewCustomVariable();
+            const Node op = Operation(OperationCode::UDiv, NO_PRECISE, gpr,
+                                      Immediate(gpu_driver->GetTextureHandlerSize()));
+
+            const Node cv_node = GetCustomVariable(bindless_cv);
+            Node amend_op = Operation(OperationCode::Assign, cv_node, std::move(op));
+            const std::size_t amend_index = DeclareAmend(amend_op);
+            AmendNodeCv(amend_index, code[cursor]);
+            // TODO Implement Bindless Index custom variable
+            auto track = MakeTrackSampler<ArraySamplerNode>(cbuf->GetIndex(),
+                                                            offset_inm->GetValue(), bindless_cv);
+            return {tracked, track};
+        }
+        return {};
+    }
+    if (const auto gpr = std::get_if<GprNode>(&*tracked)) {
+        if (gpr->GetIndex() == Tegra::Shader::Register::ZeroIndex) {
+            return {};
+        }
+        // Reduce the cursor in one to avoid infinite loops when the instruction sets the same
+        // register that it uses as operand
+        const auto [source, new_cursor] = TrackRegister(gpr, code, cursor - 1);
+        if (!source) {
+            return {};
+        }
+        return TrackBindlessSampler(source, code, new_cursor);
+    }
+    if (const auto operation = std::get_if<OperationNode>(&*tracked)) {
+        for (std::size_t i = operation->GetOperandsCount(); i > 0; --i) {
+            if (auto found = TrackBindlessSampler((*operation)[i - 1], code, cursor);
+                std::get<0>(found)) {
+                // Cbuf found in operand.
+                return found;
+            }
+        }
+        return {};
+    }
+    if (const auto conditional = std::get_if<ConditionalNode>(&*tracked)) {
+        const auto& conditional_code = conditional->GetCode();
+        return TrackBindlessSampler(tracked, conditional_code,
+                                    static_cast<s64>(conditional_code.size()));
+    }
+    return {};
+}
+
 std::tuple<Node, u32, u32> ShaderIR::TrackCbuf(Node tracked, const NodeBlock& code,
                                                s64 cursor) const {
     if (const auto cbuf = std::get_if<CbufNode>(&*tracked)) {
diff --git a/src/video_core/texture_cache/surface_base.cpp b/src/video_core/texture_cache/surface_base.cpp
index 829268b4c..84469b7ba 100644
--- a/src/video_core/texture_cache/surface_base.cpp
+++ b/src/video_core/texture_cache/surface_base.cpp
@@ -135,7 +135,7 @@ std::vector<CopyParams> SurfaceBaseImpl::BreakDownLayered(const SurfaceParams& i
         for (u32 level = 0; level < mipmaps; level++) {
             const u32 width = SurfaceParams::IntersectWidth(params, in_params, level, level);
             const u32 height = SurfaceParams::IntersectHeight(params, in_params, level, level);
-            result.emplace_back(width, height, layer, level);
+            result.emplace_back(0, 0, layer, 0, 0, layer, level, level, width, height, 1);
         }
     }
     return result;
diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h
index f4c015635..0d105d386 100644
--- a/src/video_core/texture_cache/texture_cache.h
+++ b/src/video_core/texture_cache/texture_cache.h
@@ -721,7 +721,6 @@ private:
     std::pair<TSurface, TView> GetSurface(const GPUVAddr gpu_addr, const CacheAddr cache_addr,
                                           const SurfaceParams& params, bool preserve_contents,
                                           bool is_render) {
-
         // Step 1
         // Check Level 1 Cache for a fast structural match. If candidate surface
         // matches at certain level we are pretty much done.
@@ -733,14 +732,18 @@ private:
                 return RecycleSurface(overlaps, params, gpu_addr, preserve_contents,
                                       topological_result);
             }
+
             const auto struct_result = current_surface->MatchesStructure(params);
-            if (struct_result != MatchStructureResult::None &&
-                (params.target != SurfaceTarget::Texture3D ||
-                 current_surface->MatchTarget(params.target))) {
-                if (struct_result == MatchStructureResult::FullMatch) {
-                    return ManageStructuralMatch(current_surface, params, is_render);
-                } else {
-                    return RebuildSurface(current_surface, params, is_render);
+            if (struct_result != MatchStructureResult::None) {
+                const auto& old_params = current_surface->GetSurfaceParams();
+                const bool not_3d = params.target != SurfaceTarget::Texture3D &&
+                                    old_params.target != SurfaceTarget::Texture3D;
+                if (not_3d || current_surface->MatchTarget(params.target)) {
+                    if (struct_result == MatchStructureResult::FullMatch) {
+                        return ManageStructuralMatch(current_surface, params, is_render);
+                    } else {
+                        return RebuildSurface(current_surface, params, is_render);
+                    }
                 }
             }
         }
diff --git a/src/video_core/video_core.cpp b/src/video_core/video_core.cpp
index 8e947394c..a5f81a8a0 100644
--- a/src/video_core/video_core.cpp
+++ b/src/video_core/video_core.cpp
@@ -3,19 +3,32 @@
 // Refer to the license.txt file included.
 
 #include <memory>
+#include "common/logging/log.h"
 #include "core/core.h"
 #include "core/settings.h"
 #include "video_core/gpu_asynch.h"
 #include "video_core/gpu_synch.h"
 #include "video_core/renderer_base.h"
 #include "video_core/renderer_opengl/renderer_opengl.h"
+#ifdef HAS_VULKAN
+#include "video_core/renderer_vulkan/renderer_vulkan.h"
+#endif
 #include "video_core/video_core.h"
 
 namespace VideoCore {
 
 std::unique_ptr<RendererBase> CreateRenderer(Core::Frontend::EmuWindow& emu_window,
                                              Core::System& system) {
-    return std::make_unique<OpenGL::RendererOpenGL>(emu_window, system);
+    switch (Settings::values.renderer_backend) {
+    case Settings::RendererBackend::OpenGL:
+        return std::make_unique<OpenGL::RendererOpenGL>(emu_window, system);
+#ifdef HAS_VULKAN
+    case Settings::RendererBackend::Vulkan:
+        return std::make_unique<Vulkan::RendererVulkan>(emu_window, system);
+#endif
+    default:
+        return nullptr;
+    }
 }
 
 std::unique_ptr<Tegra::GPU> CreateGPU(Core::System& system) {
diff --git a/src/web_service/telemetry_json.cpp b/src/web_service/telemetry_json.cpp
index 9156ce802..7538389bf 100644
--- a/src/web_service/telemetry_json.cpp
+++ b/src/web_service/telemetry_json.cpp
@@ -117,6 +117,7 @@ bool TelemetryJson::SubmitTestcase() {
     impl->SerializeSection(Telemetry::FieldType::Session, "Session");
     impl->SerializeSection(Telemetry::FieldType::UserFeedback, "UserFeedback");
     impl->SerializeSection(Telemetry::FieldType::UserSystem, "UserSystem");
+    impl->SerializeSection(Telemetry::FieldType::UserConfig, "UserConfig");
 
     auto content = impl->TopSection().dump();
     Client client(impl->host, impl->username, impl->token);
diff --git a/src/web_service/web_backend.cpp b/src/web_service/web_backend.cpp
index 6683f459f..737ffe409 100644
--- a/src/web_service/web_backend.cpp
+++ b/src/web_service/web_backend.cpp
@@ -73,14 +73,12 @@ struct Client::Impl {
                 if (!parsedUrl.GetPort(&port)) {
                     port = HTTP_PORT;
                 }
-                cli = std::make_unique<httplib::Client>(parsedUrl.m_Host.c_str(), port,
-                                                        TIMEOUT_SECONDS);
+                cli = std::make_unique<httplib::Client>(parsedUrl.m_Host.c_str(), port);
             } else if (parsedUrl.m_Scheme == "https") {
                 if (!parsedUrl.GetPort(&port)) {
                     port = HTTPS_PORT;
                 }
-                cli = std::make_unique<httplib::SSLClient>(parsedUrl.m_Host.c_str(), port,
-                                                           TIMEOUT_SECONDS);
+                cli = std::make_unique<httplib::SSLClient>(parsedUrl.m_Host.c_str(), port);
             } else {
                 LOG_ERROR(WebService, "Bad URL scheme {}", parsedUrl.m_Scheme);
                 return Common::WebResult{Common::WebResult::Code::InvalidURL, "Bad URL scheme"};
@@ -90,6 +88,7 @@ struct Client::Impl {
             LOG_ERROR(WebService, "Invalid URL {}", host + path);
             return Common::WebResult{Common::WebResult::Code::InvalidURL, "Invalid URL"};
         }
+        cli->set_timeout_sec(TIMEOUT_SECONDS);
 
         httplib::Headers params;
         if (!jwt.empty()) {
diff --git a/src/yuzu/CMakeLists.txt b/src/yuzu/CMakeLists.txt
index 11ae1e66e..b841e63fa 100644
--- a/src/yuzu/CMakeLists.txt
+++ b/src/yuzu/CMakeLists.txt
@@ -36,9 +36,6 @@ add_executable(yuzu
     configuration/configure_filesystem.cpp
     configuration/configure_filesystem.h
     configuration/configure_filesystem.ui
-    configuration/configure_gamelist.cpp
-    configuration/configure_gamelist.h
-    configuration/configure_gamelist.ui
     configuration/configure_general.cpp
     configuration/configure_general.h
     configuration/configure_general.ui
@@ -75,6 +72,9 @@ add_executable(yuzu
     configuration/configure_touchscreen_advanced.cpp
     configuration/configure_touchscreen_advanced.h
     configuration/configure_touchscreen_advanced.ui
+    configuration/configure_ui.cpp
+    configuration/configure_ui.h
+    configuration/configure_ui.ui
     configuration/configure_web.cpp
     configuration/configure_web.h
     configuration/configure_web.ui
@@ -200,3 +200,8 @@ if (MSVC)
     copy_yuzu_SDL_deps(yuzu)
     copy_yuzu_unicorn_deps(yuzu)
 endif()
+
+if (ENABLE_VULKAN)
+    target_include_directories(yuzu PRIVATE ../../externals/Vulkan-Headers/include)
+    target_compile_definitions(yuzu PRIVATE HAS_VULKAN)
+endif()
diff --git a/src/yuzu/bootmanager.cpp b/src/yuzu/bootmanager.cpp
index 7490fb718..55a37fffa 100644
--- a/src/yuzu/bootmanager.cpp
+++ b/src/yuzu/bootmanager.cpp
@@ -2,19 +2,30 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 
+#include <glad/glad.h>
+
 #include <QApplication>
 #include <QHBoxLayout>
 #include <QKeyEvent>
+#include <QMessageBox>
 #include <QOffscreenSurface>
 #include <QOpenGLWindow>
 #include <QPainter>
 #include <QScreen>
+#include <QStringList>
 #include <QWindow>
+#ifdef HAS_VULKAN
+#include <QVulkanWindow>
+#endif
+
 #include <fmt/format.h>
+
+#include "common/assert.h"
 #include "common/microprofile.h"
 #include "common/scm_rev.h"
 #include "core/core.h"
 #include "core/frontend/framebuffer_layout.h"
+#include "core/frontend/scope_acquire_window_context.h"
 #include "core/settings.h"
 #include "input_common/keyboard.h"
 #include "input_common/main.h"
@@ -114,19 +125,10 @@ private:
     QOpenGLContext context;
 };
 
-// This class overrides paintEvent and resizeEvent to prevent the GUI thread from stealing GL
-// context.
-// The corresponding functionality is handled in EmuThread instead
-class GGLWidgetInternal : public QOpenGLWindow {
+class GWidgetInternal : public QWindow {
 public:
-    GGLWidgetInternal(GRenderWindow* parent, QOpenGLContext* shared_context)
-        : QOpenGLWindow(shared_context), parent(parent) {}
-
-    void paintEvent(QPaintEvent* ev) override {
-        if (do_painting) {
-            QPainter painter(this);
-        }
-    }
+    GWidgetInternal(GRenderWindow* parent) : parent(parent) {}
+    virtual ~GWidgetInternal() = default;
 
     void resizeEvent(QResizeEvent* ev) override {
         parent->OnClientAreaResized(ev->size().width(), ev->size().height());
@@ -182,11 +184,47 @@ public:
         do_painting = true;
     }
 
+    std::pair<unsigned, unsigned> GetSize() const {
+        return std::make_pair(width(), height());
+    }
+
+protected:
+    bool IsPaintingEnabled() const {
+        return do_painting;
+    }
+
 private:
     GRenderWindow* parent;
-    bool do_painting;
+    bool do_painting = false;
+};
+
+// This class overrides paintEvent and resizeEvent to prevent the GUI thread from stealing GL
+// context.
+// The corresponding functionality is handled in EmuThread instead
+class GGLWidgetInternal final : public GWidgetInternal, public QOpenGLWindow {
+public:
+    GGLWidgetInternal(GRenderWindow* parent, QOpenGLContext* shared_context)
+        : GWidgetInternal(parent), QOpenGLWindow(shared_context) {}
+    ~GGLWidgetInternal() override = default;
+
+    void paintEvent(QPaintEvent* ev) override {
+        if (IsPaintingEnabled()) {
+            QPainter painter(this);
+        }
+    }
 };
 
+#ifdef HAS_VULKAN
+class GVKWidgetInternal final : public GWidgetInternal {
+public:
+    GVKWidgetInternal(GRenderWindow* parent, QVulkanInstance* instance) : GWidgetInternal(parent) {
+        setSurfaceType(QSurface::SurfaceType::VulkanSurface);
+        setVulkanInstance(instance);
+    }
+    ~GVKWidgetInternal() override = default;
+};
+#endif
+
 GRenderWindow::GRenderWindow(GMainWindow* parent, EmuThread* emu_thread)
     : QWidget(parent), emu_thread(emu_thread) {
     setWindowTitle(QStringLiteral("yuzu %1 | %2-%3")
@@ -201,9 +239,15 @@ GRenderWindow::GRenderWindow(GMainWindow* parent, EmuThread* emu_thread)
 
 GRenderWindow::~GRenderWindow() {
     InputCommon::Shutdown();
+
+    // Avoid an unordered destruction that generates a segfault
+    delete child;
 }
 
 void GRenderWindow::moveContext() {
+    if (!context) {
+        return;
+    }
     DoneCurrent();
 
     // If the thread started running, move the GL Context to the new thread. Otherwise, move it
@@ -215,8 +259,9 @@ void GRenderWindow::moveContext() {
 }
 
 void GRenderWindow::SwapBuffers() {
-    context->swapBuffers(child);
-
+    if (context) {
+        context->swapBuffers(child);
+    }
     if (!first_frame) {
         first_frame = true;
         emit FirstFrameDisplayed();
@@ -224,15 +269,38 @@ void GRenderWindow::SwapBuffers() {
 }
 
 void GRenderWindow::MakeCurrent() {
-    context->makeCurrent(child);
+    if (context) {
+        context->makeCurrent(child);
+    }
 }
 
 void GRenderWindow::DoneCurrent() {
-    context->doneCurrent();
+    if (context) {
+        context->doneCurrent();
+    }
 }
 
 void GRenderWindow::PollEvents() {}
 
+bool GRenderWindow::IsShown() const {
+    return !isMinimized();
+}
+
+void GRenderWindow::RetrieveVulkanHandlers(void* get_instance_proc_addr, void* instance,
+                                           void* surface) const {
+#ifdef HAS_VULKAN
+    const auto instance_proc_addr = vk_instance->getInstanceProcAddr("vkGetInstanceProcAddr");
+    const VkInstance instance_copy = vk_instance->vkInstance();
+    const VkSurfaceKHR surface_copy = vk_instance->surfaceForWindow(child);
+
+    std::memcpy(get_instance_proc_addr, &instance_proc_addr, sizeof(instance_proc_addr));
+    std::memcpy(instance, &instance_copy, sizeof(instance_copy));
+    std::memcpy(surface, &surface_copy, sizeof(surface_copy));
+#else
+    UNREACHABLE_MSG("Executing Vulkan code without compiling Vulkan");
+#endif
+}
+
 // On Qt 5.0+, this correctly gets the size of the framebuffer (pixels).
 //
 // Older versions get the window size (density independent pixels),
@@ -241,10 +309,9 @@ void GRenderWindow::PollEvents() {}
 void GRenderWindow::OnFramebufferSizeChanged() {
     // Screen changes potentially incur a change in screen DPI, hence we should update the
     // framebuffer size
-    const qreal pixel_ratio = GetWindowPixelRatio();
-    const u32 width = child->QPaintDevice::width() * pixel_ratio;
-    const u32 height = child->QPaintDevice::height() * pixel_ratio;
-    UpdateCurrentFramebufferLayout(width, height);
+    const qreal pixelRatio{GetWindowPixelRatio()};
+    const auto size{child->GetSize()};
+    UpdateCurrentFramebufferLayout(size.first * pixelRatio, size.second * pixelRatio);
 }
 
 void GRenderWindow::ForwardKeyPressEvent(QKeyEvent* event) {
@@ -290,7 +357,7 @@ qreal GRenderWindow::GetWindowPixelRatio() const {
 }
 
 std::pair<u32, u32> GRenderWindow::ScaleTouch(const QPointF pos) const {
-    const qreal pixel_ratio = GetWindowPixelRatio();
+    const qreal pixel_ratio{GetWindowPixelRatio()};
     return {static_cast<u32>(std::max(std::round(pos.x() * pixel_ratio), qreal{0.0})),
             static_cast<u32>(std::max(std::round(pos.y() * pixel_ratio), qreal{0.0}))};
 }
@@ -356,50 +423,46 @@ std::unique_ptr<Core::Frontend::GraphicsContext> GRenderWindow::CreateSharedCont
     return std::make_unique<GGLContext>(context.get());
 }
 
-void GRenderWindow::InitRenderTarget() {
+bool GRenderWindow::InitRenderTarget() {
     shared_context.reset();
     context.reset();
-
-    delete child;
-    child = nullptr;
-
-    delete container;
-    container = nullptr;
-
-    delete layout();
+    if (child) {
+        delete child;
+    }
+    if (container) {
+        delete container;
+    }
+    if (layout()) {
+        delete layout();
+    }
 
     first_frame = false;
 
-    // TODO: One of these flags might be interesting: WA_OpaquePaintEvent, WA_NoBackground,
-    // WA_DontShowOnScreen, WA_DeleteOnClose
-    QSurfaceFormat fmt;
-    fmt.setVersion(4, 3);
-    fmt.setProfile(QSurfaceFormat::CompatibilityProfile);
-    fmt.setOption(QSurfaceFormat::FormatOption::DeprecatedFunctions);
-    // TODO: expose a setting for buffer value (ie default/single/double/triple)
-    fmt.setSwapBehavior(QSurfaceFormat::DefaultSwapBehavior);
-    shared_context = std::make_unique<QOpenGLContext>();
-    shared_context->setFormat(fmt);
-    shared_context->create();
-    context = std::make_unique<QOpenGLContext>();
-    context->setShareContext(shared_context.get());
-    context->setFormat(fmt);
-    context->create();
-    fmt.setSwapInterval(0);
+    switch (Settings::values.renderer_backend) {
+    case Settings::RendererBackend::OpenGL:
+        if (!InitializeOpenGL()) {
+            return false;
+        }
+        break;
+    case Settings::RendererBackend::Vulkan:
+        if (!InitializeVulkan()) {
+            return false;
+        }
+        break;
+    }
 
-    child = new GGLWidgetInternal(this, shared_context.get());
     container = QWidget::createWindowContainer(child, this);
-
     QBoxLayout* layout = new QHBoxLayout(this);
+
     layout->addWidget(container);
     layout->setMargin(0);
     setLayout(layout);
 
-    // Reset minimum size to avoid unwanted resizes when this function is called for a second time.
+    // Reset minimum required size to avoid resizing issues on the main window after restarting.
     setMinimumSize(1, 1);
 
-    // Show causes the window to actually be created and the OpenGL context as well, but we don't
-    // want the widget to be shown yet, so immediately hide it.
+    // Show causes the window to actually be created and the gl context as well, but we don't want
+    // the widget to be shown yet, so immediately hide it.
     show();
     hide();
 
@@ -410,9 +473,17 @@ void GRenderWindow::InitRenderTarget() {
     OnMinimalClientAreaChangeRequest(GetActiveConfig().min_client_area_size);
 
     OnFramebufferSizeChanged();
-    NotifyClientAreaSizeChanged(std::pair<unsigned, unsigned>(child->width(), child->height()));
+    NotifyClientAreaSizeChanged(child->GetSize());
 
     BackupGeometry();
+
+    if (Settings::values.renderer_backend == Settings::RendererBackend::OpenGL) {
+        if (!LoadOpenGL()) {
+            return false;
+        }
+    }
+
+    return true;
 }
 
 void GRenderWindow::CaptureScreenshot(u32 res_scale, const QString& screenshot_path) {
@@ -441,6 +512,113 @@ void GRenderWindow::OnMinimalClientAreaChangeRequest(std::pair<u32, u32> minimal
     setMinimumSize(minimal_size.first, minimal_size.second);
 }
 
+bool GRenderWindow::InitializeOpenGL() {
+    // TODO: One of these flags might be interesting: WA_OpaquePaintEvent, WA_NoBackground,
+    // WA_DontShowOnScreen, WA_DeleteOnClose
+    QSurfaceFormat fmt;
+    fmt.setVersion(4, 3);
+    fmt.setProfile(QSurfaceFormat::CompatibilityProfile);
+    fmt.setOption(QSurfaceFormat::FormatOption::DeprecatedFunctions);
+    // TODO: expose a setting for buffer value (ie default/single/double/triple)
+    fmt.setSwapBehavior(QSurfaceFormat::DefaultSwapBehavior);
+    shared_context = std::make_unique<QOpenGLContext>();
+    shared_context->setFormat(fmt);
+    shared_context->create();
+    context = std::make_unique<QOpenGLContext>();
+    context->setShareContext(shared_context.get());
+    context->setFormat(fmt);
+    context->create();
+    fmt.setSwapInterval(false);
+
+    child = new GGLWidgetInternal(this, shared_context.get());
+    return true;
+}
+
+bool GRenderWindow::InitializeVulkan() {
+#ifdef HAS_VULKAN
+    vk_instance = std::make_unique<QVulkanInstance>();
+    vk_instance->setApiVersion(QVersionNumber(1, 1, 0));
+    vk_instance->setFlags(QVulkanInstance::Flag::NoDebugOutputRedirect);
+    if (Settings::values.renderer_debug) {
+        const auto supported_layers{vk_instance->supportedLayers()};
+        const bool found =
+            std::find_if(supported_layers.begin(), supported_layers.end(), [](const auto& layer) {
+                constexpr const char searched_layer[] = "VK_LAYER_LUNARG_standard_validation";
+                return layer.name == searched_layer;
+            });
+        if (found) {
+            vk_instance->setLayers(QByteArrayList() << "VK_LAYER_LUNARG_standard_validation");
+            vk_instance->setExtensions(QByteArrayList() << VK_EXT_DEBUG_UTILS_EXTENSION_NAME);
+        }
+    }
+    if (!vk_instance->create()) {
+        QMessageBox::critical(
+            this, tr("Error while initializing Vulkan 1.1!"),
+            tr("Your OS doesn't seem to support Vulkan 1.1 instances, or you do not have the "
+               "latest graphics drivers."));
+        return false;
+    }
+
+    child = new GVKWidgetInternal(this, vk_instance.get());
+    return true;
+#else
+    QMessageBox::critical(this, tr("Vulkan not available!"),
+                          tr("yuzu has not been compiled with Vulkan support."));
+    return false;
+#endif
+}
+
+bool GRenderWindow::LoadOpenGL() {
+    Core::Frontend::ScopeAcquireWindowContext acquire_context{*this};
+    if (!gladLoadGL()) {
+        QMessageBox::critical(this, tr("Error while initializing OpenGL 4.3!"),
+                              tr("Your GPU may not support OpenGL 4.3, or you do not have the "
+                                 "latest graphics driver."));
+        return false;
+    }
+
+    QStringList unsupported_gl_extensions = GetUnsupportedGLExtensions();
+    if (!unsupported_gl_extensions.empty()) {
+        QMessageBox::critical(
+            this, tr("Error while initializing OpenGL!"),
+            tr("Your GPU may not support one or more required OpenGL extensions. Please ensure you "
+               "have the latest graphics driver.<br><br>Unsupported extensions:<br>") +
+                unsupported_gl_extensions.join(QStringLiteral("<br>")));
+        return false;
+    }
+    return true;
+}
+
+QStringList GRenderWindow::GetUnsupportedGLExtensions() const {
+    QStringList unsupported_ext;
+
+    if (!GLAD_GL_ARB_buffer_storage)
+        unsupported_ext.append(QStringLiteral("ARB_buffer_storage"));
+    if (!GLAD_GL_ARB_direct_state_access)
+        unsupported_ext.append(QStringLiteral("ARB_direct_state_access"));
+    if (!GLAD_GL_ARB_vertex_type_10f_11f_11f_rev)
+        unsupported_ext.append(QStringLiteral("ARB_vertex_type_10f_11f_11f_rev"));
+    if (!GLAD_GL_ARB_texture_mirror_clamp_to_edge)
+        unsupported_ext.append(QStringLiteral("ARB_texture_mirror_clamp_to_edge"));
+    if (!GLAD_GL_ARB_multi_bind)
+        unsupported_ext.append(QStringLiteral("ARB_multi_bind"));
+    if (!GLAD_GL_ARB_clip_control)
+        unsupported_ext.append(QStringLiteral("ARB_clip_control"));
+
+    // Extensions required to support some texture formats.
+    if (!GLAD_GL_EXT_texture_compression_s3tc)
+        unsupported_ext.append(QStringLiteral("EXT_texture_compression_s3tc"));
+    if (!GLAD_GL_ARB_texture_compression_rgtc)
+        unsupported_ext.append(QStringLiteral("ARB_texture_compression_rgtc"));
+    if (!GLAD_GL_ARB_depth_buffer_float)
+        unsupported_ext.append(QStringLiteral("ARB_depth_buffer_float"));
+
+    for (const QString& ext : unsupported_ext)
+        LOG_CRITICAL(Frontend, "Unsupported GL extension: {}", ext.toStdString());
+
+    return unsupported_ext;
+}
+
 void GRenderWindow::OnEmulationStarting(EmuThread* emu_thread) {
     this->emu_thread = emu_thread;
     child->DisablePainting();
diff --git a/src/yuzu/bootmanager.h b/src/yuzu/bootmanager.h
index 2fc64895f..71a2fa321 100644
--- a/src/yuzu/bootmanager.h
+++ b/src/yuzu/bootmanager.h
@@ -7,17 +7,28 @@
 #include <atomic>
 #include <condition_variable>
 #include <mutex>
+
 #include <QImage>
 #include <QThread>
 #include <QWidget>
+
+#include "common/thread.h"
 #include "core/core.h"
 #include "core/frontend/emu_window.h"
 
 class QKeyEvent;
 class QScreen;
 class QTouchEvent;
+class QStringList;
+class QSurface;
+class QOpenGLContext;
+#ifdef HAS_VULKAN
+class QVulkanInstance;
+#endif
 
+class GWidgetInternal;
 class GGLWidgetInternal;
+class GVKWidgetInternal;
 class GMainWindow;
 class GRenderWindow;
 class QSurface;
@@ -123,6 +134,9 @@ public:
     void MakeCurrent() override;
     void DoneCurrent() override;
     void PollEvents() override;
+    bool IsShown() const override;
+    void RetrieveVulkanHandlers(void* get_instance_proc_addr, void* instance,
+                                void* surface) const override;
     std::unique_ptr<Core::Frontend::GraphicsContext> CreateSharedContext() const override;
 
     void ForwardKeyPressEvent(QKeyEvent* event);
@@ -142,7 +156,7 @@ public:
 
     void OnClientAreaResized(u32 width, u32 height);
 
-    void InitRenderTarget();
+    bool InitRenderTarget();
 
     void CaptureScreenshot(u32 res_scale, const QString& screenshot_path);
 
@@ -165,10 +179,13 @@ private:
 
     void OnMinimalClientAreaChangeRequest(std::pair<u32, u32> minimal_size) override;
 
-    QWidget* container = nullptr;
-    GGLWidgetInternal* child = nullptr;
+    bool InitializeOpenGL();
+    bool InitializeVulkan();
+    bool LoadOpenGL();
+    QStringList GetUnsupportedGLExtensions() const;
 
-    QByteArray geometry;
+    QWidget* container = nullptr;
+    GWidgetInternal* child = nullptr;
 
     EmuThread* emu_thread;
     // Context that backs the GGLWidgetInternal (and will be used by core to render)
@@ -177,9 +194,14 @@ private:
     // current
     std::unique_ptr<QOpenGLContext> shared_context;
 
+#ifdef HAS_VULKAN
+    std::unique_ptr<QVulkanInstance> vk_instance;
+#endif
+
     /// Temporary storage of the screenshot taken
     QImage screenshot_image;
 
+    QByteArray geometry;
     bool first_frame = false;
 
 protected:
diff --git a/src/yuzu/configuration/config.cpp b/src/yuzu/configuration/config.cpp
index f92a4b3c3..6209fff75 100644
--- a/src/yuzu/configuration/config.cpp
+++ b/src/yuzu/configuration/config.cpp
@@ -10,6 +10,7 @@
 #include "core/hle/service/acc/profile_manager.h"
 #include "core/hle/service/hid/controllers/npad.h"
 #include "input_common/main.h"
+#include "input_common/udp/client.h"
 #include "yuzu/configuration/config.h"
 #include "yuzu/uisettings.h"
 
@@ -429,6 +430,16 @@ void Config::ReadControlValues() {
                     QStringLiteral("engine:motion_emu,update_period:100,sensitivity:0.01"))
             .toString()
             .toStdString();
+    Settings::values.udp_input_address =
+        ReadSetting(QStringLiteral("udp_input_address"),
+                    QString::fromUtf8(InputCommon::CemuhookUDP::DEFAULT_ADDR))
+            .toString()
+            .toStdString();
+    Settings::values.udp_input_port = static_cast<u16>(
+        ReadSetting(QStringLiteral("udp_input_port"), InputCommon::CemuhookUDP::DEFAULT_PORT)
+            .toInt());
+    Settings::values.udp_pad_index =
+        static_cast<u8>(ReadSetting(QStringLiteral("udp_pad_index"), 0).toUInt());
 
     qt_config->endGroup();
 }
@@ -613,8 +624,13 @@ void Config::ReadPathValues() {
 void Config::ReadRendererValues() {
     qt_config->beginGroup(QStringLiteral("Renderer"));
 
+    Settings::values.renderer_backend =
+        static_cast<Settings::RendererBackend>(ReadSetting(QStringLiteral("backend"), 0).toInt());
+    Settings::values.renderer_debug = ReadSetting(QStringLiteral("debug"), false).toBool();
+    Settings::values.vulkan_device = ReadSetting(QStringLiteral("vulkan_device"), 0).toInt();
     Settings::values.resolution_factor =
         ReadSetting(QStringLiteral("resolution_factor"), 1.0).toFloat();
+    Settings::values.aspect_ratio = ReadSetting(QStringLiteral("aspect_ratio"), 0).toInt();
     Settings::values.use_frame_limit =
         ReadSetting(QStringLiteral("use_frame_limit"), true).toBool();
     Settings::values.frame_limit = ReadSetting(QStringLiteral("frame_limit"), 100).toInt();
@@ -727,7 +743,6 @@ void Config::ReadUIValues() {
 void Config::ReadUIGamelistValues() {
     qt_config->beginGroup(QStringLiteral("UIGameList"));
 
-    UISettings::values.show_unknown = ReadSetting(QStringLiteral("show_unknown"), true).toBool();
     UISettings::values.show_add_ons = ReadSetting(QStringLiteral("show_add_ons"), true).toBool();
     UISettings::values.icon_size = ReadSetting(QStringLiteral("icon_size"), 64).toUInt();
     UISettings::values.row_1_text_id = ReadSetting(QStringLiteral("row_1_text_id"), 3).toUInt();
@@ -911,6 +926,12 @@ void Config::SaveControlValues() {
                  QString::fromStdString(Settings::values.motion_device),
                  QStringLiteral("engine:motion_emu,update_period:100,sensitivity:0.01"));
     WriteSetting(QStringLiteral("keyboard_enabled"), Settings::values.keyboard_enabled, false);
+    WriteSetting(QStringLiteral("udp_input_address"),
+                 QString::fromStdString(Settings::values.udp_input_address),
+                 QString::fromUtf8(InputCommon::CemuhookUDP::DEFAULT_ADDR));
+    WriteSetting(QStringLiteral("udp_input_port"), Settings::values.udp_input_port,
+                 InputCommon::CemuhookUDP::DEFAULT_PORT);
+    WriteSetting(QStringLiteral("udp_pad_index"), Settings::values.udp_pad_index, 0);
 
     qt_config->endGroup();
 }
@@ -1039,8 +1060,12 @@ void Config::SavePathValues() {
 void Config::SaveRendererValues() {
     qt_config->beginGroup(QStringLiteral("Renderer"));
 
+    WriteSetting(QStringLiteral("backend"), static_cast<int>(Settings::values.renderer_backend), 0);
+    WriteSetting(QStringLiteral("debug"), Settings::values.renderer_debug, false);
+    WriteSetting(QStringLiteral("vulkan_device"), Settings::values.vulkan_device, 0);
     WriteSetting(QStringLiteral("resolution_factor"),
                  static_cast<double>(Settings::values.resolution_factor), 1.0);
+    WriteSetting(QStringLiteral("aspect_ratio"), Settings::values.aspect_ratio, 0);
     WriteSetting(QStringLiteral("use_frame_limit"), Settings::values.use_frame_limit, true);
     WriteSetting(QStringLiteral("frame_limit"), Settings::values.frame_limit, 100);
     WriteSetting(QStringLiteral("use_disk_shader_cache"), Settings::values.use_disk_shader_cache,
@@ -1135,7 +1160,6 @@ void Config::SaveUIValues() {
 void Config::SaveUIGamelistValues() {
     qt_config->beginGroup(QStringLiteral("UIGameList"));
 
-    WriteSetting(QStringLiteral("show_unknown"), UISettings::values.show_unknown, true);
     WriteSetting(QStringLiteral("show_add_ons"), UISettings::values.show_add_ons, true);
     WriteSetting(QStringLiteral("icon_size"), UISettings::values.icon_size, 64);
     WriteSetting(QStringLiteral("row_1_text_id"), UISettings::values.row_1_text_id, 3);
diff --git a/src/yuzu/configuration/configure.ui b/src/yuzu/configuration/configure.ui
index 372427ae2..67b990f1a 100644
--- a/src/yuzu/configuration/configure.ui
+++ b/src/yuzu/configuration/configure.ui
@@ -48,7 +48,7 @@
          <string>General</string>
         </attribute>
        </widget>
-       <widget class="ConfigureGameList" name="gameListTab">
+       <widget class="ConfigureUi" name="uiTab">
         <attribute name="title">
          <string>Game List</string>
         </attribute>
@@ -166,9 +166,9 @@
    <container>1</container>
   </customwidget>
   <customwidget>
-   <class>ConfigureGameList</class>
+   <class>ConfigureUi</class>
    <extends>QWidget</extends>
-   <header>configuration/configure_gamelist.h</header>
+   <header>configuration/configure_ui.h</header>
    <container>1</container>
   </customwidget>
   <customwidget>
diff --git a/src/yuzu/configuration/configure_debug.cpp b/src/yuzu/configuration/configure_debug.cpp
index 90c1f9459..9631059c7 100644
--- a/src/yuzu/configuration/configure_debug.cpp
+++ b/src/yuzu/configuration/configure_debug.cpp
@@ -36,6 +36,8 @@ void ConfigureDebug::SetConfiguration() {
     ui->homebrew_args_edit->setText(QString::fromStdString(Settings::values.program_args));
     ui->reporting_services->setChecked(Settings::values.reporting_services);
     ui->quest_flag->setChecked(Settings::values.quest_flag);
+    ui->enable_graphics_debugging->setEnabled(!Core::System::GetInstance().IsPoweredOn());
+    ui->enable_graphics_debugging->setChecked(Settings::values.renderer_debug);
 }
 
 void ConfigureDebug::ApplyConfiguration() {
@@ -46,6 +48,7 @@ void ConfigureDebug::ApplyConfiguration() {
     Settings::values.program_args = ui->homebrew_args_edit->text().toStdString();
     Settings::values.reporting_services = ui->reporting_services->isChecked();
     Settings::values.quest_flag = ui->quest_flag->isChecked();
+    Settings::values.renderer_debug = ui->enable_graphics_debugging->isChecked();
     Debugger::ToggleConsole();
     Log::Filter filter;
     filter.ParseFilterString(Settings::values.log_filter);
diff --git a/src/yuzu/configuration/configure_debug.ui b/src/yuzu/configuration/configure_debug.ui
index ce49569bb..e028c4c80 100644
--- a/src/yuzu/configuration/configure_debug.ui
+++ b/src/yuzu/configuration/configure_debug.ui
@@ -7,7 +7,7 @@
     <x>0</x>
     <y>0</y>
     <width>400</width>
-    <height>474</height>
+    <height>467</height>
    </rect>
   </property>
   <property name="windowTitle">
@@ -103,6 +103,80 @@
         </item>
        </layout>
       </item>
+     </layout>
+    </widget>
+   </item>
+   <item>
+    <widget class="QGroupBox" name="groupBox_3">
+     <property name="title">
+      <string>Homebrew</string>
+     </property>
+     <layout class="QVBoxLayout" name="verticalLayout_5">
+      <item>
+       <layout class="QHBoxLayout" name="horizontalLayout_4">
+        <item>
+         <widget class="QLabel" name="label_3">
+          <property name="text">
+           <string>Arguments String</string>
+          </property>
+         </widget>
+        </item>
+        <item>
+         <widget class="QLineEdit" name="homebrew_args_edit"/>
+        </item>
+       </layout>
+      </item>
+     </layout>
+    </widget>
+   </item>
+   <item>
+    <widget class="QGroupBox" name="groupBox_4">
+     <property name="title">
+      <string>Graphics</string>
+     </property>
+     <layout class="QVBoxLayout" name="verticalLayout_6">
+      <item>
+       <widget class="QCheckBox" name="enable_graphics_debugging">
+        <property name="enabled">
+         <bool>true</bool>
+        </property>
+        <property name="whatsThis">
+         <string>When checked, the graphics API enters in a slower debugging mode</string>
+        </property>
+        <property name="text">
+         <string>Enable Graphics Debugging</string>
+        </property>
+       </widget>
+      </item>
+     </layout>
+    </widget>
+   </item>
+   <item>
+    <widget class="QGroupBox" name="groupBox_5">
+     <property name="title">
+      <string>Dump</string>
+     </property>
+     <layout class="QVBoxLayout" name="verticalLayout_6">
+      <item>
+       <widget class="QCheckBox" name="dump_decompressed_nso">
+        <property name="whatsThis">
+         <string>When checked, any NSO yuzu tries to load or patch will be copied decompressed to the yuzu/dump directory.</string>
+        </property>
+        <property name="text">
+         <string>Dump Decompressed NSOs</string>
+        </property>
+       </widget>
+      </item>
+      <item>
+       <widget class="QCheckBox" name="dump_exefs">
+        <property name="whatsThis">
+         <string>When checked, any game that yuzu loads will have its ExeFS dumped to the yuzu/dump directory.</string>
+        </property>
+        <property name="text">
+         <string>Dump ExeFS</string>
+        </property>
+       </widget>
+      </item>
       <item>
        <widget class="QCheckBox" name="reporting_services">
         <property name="text">
@@ -129,11 +203,11 @@
     </widget>
    </item>
    <item>
-    <widget class="QGroupBox" name="groupBox_5">
+    <widget class="QGroupBox" name="groupBox_6">
      <property name="title">
       <string>Advanced</string>
      </property>
-     <layout class="QVBoxLayout" name="verticalLayout">
+     <layout class="QVBoxLayout" name="verticalLayout_7">
       <item>
        <widget class="QCheckBox" name="quest_flag">
         <property name="text">
@@ -145,29 +219,6 @@
     </widget>
    </item>
    <item>
-    <widget class="QGroupBox" name="groupBox_3">
-     <property name="title">
-      <string>Homebrew</string>
-     </property>
-     <layout class="QVBoxLayout" name="verticalLayout_5">
-      <item>
-       <layout class="QHBoxLayout" name="horizontalLayout_4">
-        <item>
-         <widget class="QLabel" name="label_3">
-          <property name="text">
-           <string>Arguments String</string>
-          </property>
-         </widget>
-        </item>
-        <item>
-         <widget class="QLineEdit" name="homebrew_args_edit"/>
-        </item>
-       </layout>
-      </item>
-     </layout>
-    </widget>
-   </item>
-   <item>
     <spacer name="verticalSpacer">
      <property name="orientation">
       <enum>Qt::Vertical</enum>
@@ -185,6 +236,19 @@
    </item>
   </layout>
  </widget>
+ <tabstops>
+  <tabstop>toggle_gdbstub</tabstop>
+  <tabstop>gdbport_spinbox</tabstop>
+  <tabstop>log_filter_edit</tabstop>
+  <tabstop>toggle_console</tabstop>
+  <tabstop>open_log_button</tabstop>
+  <tabstop>homebrew_args_edit</tabstop>
+  <tabstop>enable_graphics_debugging</tabstop>
+  <tabstop>dump_decompressed_nso</tabstop>
+  <tabstop>dump_exefs</tabstop>
+  <tabstop>reporting_services</tabstop>
+  <tabstop>quest_flag</tabstop>
+ </tabstops>
  <resources/>
  <connections>
   <connection>
diff --git a/src/yuzu/configuration/configure_dialog.cpp b/src/yuzu/configuration/configure_dialog.cpp
index 8497eaa14..db3b19352 100644
--- a/src/yuzu/configuration/configure_dialog.cpp
+++ b/src/yuzu/configuration/configure_dialog.cpp
@@ -34,7 +34,7 @@ void ConfigureDialog::SetConfiguration() {}
 
 void ConfigureDialog::ApplyConfiguration() {
     ui->generalTab->ApplyConfiguration();
-    ui->gameListTab->ApplyConfiguration();
+    ui->uiTab->ApplyConfiguration();
     ui->systemTab->ApplyConfiguration();
     ui->profileManagerTab->ApplyConfiguration();
     ui->filesystemTab->applyConfiguration();
@@ -74,7 +74,7 @@ Q_DECLARE_METATYPE(QList<QWidget*>);
 
 void ConfigureDialog::PopulateSelectionList() {
     const std::array<std::pair<QString, QList<QWidget*>>, 5> items{
-        {{tr("General"), {ui->generalTab, ui->webTab, ui->debugTab, ui->gameListTab}},
+        {{tr("General"), {ui->generalTab, ui->webTab, ui->debugTab, ui->uiTab}},
          {tr("System"), {ui->systemTab, ui->profileManagerTab, ui->serviceTab, ui->filesystemTab}},
          {tr("Graphics"), {ui->graphicsTab}},
          {tr("Audio"), {ui->audioTab}},
@@ -108,7 +108,7 @@ void ConfigureDialog::UpdateVisibleTabs() {
         {ui->audioTab, tr("Audio")},
         {ui->debugTab, tr("Debug")},
         {ui->webTab, tr("Web")},
-        {ui->gameListTab, tr("Game List")},
+        {ui->uiTab, tr("UI")},
         {ui->filesystemTab, tr("Filesystem")},
         {ui->serviceTab, tr("Services")},
     };
diff --git a/src/yuzu/configuration/configure_general.cpp b/src/yuzu/configuration/configure_general.cpp
index 34e1d7fea..5ef927114 100644
--- a/src/yuzu/configuration/configure_general.cpp
+++ b/src/yuzu/configuration/configure_general.cpp
@@ -15,11 +15,6 @@ ConfigureGeneral::ConfigureGeneral(QWidget* parent)
 
     ui->setupUi(this);
 
-    for (const auto& theme : UISettings::themes) {
-        ui->theme_combobox->addItem(QString::fromUtf8(theme.first),
-                                    QString::fromUtf8(theme.second));
-    }
-
     SetConfiguration();
 
     connect(ui->toggle_frame_limit, &QCheckBox::toggled, ui->frame_limit, &QSpinBox::setEnabled);
@@ -30,7 +25,6 @@ ConfigureGeneral::~ConfigureGeneral() = default;
 void ConfigureGeneral::SetConfiguration() {
     ui->toggle_check_exit->setChecked(UISettings::values.confirm_before_closing);
     ui->toggle_user_on_boot->setChecked(UISettings::values.select_user_on_boot);
-    ui->theme_combobox->setCurrentIndex(ui->theme_combobox->findData(UISettings::values.theme));
     ui->toggle_background_pause->setChecked(UISettings::values.pause_when_in_background);
 
     ui->toggle_frame_limit->setChecked(Settings::values.use_frame_limit);
@@ -41,8 +35,6 @@ void ConfigureGeneral::SetConfiguration() {
 void ConfigureGeneral::ApplyConfiguration() {
     UISettings::values.confirm_before_closing = ui->toggle_check_exit->isChecked();
     UISettings::values.select_user_on_boot = ui->toggle_user_on_boot->isChecked();
-    UISettings::values.theme =
-        ui->theme_combobox->itemData(ui->theme_combobox->currentIndex()).toString();
     UISettings::values.pause_when_in_background = ui->toggle_background_pause->isChecked();
 
     Settings::values.use_frame_limit = ui->toggle_frame_limit->isChecked();
diff --git a/src/yuzu/configuration/configure_general.ui b/src/yuzu/configuration/configure_general.ui
index 26b3486ff..857119bb3 100644
--- a/src/yuzu/configuration/configure_general.ui
+++ b/src/yuzu/configuration/configure_general.ui
@@ -65,39 +65,12 @@
             </property>
            </widget>
           </item>
-           <item>
-             <widget class="QCheckBox" name="toggle_background_pause">
-               <property name="text">
-                 <string>Pause emulation when in background</string>
-               </property>
-             </widget>
-           </item>
-         </layout>
-        </item>
-       </layout>
-      </widget>
-     </item>
-     <item>
-      <widget class="QGroupBox" name="theme_group_box">
-       <property name="title">
-        <string>Theme</string>
-       </property>
-       <layout class="QHBoxLayout" name="theme_qhbox_layout">
-        <item>
-         <layout class="QVBoxLayout" name="theme_qvbox_layout">
           <item>
-           <layout class="QHBoxLayout" name="theme_qhbox_layout_2">
-            <item>
-             <widget class="QLabel" name="theme_label">
-              <property name="text">
-               <string>Theme:</string>
-              </property>
-             </widget>
-            </item>
-            <item>
-             <widget class="QComboBox" name="theme_combobox"/>
-            </item>
-           </layout>
+           <widget class="QCheckBox" name="toggle_background_pause">
+            <property name="text">
+             <string>Pause emulation when in background</string>
+            </property>
+           </widget>
           </item>
          </layout>
         </item>
diff --git a/src/yuzu/configuration/configure_graphics.cpp b/src/yuzu/configuration/configure_graphics.cpp
index 2c9e322c9..ea899c080 100644
--- a/src/yuzu/configuration/configure_graphics.cpp
+++ b/src/yuzu/configuration/configure_graphics.cpp
@@ -3,6 +3,13 @@
 // Refer to the license.txt file included.
 
 #include <QColorDialog>
+#include <QComboBox>
+#ifdef HAS_VULKAN
+#include <QVulkanInstance>
+#endif
+
+#include "common/common_types.h"
+#include "common/logging/log.h"
 #include "core/core.h"
 #include "core/settings.h"
 #include "ui_configure_graphics.h"
@@ -51,10 +58,18 @@ Resolution FromResolutionFactor(float factor) {
 
 ConfigureGraphics::ConfigureGraphics(QWidget* parent)
     : QWidget(parent), ui(new Ui::ConfigureGraphics) {
+    vulkan_device = Settings::values.vulkan_device;
+    RetrieveVulkanDevices();
+
     ui->setupUi(this);
 
     SetConfiguration();
 
+    connect(ui->api, static_cast<void (QComboBox::*)(int)>(&QComboBox::currentIndexChanged), this,
+            [this] { UpdateDeviceComboBox(); });
+    connect(ui->device, static_cast<void (QComboBox::*)(int)>(&QComboBox::activated), this,
+            [this](int device) { UpdateDeviceSelection(device); });
+
     connect(ui->bg_button, &QPushButton::clicked, this, [this] {
         const QColor new_bg_color = QColorDialog::getColor(bg_color);
         if (!new_bg_color.isValid()) {
@@ -64,13 +79,25 @@ ConfigureGraphics::ConfigureGraphics(QWidget* parent)
     });
 }
 
+void ConfigureGraphics::UpdateDeviceSelection(int device) {
+    if (device == -1) {
+        return;
+    }
+    if (GetCurrentGraphicsBackend() == Settings::RendererBackend::Vulkan) {
+        vulkan_device = device;
+    }
+}
+
 ConfigureGraphics::~ConfigureGraphics() = default;
 
 void ConfigureGraphics::SetConfiguration() {
     const bool runtime_lock = !Core::System::GetInstance().IsPoweredOn();
 
+    ui->api->setEnabled(runtime_lock);
+    ui->api->setCurrentIndex(static_cast<int>(Settings::values.renderer_backend));
     ui->resolution_factor_combobox->setCurrentIndex(
         static_cast<int>(FromResolutionFactor(Settings::values.resolution_factor)));
+    ui->aspect_ratio_combobox->setCurrentIndex(Settings::values.aspect_ratio);
     ui->use_disk_shader_cache->setEnabled(runtime_lock);
     ui->use_disk_shader_cache->setChecked(Settings::values.use_disk_shader_cache);
     ui->use_accurate_gpu_emulation->setChecked(Settings::values.use_accurate_gpu_emulation);
@@ -80,11 +107,15 @@ void ConfigureGraphics::SetConfiguration() {
     ui->force_30fps_mode->setChecked(Settings::values.force_30fps_mode);
     UpdateBackgroundColorButton(QColor::fromRgbF(Settings::values.bg_red, Settings::values.bg_green,
                                                  Settings::values.bg_blue));
+    UpdateDeviceComboBox();
 }
 
 void ConfigureGraphics::ApplyConfiguration() {
+    Settings::values.renderer_backend = GetCurrentGraphicsBackend();
+    Settings::values.vulkan_device = vulkan_device;
     Settings::values.resolution_factor =
         ToResolutionFactor(static_cast<Resolution>(ui->resolution_factor_combobox->currentIndex()));
+    Settings::values.aspect_ratio = ui->aspect_ratio_combobox->currentIndex();
     Settings::values.use_disk_shader_cache = ui->use_disk_shader_cache->isChecked();
     Settings::values.use_accurate_gpu_emulation = ui->use_accurate_gpu_emulation->isChecked();
     Settings::values.use_asynchronous_gpu_emulation =
@@ -116,3 +147,68 @@ void ConfigureGraphics::UpdateBackgroundColorButton(QColor color) {
     const QIcon color_icon(pixmap);
     ui->bg_button->setIcon(color_icon);
 }
+
+void ConfigureGraphics::UpdateDeviceComboBox() {
+    ui->device->clear();
+
+    bool enabled = false;
+    switch (GetCurrentGraphicsBackend()) {
+    case Settings::RendererBackend::OpenGL:
+        ui->device->addItem(tr("OpenGL Graphics Device"));
+        enabled = false;
+        break;
+    case Settings::RendererBackend::Vulkan:
+        for (const auto device : vulkan_devices) {
+            ui->device->addItem(device);
+        }
+        ui->device->setCurrentIndex(vulkan_device);
+        enabled = !vulkan_devices.empty();
+        break;
+    }
+    ui->device->setEnabled(enabled && !Core::System::GetInstance().IsPoweredOn());
+}
+
+void ConfigureGraphics::RetrieveVulkanDevices() {
+#ifdef HAS_VULKAN
+    QVulkanInstance instance;
+    instance.setApiVersion(QVersionNumber(1, 1, 0));
+    if (!instance.create()) {
+        LOG_INFO(Frontend, "Vulkan 1.1 not available");
+        return;
+    }
+    const auto vkEnumeratePhysicalDevices{reinterpret_cast<PFN_vkEnumeratePhysicalDevices>(
+        instance.getInstanceProcAddr("vkEnumeratePhysicalDevices"))};
+    if (vkEnumeratePhysicalDevices == nullptr) {
+        LOG_INFO(Frontend, "Failed to get pointer to vkEnumeratePhysicalDevices");
+        return;
+    }
+    u32 physical_device_count;
+    if (vkEnumeratePhysicalDevices(instance.vkInstance(), &physical_device_count, nullptr) !=
+        VK_SUCCESS) {
+        LOG_INFO(Frontend, "Failed to get physical devices count");
+        return;
+    }
+    std::vector<VkPhysicalDevice> physical_devices(physical_device_count);
+    if (vkEnumeratePhysicalDevices(instance.vkInstance(), &physical_device_count,
+                                   physical_devices.data()) != VK_SUCCESS) {
+        LOG_INFO(Frontend, "Failed to get physical devices");
+        return;
+    }
+
+    const auto vkGetPhysicalDeviceProperties{reinterpret_cast<PFN_vkGetPhysicalDeviceProperties>(
+        instance.getInstanceProcAddr("vkGetPhysicalDeviceProperties"))};
+    if (vkGetPhysicalDeviceProperties == nullptr) {
+        LOG_INFO(Frontend, "Failed to get pointer to vkGetPhysicalDeviceProperties");
+        return;
+    }
+    for (const auto physical_device : physical_devices) {
+        VkPhysicalDeviceProperties properties;
+        vkGetPhysicalDeviceProperties(physical_device, &properties);
+        vulkan_devices.push_back(QString::fromUtf8(properties.deviceName));
+    }
+#endif
+}
+
+Settings::RendererBackend ConfigureGraphics::GetCurrentGraphicsBackend() const {
+    return static_cast<Settings::RendererBackend>(ui->api->currentIndex());
+}
diff --git a/src/yuzu/configuration/configure_graphics.h b/src/yuzu/configuration/configure_graphics.h
index fae28d98e..7e0596d9c 100644
--- a/src/yuzu/configuration/configure_graphics.h
+++ b/src/yuzu/configuration/configure_graphics.h
@@ -5,7 +5,10 @@
 #pragma once
 
 #include <memory>
+#include <vector>
+#include <QString>
 #include <QWidget>
+#include "core/settings.h"
 
 namespace Ui {
 class ConfigureGraphics;
@@ -27,7 +30,16 @@ private:
     void SetConfiguration();
 
     void UpdateBackgroundColorButton(QColor color);
+    void UpdateDeviceComboBox();
+    void UpdateDeviceSelection(int device);
+
+    void RetrieveVulkanDevices();
+
+    Settings::RendererBackend GetCurrentGraphicsBackend() const;
 
     std::unique_ptr<Ui::ConfigureGraphics> ui;
     QColor bg_color;
+
+    std::vector<QString> vulkan_devices;
+    u32 vulkan_device{};
 };
diff --git a/src/yuzu/configuration/configure_graphics.ui b/src/yuzu/configuration/configure_graphics.ui
index 0309ee300..db60426ab 100644
--- a/src/yuzu/configuration/configure_graphics.ui
+++ b/src/yuzu/configuration/configure_graphics.ui
@@ -7,21 +7,69 @@
     <x>0</x>
     <y>0</y>
     <width>400</width>
-    <height>300</height>
+    <height>321</height>
    </rect>
   </property>
   <property name="windowTitle">
    <string>Form</string>
   </property>
-  <layout class="QVBoxLayout" name="verticalLayout">
+  <layout class="QVBoxLayout" name="verticalLayout_1">
    <item>
-    <layout class="QVBoxLayout" name="verticalLayout_3">
+    <layout class="QVBoxLayout" name="verticalLayout_2">
+     <item>
+      <widget class="QGroupBox" name="groupBox_2">
+       <property name="title">
+        <string>API Settings</string>
+       </property>
+       <layout class="QVBoxLayout" name="verticalLayout_3">
+        <item>
+         <layout class="QHBoxLayout" name="horizontalLayout_4">
+          <item>
+           <widget class="QLabel" name="label_2">
+            <property name="text">
+             <string>API:</string>
+            </property>
+           </widget>
+          </item>
+          <item>
+           <widget class="QComboBox" name="api">
+            <item>
+             <property name="text">
+              <string notr="true">OpenGL</string>
+             </property>
+            </item>
+            <item>
+             <property name="text">
+              <string notr="true">Vulkan</string>
+             </property>
+            </item>
+           </widget>
+          </item>
+         </layout>
+        </item>
+        <item>
+         <layout class="QHBoxLayout" name="horizontalLayout_5">
+          <item>
+           <widget class="QLabel" name="label_3">
+            <property name="text">
+             <string>Device:</string>
+            </property>
+           </widget>
+          </item>
+          <item>
+           <widget class="QComboBox" name="device"/>
+          </item>
+         </layout>
+        </item>
+       </layout>
+      </widget>
+     </item>
      <item>
       <widget class="QGroupBox" name="groupBox">
        <property name="title">
-        <string>Graphics</string>
+        <string>Graphics Settings</string>
        </property>
-       <layout class="QVBoxLayout" name="verticalLayout_2">
+       <layout class="QVBoxLayout" name="verticalLayout_4">
         <item>
          <widget class="QCheckBox" name="use_disk_shader_cache">
           <property name="text">
@@ -30,16 +78,16 @@
          </widget>
         </item>
         <item>
-         <widget class="QCheckBox" name="use_accurate_gpu_emulation">
+         <widget class="QCheckBox" name="use_asynchronous_gpu_emulation">
           <property name="text">
-           <string>Use accurate GPU emulation (slow)</string>
+           <string>Use asynchronous GPU emulation</string>
           </property>
          </widget>
         </item>
         <item>
-         <widget class="QCheckBox" name="use_asynchronous_gpu_emulation">
+         <widget class="QCheckBox" name="use_accurate_gpu_emulation">
           <property name="text">
-           <string>Use asynchronous GPU emulation</string>
+           <string>Use accurate GPU emulation (slow)</string>
           </property>
          </widget>
         </item>
@@ -51,11 +99,11 @@
          </widget>
         </item>
         <item>
-         <layout class="QHBoxLayout" name="horizontalLayout">
+         <layout class="QHBoxLayout" name="horizontalLayout_2">
           <item>
            <widget class="QLabel" name="label">
             <property name="text">
-             <string>Internal Resolution</string>
+             <string>Internal Resolution:</string>
             </property>
            </widget>
           </item>
@@ -93,6 +141,41 @@
         <item>
          <layout class="QHBoxLayout" name="horizontalLayout_6">
           <item>
+           <widget class="QLabel" name="ar_label">
+            <property name="text">
+             <string>Aspect Ratio:</string>
+            </property>
+           </widget>
+          </item>
+          <item>
+           <widget class="QComboBox" name="aspect_ratio_combobox">
+            <item>
+             <property name="text">
+              <string>Default (16:9)</string>
+             </property>
+            </item>
+            <item>
+             <property name="text">
+              <string>Force 4:3</string>
+             </property>
+            </item>
+            <item>
+             <property name="text">
+              <string>Force 21:9</string>
+             </property>
+            </item>
+            <item>
+             <property name="text">
+              <string>Stretch to Window</string>
+             </property>
+            </item>
+           </widget>
+          </item>
+         </layout>
+        </item>
+        <item>
+         <layout class="QHBoxLayout" name="horizontalLayout_3">
+          <item>
            <widget class="QLabel" name="bg_label">
             <property name="text">
              <string>Background Color:</string>
diff --git a/src/yuzu/configuration/configure_input_player.cpp b/src/yuzu/configuration/configure_input_player.cpp
index 67c9a7c6d..96dec50e2 100644
--- a/src/yuzu/configuration/configure_input_player.cpp
+++ b/src/yuzu/configuration/configure_input_player.cpp
@@ -236,6 +236,8 @@ ConfigureInputPlayer::ConfigureInputPlayer(QWidget* parent, std::size_t player_i
         widget->setVisible(false);
 
     analog_map_stick = {ui->buttonLStickAnalog, ui->buttonRStickAnalog};
+    analog_map_deadzone = {ui->sliderLStickDeadzone, ui->sliderRStickDeadzone};
+    analog_map_deadzone_label = {ui->labelLStickDeadzone, ui->labelRStickDeadzone};
 
     for (int button_id = 0; button_id < Settings::NativeButton::NumButtons; button_id++) {
         auto* const button = button_map[button_id];
@@ -326,6 +328,11 @@ ConfigureInputPlayer::ConfigureInputPlayer(QWidget* parent, std::size_t player_i
                     InputCommon::Polling::DeviceType::Analog);
             }
         });
+        connect(analog_map_deadzone[analog_id], &QSlider::valueChanged, [=] {
+            const float deadzone = analog_map_deadzone[analog_id]->value() / 100.0f;
+            analog_map_deadzone_label[analog_id]->setText(tr("Deadzone: %1").arg(deadzone));
+            analogs_param[analog_id].Set("deadzone", deadzone);
+        });
     }
 
     connect(ui->buttonClearAll, &QPushButton::clicked, [this] { ClearAll(); });
@@ -484,7 +491,7 @@ void ConfigureInputPlayer::ClearAll() {
                 continue;
             }
 
-            analogs_param[analog_id].Erase(analog_sub_buttons[sub_button_id]);
+            analogs_param[analog_id].Clear();
         }
     }
 
@@ -508,6 +515,23 @@ void ConfigureInputPlayer::UpdateButtonLabels() {
                 AnalogToText(analogs_param[analog_id], analog_sub_buttons[sub_button_id]));
         }
         analog_map_stick[analog_id]->setText(tr("Set Analog Stick"));
+
+        auto& param = analogs_param[analog_id];
+        auto* const analog_deadzone_slider = analog_map_deadzone[analog_id];
+        auto* const analog_deadzone_label = analog_map_deadzone_label[analog_id];
+
+        if (param.Has("engine") && param.Get("engine", "") == "sdl") {
+            if (!param.Has("deadzone")) {
+                param.Set("deadzone", 0.1f);
+            }
+
+            analog_deadzone_slider->setValue(static_cast<int>(param.Get("deadzone", 0.1f) * 100));
+            analog_deadzone_slider->setVisible(true);
+            analog_deadzone_label->setVisible(true);
+        } else {
+            analog_deadzone_slider->setVisible(false);
+            analog_deadzone_label->setVisible(false);
+        }
     }
 }
 
diff --git a/src/yuzu/configuration/configure_input_player.h b/src/yuzu/configuration/configure_input_player.h
index c66027651..045704e47 100644
--- a/src/yuzu/configuration/configure_input_player.h
+++ b/src/yuzu/configuration/configure_input_player.h
@@ -97,6 +97,8 @@ private:
     /// Analog inputs are also represented each with a single button, used to configure with an
     /// actual analog stick
     std::array<QPushButton*, Settings::NativeAnalog::NumAnalogs> analog_map_stick;
+    std::array<QSlider*, Settings::NativeAnalog::NumAnalogs> analog_map_deadzone;
+    std::array<QLabel*, Settings::NativeAnalog::NumAnalogs> analog_map_deadzone_label;
 
     static const std::array<std::string, ANALOG_SUB_BUTTONS_NUM> analog_sub_buttons;
 
diff --git a/src/yuzu/configuration/configure_input_player.ui b/src/yuzu/configuration/configure_input_player.ui
index 42db020be..1556481d0 100644
--- a/src/yuzu/configuration/configure_input_player.ui
+++ b/src/yuzu/configuration/configure_input_player.ui
@@ -170,6 +170,44 @@
           </item>
          </layout>
         </item>
+        <item row="4" column="0" colspan="2">
+         <layout class="QVBoxLayout" name="sliderRStickDeadzoneVerticalLayout">
+          <item>
+           <layout class="QHBoxLayout" name="sliderRStickDeadzoneHorizontalLayout">
+            <item>
+             <widget class="QLabel" name="labelRStickDeadzone">
+              <property name="text">
+               <string>Deadzone: 0</string>
+              </property>
+              <property name="alignment">
+               <enum>Qt::AlignHCenter</enum>
+              </property>
+             </widget>
+            </item>
+           </layout>
+          </item>
+          <item>
+           <widget class="QSlider" name="sliderRStickDeadzone">
+            <property name="orientation">
+             <enum>Qt::Horizontal</enum>
+            </property>
+           </widget>
+          </item>
+         </layout>
+        </item>
+        <item row="5" column="0">
+         <spacer name="RStick_verticalSpacer">
+          <property name="orientation">
+           <enum>Qt::Vertical</enum>
+          </property>
+          <property name="sizeHint" stdset="0">
+           <size>
+            <width>0</width>
+            <height>0</height>
+           </size>
+          </property>
+         </spacer>
+        </item>
        </layout>
       </widget>
      </item>
@@ -745,6 +783,47 @@
           </item>
          </layout>
         </item>
+        <item row="5" column="1" colspan="2">
+         <layout class="QVBoxLayout" name="sliderLStickDeadzoneVerticalLayout">
+          <property name="sizeConstraint">
+           <enum>QLayout::SetDefaultConstraint</enum>
+          </property>
+          <item>
+           <layout class="QHBoxLayout" name="sliderLStickDeadzoneHorizontalLayout">
+            <item>
+             <widget class="QLabel" name="labelLStickDeadzone">
+              <property name="text">
+               <string>Deadzone: 0</string>
+              </property>
+              <property name="alignment">
+               <enum>Qt::AlignHCenter</enum>
+              </property>
+             </widget>
+            </item>
+           </layout>
+          </item>
+          <item>
+           <widget class="QSlider" name="sliderLStickDeadzone">
+            <property name="orientation">
+             <enum>Qt::Horizontal</enum>
+            </property>
+           </widget>
+          </item>
+         </layout>
+        </item>
+        <item row="6" column="1">
+         <spacer name="LStick_verticalSpacer">
+          <property name="orientation">
+           <enum>Qt::Vertical</enum>
+          </property>
+          <property name="sizeHint" stdset="0">
+           <size>
+            <width>0</width>
+            <height>0</height>
+           </size>
+          </property>
+         </spacer>
+        </item>
        </layout>
       </widget>
      </item>
diff --git a/src/yuzu/configuration/configure_gamelist.cpp b/src/yuzu/configuration/configure_ui.cpp
index e43e84d39..94424ee44 100644
--- a/src/yuzu/configuration/configure_gamelist.cpp
+++ b/src/yuzu/configuration/configure_ui.cpp
@@ -7,8 +7,8 @@
 
 #include "common/common_types.h"
 #include "core/settings.h"
-#include "ui_configure_gamelist.h"
-#include "yuzu/configuration/configure_gamelist.h"
+#include "ui_configure_ui.h"
+#include "yuzu/configuration/configure_ui.h"
 #include "yuzu/uisettings.h"
 
 namespace {
@@ -26,36 +26,39 @@ constexpr std::array row_text_names{
 };
 } // Anonymous namespace
 
-ConfigureGameList::ConfigureGameList(QWidget* parent)
-    : QWidget(parent), ui(new Ui::ConfigureGameList) {
+ConfigureUi::ConfigureUi(QWidget* parent) : QWidget(parent), ui(new Ui::ConfigureUi) {
     ui->setupUi(this);
 
+    for (const auto& theme : UISettings::themes) {
+        ui->theme_combobox->addItem(QString::fromUtf8(theme.first),
+                                    QString::fromUtf8(theme.second));
+    }
+
     InitializeIconSizeComboBox();
     InitializeRowComboBoxes();
 
     SetConfiguration();
 
     // Force game list reload if any of the relevant settings are changed.
-    connect(ui->show_unknown, &QCheckBox::stateChanged, this,
-            &ConfigureGameList::RequestGameListUpdate);
     connect(ui->icon_size_combobox, QOverload<int>::of(&QComboBox::currentIndexChanged), this,
-            &ConfigureGameList::RequestGameListUpdate);
+            &ConfigureUi::RequestGameListUpdate);
     connect(ui->row_1_text_combobox, QOverload<int>::of(&QComboBox::currentIndexChanged), this,
-            &ConfigureGameList::RequestGameListUpdate);
+            &ConfigureUi::RequestGameListUpdate);
     connect(ui->row_2_text_combobox, QOverload<int>::of(&QComboBox::currentIndexChanged), this,
-            &ConfigureGameList::RequestGameListUpdate);
+            &ConfigureUi::RequestGameListUpdate);
 
     // Update text ComboBoxes after user interaction.
     connect(ui->row_1_text_combobox, QOverload<int>::of(&QComboBox::activated),
-            [=]() { ConfigureGameList::UpdateSecondRowComboBox(); });
+            [=]() { ConfigureUi::UpdateSecondRowComboBox(); });
     connect(ui->row_2_text_combobox, QOverload<int>::of(&QComboBox::activated),
-            [=]() { ConfigureGameList::UpdateFirstRowComboBox(); });
+            [=]() { ConfigureUi::UpdateFirstRowComboBox(); });
 }
 
-ConfigureGameList::~ConfigureGameList() = default;
+ConfigureUi::~ConfigureUi() = default;
 
-void ConfigureGameList::ApplyConfiguration() {
-    UISettings::values.show_unknown = ui->show_unknown->isChecked();
+void ConfigureUi::ApplyConfiguration() {
+    UISettings::values.theme =
+        ui->theme_combobox->itemData(ui->theme_combobox->currentIndex()).toString();
     UISettings::values.show_add_ons = ui->show_add_ons->isChecked();
     UISettings::values.icon_size = ui->icon_size_combobox->currentData().toUInt();
     UISettings::values.row_1_text_id = ui->row_1_text_combobox->currentData().toUInt();
@@ -63,18 +66,18 @@ void ConfigureGameList::ApplyConfiguration() {
     Settings::Apply();
 }
 
-void ConfigureGameList::RequestGameListUpdate() {
+void ConfigureUi::RequestGameListUpdate() {
     UISettings::values.is_game_list_reload_pending.exchange(true);
 }
 
-void ConfigureGameList::SetConfiguration() {
-    ui->show_unknown->setChecked(UISettings::values.show_unknown);
+void ConfigureUi::SetConfiguration() {
+    ui->theme_combobox->setCurrentIndex(ui->theme_combobox->findData(UISettings::values.theme));
     ui->show_add_ons->setChecked(UISettings::values.show_add_ons);
     ui->icon_size_combobox->setCurrentIndex(
         ui->icon_size_combobox->findData(UISettings::values.icon_size));
 }
 
-void ConfigureGameList::changeEvent(QEvent* event) {
+void ConfigureUi::changeEvent(QEvent* event) {
     if (event->type() == QEvent::LanguageChange) {
         RetranslateUI();
     }
@@ -82,7 +85,7 @@ void ConfigureGameList::changeEvent(QEvent* event) {
     QWidget::changeEvent(event);
 }
 
-void ConfigureGameList::RetranslateUI() {
+void ConfigureUi::RetranslateUI() {
     ui->retranslateUi(this);
 
     for (int i = 0; i < ui->icon_size_combobox->count(); i++) {
@@ -97,18 +100,18 @@ void ConfigureGameList::RetranslateUI() {
     }
 }
 
-void ConfigureGameList::InitializeIconSizeComboBox() {
+void ConfigureUi::InitializeIconSizeComboBox() {
     for (const auto& size : default_icon_sizes) {
         ui->icon_size_combobox->addItem(QString::fromUtf8(size.second), size.first);
     }
 }
 
-void ConfigureGameList::InitializeRowComboBoxes() {
+void ConfigureUi::InitializeRowComboBoxes() {
     UpdateFirstRowComboBox(true);
     UpdateSecondRowComboBox(true);
 }
 
-void ConfigureGameList::UpdateFirstRowComboBox(bool init) {
+void ConfigureUi::UpdateFirstRowComboBox(bool init) {
     const int currentIndex =
         init ? UISettings::values.row_1_text_id
              : ui->row_1_text_combobox->findData(ui->row_1_text_combobox->currentData());
@@ -127,7 +130,7 @@ void ConfigureGameList::UpdateFirstRowComboBox(bool init) {
         ui->row_1_text_combobox->findData(ui->row_2_text_combobox->currentData()));
 }
 
-void ConfigureGameList::UpdateSecondRowComboBox(bool init) {
+void ConfigureUi::UpdateSecondRowComboBox(bool init) {
     const int currentIndex =
         init ? UISettings::values.row_2_text_id
              : ui->row_2_text_combobox->findData(ui->row_2_text_combobox->currentData());
diff --git a/src/yuzu/configuration/configure_gamelist.h b/src/yuzu/configuration/configure_ui.h
index ecd3fa174..d471afe99 100644
--- a/src/yuzu/configuration/configure_gamelist.h
+++ b/src/yuzu/configuration/configure_ui.h
@@ -8,15 +8,15 @@
 #include <QWidget>
 
 namespace Ui {
-class ConfigureGameList;
+class ConfigureUi;
 }
 
-class ConfigureGameList : public QWidget {
+class ConfigureUi : public QWidget {
     Q_OBJECT
 
 public:
-    explicit ConfigureGameList(QWidget* parent = nullptr);
-    ~ConfigureGameList() override;
+    explicit ConfigureUi(QWidget* parent = nullptr);
+    ~ConfigureUi() override;
 
     void ApplyConfiguration();
 
@@ -34,5 +34,5 @@ private:
     void UpdateFirstRowComboBox(bool init = false);
     void UpdateSecondRowComboBox(bool init = false);
 
-    std::unique_ptr<Ui::ConfigureGameList> ui;
+    std::unique_ptr<Ui::ConfigureUi> ui;
 };
diff --git a/src/yuzu/configuration/configure_gamelist.ui b/src/yuzu/configuration/configure_ui.ui
index 7a69377e7..bd5c5d3c2 100644
--- a/src/yuzu/configuration/configure_gamelist.ui
+++ b/src/yuzu/configuration/configure_ui.ui
@@ -1,7 +1,7 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <ui version="4.0">
- <class>ConfigureGameList</class>
- <widget class="QWidget" name="ConfigureGameList">
+ <class>ConfigureUi</class>
+ <widget class="QWidget" name="ConfigureUi">
   <property name="geometry">
    <rect>
     <x>0</x>
@@ -21,22 +21,22 @@
        <property name="title">
         <string>General</string>
        </property>
-       <layout class="QHBoxLayout" name="GeneralHorizontalLayout">
+       <layout class="QHBoxLayout" name="horizontalLayout">
         <item>
-         <layout class="QVBoxLayout" name="GeneralVerticalLayout">
+         <layout class="QVBoxLayout" name="verticalLayout">
           <item>
-           <widget class="QCheckBox" name="show_unknown">
-            <property name="text">
-             <string>Show files with type 'Unknown'</string>
-            </property>
-           </widget>
-          </item>
-          <item>
-           <widget class="QCheckBox" name="show_add_ons">
-            <property name="text">
-             <string>Show Add-Ons Column</string>
-            </property>
-           </widget>
+           <layout class="QHBoxLayout" name="horizontalLayout_3">
+            <item>
+             <widget class="QLabel" name="theme_label">
+              <property name="text">
+               <string>Theme:</string>
+              </property>
+             </widget>
+            </item>
+            <item>
+             <widget class="QComboBox" name="theme_combobox"/>
+            </item>
+           </layout>
           </item>
          </layout>
         </item>
@@ -44,13 +44,20 @@
       </widget>
      </item>
      <item>
-      <widget class="QGroupBox" name="IconSizeGroupBox">
+      <widget class="QGroupBox" name="GameListGroupBox">
        <property name="title">
-        <string>Icon Size</string>
+        <string>Game List</string>
        </property>
-       <layout class="QHBoxLayout" name="icon_size_qhbox_layout">
+       <layout class="QHBoxLayout" name="GameListHorizontalLayout">
         <item>
-         <layout class="QVBoxLayout" name="icon_size_qvbox_layout">
+         <layout class="QVBoxLayout" name="GeneralVerticalLayout">
+          <item>
+           <widget class="QCheckBox" name="show_add_ons">
+            <property name="text">
+             <string>Show Add-Ons Column</string>
+            </property>
+           </widget>
+          </item>
           <item>
            <layout class="QHBoxLayout" name="icon_size_qhbox_layout_2">
             <item>
@@ -65,19 +72,6 @@
             </item>
            </layout>
           </item>
-         </layout>
-        </item>
-       </layout>
-      </widget>
-     </item>
-     <item>
-      <widget class="QGroupBox" name="RowGroupBox">
-       <property name="title">
-        <string>Row Text</string>
-       </property>
-       <layout class="QHBoxLayout" name="RowHorizontalLayout">
-        <item>
-         <layout class="QVBoxLayout" name="RowVerticalLayout">
           <item>
            <layout class="QHBoxLayout" name="row_1_qhbox_layout">
             <item>
diff --git a/src/yuzu/debugger/wait_tree.cpp b/src/yuzu/debugger/wait_tree.cpp
index 727bd8a94..3f1a94627 100644
--- a/src/yuzu/debugger/wait_tree.cpp
+++ b/src/yuzu/debugger/wait_tree.cpp
@@ -12,8 +12,8 @@
 #include "core/hle/kernel/process.h"
 #include "core/hle/kernel/readable_event.h"
 #include "core/hle/kernel/scheduler.h"
+#include "core/hle/kernel/synchronization_object.h"
 #include "core/hle/kernel/thread.h"
-#include "core/hle/kernel/wait_object.h"
 #include "core/memory.h"
 
 WaitTreeItem::WaitTreeItem() = default;
@@ -133,8 +133,9 @@ std::vector<std::unique_ptr<WaitTreeItem>> WaitTreeCallstack::GetChildren() cons
     return list;
 }
 
-WaitTreeWaitObject::WaitTreeWaitObject(const Kernel::WaitObject& o) : object(o) {}
-WaitTreeWaitObject::~WaitTreeWaitObject() = default;
+WaitTreeSynchronizationObject::WaitTreeSynchronizationObject(const Kernel::SynchronizationObject& o)
+    : object(o) {}
+WaitTreeSynchronizationObject::~WaitTreeSynchronizationObject() = default;
 
 WaitTreeExpandableItem::WaitTreeExpandableItem() = default;
 WaitTreeExpandableItem::~WaitTreeExpandableItem() = default;
@@ -143,25 +144,26 @@ bool WaitTreeExpandableItem::IsExpandable() const {
     return true;
 }
 
-QString WaitTreeWaitObject::GetText() const {
+QString WaitTreeSynchronizationObject::GetText() const {
     return tr("[%1]%2 %3")
         .arg(object.GetObjectId())
         .arg(QString::fromStdString(object.GetTypeName()),
              QString::fromStdString(object.GetName()));
 }
 
-std::unique_ptr<WaitTreeWaitObject> WaitTreeWaitObject::make(const Kernel::WaitObject& object) {
+std::unique_ptr<WaitTreeSynchronizationObject> WaitTreeSynchronizationObject::make(
+    const Kernel::SynchronizationObject& object) {
     switch (object.GetHandleType()) {
     case Kernel::HandleType::ReadableEvent:
         return std::make_unique<WaitTreeEvent>(static_cast<const Kernel::ReadableEvent&>(object));
     case Kernel::HandleType::Thread:
         return std::make_unique<WaitTreeThread>(static_cast<const Kernel::Thread&>(object));
     default:
-        return std::make_unique<WaitTreeWaitObject>(object);
+        return std::make_unique<WaitTreeSynchronizationObject>(object);
     }
 }
 
-std::vector<std::unique_ptr<WaitTreeItem>> WaitTreeWaitObject::GetChildren() const {
+std::vector<std::unique_ptr<WaitTreeItem>> WaitTreeSynchronizationObject::GetChildren() const {
     std::vector<std::unique_ptr<WaitTreeItem>> list;
 
     const auto& threads = object.GetWaitingThreads();
@@ -173,8 +175,8 @@ std::vector<std::unique_ptr<WaitTreeItem>> WaitTreeWaitObject::GetChildren() con
     return list;
 }
 
-WaitTreeObjectList::WaitTreeObjectList(const std::vector<std::shared_ptr<Kernel::WaitObject>>& list,
-                                       bool w_all)
+WaitTreeObjectList::WaitTreeObjectList(
+    const std::vector<std::shared_ptr<Kernel::SynchronizationObject>>& list, bool w_all)
     : object_list(list), wait_all(w_all) {}
 
 WaitTreeObjectList::~WaitTreeObjectList() = default;
@@ -188,11 +190,12 @@ QString WaitTreeObjectList::GetText() const {
 std::vector<std::unique_ptr<WaitTreeItem>> WaitTreeObjectList::GetChildren() const {
     std::vector<std::unique_ptr<WaitTreeItem>> list(object_list.size());
     std::transform(object_list.begin(), object_list.end(), list.begin(),
-                   [](const auto& t) { return WaitTreeWaitObject::make(*t); });
+                   [](const auto& t) { return WaitTreeSynchronizationObject::make(*t); });
     return list;
 }
 
-WaitTreeThread::WaitTreeThread(const Kernel::Thread& thread) : WaitTreeWaitObject(thread) {}
+WaitTreeThread::WaitTreeThread(const Kernel::Thread& thread)
+    : WaitTreeSynchronizationObject(thread) {}
 WaitTreeThread::~WaitTreeThread() = default;
 
 QString WaitTreeThread::GetText() const {
@@ -241,7 +244,8 @@ QString WaitTreeThread::GetText() const {
     const QString pc_info = tr(" PC = 0x%1 LR = 0x%2")
                                 .arg(context.pc, 8, 16, QLatin1Char{'0'})
                                 .arg(context.cpu_registers[30], 8, 16, QLatin1Char{'0'});
-    return QStringLiteral("%1%2 (%3) ").arg(WaitTreeWaitObject::GetText(), pc_info, status);
+    return QStringLiteral("%1%2 (%3) ")
+        .arg(WaitTreeSynchronizationObject::GetText(), pc_info, status);
 }
 
 QColor WaitTreeThread::GetColor() const {
@@ -273,7 +277,7 @@ QColor WaitTreeThread::GetColor() const {
 }
 
 std::vector<std::unique_ptr<WaitTreeItem>> WaitTreeThread::GetChildren() const {
-    std::vector<std::unique_ptr<WaitTreeItem>> list(WaitTreeWaitObject::GetChildren());
+    std::vector<std::unique_ptr<WaitTreeItem>> list(WaitTreeSynchronizationObject::GetChildren());
 
     const auto& thread = static_cast<const Kernel::Thread&>(object);
 
@@ -314,7 +318,7 @@ std::vector<std::unique_ptr<WaitTreeItem>> WaitTreeThread::GetChildren() const {
     }
 
     if (thread.GetStatus() == Kernel::ThreadStatus::WaitSynch) {
-        list.push_back(std::make_unique<WaitTreeObjectList>(thread.GetWaitObjects(),
+        list.push_back(std::make_unique<WaitTreeObjectList>(thread.GetSynchronizationObjects(),
                                                             thread.IsSleepingOnWait()));
     }
 
@@ -323,7 +327,8 @@ std::vector<std::unique_ptr<WaitTreeItem>> WaitTreeThread::GetChildren() const {
     return list;
 }
 
-WaitTreeEvent::WaitTreeEvent(const Kernel::ReadableEvent& object) : WaitTreeWaitObject(object) {}
+WaitTreeEvent::WaitTreeEvent(const Kernel::ReadableEvent& object)
+    : WaitTreeSynchronizationObject(object) {}
 WaitTreeEvent::~WaitTreeEvent() = default;
 
 WaitTreeThreadList::WaitTreeThreadList(const std::vector<std::shared_ptr<Kernel::Thread>>& list)
diff --git a/src/yuzu/debugger/wait_tree.h b/src/yuzu/debugger/wait_tree.h
index 631274a5f..8e3bc4b24 100644
--- a/src/yuzu/debugger/wait_tree.h
+++ b/src/yuzu/debugger/wait_tree.h
@@ -19,7 +19,7 @@ class EmuThread;
 namespace Kernel {
 class HandleTable;
 class ReadableEvent;
-class WaitObject;
+class SynchronizationObject;
 class Thread;
 } // namespace Kernel
 
@@ -99,35 +99,37 @@ private:
     const Kernel::Thread& thread;
 };
 
-class WaitTreeWaitObject : public WaitTreeExpandableItem {
+class WaitTreeSynchronizationObject : public WaitTreeExpandableItem {
     Q_OBJECT
 public:
-    explicit WaitTreeWaitObject(const Kernel::WaitObject& object);
-    ~WaitTreeWaitObject() override;
+    explicit WaitTreeSynchronizationObject(const Kernel::SynchronizationObject& object);
+    ~WaitTreeSynchronizationObject() override;
 
-    static std::unique_ptr<WaitTreeWaitObject> make(const Kernel::WaitObject& object);
+    static std::unique_ptr<WaitTreeSynchronizationObject> make(
+        const Kernel::SynchronizationObject& object);
     QString GetText() const override;
     std::vector<std::unique_ptr<WaitTreeItem>> GetChildren() const override;
 
 protected:
-    const Kernel::WaitObject& object;
+    const Kernel::SynchronizationObject& object;
 };
 
 class WaitTreeObjectList : public WaitTreeExpandableItem {
     Q_OBJECT
 public:
-    WaitTreeObjectList(const std::vector<std::shared_ptr<Kernel::WaitObject>>& list, bool wait_all);
+    WaitTreeObjectList(const std::vector<std::shared_ptr<Kernel::SynchronizationObject>>& list,
+                       bool wait_all);
     ~WaitTreeObjectList() override;
 
     QString GetText() const override;
     std::vector<std::unique_ptr<WaitTreeItem>> GetChildren() const override;
 
 private:
-    const std::vector<std::shared_ptr<Kernel::WaitObject>>& object_list;
+    const std::vector<std::shared_ptr<Kernel::SynchronizationObject>>& object_list;
     bool wait_all;
 };
 
-class WaitTreeThread : public WaitTreeWaitObject {
+class WaitTreeThread : public WaitTreeSynchronizationObject {
     Q_OBJECT
 public:
     explicit WaitTreeThread(const Kernel::Thread& thread);
@@ -138,7 +140,7 @@ public:
     std::vector<std::unique_ptr<WaitTreeItem>> GetChildren() const override;
 };
 
-class WaitTreeEvent : public WaitTreeWaitObject {
+class WaitTreeEvent : public WaitTreeSynchronizationObject {
     Q_OBJECT
 public:
     explicit WaitTreeEvent(const Kernel::ReadableEvent& object);
diff --git a/src/yuzu/game_list_worker.cpp b/src/yuzu/game_list_worker.cpp
index 4c81ef12b..da2c27aa2 100644
--- a/src/yuzu/game_list_worker.cpp
+++ b/src/yuzu/game_list_worker.cpp
@@ -298,8 +298,7 @@ void GameListWorker::ScanFileSystem(ScanTarget target, const std::string& dir_pa
             }
 
             const auto file_type = loader->GetFileType();
-            if ((file_type == Loader::FileType::Unknown || file_type == Loader::FileType::Error) &&
-                !UISettings::values.show_unknown) {
+            if (file_type == Loader::FileType::Unknown || file_type == Loader::FileType::Error) {
                 return true;
             }
 
diff --git a/src/yuzu/main.cpp b/src/yuzu/main.cpp
index b5dd3e0d6..54ca2dc1d 100644
--- a/src/yuzu/main.cpp
+++ b/src/yuzu/main.cpp
@@ -454,7 +454,6 @@ void GMainWindow::InitializeWidgets() {
     // Create status bar
     message_label = new QLabel();
     // Configured separately for left alignment
-    message_label->setVisible(false);
     message_label->setFrameStyle(QFrame::NoFrame);
     message_label->setContentsMargins(4, 0, 4, 0);
     message_label->setAlignment(Qt::AlignLeft);
@@ -476,8 +475,73 @@ void GMainWindow::InitializeWidgets() {
         label->setVisible(false);
         label->setFrameStyle(QFrame::NoFrame);
         label->setContentsMargins(4, 0, 4, 0);
-        statusBar()->addPermanentWidget(label, 0);
+        statusBar()->addPermanentWidget(label);
     }
+
+    // Setup Dock button
+    dock_status_button = new QPushButton();
+    dock_status_button->setObjectName(QStringLiteral("TogglableStatusBarButton"));
+    dock_status_button->setFocusPolicy(Qt::NoFocus);
+    connect(dock_status_button, &QPushButton::clicked, [&] {
+        Settings::values.use_docked_mode = !Settings::values.use_docked_mode;
+        dock_status_button->setChecked(Settings::values.use_docked_mode);
+        OnDockedModeChanged(!Settings::values.use_docked_mode, Settings::values.use_docked_mode);
+    });
+    dock_status_button->setText(tr("DOCK"));
+    dock_status_button->setCheckable(true);
+    dock_status_button->setChecked(Settings::values.use_docked_mode);
+    statusBar()->insertPermanentWidget(0, dock_status_button);
+
+    // Setup ASync button
+    async_status_button = new QPushButton();
+    async_status_button->setObjectName(QStringLiteral("TogglableStatusBarButton"));
+    async_status_button->setFocusPolicy(Qt::NoFocus);
+    connect(async_status_button, &QPushButton::clicked, [&] {
+        if (emulation_running) {
+            return;
+        }
+        Settings::values.use_asynchronous_gpu_emulation =
+            !Settings::values.use_asynchronous_gpu_emulation;
+        async_status_button->setChecked(Settings::values.use_asynchronous_gpu_emulation);
+        Settings::Apply();
+    });
+    async_status_button->setText(tr("ASYNC"));
+    async_status_button->setCheckable(true);
+    async_status_button->setChecked(Settings::values.use_asynchronous_gpu_emulation);
+    statusBar()->insertPermanentWidget(0, async_status_button);
+
+    // Setup Renderer API button
+    renderer_status_button = new QPushButton();
+    renderer_status_button->setObjectName(QStringLiteral("RendererStatusBarButton"));
+    renderer_status_button->setCheckable(true);
+    renderer_status_button->setFocusPolicy(Qt::NoFocus);
+    connect(renderer_status_button, &QPushButton::toggled, [=](bool checked) {
+        renderer_status_button->setText(checked ? tr("VULKAN") : tr("OPENGL"));
+    });
+    renderer_status_button->toggle();
+
+#ifndef HAS_VULKAN
+    renderer_status_button->setChecked(false);
+    renderer_status_button->setCheckable(false);
+    renderer_status_button->setDisabled(true);
+#else
+    renderer_status_button->setChecked(Settings::values.renderer_backend ==
+                                       Settings::RendererBackend::Vulkan);
+    connect(renderer_status_button, &QPushButton::clicked, [=] {
+        if (emulation_running) {
+            return;
+        }
+        if (renderer_status_button->isChecked()) {
+            Settings::values.renderer_backend = Settings::RendererBackend::Vulkan;
+        } else {
+            Settings::values.renderer_backend = Settings::RendererBackend::OpenGL;
+        }
+
+        Settings::Apply();
+    });
+#endif // HAS_VULKAN
+    statusBar()->insertPermanentWidget(0, renderer_status_button);
+
     statusBar()->setVisible(true);
     setStyleSheet(QStringLiteral("QStatusBar::item{border: none;}"));
 }
@@ -640,6 +704,7 @@ void GMainWindow::InitializeHotkeys() {
                 Settings::values.use_docked_mode = !Settings::values.use_docked_mode;
                 OnDockedModeChanged(!Settings::values.use_docked_mode,
                                     Settings::values.use_docked_mode);
+                dock_status_button->setChecked(Settings::values.use_docked_mode);
             });
 }
 
@@ -806,70 +871,12 @@ void GMainWindow::AllowOSSleep() {
 #endif
 }
 
-QStringList GMainWindow::GetUnsupportedGLExtensions() {
-    QStringList unsupported_ext;
-
-    if (!GLAD_GL_ARB_buffer_storage) {
-        unsupported_ext.append(QStringLiteral("ARB_buffer_storage"));
-    }
-    if (!GLAD_GL_ARB_direct_state_access) {
-        unsupported_ext.append(QStringLiteral("ARB_direct_state_access"));
-    }
-    if (!GLAD_GL_ARB_vertex_type_10f_11f_11f_rev) {
-        unsupported_ext.append(QStringLiteral("ARB_vertex_type_10f_11f_11f_rev"));
-    }
-    if (!GLAD_GL_ARB_texture_mirror_clamp_to_edge) {
-        unsupported_ext.append(QStringLiteral("ARB_texture_mirror_clamp_to_edge"));
-    }
-    if (!GLAD_GL_ARB_multi_bind) {
-        unsupported_ext.append(QStringLiteral("ARB_multi_bind"));
-    }
-    if (!GLAD_GL_ARB_clip_control) {
-        unsupported_ext.append(QStringLiteral("ARB_clip_control"));
-    }
-
-    // Extensions required to support some texture formats.
-    if (!GLAD_GL_EXT_texture_compression_s3tc) {
-        unsupported_ext.append(QStringLiteral("EXT_texture_compression_s3tc"));
-    }
-    if (!GLAD_GL_ARB_texture_compression_rgtc) {
-        unsupported_ext.append(QStringLiteral("ARB_texture_compression_rgtc"));
-    }
-    if (!GLAD_GL_ARB_depth_buffer_float) {
-        unsupported_ext.append(QStringLiteral("ARB_depth_buffer_float"));
-    }
-
-    for (const QString& ext : unsupported_ext) {
-        LOG_CRITICAL(Frontend, "Unsupported GL extension: {}", ext.toStdString());
-    }
-
-    return unsupported_ext;
-}
-
 bool GMainWindow::LoadROM(const QString& filename) {
     // Shutdown previous session if the emu thread is still active...
     if (emu_thread != nullptr)
         ShutdownGame();
 
-    render_window->InitRenderTarget();
-
-    {
-        Core::Frontend::ScopeAcquireWindowContext acquire_context{*render_window};
-        if (!gladLoadGL()) {
-            QMessageBox::critical(this, tr("Error while initializing OpenGL 4.3 Core!"),
-                                  tr("Your GPU may not support OpenGL 4.3, or you do not "
-                                     "have the latest graphics driver."));
-            return false;
-        }
-    }
-
-    const QStringList unsupported_gl_extensions = GetUnsupportedGLExtensions();
-    if (!unsupported_gl_extensions.empty()) {
-        QMessageBox::critical(this, tr("Error while initializing OpenGL Core!"),
-                              tr("Your GPU may not support one or more required OpenGL"
-                                 "extensions. Please ensure you have the latest graphics "
-                                 "driver.<br><br>Unsupported extensions:<br>") +
-                                  unsupported_gl_extensions.join(QStringLiteral("<br>")));
+    if (!render_window->InitRenderTarget()) {
         return false;
     }
 
@@ -980,7 +987,9 @@ void GMainWindow::BootGame(const QString& filename) {
     // Create and start the emulation thread
     emu_thread = std::make_unique<EmuThread>(render_window);
     emit EmulationStarting(emu_thread.get());
-    render_window->moveContext();
+    if (Settings::values.renderer_backend == Settings::RendererBackend::OpenGL) {
+        render_window->moveContext();
+    }
     emu_thread->start();
 
     connect(render_window, &GRenderWindow::Closed, this, &GMainWindow::OnStopGame);
@@ -1000,6 +1009,8 @@ void GMainWindow::BootGame(const QString& filename) {
         game_list_placeholder->hide();
     }
     status_bar_update_timer.start(2000);
+    async_status_button->setDisabled(true);
+    renderer_status_button->setDisabled(true);
 
     const u64 title_id = Core::System::GetInstance().CurrentProcess()->GetTitleID();
 
@@ -1065,10 +1076,13 @@ void GMainWindow::ShutdownGame() {
 
     // Disable status bar updates
     status_bar_update_timer.stop();
-    message_label->setVisible(false);
     emu_speed_label->setVisible(false);
     game_fps_label->setVisible(false);
     emu_frametime_label->setVisible(false);
+    async_status_button->setEnabled(true);
+#ifdef HAS_VULKAN
+    renderer_status_button->setEnabled(true);
+#endif
 
     emulation_running = false;
 
@@ -1836,6 +1850,13 @@ void GMainWindow::OnConfigure() {
     }
 
     config->Save();
+
+    dock_status_button->setChecked(Settings::values.use_docked_mode);
+    async_status_button->setChecked(Settings::values.use_asynchronous_gpu_emulation);
+#ifdef HAS_VULKAN
+    renderer_status_button->setChecked(Settings::values.renderer_backend ==
+                                       Settings::RendererBackend::Vulkan);
+#endif
 }
 
 void GMainWindow::OnLoadAmiibo() {
@@ -2028,7 +2049,6 @@ void GMainWindow::OnCoreError(Core::System::ResultStatus result, std::string det
         if (emu_thread) {
             emu_thread->SetRunning(true);
             message_label->setText(status_message);
-            message_label->setVisible(true);
         }
     }
 }
@@ -2195,6 +2215,18 @@ void GMainWindow::closeEvent(QCloseEvent* event) {
     QWidget::closeEvent(event);
 }
 
+void GMainWindow::keyPressEvent(QKeyEvent* event) {
+    if (render_window) {
+        render_window->ForwardKeyPressEvent(event);
+    }
+}
+
+void GMainWindow::keyReleaseEvent(QKeyEvent* event) {
+    if (render_window) {
+        render_window->ForwardKeyReleaseEvent(event);
+    }
+}
+
 static bool IsSingleFileDropEvent(QDropEvent* event) {
     const QMimeData* mimeData = event->mimeData();
     return mimeData->hasUrls() && mimeData->urls().length() == 1;
@@ -2227,18 +2259,6 @@ void GMainWindow::dragMoveEvent(QDragMoveEvent* event) {
     event->acceptProposedAction();
 }
 
-void GMainWindow::keyPressEvent(QKeyEvent* event) {
-    if (render_window) {
-        render_window->ForwardKeyPressEvent(event);
-    }
-}
-
-void GMainWindow::keyReleaseEvent(QKeyEvent* event) {
-    if (render_window) {
-        render_window->ForwardKeyReleaseEvent(event);
-    }
-}
-
 bool GMainWindow::ConfirmChangeGame() {
     if (emu_thread == nullptr)
         return true;
@@ -2290,8 +2310,16 @@ void GMainWindow::UpdateUITheme() {
     QStringList theme_paths(default_theme_paths);
 
     if (is_default_theme || current_theme.isEmpty()) {
-        qApp->setStyleSheet({});
-        setStyleSheet({});
+        const QString theme_uri(QStringLiteral(":default/style.qss"));
+        QFile f(theme_uri);
+        if (f.open(QFile::ReadOnly | QFile::Text)) {
+            QTextStream ts(&f);
+            qApp->setStyleSheet(ts.readAll());
+            setStyleSheet(ts.readAll());
+        } else {
+            qApp->setStyleSheet({});
+            setStyleSheet({});
+        }
         theme_paths.append(default_icons);
         QIcon::setThemeName(default_icons);
     } else {
diff --git a/src/yuzu/main.h b/src/yuzu/main.h
index a56f9a981..8eba2172c 100644
--- a/src/yuzu/main.h
+++ b/src/yuzu/main.h
@@ -27,6 +27,7 @@ class LoadingScreen;
 class MicroProfileDialog;
 class ProfilerWidget;
 class QLabel;
+class QPushButton;
 class WaitTreeWidget;
 enum class GameListOpenTarget;
 class GameListPlaceholder;
@@ -130,7 +131,6 @@ private:
     void PreventOSSleep();
     void AllowOSSleep();
 
-    QStringList GetUnsupportedGLExtensions();
     bool LoadROM(const QString& filename);
     void BootGame(const QString& filename);
     void ShutdownGame();
@@ -229,6 +229,9 @@ private:
     QLabel* emu_speed_label = nullptr;
     QLabel* game_fps_label = nullptr;
     QLabel* emu_frametime_label = nullptr;
+    QPushButton* async_status_button = nullptr;
+    QPushButton* renderer_status_button = nullptr;
+    QPushButton* dock_status_button = nullptr;
     QTimer status_bar_update_timer;
 
     std::unique_ptr<Config> config;
diff --git a/src/yuzu/uisettings.h b/src/yuzu/uisettings.h
index bc7725a01..a675ecf4d 100644
--- a/src/yuzu/uisettings.h
+++ b/src/yuzu/uisettings.h
@@ -89,7 +89,6 @@ struct Values {
     int profile_index;
 
     // Game List
-    bool show_unknown;
     bool show_add_ons;
     uint32_t icon_size;
     uint8_t row_1_text_id;
diff --git a/src/yuzu_cmd/CMakeLists.txt b/src/yuzu_cmd/CMakeLists.txt
index b5f06ab9e..a15719a0f 100644
--- a/src/yuzu_cmd/CMakeLists.txt
+++ b/src/yuzu_cmd/CMakeLists.txt
@@ -8,11 +8,22 @@ add_executable(yuzu-cmd
     emu_window/emu_window_sdl2_gl.h
     emu_window/emu_window_sdl2.cpp
     emu_window/emu_window_sdl2.h
+    emu_window/emu_window_sdl2_gl.cpp
+    emu_window/emu_window_sdl2_gl.h
     resource.h
     yuzu.cpp
     yuzu.rc
 )
 
+if (ENABLE_VULKAN)
+    target_sources(yuzu-cmd PRIVATE
+                   emu_window/emu_window_sdl2_vk.cpp
+                   emu_window/emu_window_sdl2_vk.h)
+
+    target_include_directories(yuzu-cmd PRIVATE ../../externals/Vulkan-Headers/include)
+    target_compile_definitions(yuzu-cmd PRIVATE HAS_VULKAN)
+endif()
+
 create_target_directory_groups(yuzu-cmd)
 
 target_link_libraries(yuzu-cmd PRIVATE common core input_common)
diff --git a/src/yuzu_cmd/config.cpp b/src/yuzu_cmd/config.cpp
index 1a812cb87..96f1ce3af 100644
--- a/src/yuzu_cmd/config.cpp
+++ b/src/yuzu_cmd/config.cpp
@@ -12,6 +12,7 @@
 #include "core/hle/service/acc/profile_manager.h"
 #include "core/settings.h"
 #include "input_common/main.h"
+#include "input_common/udp/client.h"
 #include "yuzu_cmd/config.h"
 #include "yuzu_cmd/default_ini.h"
 
@@ -297,6 +298,10 @@ void Config::ReadValues() {
         sdl2_config->GetInteger("ControlsGeneral", "touch_diameter_x", 15);
     Settings::values.touchscreen.diameter_y =
         sdl2_config->GetInteger("ControlsGeneral", "touch_diameter_y", 15);
+    Settings::values.udp_input_address =
+        sdl2_config->Get("Controls", "udp_input_address", InputCommon::CemuhookUDP::DEFAULT_ADDR);
+    Settings::values.udp_input_port = static_cast<u16>(sdl2_config->GetInteger(
+        "Controls", "udp_input_port", InputCommon::CemuhookUDP::DEFAULT_PORT));
 
     std::transform(keyboard_keys.begin(), keyboard_keys.end(),
                    Settings::values.keyboard_keys.begin(), InputCommon::GenerateKeyboardParam);
@@ -366,8 +371,16 @@ void Config::ReadValues() {
     Settings::values.use_multi_core = sdl2_config->GetBoolean("Core", "use_multi_core", false);
 
     // Renderer
+    const int renderer_backend = sdl2_config->GetInteger(
+        "Renderer", "backend", static_cast<int>(Settings::RendererBackend::OpenGL));
+    Settings::values.renderer_backend = static_cast<Settings::RendererBackend>(renderer_backend);
+    Settings::values.renderer_debug = sdl2_config->GetBoolean("Renderer", "debug", false);
+    Settings::values.vulkan_device = sdl2_config->GetInteger("Renderer", "vulkan_device", 0);
+
     Settings::values.resolution_factor =
         static_cast<float>(sdl2_config->GetReal("Renderer", "resolution_factor", 1.0));
+    Settings::values.aspect_ratio =
+        static_cast<int>(sdl2_config->GetInteger("Renderer", "aspect_ratio", 0));
     Settings::values.use_frame_limit = sdl2_config->GetBoolean("Renderer", "use_frame_limit", true);
     Settings::values.frame_limit =
         static_cast<u16>(sdl2_config->GetInteger("Renderer", "frame_limit", 100));
diff --git a/src/yuzu_cmd/default_ini.h b/src/yuzu_cmd/default_ini.h
index 8d18a4a5a..8a2b658cd 100644
--- a/src/yuzu_cmd/default_ini.h
+++ b/src/yuzu_cmd/default_ini.h
@@ -69,18 +69,46 @@ rstick=
 #  - "motion_emu" (default) for emulating motion input from mouse input. Required parameters:
 #      - "update_period": update period in milliseconds (default to 100)
 #      - "sensitivity": the coefficient converting mouse movement to tilting angle (default to 0.01)
+#  - "cemuhookudp" reads motion input from a udp server that uses cemuhook's udp protocol
 motion_device=
 
 # for touch input, the following devices are available:
 #  - "emu_window" (default) for emulating touch input from mouse input to the emulation window. No parameters required
+#  - "cemuhookudp" reads touch input from a udp server that uses cemuhook's udp protocol
+#      - "min_x", "min_y", "max_x", "max_y": defines the udp device's touch screen coordinate system
 touch_device=
 
+# Most desktop operating systems do not expose a way to poll the motion state of the controllers
+# so as a way around it, cemuhook created a udp client/server protocol to broadcast the data directly
+# from a controller device to the client program. Citra has a client that can connect and read
+# from any cemuhook compatible motion program.
+
+# IPv4 address of the udp input server (Default "127.0.0.1")
+udp_input_address=
+
+# Port of the udp input server. (Default 26760)
+udp_input_port=
+
+# The pad to request data on. Should be between 0 (Pad 1) and 3 (Pad 4). (Default 0)
+udp_pad_index=
+
 [Core]
 # Whether to use multi-core for CPU emulation
 # 0 (default): Disabled, 1: Enabled
 use_multi_core=
 
 [Renderer]
+# Which backend API to use.
+# 0 (default): OpenGL, 1: Vulkan
+backend =
+
+# Enable graphics API debugging mode.
+# 0 (default): Disabled, 1: Enabled
+debug =
+
+# Which Vulkan physical device to use (defaults to 0)
+vulkan_device =
+
 # Whether to use software or hardware rendering.
 # 0: Software, 1 (default): Hardware
 use_hw_renderer =
@@ -94,6 +122,10 @@ use_shader_jit =
 # factor for the Switch resolution
 resolution_factor =
 
+# Aspect ratio
+# 0: Default (16:9), 1: Force 4:3, 2: Force 21:9, 3: Stretch to Window
+aspect_ratio =
+
 # Whether to enable V-Sync (caps the framerate at 60FPS) or not.
 # 0 (default): Off, 1: On
 use_vsync =
diff --git a/src/yuzu_cmd/emu_window/emu_window_sdl2.cpp b/src/yuzu_cmd/emu_window/emu_window_sdl2.cpp
index b1c512db1..e96139885 100644
--- a/src/yuzu_cmd/emu_window/emu_window_sdl2.cpp
+++ b/src/yuzu_cmd/emu_window/emu_window_sdl2.cpp
@@ -89,6 +89,10 @@ bool EmuWindow_SDL2::IsOpen() const {
     return is_open;
 }
 
+bool EmuWindow_SDL2::IsShown() const {
+    return is_shown;
+}
+
 void EmuWindow_SDL2::OnResize() {
     int width, height;
     SDL_GetWindowSize(render_window, &width, &height);
diff --git a/src/yuzu_cmd/emu_window/emu_window_sdl2.h b/src/yuzu_cmd/emu_window/emu_window_sdl2.h
index eaa971f77..b38f56661 100644
--- a/src/yuzu_cmd/emu_window/emu_window_sdl2.h
+++ b/src/yuzu_cmd/emu_window/emu_window_sdl2.h
@@ -21,6 +21,9 @@ public:
     /// Whether the window is still open, and a close request hasn't yet been sent
     bool IsOpen() const;
 
+    /// Returns if window is shown (not minimized)
+    bool IsShown() const override;
+
 protected:
     /// Called by PollEvents when a key is pressed or released.
     void OnKeyEvent(int key, u8 state);
diff --git a/src/yuzu_cmd/emu_window/emu_window_sdl2_gl.cpp b/src/yuzu_cmd/emu_window/emu_window_sdl2_gl.cpp
index 6fde694a2..7ffa0ac09 100644
--- a/src/yuzu_cmd/emu_window/emu_window_sdl2_gl.cpp
+++ b/src/yuzu_cmd/emu_window/emu_window_sdl2_gl.cpp
@@ -9,6 +9,7 @@
 #include <SDL.h>
 #include <fmt/format.h>
 #include <glad/glad.h>
+#include "common/assert.h"
 #include "common/logging/log.h"
 #include "common/scm_rev.h"
 #include "common/string_util.h"
@@ -151,6 +152,12 @@ void EmuWindow_SDL2_GL::DoneCurrent() {
     SDL_GL_MakeCurrent(render_window, nullptr);
 }
 
+void EmuWindow_SDL2_GL::RetrieveVulkanHandlers(void* get_instance_proc_addr, void* instance,
+                                               void* surface) const {
+    // Should not have been called from OpenGL
+    UNREACHABLE();
+}
+
 std::unique_ptr<Core::Frontend::GraphicsContext> EmuWindow_SDL2_GL::CreateSharedContext() const {
     return std::make_unique<SDLGLContext>();
 }
diff --git a/src/yuzu_cmd/emu_window/emu_window_sdl2_gl.h b/src/yuzu_cmd/emu_window/emu_window_sdl2_gl.h
index 630deba93..c753085a8 100644
--- a/src/yuzu_cmd/emu_window/emu_window_sdl2_gl.h
+++ b/src/yuzu_cmd/emu_window/emu_window_sdl2_gl.h
@@ -22,6 +22,10 @@ public:
     /// Releases the GL context from the caller thread
     void DoneCurrent() override;
 
+    /// Ignored in OpenGL
+    void RetrieveVulkanHandlers(void* get_instance_proc_addr, void* instance,
+                                void* surface) const override;
+
     std::unique_ptr<Core::Frontend::GraphicsContext> CreateSharedContext() const override;
 
 private:
diff --git a/src/yuzu_cmd/emu_window/emu_window_sdl2_vk.cpp b/src/yuzu_cmd/emu_window/emu_window_sdl2_vk.cpp
new file mode 100644
index 000000000..a203f0da9
--- /dev/null
+++ b/src/yuzu_cmd/emu_window/emu_window_sdl2_vk.cpp
@@ -0,0 +1,162 @@
+// Copyright 2018 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <algorithm>
+#include <string>
+#include <vector>
+#include <SDL.h>
+#include <SDL_vulkan.h>
+#include <fmt/format.h>
+#include <vulkan/vulkan.h>
+#include "common/assert.h"
+#include "common/logging/log.h"
+#include "common/scm_rev.h"
+#include "core/settings.h"
+#include "yuzu_cmd/emu_window/emu_window_sdl2_vk.h"
+
+EmuWindow_SDL2_VK::EmuWindow_SDL2_VK(bool fullscreen) : EmuWindow_SDL2(fullscreen) {
+    if (SDL_Vulkan_LoadLibrary(nullptr) != 0) {
+        LOG_CRITICAL(Frontend, "SDL failed to load the Vulkan library: {}", SDL_GetError());
+        exit(EXIT_FAILURE);
+    }
+
+    vkGetInstanceProcAddr =
+        reinterpret_cast<PFN_vkGetInstanceProcAddr>(SDL_Vulkan_GetVkGetInstanceProcAddr());
+    if (vkGetInstanceProcAddr == nullptr) {
+        LOG_CRITICAL(Frontend, "Failed to retrieve Vulkan function pointer!");
+        exit(EXIT_FAILURE);
+    }
+
+    const std::string window_title = fmt::format("yuzu {} | {}-{} (Vulkan)", Common::g_build_name,
+                                                 Common::g_scm_branch, Common::g_scm_desc);
+    render_window =
+        SDL_CreateWindow(window_title.c_str(),
+                         SDL_WINDOWPOS_UNDEFINED, // x position
+                         SDL_WINDOWPOS_UNDEFINED, // y position
+                         Layout::ScreenUndocked::Width, Layout::ScreenUndocked::Height,
+                         SDL_WINDOW_RESIZABLE | SDL_WINDOW_ALLOW_HIGHDPI | SDL_WINDOW_VULKAN);
+
+    const bool use_standard_layers = UseStandardLayers(vkGetInstanceProcAddr);
+
+    u32 extra_ext_count{};
+    if (!SDL_Vulkan_GetInstanceExtensions(render_window, &extra_ext_count, NULL)) {
+        LOG_CRITICAL(Frontend, "Failed to query Vulkan extensions count from SDL! {}",
+                     SDL_GetError());
+        exit(1);
+    }
+
+    auto extra_ext_names = std::make_unique<const char* []>(extra_ext_count);
+    if (!SDL_Vulkan_GetInstanceExtensions(render_window, &extra_ext_count, extra_ext_names.get())) {
+        LOG_CRITICAL(Frontend, "Failed to query Vulkan extensions from SDL! {}", SDL_GetError());
+        exit(1);
+    }
+    std::vector<const char*> enabled_extensions;
+    enabled_extensions.insert(enabled_extensions.begin(), extra_ext_names.get(),
+                              extra_ext_names.get() + extra_ext_count);
+
+    std::vector<const char*> enabled_layers;
+    if (use_standard_layers) {
+        enabled_extensions.push_back(VK_EXT_DEBUG_UTILS_EXTENSION_NAME);
+        enabled_layers.push_back("VK_LAYER_LUNARG_standard_validation");
+    }
+
+    VkApplicationInfo app_info{};
+    app_info.sType = VK_STRUCTURE_TYPE_APPLICATION_INFO;
+    app_info.apiVersion = VK_API_VERSION_1_1;
+    app_info.applicationVersion = VK_MAKE_VERSION(0, 1, 0);
+    app_info.pApplicationName = "yuzu-emu";
+    app_info.engineVersion = VK_MAKE_VERSION(0, 1, 0);
+    app_info.pEngineName = "yuzu-emu";
+
+    VkInstanceCreateInfo instance_ci{};
+    instance_ci.sType = VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO;
+    instance_ci.pApplicationInfo = &app_info;
+    instance_ci.enabledExtensionCount = static_cast<u32>(enabled_extensions.size());
+    instance_ci.ppEnabledExtensionNames = enabled_extensions.data();
+    if (Settings::values.renderer_debug) {
+        instance_ci.enabledLayerCount = static_cast<u32>(enabled_layers.size());
+        instance_ci.ppEnabledLayerNames = enabled_layers.data();
+    }
+
+    const auto vkCreateInstance =
+        reinterpret_cast<PFN_vkCreateInstance>(vkGetInstanceProcAddr(nullptr, "vkCreateInstance"));
+    if (vkCreateInstance == nullptr ||
+        vkCreateInstance(&instance_ci, nullptr, &vk_instance) != VK_SUCCESS) {
+        LOG_CRITICAL(Frontend, "Failed to create Vulkan instance!");
+        exit(EXIT_FAILURE);
+    }
+
+    vkDestroyInstance = reinterpret_cast<PFN_vkDestroyInstance>(
+        vkGetInstanceProcAddr(vk_instance, "vkDestroyInstance"));
+    if (vkDestroyInstance == nullptr) {
+        LOG_CRITICAL(Frontend, "Failed to retrieve Vulkan function pointer!");
+        exit(EXIT_FAILURE);
+    }
+
+    if (!SDL_Vulkan_CreateSurface(render_window, vk_instance, &vk_surface)) {
+        LOG_CRITICAL(Frontend, "Failed to create Vulkan surface! {}", SDL_GetError());
+        exit(EXIT_FAILURE);
+    }
+
+    OnResize();
+    OnMinimalClientAreaChangeRequest(GetActiveConfig().min_client_area_size);
+    SDL_PumpEvents();
+    LOG_INFO(Frontend, "yuzu Version: {} | {}-{} (Vulkan)", Common::g_build_name,
+             Common::g_scm_branch, Common::g_scm_desc);
+}
+
+EmuWindow_SDL2_VK::~EmuWindow_SDL2_VK() {
+    vkDestroyInstance(vk_instance, nullptr);
+}
+
+void EmuWindow_SDL2_VK::SwapBuffers() {}
+
+void EmuWindow_SDL2_VK::MakeCurrent() {
+    // Unused on Vulkan
+}
+
+void EmuWindow_SDL2_VK::DoneCurrent() {
+    // Unused on Vulkan
+}
+
+void EmuWindow_SDL2_VK::RetrieveVulkanHandlers(void* get_instance_proc_addr, void* instance,
+                                               void* surface) const {
+    const auto instance_proc_addr = vkGetInstanceProcAddr;
+    std::memcpy(get_instance_proc_addr, &instance_proc_addr, sizeof(instance_proc_addr));
+    std::memcpy(instance, &vk_instance, sizeof(vk_instance));
+    std::memcpy(surface, &vk_surface, sizeof(vk_surface));
+}
+
+std::unique_ptr<Core::Frontend::GraphicsContext> EmuWindow_SDL2_VK::CreateSharedContext() const {
+    return nullptr;
+}
+
+bool EmuWindow_SDL2_VK::UseStandardLayers(PFN_vkGetInstanceProcAddr vkGetInstanceProcAddr) const {
+    if (!Settings::values.renderer_debug) {
+        return false;
+    }
+
+    const auto vkEnumerateInstanceLayerProperties =
+        reinterpret_cast<PFN_vkEnumerateInstanceLayerProperties>(
+            vkGetInstanceProcAddr(nullptr, "vkEnumerateInstanceLayerProperties"));
+    if (vkEnumerateInstanceLayerProperties == nullptr) {
+        LOG_CRITICAL(Frontend, "Failed to retrieve Vulkan function pointer!");
+        return false;
+    }
+
+    u32 available_layers_count{};
+    if (vkEnumerateInstanceLayerProperties(&available_layers_count, nullptr) != VK_SUCCESS) {
+        LOG_CRITICAL(Frontend, "Failed to enumerate Vulkan validation layers!");
+        return false;
+    }
+    std::vector<VkLayerProperties> layers(available_layers_count);
+    if (vkEnumerateInstanceLayerProperties(&available_layers_count, layers.data()) != VK_SUCCESS) {
+        LOG_CRITICAL(Frontend, "Failed to enumerate Vulkan validation layers!");
+        return false;
+    }
+
+    return std::find_if(layers.begin(), layers.end(), [&](const auto& layer) {
+               return layer.layerName == std::string("VK_LAYER_LUNARG_standard_validation");
+           }) != layers.end();
+}
diff --git a/src/yuzu_cmd/emu_window/emu_window_sdl2_vk.h b/src/yuzu_cmd/emu_window/emu_window_sdl2_vk.h
new file mode 100644
index 000000000..2a7c06a24
--- /dev/null
+++ b/src/yuzu_cmd/emu_window/emu_window_sdl2_vk.h
@@ -0,0 +1,39 @@
+// Copyright 2018 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <vulkan/vulkan.h>
+#include "core/frontend/emu_window.h"
+#include "yuzu_cmd/emu_window/emu_window_sdl2.h"
+
+class EmuWindow_SDL2_VK final : public EmuWindow_SDL2 {
+public:
+    explicit EmuWindow_SDL2_VK(bool fullscreen);
+    ~EmuWindow_SDL2_VK();
+
+    /// Swap buffers to display the next frame
+    void SwapBuffers() override;
+
+    /// Makes the graphics context current for the caller thread
+    void MakeCurrent() override;
+
+    /// Releases the GL context from the caller thread
+    void DoneCurrent() override;
+
+    /// Retrieves Vulkan specific handlers from the window
+    void RetrieveVulkanHandlers(void* get_instance_proc_addr, void* instance,
+                                void* surface) const override;
+
+    std::unique_ptr<Core::Frontend::GraphicsContext> CreateSharedContext() const override;
+
+private:
+    bool UseStandardLayers(PFN_vkGetInstanceProcAddr vkGetInstanceProcAddr) const;
+
+    VkInstance vk_instance{};
+    VkSurfaceKHR vk_surface{};
+
+    PFN_vkGetInstanceProcAddr vkGetInstanceProcAddr{};
+    PFN_vkDestroyInstance vkDestroyInstance{};
+};
diff --git a/src/yuzu_cmd/yuzu.cpp b/src/yuzu_cmd/yuzu.cpp
index 3ee088a91..325795321 100644
--- a/src/yuzu_cmd/yuzu.cpp
+++ b/src/yuzu_cmd/yuzu.cpp
@@ -32,6 +32,9 @@
 #include "yuzu_cmd/config.h"
 #include "yuzu_cmd/emu_window/emu_window_sdl2.h"
 #include "yuzu_cmd/emu_window/emu_window_sdl2_gl.h"
+#ifdef HAS_VULKAN
+#include "yuzu_cmd/emu_window/emu_window_sdl2_vk.h"
+#endif
 
 #include "core/file_sys/registered_cache.h"
 
@@ -174,7 +177,20 @@ int main(int argc, char** argv) {
     Settings::values.use_gdbstub = use_gdbstub;
     Settings::Apply();
 
-    std::unique_ptr<EmuWindow_SDL2> emu_window{std::make_unique<EmuWindow_SDL2_GL>(fullscreen)};
+    std::unique_ptr<EmuWindow_SDL2> emu_window;
+    switch (Settings::values.renderer_backend) {
+    case Settings::RendererBackend::OpenGL:
+        emu_window = std::make_unique<EmuWindow_SDL2_GL>(fullscreen);
+        break;
+    case Settings::RendererBackend::Vulkan:
+#ifdef HAS_VULKAN
+        emu_window = std::make_unique<EmuWindow_SDL2_VK>(fullscreen);
+        break;
+#else
+        LOG_CRITICAL(Frontend, "Vulkan backend has not been compiled!");
+        return 1;
+#endif
+    }
 
     if (!Settings::values.use_multi_core) {
         // Single core mode must acquire OpenGL context for entire emulation session
diff --git a/src/yuzu_tester/config.cpp b/src/yuzu_tester/config.cpp
index 84ab4d687..0ac93b62a 100644
--- a/src/yuzu_tester/config.cpp
+++ b/src/yuzu_tester/config.cpp
@@ -118,6 +118,8 @@ void Config::ReadValues() {
     // Renderer
     Settings::values.resolution_factor =
         static_cast<float>(sdl2_config->GetReal("Renderer", "resolution_factor", 1.0));
+    Settings::values.aspect_ratio =
+        static_cast<int>(sdl2_config->GetInteger("Renderer", "aspect_ratio", 0));
     Settings::values.use_frame_limit = false;
     Settings::values.frame_limit = 100;
     Settings::values.use_disk_shader_cache =
diff --git a/src/yuzu_tester/default_ini.h b/src/yuzu_tester/default_ini.h
index 9a3e86d68..8d93f7b88 100644
--- a/src/yuzu_tester/default_ini.h
+++ b/src/yuzu_tester/default_ini.h
@@ -26,6 +26,10 @@ use_shader_jit =
 # factor for the Switch resolution
 resolution_factor =
 
+# Aspect ratio
+# 0: Default (16:9), 1: Force 4:3, 2: Force 21:9, 3: Stretch to Window
+aspect_ratio =
+
 # Whether to enable V-Sync (caps the framerate at 60FPS) or not.
 # 0 (default): Off, 1: On
 use_vsync =
diff --git a/src/yuzu_tester/emu_window/emu_window_sdl2_hide.cpp b/src/yuzu_tester/emu_window/emu_window_sdl2_hide.cpp
index e7fe8decf..f2cc4a797 100644
--- a/src/yuzu_tester/emu_window/emu_window_sdl2_hide.cpp
+++ b/src/yuzu_tester/emu_window/emu_window_sdl2_hide.cpp
@@ -5,10 +5,15 @@
 #include <algorithm>
 #include <cstdlib>
 #include <string>
+
+#include <fmt/format.h>
+
 #define SDL_MAIN_HANDLED
 #include <SDL.h>
-#include <fmt/format.h>
+
 #include <glad/glad.h>
+
+#include "common/assert.h"
 #include "common/logging/log.h"
 #include "common/scm_rev.h"
 #include "core/settings.h"
@@ -120,3 +125,11 @@ void EmuWindow_SDL2_Hide::MakeCurrent() {
 void EmuWindow_SDL2_Hide::DoneCurrent() {
     SDL_GL_MakeCurrent(render_window, nullptr);
 }
+
+bool EmuWindow_SDL2_Hide::IsShown() const {
+    return false;
+}
+
+void EmuWindow_SDL2_Hide::RetrieveVulkanHandlers(void*, void*, void*) const {
+    UNREACHABLE();
+}
diff --git a/src/yuzu_tester/emu_window/emu_window_sdl2_hide.h b/src/yuzu_tester/emu_window/emu_window_sdl2_hide.h
index 1a8953c75..c7fccc002 100644
--- a/src/yuzu_tester/emu_window/emu_window_sdl2_hide.h
+++ b/src/yuzu_tester/emu_window/emu_window_sdl2_hide.h
@@ -25,6 +25,13 @@ public:
     /// Releases the GL context from the caller thread
     void DoneCurrent() override;
 
+    /// Whether the screen is being shown or not.
+    bool IsShown() const override;
+
+    /// Retrieves Vulkan specific handlers from the window
+    void RetrieveVulkanHandlers(void* get_instance_proc_addr, void* instance,
+                                void* surface) const override;
+
     /// Whether the window is still open, and a close request hasn't yet been sent
     bool IsOpen() const;