20 files changed, 273 insertions, 65 deletions
diff --git a/src/core/hle/kernel/thread.h b/src/core/hle/kernel/thread.h
index 20f50458b..cb57ee78a 100644
--- a/src/core/hle/kernel/thread.h
+++ b/src/core/hle/kernel/thread.h
@@ -15,6 +15,12 @@
 #include "core/hle/kernel/wait_object.h"
 #include "core/hle/result.h"
 
+namespace Kernel {
+
+class KernelCore;
+class Process;
+class Scheduler;
+
 enum ThreadPriority : u32 {
     THREADPRIO_HIGHEST = 0,       ///< Highest thread priority
     THREADPRIO_USERLAND_MAX = 24, ///< Highest thread priority for userland apps
@@ -54,12 +60,6 @@ enum class ThreadWakeupReason {
     Timeout // The thread was woken up due to a wait timeout.
 };
 
-namespace Kernel {
-
-class KernelCore;
-class Process;
-class Scheduler;
-
 class Thread final : public WaitObject {
 public:
     /**
diff --git a/src/core/hle/service/acc/acc_su.h b/src/core/hle/service/acc/acc_su.h
index a3eb885bf..fcced063a 100644
--- a/src/core/hle/service/acc/acc_su.h
+++ b/src/core/hle/service/acc/acc_su.h
@@ -6,8 +6,7 @@
 
 #include "core/hle/service/acc/acc.h"
 
-namespace Service {
-namespace Account {
+namespace Service::Account {
 
 class ACC_SU final : public Module::Interface {
 public:
@@ -16,5 +15,4 @@ public:
     ~ACC_SU() override;
 };
 
-} // namespace Account
-} // namespace Service
+} // namespace Service::Account
diff --git a/src/core/hle/service/ns/pl_u.cpp b/src/core/hle/service/ns/pl_u.cpp
index ac0eaaa8f..447689a1a 100644
--- a/src/core/hle/service/ns/pl_u.cpp
+++ b/src/core/hle/service/ns/pl_u.cpp
@@ -253,14 +253,16 @@ PL_U::PL_U() : ServiceFramework("pl:u"), impl{std::make_unique<Impl>()} {
             LOG_WARNING(Service_NS,
                         "Shared Font file missing. Loading open source replacement from memory");
 
+            // clang-format off
             const std::vector<std::vector<u8>> open_source_shared_fonts_ttf = {
                 {std::begin(FontChineseSimplified), std::end(FontChineseSimplified)},
                 {std::begin(FontChineseTraditional), std::end(FontChineseTraditional)},
-                {std::begin(FontExtendedChineseSimplified),
-                 std::end(FontExtendedChineseSimplified)},
+                {std::begin(FontExtendedChineseSimplified), std::end(FontExtendedChineseSimplified)},
+                {std::begin(FontKorean), std::end(FontKorean)},
                 {std::begin(FontNintendoExtended), std::end(FontNintendoExtended)},
                 {std::begin(FontStandard), std::end(FontStandard)},
             };
+            // clang-format on
 
             for (const std::vector<u8>& font_ttf : open_source_shared_fonts_ttf) {
                 const FontRegion region{static_cast<u32>(offset + 8),
diff --git a/src/core/hle/service/nvflinger/buffer_queue.cpp b/src/core/hle/service/nvflinger/buffer_queue.cpp
index 34f98fe5a..fd98d541d 100644
--- a/src/core/hle/service/nvflinger/buffer_queue.cpp
+++ b/src/core/hle/service/nvflinger/buffer_queue.cpp
@@ -9,8 +9,7 @@
 #include "core/core.h"
 #include "core/hle/service/nvflinger/buffer_queue.h"
 
-namespace Service {
-namespace NVFlinger {
+namespace Service::NVFlinger {
 
 BufferQueue::BufferQueue(u32 id, u64 layer_id) : id(id), layer_id(layer_id) {
     auto& kernel = Core::System::GetInstance().Kernel();
@@ -104,5 +103,4 @@ u32 BufferQueue::Query(QueryType type) {
     return 0;
 }
 
-} // namespace NVFlinger
-} // namespace Service
+} // namespace Service::NVFlinger
diff --git a/src/core/hle/service/nvflinger/buffer_queue.h b/src/core/hle/service/nvflinger/buffer_queue.h
index 17c81928a..50b767732 100644
--- a/src/core/hle/service/nvflinger/buffer_queue.h
+++ b/src/core/hle/service/nvflinger/buffer_queue.h
@@ -15,8 +15,7 @@ namespace CoreTiming {
 struct EventType;
 }
 
-namespace Service {
-namespace NVFlinger {
+namespace Service::NVFlinger {
 
 struct IGBPBuffer {
     u32_le magic;
@@ -98,5 +97,4 @@ private:
     Kernel::SharedPtr<Kernel::Event> buffer_wait_event;
 };
 
-} // namespace NVFlinger
-} // namespace Service
+} // namespace Service::NVFlinger
diff --git a/src/core/hle/service/vi/vi.cpp b/src/core/hle/service/vi/vi.cpp
index 85244ac3b..cf94b00e6 100644
--- a/src/core/hle/service/vi/vi.cpp
+++ b/src/core/hle/service/vi/vi.cpp
@@ -514,7 +514,7 @@ private:
                 ctx.SleepClientThread(
                     Kernel::GetCurrentThread(), "IHOSBinderDriver::DequeueBuffer", -1,
                     [=](Kernel::SharedPtr<Kernel::Thread> thread, Kernel::HLERequestContext& ctx,
-                        ThreadWakeupReason reason) {
+                        Kernel::ThreadWakeupReason reason) {
                         // Repeat TransactParcel DequeueBuffer when a buffer is available
                         auto buffer_queue = nv_flinger->GetBufferQueue(id);
                         boost::optional<u32> slot = buffer_queue->DequeueBuffer(width, height);
diff --git a/src/core/loader/nro.cpp b/src/core/loader/nro.cpp
index bb89a9da3..c49ec34ab 100644
--- a/src/core/loader/nro.cpp
+++ b/src/core/loader/nro.cpp
@@ -191,7 +191,7 @@ ResultStatus AppLoader_NRO::Load(Kernel::SharedPtr<Kernel::Process>& process) {
     process->svc_access_mask.set();
     process->resource_limit =
         kernel.ResourceLimitForCategory(Kernel::ResourceLimitCategory::APPLICATION);
-    process->Run(base_addr, THREADPRIO_DEFAULT, Memory::DEFAULT_STACK_SIZE);
+    process->Run(base_addr, Kernel::THREADPRIO_DEFAULT, Memory::DEFAULT_STACK_SIZE);
 
     is_loaded = true;
     return ResultStatus::Success;
diff --git a/src/core/loader/nso.cpp b/src/core/loader/nso.cpp
index 082a95d40..3c6306818 100644
--- a/src/core/loader/nso.cpp
+++ b/src/core/loader/nso.cpp
@@ -157,7 +157,8 @@ ResultStatus AppLoader_NSO::Load(Kernel::SharedPtr<Kernel::Process>& process) {
     process->svc_access_mask.set();
     process->resource_limit =
         kernel.ResourceLimitForCategory(Kernel::ResourceLimitCategory::APPLICATION);
-    process->Run(Memory::PROCESS_IMAGE_VADDR, THREADPRIO_DEFAULT, Memory::DEFAULT_STACK_SIZE);
+    process->Run(Memory::PROCESS_IMAGE_VADDR, Kernel::THREADPRIO_DEFAULT,
+                 Memory::DEFAULT_STACK_SIZE);
 
     is_loaded = true;
     return ResultStatus::Success;
diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt
index 65b5f57c3..4a79ce39c 100644
--- a/src/video_core/CMakeLists.txt
+++ b/src/video_core/CMakeLists.txt
@@ -5,6 +5,8 @@ add_library(video_core STATIC
     debug_utils/debug_utils.h
     engines/fermi_2d.cpp
     engines/fermi_2d.h
+    engines/kepler_memory.cpp
+    engines/kepler_memory.h
     engines/maxwell_3d.cpp
     engines/maxwell_3d.h
     engines/maxwell_compute.cpp
diff --git a/src/video_core/command_processor.cpp b/src/video_core/command_processor.cpp
index 2625ddfdc..f1aa6091b 100644
--- a/src/video_core/command_processor.cpp
+++ b/src/video_core/command_processor.cpp
@@ -14,6 +14,7 @@
 #include "core/tracer/recorder.h"
 #include "video_core/command_processor.h"
 #include "video_core/engines/fermi_2d.h"
+#include "video_core/engines/kepler_memory.h"
 #include "video_core/engines/maxwell_3d.h"
 #include "video_core/engines/maxwell_compute.h"
 #include "video_core/engines/maxwell_dma.h"
@@ -69,6 +70,9 @@ void GPU::ProcessCommandLists(const std::vector<CommandListHeader>& commands) {
         case EngineID::MAXWELL_DMA_COPY_A:
             maxwell_dma->WriteReg(method, value);
             break;
+        case EngineID::KEPLER_INLINE_TO_MEMORY_B:
+            kepler_memory->WriteReg(method, value);
+            break;
         default:
             UNIMPLEMENTED_MSG("Unimplemented engine");
         }
diff --git a/src/video_core/engines/kepler_memory.cpp b/src/video_core/engines/kepler_memory.cpp
new file mode 100644
index 000000000..66ae6332d
--- /dev/null
+++ b/src/video_core/engines/kepler_memory.cpp
@@ -0,0 +1,45 @@
+// Copyright 2018 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include "common/logging/log.h"
+#include "core/memory.h"
+#include "video_core/engines/kepler_memory.h"
+
+namespace Tegra::Engines {
+
+KeplerMemory::KeplerMemory(MemoryManager& memory_manager) : memory_manager(memory_manager) {}
+KeplerMemory::~KeplerMemory() = default;
+
+void KeplerMemory::WriteReg(u32 method, u32 value) {
+    ASSERT_MSG(method < Regs::NUM_REGS,
+               "Invalid KeplerMemory register, increase the size of the Regs structure");
+
+    regs.reg_array[method] = value;
+
+    switch (method) {
+    case KEPLERMEMORY_REG_INDEX(exec): {
+        state.write_offset = 0;
+        break;
+    }
+    case KEPLERMEMORY_REG_INDEX(data): {
+        ProcessData(value);
+        break;
+    }
+    }
+}
+
+void KeplerMemory::ProcessData(u32 data) {
+    ASSERT_MSG(regs.exec.linear, "Non-linear uploads are not supported");
+    ASSERT(regs.dest.x == 0 && regs.dest.y == 0 && regs.dest.z == 0);
+
+    GPUVAddr address = regs.dest.Address();
+    VAddr dest_address =
+        *memory_manager.GpuToCpuAddress(address + state.write_offset * sizeof(u32));
+
+    Memory::Write32(dest_address, data);
+
+    state.write_offset++;
+}
+
+} // namespace Tegra::Engines
diff --git a/src/video_core/engines/kepler_memory.h b/src/video_core/engines/kepler_memory.h
new file mode 100644
index 000000000..b0d0078cf
--- /dev/null
+++ b/src/video_core/engines/kepler_memory.h
@@ -0,0 +1,90 @@
+// Copyright 2018 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <array>
+#include "common/assert.h"
+#include "common/bit_field.h"
+#include "common/common_funcs.h"
+#include "common/common_types.h"
+#include "video_core/memory_manager.h"
+
+namespace Tegra::Engines {
+
+#define KEPLERMEMORY_REG_INDEX(field_name)                                                         \
+    (offsetof(Tegra::Engines::KeplerMemory::Regs, field_name) / sizeof(u32))
+
+class KeplerMemory final {
+public:
+    KeplerMemory(MemoryManager& memory_manager);
+    ~KeplerMemory();
+
+    /// Write the value to the register identified by method.
+    void WriteReg(u32 method, u32 value);
+
+    struct Regs {
+        static constexpr size_t NUM_REGS = 0x7F;
+
+        union {
+            struct {
+                INSERT_PADDING_WORDS(0x60);
+
+                u32 line_length_in;
+                u32 line_count;
+
+                struct {
+                    u32 address_high;
+                    u32 address_low;
+                    u32 pitch;
+                    u32 block_dimensions;
+                    u32 width;
+                    u32 height;
+                    u32 depth;
+                    u32 z;
+                    u32 x;
+                    u32 y;
+
+                    GPUVAddr Address() const {
+                        return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high) << 32) |
+                                                     address_low);
+                    }
+                } dest;
+
+                struct {
+                    union {
+                        BitField<0, 1, u32> linear;
+                    };
+                } exec;
+
+                u32 data;
+
+                INSERT_PADDING_WORDS(0x11);
+            };
+            std::array<u32, NUM_REGS> reg_array;
+        };
+    } regs{};
+
+    struct {
+        u32 write_offset = 0;
+    } state{};
+
+private:
+    MemoryManager& memory_manager;
+
+    void ProcessData(u32 data);
+};
+
+#define ASSERT_REG_POSITION(field_name, position)                                                  \
+    static_assert(offsetof(KeplerMemory::Regs, field_name) == position * 4,                        \
+                  "Field " #field_name " has invalid position")
+
+ASSERT_REG_POSITION(line_length_in, 0x60);
+ASSERT_REG_POSITION(line_count, 0x61);
+ASSERT_REG_POSITION(dest, 0x62);
+ASSERT_REG_POSITION(exec, 0x6C);
+ASSERT_REG_POSITION(data, 0x6D);
+#undef ASSERT_REG_POSITION
+
+} // namespace Tegra::Engines
diff --git a/src/video_core/engines/shader_bytecode.h b/src/video_core/engines/shader_bytecode.h
index 58f2904ce..d6e2397f2 100644
--- a/src/video_core/engines/shader_bytecode.h
+++ b/src/video_core/engines/shader_bytecode.h
@@ -67,6 +67,13 @@ private:
     u64 value{};
 };
 
+enum class AttributeSize : u64 {
+    Word = 0,
+    DoubleWord = 1,
+    TripleWord = 2,
+    QuadWord = 3,
+};
+
 union Attribute {
     Attribute() = default;
 
@@ -87,9 +94,10 @@ union Attribute {
     };
 
     union {
+        BitField<20, 10, u64> immediate;
         BitField<22, 2, u64> element;
         BitField<24, 6, Index> index;
-        BitField<47, 3, u64> size;
+        BitField<47, 3, AttributeSize> size;
     } fmt20;
 
     union {
diff --git a/src/video_core/gpu.cpp b/src/video_core/gpu.cpp
index 86a809f86..baa8b63b7 100644
--- a/src/video_core/gpu.cpp
+++ b/src/video_core/gpu.cpp
@@ -4,6 +4,7 @@
 
 #include "common/assert.h"
 #include "video_core/engines/fermi_2d.h"
+#include "video_core/engines/kepler_memory.h"
 #include "video_core/engines/maxwell_3d.h"
 #include "video_core/engines/maxwell_compute.h"
 #include "video_core/engines/maxwell_dma.h"
@@ -27,6 +28,7 @@ GPU::GPU(VideoCore::RasterizerInterface& rasterizer) {
     fermi_2d = std::make_unique<Engines::Fermi2D>(*memory_manager);
     maxwell_compute = std::make_unique<Engines::MaxwellCompute>();
     maxwell_dma = std::make_unique<Engines::MaxwellDMA>(*memory_manager);
+    kepler_memory = std::make_unique<Engines::KeplerMemory>(*memory_manager);
 }
 
 GPU::~GPU() = default;
diff --git a/src/video_core/gpu.h b/src/video_core/gpu.h
index 589a59b4f..7329ca766 100644
--- a/src/video_core/gpu.h
+++ b/src/video_core/gpu.h
@@ -102,6 +102,7 @@ class Fermi2D;
 class Maxwell3D;
 class MaxwellCompute;
 class MaxwellDMA;
+class KeplerMemory;
 } // namespace Engines
 
 enum class EngineID {
@@ -146,6 +147,8 @@ private:
     std::unique_ptr<Engines::MaxwellCompute> maxwell_compute;
     /// DMA engine
     std::unique_ptr<Engines::MaxwellDMA> maxwell_dma;
+    /// Inline memory engine
+    std::unique_ptr<Engines::KeplerMemory> kepler_memory;
 };
 
 } // namespace Tegra
diff --git a/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp b/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp
index 32001e44b..63bbcb666 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp
@@ -167,6 +167,7 @@ static constexpr std::array<FormatTuple, SurfaceParams::MaxPixelFormat> tex_form
     {GL_RG8, GL_RG, GL_BYTE, ComponentType::SNorm, false},                                // RG8S
     {GL_RG32UI, GL_RG_INTEGER, GL_UNSIGNED_INT, ComponentType::UInt, false},              // RG32UI
     {GL_R32UI, GL_RED_INTEGER, GL_UNSIGNED_INT, ComponentType::UInt, false},              // R32UI
+    {GL_RGBA8, GL_RGBA, GL_UNSIGNED_BYTE, ComponentType::UNorm, false}, // ASTC_2D_8X8
 
     // Depth formats
     {GL_DEPTH_COMPONENT32F, GL_DEPTH_COMPONENT, GL_FLOAT, ComponentType::Float, false}, // Z32F
@@ -213,6 +214,7 @@ static const FormatTuple& GetFormatTuple(PixelFormat pixel_format, ComponentType
 static bool IsPixelFormatASTC(PixelFormat format) {
     switch (format) {
     case PixelFormat::ASTC_2D_4X4:
+    case PixelFormat::ASTC_2D_8X8:
         return true;
     default:
         return false;
@@ -223,6 +225,8 @@ static std::pair<u32, u32> GetASTCBlockSize(PixelFormat format) {
     switch (format) {
     case PixelFormat::ASTC_2D_4X4:
         return {4, 4};
+    case PixelFormat::ASTC_2D_8X8:
+        return {8, 8};
     default:
         LOG_CRITICAL(HW_GPU, "Unhandled format: {}", static_cast<u32>(format));
         UNREACHABLE();
@@ -327,6 +331,7 @@ static constexpr std::array<void (*)(u32, u32, u32, u8*, size_t, VAddr),
         MortonCopy<true, PixelFormat::RG8S>,
         MortonCopy<true, PixelFormat::RG32UI>,
         MortonCopy<true, PixelFormat::R32UI>,
+        MortonCopy<true, PixelFormat::ASTC_2D_8X8>,
         MortonCopy<true, PixelFormat::Z32F>,
         MortonCopy<true, PixelFormat::Z16>,
         MortonCopy<true, PixelFormat::Z24S8>,
@@ -386,6 +391,7 @@ static constexpr std::array<void (*)(u32, u32, u32, u8*, size_t, VAddr),
         MortonCopy<false, PixelFormat::RG8S>,
         MortonCopy<false, PixelFormat::RG32UI>,
         MortonCopy<false, PixelFormat::R32UI>,
+        nullptr,
         MortonCopy<false, PixelFormat::Z32F>,
         MortonCopy<false, PixelFormat::Z16>,
         MortonCopy<false, PixelFormat::Z24S8>,
@@ -544,7 +550,8 @@ static void ConvertG8R8ToR8G8(std::vector<u8>& data, u32 width, u32 height) {
 static void ConvertFormatAsNeeded_LoadGLBuffer(std::vector<u8>& data, PixelFormat pixel_format,
                                                u32 width, u32 height) {
     switch (pixel_format) {
-    case PixelFormat::ASTC_2D_4X4: {
+    case PixelFormat::ASTC_2D_4X4:
+    case PixelFormat::ASTC_2D_8X8: {
         // Convert ASTC pixel formats to RGBA8, as most desktop GPUs do not support ASTC.
         u32 block_width{};
         u32 block_height{};
diff --git a/src/video_core/renderer_opengl/gl_rasterizer_cache.h b/src/video_core/renderer_opengl/gl_rasterizer_cache.h
index 57ea8593b..0525e9a94 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer_cache.h
+++ b/src/video_core/renderer_opengl/gl_rasterizer_cache.h
@@ -70,19 +70,20 @@ struct SurfaceParams {
         RG8S = 42,
         RG32UI = 43,
         R32UI = 44,
+        ASTC_2D_8X8 = 45,
 
         MaxColorFormat,
 
         // Depth formats
-        Z32F = 45,
-        Z16 = 46,
+        Z32F = 46,
+        Z16 = 47,
 
         MaxDepthFormat,
 
         // DepthStencil formats
-        Z24S8 = 47,
-        S8Z24 = 48,
-        Z32FS8 = 49,
+        Z24S8 = 48,
+        S8Z24 = 49,
+        Z32FS8 = 50,
 
         MaxDepthStencilFormat,
 
@@ -192,6 +193,7 @@ struct SurfaceParams {
             1, // RG8S
             1, // RG32UI
             1, // R32UI
+            4, // ASTC_2D_8X8
             1, // Z32F
             1, // Z16
             1, // Z24S8
@@ -253,6 +255,7 @@ struct SurfaceParams {
             16,  // RG8S
             64,  // RG32UI
             32,  // R32UI
+            16,  // ASTC_2D_8X8
             32,  // Z32F
             16,  // Z16
             32,  // Z24S8
@@ -522,6 +525,8 @@ struct SurfaceParams {
             return PixelFormat::BC6H_SF16;
         case Tegra::Texture::TextureFormat::ASTC_2D_4X4:
             return PixelFormat::ASTC_2D_4X4;
+        case Tegra::Texture::TextureFormat::ASTC_2D_8X8:
+            return PixelFormat::ASTC_2D_8X8;
         case Tegra::Texture::TextureFormat::R16_G16:
             switch (component_type) {
             case Tegra::Texture::ComponentType::FLOAT:
diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
index 2d56370c7..81c0662d0 100644
--- a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
@@ -1772,13 +1772,34 @@ private:
         case OpCode::Type::Memory: {
             switch (opcode->GetId()) {
             case OpCode::Id::LD_A: {
-                ASSERT_MSG(instr.attribute.fmt20.size == 0, "untested");
                 // Note: Shouldn't this be interp mode flat? As in no interpolation made.
+                ASSERT_MSG(instr.gpr8.Value() == Register::ZeroIndex,
+                           "Indirect attribute loads are not supported");
+                ASSERT_MSG((instr.attribute.fmt20.immediate.Value() % sizeof(u32)) == 0,
+                           "Unaligned attribute loads are not supported");
 
                 Tegra::Shader::IpaMode input_mode{Tegra::Shader::IpaInterpMode::Perspective,
                                                   Tegra::Shader::IpaSampleMode::Default};
-                regs.SetRegisterToInputAttibute(instr.gpr0, instr.attribute.fmt20.element,
-                                                instr.attribute.fmt20.index, input_mode);
+
+                u32 next_element = instr.attribute.fmt20.element;
+                u32 next_index = static_cast<u32>(instr.attribute.fmt20.index.Value());
+
+                const auto LoadNextElement = [&](u32 reg_offset) {
+                    regs.SetRegisterToInputAttibute(instr.gpr0.Value() + reg_offset, next_element,
+                                                    static_cast<Attribute::Index>(next_index),
+                                                    input_mode);
+
+                    // Load the next attribute element into the following register. If the element
+                    // to load goes beyond the vec4 size, load the first element of the next
+                    // attribute.
+                    next_element = (next_element + 1) % 4;
+                    next_index = next_index + (next_element == 0 ? 1 : 0);
+                };
+
+                const u32 num_words = static_cast<u32>(instr.attribute.fmt20.size.Value()) + 1;
+                for (u32 reg_offset = 0; reg_offset < num_words; ++reg_offset) {
+                    LoadNextElement(reg_offset);
+                }
                 break;
             }
             case OpCode::Id::LD_C: {
@@ -1820,9 +1841,31 @@ private:
                 break;
             }
             case OpCode::Id::ST_A: {
-                ASSERT_MSG(instr.attribute.fmt20.size == 0, "untested");
-                regs.SetOutputAttributeToRegister(instr.attribute.fmt20.index,
-                                                  instr.attribute.fmt20.element, instr.gpr0);
+                ASSERT_MSG(instr.gpr8.Value() == Register::ZeroIndex,
+                           "Indirect attribute loads are not supported");
+                ASSERT_MSG((instr.attribute.fmt20.immediate.Value() % sizeof(u32)) == 0,
+                           "Unaligned attribute loads are not supported");
+
+                u32 next_element = instr.attribute.fmt20.element;
+                u32 next_index = static_cast<u32>(instr.attribute.fmt20.index.Value());
+
+                const auto StoreNextElement = [&](u32 reg_offset) {
+                    regs.SetOutputAttributeToRegister(static_cast<Attribute::Index>(next_index),
+                                                      next_element,
+                                                      instr.gpr0.Value() + reg_offset);
+
+                    // Load the next attribute element into the following register. If the element
+                    // to load goes beyond the vec4 size, load the first element of the next
+                    // attribute.
+                    next_element = (next_element + 1) % 4;
+                    next_index = next_index + (next_element == 0 ? 1 : 0);
+                };
+
+                const u32 num_words = static_cast<u32>(instr.attribute.fmt20.size.Value()) + 1;
+                for (u32 reg_offset = 0; reg_offset < num_words; ++reg_offset) {
+                    StoreNextElement(reg_offset);
+                }
+
                 break;
             }
             case OpCode::Id::TEX: {
diff --git a/src/video_core/textures/decoders.cpp b/src/video_core/textures/decoders.cpp
index 272294c62..58b65de1a 100644
--- a/src/video_core/textures/decoders.cpp
+++ b/src/video_core/textures/decoders.cpp
@@ -63,6 +63,7 @@ u32 BytesPerPixel(TextureFormat format) {
     case TextureFormat::R32_G32_B32:
         return 12;
     case TextureFormat::ASTC_2D_4X4:
+    case TextureFormat::ASTC_2D_8X8:
     case TextureFormat::A8R8G8B8:
     case TextureFormat::A2B10G10R10:
     case TextureFormat::BF10GF11RF11:
@@ -111,6 +112,7 @@ std::vector<u8> DecodeTexture(const std::vector<u8>& texture_data, TextureFormat
     case TextureFormat::BC6H_UF16:
     case TextureFormat::BC6H_SF16:
     case TextureFormat::ASTC_2D_4X4:
+    case TextureFormat::ASTC_2D_8X8:
     case TextureFormat::A8R8G8B8:
     case TextureFormat::A2B10G10R10:
     case TextureFormat::A1B5G5R5:
diff --git a/src/yuzu/debugger/wait_tree.cpp b/src/yuzu/debugger/wait_tree.cpp
index 6c2cd967e..dc1023113 100644
--- a/src/yuzu/debugger/wait_tree.cpp
+++ b/src/yuzu/debugger/wait_tree.cpp
@@ -213,35 +213,35 @@ QString WaitTreeThread::GetText() const {
     const auto& thread = static_cast<const Kernel::Thread&>(object);
     QString status;
     switch (thread.status) {
-    case ThreadStatus::Running:
+    case Kernel::ThreadStatus::Running:
         status = tr("running");
         break;
-    case ThreadStatus::Ready:
+    case Kernel::ThreadStatus::Ready:
         status = tr("ready");
         break;
-    case ThreadStatus::WaitHLEEvent:
+    case Kernel::ThreadStatus::WaitHLEEvent:
         status = tr("waiting for HLE return");
         break;
-    case ThreadStatus::WaitSleep:
+    case Kernel::ThreadStatus::WaitSleep:
         status = tr("sleeping");
         break;
-    case ThreadStatus::WaitIPC:
+    case Kernel::ThreadStatus::WaitIPC:
         status = tr("waiting for IPC reply");
         break;
-    case ThreadStatus::WaitSynchAll:
-    case ThreadStatus::WaitSynchAny:
+    case Kernel::ThreadStatus::WaitSynchAll:
+    case Kernel::ThreadStatus::WaitSynchAny:
         status = tr("waiting for objects");
         break;
-    case ThreadStatus::WaitMutex:
+    case Kernel::ThreadStatus::WaitMutex:
         status = tr("waiting for mutex");
         break;
-    case ThreadStatus::WaitArb:
+    case Kernel::ThreadStatus::WaitArb:
         status = tr("waiting for address arbiter");
         break;
-    case ThreadStatus::Dormant:
+    case Kernel::ThreadStatus::Dormant:
         status = tr("dormant");
         break;
-    case ThreadStatus::Dead:
+    case Kernel::ThreadStatus::Dead:
         status = tr("dead");
         break;
     }
@@ -254,23 +254,23 @@ QString WaitTreeThread::GetText() const {
 QColor WaitTreeThread::GetColor() const {
     const auto& thread = static_cast<const Kernel::Thread&>(object);
     switch (thread.status) {
-    case ThreadStatus::Running:
+    case Kernel::ThreadStatus::Running:
         return QColor(Qt::GlobalColor::darkGreen);
-    case ThreadStatus::Ready:
+    case Kernel::ThreadStatus::Ready:
         return QColor(Qt::GlobalColor::darkBlue);
-    case ThreadStatus::WaitHLEEvent:
-    case ThreadStatus::WaitIPC:
+    case Kernel::ThreadStatus::WaitHLEEvent:
+    case Kernel::ThreadStatus::WaitIPC:
         return QColor(Qt::GlobalColor::darkRed);
-    case ThreadStatus::WaitSleep:
+    case Kernel::ThreadStatus::WaitSleep:
         return QColor(Qt::GlobalColor::darkYellow);
-    case ThreadStatus::WaitSynchAll:
-    case ThreadStatus::WaitSynchAny:
-    case ThreadStatus::WaitMutex:
-    case ThreadStatus::WaitArb:
+    case Kernel::ThreadStatus::WaitSynchAll:
+    case Kernel::ThreadStatus::WaitSynchAny:
+    case Kernel::ThreadStatus::WaitMutex:
+    case Kernel::ThreadStatus::WaitArb:
         return QColor(Qt::GlobalColor::red);
-    case ThreadStatus::Dormant:
+    case Kernel::ThreadStatus::Dormant:
         return QColor(Qt::GlobalColor::darkCyan);
-    case ThreadStatus::Dead:
+    case Kernel::ThreadStatus::Dead:
         return QColor(Qt::GlobalColor::gray);
     default:
         return WaitTreeItem::GetColor();
@@ -284,13 +284,13 @@ std::vector<std::unique_ptr<WaitTreeItem>> WaitTreeThread::GetChildren() const {
 
     QString processor;
     switch (thread.processor_id) {
-    case ThreadProcessorId::THREADPROCESSORID_DEFAULT:
+    case Kernel::ThreadProcessorId::THREADPROCESSORID_DEFAULT:
         processor = tr("default");
         break;
-    case ThreadProcessorId::THREADPROCESSORID_0:
-    case ThreadProcessorId::THREADPROCESSORID_1:
-    case ThreadProcessorId::THREADPROCESSORID_2:
-    case ThreadProcessorId::THREADPROCESSORID_3:
+    case Kernel::ThreadProcessorId::THREADPROCESSORID_0:
+    case Kernel::ThreadProcessorId::THREADPROCESSORID_1:
+    case Kernel::ThreadProcessorId::THREADPROCESSORID_2:
+    case Kernel::ThreadProcessorId::THREADPROCESSORID_3:
         processor = tr("core %1").arg(thread.processor_id);
         break;
     default:
@@ -314,8 +314,8 @@ std::vector<std::unique_ptr<WaitTreeItem>> WaitTreeThread::GetChildren() const {
     else
         list.push_back(std::make_unique<WaitTreeText>(tr("not waiting for mutex")));
 
-    if (thread.status == ThreadStatus::WaitSynchAny ||
-        thread.status == ThreadStatus::WaitSynchAll) {
+    if (thread.status == Kernel::ThreadStatus::WaitSynchAny ||
+        thread.status == Kernel::ThreadStatus::WaitSynchAll) {
         list.push_back(std::make_unique<WaitTreeObjectList>(thread.wait_objects,
                                                             thread.IsSleepingOnWaitAll()));
     }