15 files changed, 559 insertions, 151 deletions
diff --git a/src/core/hle/service/nvflinger/buffer_queue.cpp b/src/core/hle/service/nvflinger/buffer_queue.cpp
index 32b6f4b27..f1e3d832a 100644
--- a/src/core/hle/service/nvflinger/buffer_queue.cpp
+++ b/src/core/hle/service/nvflinger/buffer_queue.cpp
@@ -28,6 +28,7 @@ void BufferQueue::SetPreallocatedBuffer(u32 slot, const IGBPBuffer& igbp_buffer)
     buffer.slot = slot;
     buffer.igbp_buffer = igbp_buffer;
     buffer.status = Buffer::Status::Free;
+    free_buffers.push_back(slot);
 
     queue.emplace_back(buffer);
     buffer_wait_event.writable->Signal();
@@ -35,16 +36,37 @@ void BufferQueue::SetPreallocatedBuffer(u32 slot, const IGBPBuffer& igbp_buffer)
 
 std::optional<std::pair<u32, Service::Nvidia::MultiFence*>> BufferQueue::DequeueBuffer(u32 width,
                                                                                        u32 height) {
-    auto itr = std::find_if(queue.begin(), queue.end(), [&](const Buffer& buffer) {
-        // Only consider free buffers. Buffers become free once again after they've been Acquired
-        // and Released by the compositor, see the NVFlinger::Compose method.
-        if (buffer.status != Buffer::Status::Free) {
-            return false;
-        }
 
-        // Make sure that the parameters match.
-        return buffer.igbp_buffer.width == width && buffer.igbp_buffer.height == height;
-    });
+    if (free_buffers.empty()) {
+        return {};
+    }
+
+    auto f_itr = free_buffers.begin();
+    auto itr = queue.end();
+
+    while (f_itr != free_buffers.end()) {
+        auto slot = *f_itr;
+        itr = std::find_if(queue.begin(), queue.end(), [&](const Buffer& buffer) {
+            // Only consider free buffers. Buffers become free once again after they've been
+            // Acquired and Released by the compositor, see the NVFlinger::Compose method.
+            if (buffer.status != Buffer::Status::Free) {
+                return false;
+            }
+
+            if (buffer.slot != slot) {
+                return false;
+            }
+
+            // Make sure that the parameters match.
+            return buffer.igbp_buffer.width == width && buffer.igbp_buffer.height == height;
+        });
+
+        if (itr != queue.end()) {
+            free_buffers.erase(f_itr);
+            break;
+        }
+        ++f_itr;
+    }
 
     if (itr == queue.end()) {
         return {};
@@ -99,10 +121,18 @@ void BufferQueue::ReleaseBuffer(u32 slot) {
     ASSERT(itr != queue.end());
     ASSERT(itr->status == Buffer::Status::Acquired);
     itr->status = Buffer::Status::Free;
+    free_buffers.push_back(slot);
 
     buffer_wait_event.writable->Signal();
 }
 
+void BufferQueue::Disconnect() {
+    queue.clear();
+    queue_sequence.clear();
+    id = 1;
+    layer_id = 1;
+}
+
 u32 BufferQueue::Query(QueryType type) {
     LOG_WARNING(Service, "(STUBBED) called type={}", static_cast<u32>(type));
 
diff --git a/src/core/hle/service/nvflinger/buffer_queue.h b/src/core/hle/service/nvflinger/buffer_queue.h
index f4bbfd945..d5f31e567 100644
--- a/src/core/hle/service/nvflinger/buffer_queue.h
+++ b/src/core/hle/service/nvflinger/buffer_queue.h
@@ -87,6 +87,7 @@ public:
                      Service::Nvidia::MultiFence& multi_fence);
     std::optional<std::reference_wrapper<const Buffer>> AcquireBuffer();
     void ReleaseBuffer(u32 slot);
+    void Disconnect();
     u32 Query(QueryType type);
 
     u32 GetId() const {
@@ -101,6 +102,7 @@ private:
     u32 id;
     u64 layer_id;
 
+    std::list<u32> free_buffers;
     std::vector<Buffer> queue;
     std::list<u32> queue_sequence;
     Kernel::EventPair buffer_wait_event;
diff --git a/src/core/hle/service/vi/vi.cpp b/src/core/hle/service/vi/vi.cpp
index 519da74e0..fdc62d05b 100644
--- a/src/core/hle/service/vi/vi.cpp
+++ b/src/core/hle/service/vi/vi.cpp
@@ -513,7 +513,8 @@ private:
 
         auto& buffer_queue = nv_flinger->FindBufferQueue(id);
 
-        if (transaction == TransactionId::Connect) {
+        switch (transaction) {
+        case TransactionId::Connect: {
             IGBPConnectRequestParcel request{ctx.ReadBuffer()};
             IGBPConnectResponseParcel response{
                 static_cast<u32>(static_cast<u32>(DisplayResolution::UndockedWidth) *
@@ -521,14 +522,18 @@ private:
                 static_cast<u32>(static_cast<u32>(DisplayResolution::UndockedHeight) *
                                  Settings::values.resolution_factor)};
             ctx.WriteBuffer(response.Serialize());
-        } else if (transaction == TransactionId::SetPreallocatedBuffer) {
+            break;
+        }
+        case TransactionId::SetPreallocatedBuffer: {
             IGBPSetPreallocatedBufferRequestParcel request{ctx.ReadBuffer()};
 
             buffer_queue.SetPreallocatedBuffer(request.data.slot, request.buffer);
 
             IGBPSetPreallocatedBufferResponseParcel response{};
             ctx.WriteBuffer(response.Serialize());
-        } else if (transaction == TransactionId::DequeueBuffer) {
+            break;
+        }
+        case TransactionId::DequeueBuffer: {
             IGBPDequeueBufferRequestParcel request{ctx.ReadBuffer()};
             const u32 width{request.data.width};
             const u32 height{request.data.height};
@@ -556,14 +561,18 @@ private:
                     },
                     buffer_queue.GetWritableBufferWaitEvent());
             }
-        } else if (transaction == TransactionId::RequestBuffer) {
+            break;
+        }
+        case TransactionId::RequestBuffer: {
             IGBPRequestBufferRequestParcel request{ctx.ReadBuffer()};
 
             auto& buffer = buffer_queue.RequestBuffer(request.slot);
 
             IGBPRequestBufferResponseParcel response{buffer};
             ctx.WriteBuffer(response.Serialize());
-        } else if (transaction == TransactionId::QueueBuffer) {
+            break;
+        }
+        case TransactionId::QueueBuffer: {
             IGBPQueueBufferRequestParcel request{ctx.ReadBuffer()};
 
             buffer_queue.QueueBuffer(request.data.slot, request.data.transform,
@@ -572,7 +581,9 @@ private:
 
             IGBPQueueBufferResponseParcel response{1280, 720};
             ctx.WriteBuffer(response.Serialize());
-        } else if (transaction == TransactionId::Query) {
+            break;
+        }
+        case TransactionId::Query: {
             IGBPQueryRequestParcel request{ctx.ReadBuffer()};
 
             const u32 value =
@@ -580,15 +591,30 @@ private:
 
             IGBPQueryResponseParcel response{value};
             ctx.WriteBuffer(response.Serialize());
-        } else if (transaction == TransactionId::CancelBuffer) {
+            break;
+        }
+        case TransactionId::CancelBuffer: {
             LOG_CRITICAL(Service_VI, "(STUBBED) called, transaction=CancelBuffer");
-        } else if (transaction == TransactionId::Disconnect ||
-                   transaction == TransactionId::DetachBuffer) {
+            break;
+        }
+        case TransactionId::Disconnect: {
+            LOG_WARNING(Service_VI, "(STUBBED) called, transaction=Disconnect");
+            const auto buffer = ctx.ReadBuffer();
+
+            buffer_queue.Disconnect();
+
+            IGBPEmptyResponseParcel response{};
+            ctx.WriteBuffer(response.Serialize());
+            break;
+        }
+        case TransactionId::DetachBuffer: {
             const auto buffer = ctx.ReadBuffer();
 
             IGBPEmptyResponseParcel response{};
             ctx.WriteBuffer(response.Serialize());
-        } else {
+            break;
+        }
+        default:
             ASSERT_MSG(false, "Unimplemented");
         }
 
diff --git a/src/video_core/engines/maxwell_3d.h b/src/video_core/engines/maxwell_3d.h
index d24c9f657..4637ddabd 100644
--- a/src/video_core/engines/maxwell_3d.h
+++ b/src/video_core/engines/maxwell_3d.h
@@ -312,6 +312,35 @@ public:
             }
         };
 
+        struct MsaaSampleLocation {
+            union {
+                BitField<0, 4, u32> x0;
+                BitField<4, 4, u32> y0;
+                BitField<8, 4, u32> x1;
+                BitField<12, 4, u32> y1;
+                BitField<16, 4, u32> x2;
+                BitField<20, 4, u32> y2;
+                BitField<24, 4, u32> x3;
+                BitField<28, 4, u32> y3;
+            };
+
+            constexpr std::pair<u32, u32> Location(int index) const {
+                switch (index) {
+                case 0:
+                    return {x0, y0};
+                case 1:
+                    return {x1, y1};
+                case 2:
+                    return {x2, y2};
+                case 3:
+                    return {x3, y3};
+                default:
+                    UNREACHABLE();
+                    return {0, 0};
+                }
+            }
+        };
+
         enum class DepthMode : u32 {
             MinusOneToOne = 0,
             ZeroToOne = 1,
@@ -793,7 +822,13 @@ public:
 
                 u32 rt_separate_frag_data;
 
-                INSERT_UNION_PADDING_WORDS(0xC);
+                INSERT_UNION_PADDING_WORDS(0x1);
+
+                u32 multisample_raster_enable;
+                u32 multisample_raster_samples;
+                std::array<u32, 4> multisample_sample_mask;
+
+                INSERT_UNION_PADDING_WORDS(0x5);
 
                 struct {
                     u32 address_high;
@@ -830,7 +865,16 @@ public:
 
                 std::array<VertexAttribute, NumVertexAttributes> vertex_attrib_format;
 
-                INSERT_UNION_PADDING_WORDS(0xF);
+                std::array<MsaaSampleLocation, 4> multisample_sample_locations;
+
+                INSERT_UNION_PADDING_WORDS(0x2);
+
+                union {
+                    BitField<0, 1, u32> enable;
+                    BitField<4, 3, u32> target;
+                } multisample_coverage_to_color;
+
+                INSERT_UNION_PADDING_WORDS(0x8);
 
                 struct {
                     union {
@@ -943,7 +987,7 @@ public:
 
                 CounterReset counter_reset;
 
-                INSERT_UNION_PADDING_WORDS(0x1);
+                u32 multisample_enable;
 
                 u32 zeta_enable;
 
@@ -1007,7 +1051,11 @@ public:
 
                 float polygon_offset_units;
 
-                INSERT_UNION_PADDING_WORDS(0x11);
+                INSERT_UNION_PADDING_WORDS(0x4);
+
+                Tegra::Texture::MsaaMode multisample_mode;
+
+                INSERT_UNION_PADDING_WORDS(0xC);
 
                 union {
                     BitField<2, 1, u32> coord_origin;
@@ -1507,12 +1555,17 @@ ASSERT_REG_POSITION(stencil_back_func_ref, 0x3D5);
 ASSERT_REG_POSITION(stencil_back_mask, 0x3D6);
 ASSERT_REG_POSITION(stencil_back_func_mask, 0x3D7);
 ASSERT_REG_POSITION(color_mask_common, 0x3E4);
-ASSERT_REG_POSITION(rt_separate_frag_data, 0x3EB);
 ASSERT_REG_POSITION(depth_bounds, 0x3E7);
+ASSERT_REG_POSITION(rt_separate_frag_data, 0x3EB);
+ASSERT_REG_POSITION(multisample_raster_enable, 0x3ED);
+ASSERT_REG_POSITION(multisample_raster_samples, 0x3EE);
+ASSERT_REG_POSITION(multisample_sample_mask, 0x3EF);
 ASSERT_REG_POSITION(zeta, 0x3F8);
 ASSERT_REG_POSITION(clear_flags, 0x43E);
 ASSERT_REG_POSITION(fill_rectangle, 0x44F);
 ASSERT_REG_POSITION(vertex_attrib_format, 0x458);
+ASSERT_REG_POSITION(multisample_sample_locations, 0x478);
+ASSERT_REG_POSITION(multisample_coverage_to_color, 0x47E);
 ASSERT_REG_POSITION(rt_control, 0x487);
 ASSERT_REG_POSITION(zeta_width, 0x48a);
 ASSERT_REG_POSITION(zeta_height, 0x48b);
@@ -1545,11 +1598,12 @@ ASSERT_REG_POSITION(samplecnt_enable, 0x545);
 ASSERT_REG_POSITION(point_size, 0x546);
 ASSERT_REG_POSITION(point_sprite_enable, 0x548);
 ASSERT_REG_POSITION(counter_reset, 0x54C);
+ASSERT_REG_POSITION(multisample_enable, 0x54D);
 ASSERT_REG_POSITION(zeta_enable, 0x54E);
 ASSERT_REG_POSITION(multisample_control, 0x54F);
 ASSERT_REG_POSITION(condition, 0x554);
 ASSERT_REG_POSITION(tsc, 0x557);
-ASSERT_REG_POSITION(polygon_offset_factor, 0x55b);
+ASSERT_REG_POSITION(polygon_offset_factor, 0x55B);
 ASSERT_REG_POSITION(tic, 0x55D);
 ASSERT_REG_POSITION(stencil_two_side_enable, 0x565);
 ASSERT_REG_POSITION(stencil_back_op_fail, 0x566);
@@ -1558,6 +1612,7 @@ ASSERT_REG_POSITION(stencil_back_op_zpass, 0x568);
 ASSERT_REG_POSITION(stencil_back_func_func, 0x569);
 ASSERT_REG_POSITION(framebuffer_srgb, 0x56E);
 ASSERT_REG_POSITION(polygon_offset_units, 0x56F);
+ASSERT_REG_POSITION(multisample_mode, 0x574);
 ASSERT_REG_POSITION(point_coord_replace, 0x581);
 ASSERT_REG_POSITION(code_address, 0x582);
 ASSERT_REG_POSITION(draw, 0x585);
diff --git a/src/video_core/engines/shader_bytecode.h b/src/video_core/engines/shader_bytecode.h
index 498936f0c..c66c66f6c 100644
--- a/src/video_core/engines/shader_bytecode.h
+++ b/src/video_core/engines/shader_bytecode.h
@@ -290,6 +290,23 @@ enum class VmadShr : u64 {
     Shr15 = 2,
 };
 
+enum class VmnmxType : u64 {
+    Bits8,
+    Bits16,
+    Bits32,
+};
+
+enum class VmnmxOperation : u64 {
+    Mrg_16H = 0,
+    Mrg_16L = 1,
+    Mrg_8B0 = 2,
+    Mrg_8B2 = 3,
+    Acc = 4,
+    Min = 5,
+    Max = 6,
+    Nop = 7,
+};
+
 enum class XmadMode : u64 {
     None = 0,
     CLo = 1,
@@ -1651,6 +1668,42 @@ union Instruction {
     } vmad;
 
     union {
+        BitField<54, 1, u64> is_dest_signed;
+        BitField<48, 1, u64> is_src_a_signed;
+        BitField<49, 1, u64> is_src_b_signed;
+        BitField<37, 2, u64> src_format_a;
+        BitField<29, 2, u64> src_format_b;
+        BitField<56, 1, u64> mx;
+        BitField<55, 1, u64> sat;
+        BitField<36, 2, u64> selector_a;
+        BitField<28, 2, u64> selector_b;
+        BitField<50, 1, u64> is_op_b_register;
+        BitField<51, 3, VmnmxOperation> operation;
+
+        VmnmxType SourceFormatA() const {
+            switch (src_format_a) {
+            case 0b11:
+                return VmnmxType::Bits32;
+            case 0b10:
+                return VmnmxType::Bits16;
+            default:
+                return VmnmxType::Bits8;
+            }
+        }
+
+        VmnmxType SourceFormatB() const {
+            switch (src_format_b) {
+            case 0b11:
+                return VmnmxType::Bits32;
+            case 0b10:
+                return VmnmxType::Bits16;
+            default:
+                return VmnmxType::Bits8;
+            }
+        }
+    } vmnmx;
+
+    union {
         BitField<20, 16, u64> imm20_16;
         BitField<35, 1, u64> high_b_rr; // used on RR
         BitField<36, 1, u64> product_shift_left;
@@ -1763,6 +1816,7 @@ public:
         MEMBAR,
         VMAD,
         VSETP,
+        VMNMX,
         FFMA_IMM, // Fused Multiply and Add
         FFMA_CR,
         FFMA_RC,
@@ -2070,6 +2124,7 @@ private:
             INST("1110111110011---", Id::MEMBAR, Type::Trivial, "MEMBAR"),
             INST("01011111--------", Id::VMAD, Type::Video, "VMAD"),
             INST("0101000011110---", Id::VSETP, Type::Video, "VSETP"),
+            INST("0011101---------", Id::VMNMX, Type::Video, "VMNMX"),
             INST("0011001-1-------", Id::FFMA_IMM, Type::Ffma, "FFMA_IMM"),
             INST("010010011-------", Id::FFMA_CR, Type::Ffma, "FFMA_CR"),
             INST("010100011-------", Id::FFMA_RC, Type::Ffma, "FFMA_RC"),
@@ -2170,7 +2225,7 @@ private:
             INST("0011011-11111---", Id::SHF_LEFT_IMM, Type::Shift, "SHF_LEFT_IMM"),
             INST("0100110011100---", Id::I2I_C, Type::Conversion, "I2I_C"),
             INST("0101110011100---", Id::I2I_R, Type::Conversion, "I2I_R"),
-            INST("0011101-11100---", Id::I2I_IMM, Type::Conversion, "I2I_IMM"),
+            INST("0011100-11100---", Id::I2I_IMM, Type::Conversion, "I2I_IMM"),
             INST("0100110010111---", Id::I2F_C, Type::Conversion, "I2F_C"),
             INST("0101110010111---", Id::I2F_R, Type::Conversion, "I2F_R"),
             INST("0011100-10111---", Id::I2F_IMM, Type::Conversion, "I2F_IMM"),
diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
index 160ae4340..1f1f01313 100644
--- a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
@@ -1819,15 +1819,15 @@ private:
     }
 
     Expression HMergeH0(Operation operation) {
-        std::string dest = VisitOperand(operation, 0).AsUint();
-        std::string src = VisitOperand(operation, 1).AsUint();
-        return {fmt::format("(({} & 0x0000FFFFU) | ({} & 0xFFFF0000U))", src, dest), Type::Uint};
+        const std::string dest = VisitOperand(operation, 0).AsUint();
+        const std::string src = VisitOperand(operation, 1).AsUint();
+        return {fmt::format("bitfieldInsert({}, {}, 0, 16)", dest, src), Type::Uint};
     }
 
     Expression HMergeH1(Operation operation) {
-        std::string dest = VisitOperand(operation, 0).AsUint();
-        std::string src = VisitOperand(operation, 1).AsUint();
-        return {fmt::format("(({} & 0x0000FFFFU) | ({} & 0xFFFF0000U))", dest, src), Type::Uint};
+        const std::string dest = VisitOperand(operation, 0).AsUint();
+        const std::string src = VisitOperand(operation, 1).AsUint();
+        return {fmt::format("bitfieldInsert({}, {}, 16, 16)", dest, src), Type::Uint};
     }
 
     Expression HPack2(Operation operation) {
diff --git a/src/video_core/renderer_opengl/gl_texture_cache.cpp b/src/video_core/renderer_opengl/gl_texture_cache.cpp
index 36590a6d0..0b4d999d7 100644
--- a/src/video_core/renderer_opengl/gl_texture_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_texture_cache.cpp
@@ -411,14 +411,13 @@ CachedSurfaceView::~CachedSurfaceView() = default;
 void CachedSurfaceView::Attach(GLenum attachment, GLenum target) const {
     ASSERT(params.num_levels == 1);
 
-    const GLuint texture = surface.GetTexture();
     if (params.num_layers > 1) {
         // Layered framebuffer attachments
         UNIMPLEMENTED_IF(params.base_layer != 0);
 
         switch (params.target) {
         case SurfaceTarget::Texture2DArray:
-            glFramebufferTexture(target, attachment, texture, params.base_level);
+            glFramebufferTexture(target, attachment, GetTexture(), params.base_level);
             break;
         default:
             UNIMPLEMENTED();
@@ -427,6 +426,7 @@ void CachedSurfaceView::Attach(GLenum attachment, GLenum target) const {
     }
 
     const GLenum view_target = surface.GetTarget();
+    const GLuint texture = surface.GetTexture();
     switch (surface.GetSurfaceParams().target) {
     case SurfaceTarget::Texture1D:
         glFramebufferTexture1D(target, attachment, view_target, texture, params.base_level);
diff --git a/src/video_core/shader/decode/conversion.cpp b/src/video_core/shader/decode/conversion.cpp
index c72690b2b..b9989c88c 100644
--- a/src/video_core/shader/decode/conversion.cpp
+++ b/src/video_core/shader/decode/conversion.cpp
@@ -2,6 +2,10 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 
+#include <limits>
+#include <optional>
+#include <utility>
+
 #include "common/assert.h"
 #include "common/common_types.h"
 #include "video_core/engines/shader_bytecode.h"
@@ -15,9 +19,49 @@ using Tegra::Shader::OpCode;
 using Tegra::Shader::Register;
 
 namespace {
+
 constexpr OperationCode GetFloatSelector(u64 selector) {
     return selector == 0 ? OperationCode::FCastHalf0 : OperationCode::FCastHalf1;
 }
+
+constexpr u32 SizeInBits(Register::Size size) {
+    switch (size) {
+    case Register::Size::Byte:
+        return 8;
+    case Register::Size::Short:
+        return 16;
+    case Register::Size::Word:
+        return 32;
+    case Register::Size::Long:
+        return 64;
+    }
+    return 0;
+}
+
+constexpr std::optional<std::pair<s32, s32>> IntegerSaturateBounds(Register::Size src_size,
+                                                                   Register::Size dst_size,
+                                                                   bool src_signed,
+                                                                   bool dst_signed) {
+    const u32 dst_bits = SizeInBits(dst_size);
+    if (src_size == Register::Size::Word && dst_size == Register::Size::Word) {
+        if (src_signed == dst_signed) {
+            return std::nullopt;
+        }
+        return std::make_pair(0, std::numeric_limits<s32>::max());
+    }
+    if (dst_signed) {
+        // Signed destination, clamp to [-128, 127] for instance
+        return std::make_pair(-(1 << (dst_bits - 1)), (1 << (dst_bits - 1)) - 1);
+    } else {
+        // Unsigned destination
+        if (dst_bits == 32) {
+            // Avoid shifting by 32, that is undefined behavior
+            return std::make_pair(0, s32(std::numeric_limits<u32>::max()));
+        }
+        return std::make_pair(0, (1 << dst_bits) - 1);
+    }
+}
+
 } // Anonymous namespace
 
 u32 ShaderIR::DecodeConversion(NodeBlock& bb, u32 pc) {
@@ -28,14 +72,13 @@ u32 ShaderIR::DecodeConversion(NodeBlock& bb, u32 pc) {
     case OpCode::Id::I2I_R:
     case OpCode::Id::I2I_C:
     case OpCode::Id::I2I_IMM: {
-        UNIMPLEMENTED_IF(instr.conversion.int_src.selector != 0);
-        UNIMPLEMENTED_IF(instr.conversion.dst_size != Register::Size::Word);
-        UNIMPLEMENTED_IF(instr.alu.saturate_d);
+        const bool src_signed = instr.conversion.is_input_signed;
+        const bool dst_signed = instr.conversion.is_output_signed;
+        const Register::Size src_size = instr.conversion.src_size;
+        const Register::Size dst_size = instr.conversion.dst_size;
+        const u32 selector = static_cast<u32>(instr.conversion.int_src.selector);
 
-        const bool input_signed = instr.conversion.is_input_signed;
-        const bool output_signed = instr.conversion.is_output_signed;
-
-        Node value = [&]() {
+        Node value = [this, instr, opcode] {
             switch (opcode->get().GetId()) {
             case OpCode::Id::I2I_R:
                 return GetRegister(instr.gpr20);
@@ -48,16 +91,60 @@ u32 ShaderIR::DecodeConversion(NodeBlock& bb, u32 pc) {
                 return Immediate(0);
             }
         }();
-        value = ConvertIntegerSize(value, instr.conversion.src_size, input_signed);
 
-        value = GetOperandAbsNegInteger(value, instr.conversion.abs_a, instr.conversion.negate_a,
-                                        input_signed);
-        if (input_signed != output_signed) {
-            value = SignedOperation(OperationCode::ICastUnsigned, output_signed, NO_PRECISE, value);
+        // Ensure the source selector is valid
+        switch (instr.conversion.src_size) {
+        case Register::Size::Byte:
+            break;
+        case Register::Size::Short:
+            ASSERT(selector == 0 || selector == 2);
+            break;
+        default:
+            ASSERT(selector == 0);
+            break;
+        }
+
+        if (src_size != Register::Size::Word || selector != 0) {
+            value = SignedOperation(OperationCode::IBitfieldExtract, src_signed, std::move(value),
+                                    Immediate(selector * 8), Immediate(SizeInBits(src_size)));
+        }
+
+        value = GetOperandAbsNegInteger(std::move(value), instr.conversion.abs_a,
+                                        instr.conversion.negate_a, src_signed);
+
+        if (instr.alu.saturate_d) {
+            if (src_signed && !dst_signed) {
+                Node is_negative = Operation(OperationCode::LogicalUGreaterEqual, value,
+                                             Immediate(1 << (SizeInBits(src_size) - 1)));
+                value = Operation(OperationCode::Select, std::move(is_negative), Immediate(0),
+                                  std::move(value));
+
+                // Simplify generated expressions, this can be removed without semantic impact
+                SetTemporary(bb, 0, std::move(value));
+                value = GetTemporary(0);
+
+                if (dst_size != Register::Size::Word) {
+                    const Node limit = Immediate((1 << SizeInBits(dst_size)) - 1);
+                    Node is_large =
+                        Operation(OperationCode::LogicalUGreaterThan, std::move(value), limit);
+                    value = Operation(OperationCode::Select, std::move(is_large), limit,
+                                      std::move(value));
+                }
+            } else if (const std::optional bounds =
+                           IntegerSaturateBounds(src_size, dst_size, src_signed, dst_signed)) {
+                value = SignedOperation(OperationCode::IMax, src_signed, std::move(value),
+                                        Immediate(bounds->first));
+                value = SignedOperation(OperationCode::IMin, src_signed, std::move(value),
+                                        Immediate(bounds->second));
+            }
+        } else if (dst_size != Register::Size::Word) {
+            // No saturation, we only have to mask the result
+            Node mask = Immediate((1 << SizeInBits(dst_size)) - 1);
+            value = Operation(OperationCode::UBitwiseAnd, std::move(value), std::move(mask));
         }
 
         SetInternalFlagsFromInteger(bb, value, instr.generates_cc);
-        SetRegister(bb, instr.gpr0, value);
+        SetRegister(bb, instr.gpr0, std::move(value));
         break;
     }
     case OpCode::Id::I2F_R:
diff --git a/src/video_core/shader/decode/texture.cpp b/src/video_core/shader/decode/texture.cpp
index 48350e042..6c4a1358b 100644
--- a/src/video_core/shader/decode/texture.cpp
+++ b/src/video_core/shader/decode/texture.cpp
@@ -780,20 +780,6 @@ Node4 ShaderIR::GetTldsCode(Instruction instr, TextureType texture_type, bool is
     // When lod is used always is in gpr20
     const Node lod = lod_enabled ? GetRegister(instr.gpr20) : Immediate(0);
 
-    // Fill empty entries from the guest sampler
-    const std::size_t entry_coord_count = GetCoordCount(sampler.GetType());
-    if (type_coord_count != entry_coord_count) {
-        LOG_WARNING(HW_GPU, "Bound and built texture types mismatch");
-
-        // When the size is higher we insert zeroes
-        for (std::size_t i = type_coord_count; i < entry_coord_count; ++i) {
-            coords.push_back(GetRegister(Register::ZeroIndex));
-        }
-
-        // Then we ensure the size matches the number of entries (dropping unused values)
-        coords.resize(entry_coord_count);
-    }
-
     Node4 values;
     for (u32 element = 0; element < values.size(); ++element) {
         auto coords_copy = coords;
diff --git a/src/video_core/shader/decode/video.cpp b/src/video_core/shader/decode/video.cpp
index b047cf870..64ba60ea2 100644
--- a/src/video_core/shader/decode/video.cpp
+++ b/src/video_core/shader/decode/video.cpp
@@ -10,16 +10,24 @@
 
 namespace VideoCommon::Shader {
 
+using std::move;
 using Tegra::Shader::Instruction;
 using Tegra::Shader::OpCode;
 using Tegra::Shader::Pred;
 using Tegra::Shader::VideoType;
 using Tegra::Shader::VmadShr;
+using Tegra::Shader::VmnmxOperation;
+using Tegra::Shader::VmnmxType;
 
 u32 ShaderIR::DecodeVideo(NodeBlock& bb, u32 pc) {
     const Instruction instr = {program_code[pc]};
     const auto opcode = OpCode::Decode(instr);
 
+    if (opcode->get().GetId() == OpCode::Id::VMNMX) {
+        DecodeVMNMX(bb, instr);
+        return pc;
+    }
+
     const Node op_a =
         GetVideoOperand(GetRegister(instr.gpr8), instr.video.is_byte_chunk_a, instr.video.signed_a,
                         instr.video.type_a, instr.video.byte_height_a);
@@ -109,4 +117,54 @@ Node ShaderIR::GetVideoOperand(Node op, bool is_chunk, bool is_signed,
     }
 }
 
+void ShaderIR::DecodeVMNMX(NodeBlock& bb, Tegra::Shader::Instruction instr) {
+    UNIMPLEMENTED_IF(!instr.vmnmx.is_op_b_register);
+    UNIMPLEMENTED_IF(instr.vmnmx.SourceFormatA() != VmnmxType::Bits32);
+    UNIMPLEMENTED_IF(instr.vmnmx.SourceFormatB() != VmnmxType::Bits32);
+    UNIMPLEMENTED_IF(instr.vmnmx.is_src_a_signed != instr.vmnmx.is_src_b_signed);
+    UNIMPLEMENTED_IF(instr.vmnmx.sat);
+    UNIMPLEMENTED_IF(instr.generates_cc);
+
+    Node op_a = GetRegister(instr.gpr8);
+    Node op_b = GetRegister(instr.gpr20);
+    Node op_c = GetRegister(instr.gpr39);
+
+    const bool is_oper1_signed = instr.vmnmx.is_src_a_signed; // Stubbed
+    const bool is_oper2_signed = instr.vmnmx.is_dest_signed;
+
+    const auto operation_a = instr.vmnmx.mx ? OperationCode::IMax : OperationCode::IMin;
+    Node value = SignedOperation(operation_a, is_oper1_signed, move(op_a), move(op_b));
+
+    switch (instr.vmnmx.operation) {
+    case VmnmxOperation::Mrg_16H:
+        value = BitfieldInsert(move(op_c), move(value), 16, 16);
+        break;
+    case VmnmxOperation::Mrg_16L:
+        value = BitfieldInsert(move(op_c), move(value), 0, 16);
+        break;
+    case VmnmxOperation::Mrg_8B0:
+        value = BitfieldInsert(move(op_c), move(value), 0, 8);
+        break;
+    case VmnmxOperation::Mrg_8B2:
+        value = BitfieldInsert(move(op_c), move(value), 16, 8);
+        break;
+    case VmnmxOperation::Acc:
+        value = Operation(OperationCode::IAdd, move(value), move(op_c));
+        break;
+    case VmnmxOperation::Min:
+        value = SignedOperation(OperationCode::IMin, is_oper2_signed, move(value), move(op_c));
+        break;
+    case VmnmxOperation::Max:
+        value = SignedOperation(OperationCode::IMax, is_oper2_signed, move(value), move(op_c));
+        break;
+    case VmnmxOperation::Nop:
+        break;
+    default:
+        UNREACHABLE();
+        break;
+    }
+
+    SetRegister(bb, instr.gpr0, move(value));
+}
+
 } // namespace VideoCommon::Shader
diff --git a/src/video_core/shader/shader_ir.h b/src/video_core/shader/shader_ir.h
index ca6c976c9..c6e7bdf50 100644
--- a/src/video_core/shader/shader_ir.h
+++ b/src/video_core/shader/shader_ir.h
@@ -354,6 +354,9 @@ private:
     /// Marks the usage of a input or output attribute.
     void MarkAttributeUsage(Tegra::Shader::Attribute::Index index, u64 element);
 
+    /// Decodes VMNMX instruction and inserts its code into the passed basic block.
+    void DecodeVMNMX(NodeBlock& bb, Tegra::Shader::Instruction instr);
+
     void WriteTexInstructionFloat(NodeBlock& bb, Tegra::Shader::Instruction instr,
                                   const Node4& components);
 
diff --git a/src/video_core/textures/astc.cpp b/src/video_core/textures/astc.cpp
index 062b4f252..365bde2f1 100644
--- a/src/video_core/textures/astc.cpp
+++ b/src/video_core/textures/astc.cpp
@@ -20,6 +20,8 @@
 #include <cstring>
 #include <vector>
 
+#include <boost/container/static_vector.hpp>
+
 #include "common/common_types.h"
 
 #include "video_core/textures/astc.h"
@@ -39,25 +41,25 @@ constexpr u32 Popcnt(u32 n) {
 
 class InputBitStream {
 public:
-    explicit InputBitStream(const u8* ptr, std::size_t start_offset = 0)
-        : m_CurByte(ptr), m_NextBit(start_offset % 8) {}
+    constexpr explicit InputBitStream(const u8* ptr, std::size_t start_offset = 0)
+        : cur_byte{ptr}, next_bit{start_offset % 8} {}
 
-    std::size_t GetBitsRead() const {
-        return m_BitsRead;
+    constexpr std::size_t GetBitsRead() const {
+        return bits_read;
     }
 
-    u32 ReadBit() {
-        u32 bit = *m_CurByte >> m_NextBit++;
-        while (m_NextBit >= 8) {
-            m_NextBit -= 8;
-            m_CurByte++;
+    constexpr bool ReadBit() {
+        const bool bit = (*cur_byte >> next_bit++) & 1;
+        while (next_bit >= 8) {
+            next_bit -= 8;
+            cur_byte++;
         }
 
-        m_BitsRead++;
-        return bit & 1;
+        bits_read++;
+        return bit;
     }
 
-    u32 ReadBits(std::size_t nBits) {
+    constexpr u32 ReadBits(std::size_t nBits) {
         u32 ret = 0;
         for (std::size_t i = 0; i < nBits; ++i) {
             ret |= (ReadBit() & 1) << i;
@@ -66,7 +68,7 @@ public:
     }
 
     template <std::size_t nBits>
-    u32 ReadBits() {
+    constexpr u32 ReadBits() {
         u32 ret = 0;
         for (std::size_t i = 0; i < nBits; ++i) {
             ret |= (ReadBit() & 1) << i;
@@ -75,64 +77,58 @@ public:
     }
 
 private:
-    const u8* m_CurByte;
-    std::size_t m_NextBit = 0;
-    std::size_t m_BitsRead = 0;
+    const u8* cur_byte;
+    std::size_t next_bit = 0;
+    std::size_t bits_read = 0;
 };
 
 class OutputBitStream {
 public:
-    explicit OutputBitStream(u8* ptr, s32 nBits = 0, s32 start_offset = 0)
-        : m_NumBits(nBits), m_CurByte(ptr), m_NextBit(start_offset % 8) {}
-
-    ~OutputBitStream() = default;
+    constexpr explicit OutputBitStream(u8* ptr, std::size_t bits = 0, std::size_t start_offset = 0)
+        : cur_byte{ptr}, num_bits{bits}, next_bit{start_offset % 8} {}
 
-    s32 GetBitsWritten() const {
-        return m_BitsWritten;
+    constexpr std::size_t GetBitsWritten() const {
+        return bits_written;
     }
 
-    void WriteBitsR(u32 val, u32 nBits) {
+    constexpr void WriteBitsR(u32 val, u32 nBits) {
         for (u32 i = 0; i < nBits; i++) {
             WriteBit((val >> (nBits - i - 1)) & 1);
         }
     }
 
-    void WriteBits(u32 val, u32 nBits) {
+    constexpr void WriteBits(u32 val, u32 nBits) {
         for (u32 i = 0; i < nBits; i++) {
             WriteBit((val >> i) & 1);
         }
     }
 
 private:
-    void WriteBit(s32 b) {
-
-        if (done)
+    constexpr void WriteBit(bool b) {
+        if (bits_written >= num_bits) {
             return;
+        }
 
-        const u32 mask = 1 << m_NextBit++;
+        const u32 mask = 1 << next_bit++;
 
         // clear the bit
-        *m_CurByte &= static_cast<u8>(~mask);
+        *cur_byte &= static_cast<u8>(~mask);
 
         // Write the bit, if necessary
         if (b)
-            *m_CurByte |= static_cast<u8>(mask);
+            *cur_byte |= static_cast<u8>(mask);
 
         // Next byte?
-        if (m_NextBit >= 8) {
-            m_CurByte += 1;
-            m_NextBit = 0;
+        if (next_bit >= 8) {
+            cur_byte += 1;
+            next_bit = 0;
         }
-
-        done = done || ++m_BitsWritten >= m_NumBits;
     }
 
-    s32 m_BitsWritten = 0;
-    const s32 m_NumBits;
-    u8* m_CurByte;
-    s32 m_NextBit = 0;
-
-    bool done = false;
+    u8* cur_byte;
+    std::size_t num_bits;
+    std::size_t bits_written = 0;
+    std::size_t next_bit = 0;
 };
 
 template <typename IntType>
@@ -195,9 +191,13 @@ struct IntegerEncodedValue {
         u32 trit_value;
     };
 };
+using IntegerEncodedVector = boost::container::static_vector<
+    IntegerEncodedValue, 64,
+    boost::container::static_vector_options<
+        boost::container::inplace_alignment<alignof(IntegerEncodedValue)>,
+        boost::container::throw_on_overflow<false>>::type>;
 
-static void DecodeTritBlock(InputBitStream& bits, std::vector<IntegerEncodedValue>& result,
-                            u32 nBitsPerValue) {
+static void DecodeTritBlock(InputBitStream& bits, IntegerEncodedVector& result, u32 nBitsPerValue) {
     // Implement the algorithm in section C.2.12
     u32 m[5];
     u32 t[5];
@@ -255,7 +255,7 @@ static void DecodeTritBlock(InputBitStream& bits, std::vector<IntegerEncodedValu
     }
 }
 
-static void DecodeQus32Block(InputBitStream& bits, std::vector<IntegerEncodedValue>& result,
+static void DecodeQus32Block(InputBitStream& bits, IntegerEncodedVector& result,
                              u32 nBitsPerValue) {
     // Implement the algorithm in section C.2.12
     u32 m[3];
@@ -343,8 +343,8 @@ static constexpr std::array EncodingsValues = MakeEncodedValues();
 // Fills result with the values that are encoded in the given
 // bitstream. We must know beforehand what the maximum possible
 // value is, and how many values we're decoding.
-static void DecodeIntegerSequence(std::vector<IntegerEncodedValue>& result, InputBitStream& bits,
-                                  u32 maxRange, u32 nValues) {
+static void DecodeIntegerSequence(IntegerEncodedVector& result, InputBitStream& bits, u32 maxRange,
+                                  u32 nValues) {
     // Determine encoding parameters
     IntegerEncodedValue val = EncodingsValues[maxRange];
 
@@ -634,12 +634,14 @@ static void FillError(u32* outBuf, u32 blockWidth, u32 blockHeight) {
 // Replicates low numBits such that [(toBit - 1):(toBit - 1 - fromBit)]
 // is the same as [(numBits - 1):0] and repeats all the way down.
 template <typename IntType>
-static IntType Replicate(IntType val, u32 numBits, u32 toBit) {
-    if (numBits == 0)
+static constexpr IntType Replicate(IntType val, u32 numBits, u32 toBit) {
+    if (numBits == 0) {
         return 0;
-    if (toBit == 0)
+    }
+    if (toBit == 0) {
         return 0;
-    IntType v = val & static_cast<IntType>((1 << numBits) - 1);
+    }
+    const IntType v = val & static_cast<IntType>((1 << numBits) - 1);
     IntType res = v;
     u32 reslen = numBits;
     while (reslen < toBit) {
@@ -656,6 +658,89 @@ static IntType Replicate(IntType val, u32 numBits, u32 toBit) {
     return res;
 }
 
+static constexpr std::size_t NumReplicateEntries(u32 num_bits) {
+    return std::size_t(1) << num_bits;
+}
+
+template <typename IntType, u32 num_bits, u32 to_bit>
+static constexpr auto MakeReplicateTable() {
+    std::array<IntType, NumReplicateEntries(num_bits)> table{};
+    for (IntType value = 0; value < static_cast<IntType>(std::size(table)); ++value) {
+        table[value] = Replicate(value, num_bits, to_bit);
+    }
+    return table;
+}
+
+static constexpr auto REPLICATE_BYTE_TO_16_TABLE = MakeReplicateTable<u32, 8, 16>();
+static constexpr u32 ReplicateByteTo16(std::size_t value) {
+    return REPLICATE_BYTE_TO_16_TABLE[value];
+}
+
+static constexpr auto REPLICATE_BIT_TO_7_TABLE = MakeReplicateTable<u32, 1, 7>();
+static constexpr u32 ReplicateBitTo7(std::size_t value) {
+    return REPLICATE_BIT_TO_7_TABLE[value];
+}
+
+static constexpr auto REPLICATE_BIT_TO_9_TABLE = MakeReplicateTable<u32, 1, 9>();
+static constexpr u32 ReplicateBitTo9(std::size_t value) {
+    return REPLICATE_BIT_TO_9_TABLE[value];
+}
+
+static constexpr auto REPLICATE_1_BIT_TO_8_TABLE = MakeReplicateTable<u32, 1, 8>();
+static constexpr auto REPLICATE_2_BIT_TO_8_TABLE = MakeReplicateTable<u32, 2, 8>();
+static constexpr auto REPLICATE_3_BIT_TO_8_TABLE = MakeReplicateTable<u32, 3, 8>();
+static constexpr auto REPLICATE_4_BIT_TO_8_TABLE = MakeReplicateTable<u32, 4, 8>();
+static constexpr auto REPLICATE_5_BIT_TO_8_TABLE = MakeReplicateTable<u32, 5, 8>();
+static constexpr auto REPLICATE_6_BIT_TO_8_TABLE = MakeReplicateTable<u32, 6, 8>();
+static constexpr auto REPLICATE_7_BIT_TO_8_TABLE = MakeReplicateTable<u32, 7, 8>();
+static constexpr auto REPLICATE_8_BIT_TO_8_TABLE = MakeReplicateTable<u32, 8, 8>();
+/// Use a precompiled table with the most common usages, if it's not in the expected range, fallback
+/// to the runtime implementation
+static constexpr u32 FastReplicateTo8(u32 value, u32 num_bits) {
+    switch (num_bits) {
+    case 1:
+        return REPLICATE_1_BIT_TO_8_TABLE[value];
+    case 2:
+        return REPLICATE_2_BIT_TO_8_TABLE[value];
+    case 3:
+        return REPLICATE_3_BIT_TO_8_TABLE[value];
+    case 4:
+        return REPLICATE_4_BIT_TO_8_TABLE[value];
+    case 5:
+        return REPLICATE_5_BIT_TO_8_TABLE[value];
+    case 6:
+        return REPLICATE_6_BIT_TO_8_TABLE[value];
+    case 7:
+        return REPLICATE_7_BIT_TO_8_TABLE[value];
+    case 8:
+        return REPLICATE_8_BIT_TO_8_TABLE[value];
+    default:
+        return Replicate(value, num_bits, 8);
+    }
+}
+
+static constexpr auto REPLICATE_1_BIT_TO_6_TABLE = MakeReplicateTable<u32, 1, 6>();
+static constexpr auto REPLICATE_2_BIT_TO_6_TABLE = MakeReplicateTable<u32, 2, 6>();
+static constexpr auto REPLICATE_3_BIT_TO_6_TABLE = MakeReplicateTable<u32, 3, 6>();
+static constexpr auto REPLICATE_4_BIT_TO_6_TABLE = MakeReplicateTable<u32, 4, 6>();
+static constexpr auto REPLICATE_5_BIT_TO_6_TABLE = MakeReplicateTable<u32, 5, 6>();
+static constexpr u32 FastReplicateTo6(u32 value, u32 num_bits) {
+    switch (num_bits) {
+    case 1:
+        return REPLICATE_1_BIT_TO_6_TABLE[value];
+    case 2:
+        return REPLICATE_2_BIT_TO_6_TABLE[value];
+    case 3:
+        return REPLICATE_3_BIT_TO_6_TABLE[value];
+    case 4:
+        return REPLICATE_4_BIT_TO_6_TABLE[value];
+    case 5:
+        return REPLICATE_5_BIT_TO_6_TABLE[value];
+    default:
+        return Replicate(value, num_bits, 6);
+    }
+}
+
 class Pixel {
 protected:
     using ChannelType = s16;
@@ -674,10 +759,10 @@ public:
     // significant bits when going from larger to smaller bit depth
     // or by repeating the most significant bits when going from
     // smaller to larger bit depths.
-    void ChangeBitDepth(const u8 (&depth)[4]) {
+    void ChangeBitDepth() {
         for (u32 i = 0; i < 4; i++) {
-            Component(i) = ChangeBitDepth(Component(i), m_BitDepth[i], depth[i]);
-            m_BitDepth[i] = depth[i];
+            Component(i) = ChangeBitDepth(Component(i), m_BitDepth[i]);
+            m_BitDepth[i] = 8;
         }
     }
 
@@ -689,28 +774,23 @@ public:
 
     // Changes the bit depth of a single component. See the comment
     // above for how we do this.
-    static ChannelType ChangeBitDepth(Pixel::ChannelType val, u8 oldDepth, u8 newDepth) {
-        assert(newDepth <= 8);
+    static ChannelType ChangeBitDepth(Pixel::ChannelType val, u8 oldDepth) {
         assert(oldDepth <= 8);
 
-        if (oldDepth == newDepth) {
+        if (oldDepth == 8) {
             // Do nothing
             return val;
-        } else if (oldDepth == 0 && newDepth != 0) {
-            return static_cast<ChannelType>((1 << newDepth) - 1);
-        } else if (newDepth > oldDepth) {
-            return Replicate(val, oldDepth, newDepth);
+        } else if (oldDepth == 0) {
+            return static_cast<ChannelType>((1 << 8) - 1);
+        } else if (8 > oldDepth) {
+            return static_cast<ChannelType>(FastReplicateTo8(static_cast<u32>(val), oldDepth));
         } else {
             // oldDepth > newDepth
-            if (newDepth == 0) {
-                return 0xFF;
-            } else {
-                u8 bitsWasted = static_cast<u8>(oldDepth - newDepth);
-                u16 v = static_cast<u16>(val);
-                v = static_cast<u16>((v + (1 << (bitsWasted - 1))) >> bitsWasted);
-                v = ::std::min<u16>(::std::max<u16>(0, v), static_cast<u16>((1 << newDepth) - 1));
-                return static_cast<u8>(v);
-            }
+            const u8 bitsWasted = static_cast<u8>(oldDepth - 8);
+            u16 v = static_cast<u16>(val);
+            v = static_cast<u16>((v + (1 << (bitsWasted - 1))) >> bitsWasted);
+            v = ::std::min<u16>(::std::max<u16>(0, v), static_cast<u16>((1 << 8) - 1));
+            return static_cast<u8>(v);
         }
 
         assert(false && "We shouldn't get here.");
@@ -760,8 +840,7 @@ public:
     // up in the most-significant byte.
     u32 Pack() const {
         Pixel eightBit(*this);
-        const u8 eightBitDepth[4] = {8, 8, 8, 8};
-        eightBit.ChangeBitDepth(eightBitDepth);
+        eightBit.ChangeBitDepth();
 
         u32 r = 0;
         r |= eightBit.A();
@@ -816,8 +895,7 @@ static void DecodeColorValues(u32* out, u8* data, const u32* modes, const u32 nP
     }
 
     // We now have enough to decode our integer sequence.
-    std::vector<IntegerEncodedValue> decodedColorValues;
-    decodedColorValues.reserve(32);
+    IntegerEncodedVector decodedColorValues;
 
     InputBitStream colorStream(data);
     DecodeIntegerSequence(decodedColorValues, colorStream, range, nValues);
@@ -839,12 +917,12 @@ static void DecodeColorValues(u32* out, u8* data, const u32* modes, const u32 nP
 
         u32 A = 0, B = 0, C = 0, D = 0;
         // A is just the lsb replicated 9 times.
-        A = Replicate(bitval & 1, 1, 9);
+        A = ReplicateBitTo9(bitval & 1);
 
         switch (val.encoding) {
         // Replicate bits
         case IntegerEncoding::JustBits:
-            out[outIdx++] = Replicate(bitval, bitlen, 8);
+            out[outIdx++] = FastReplicateTo8(bitval, bitlen);
             break;
 
         // Use algorithm in C.2.13
@@ -962,13 +1040,13 @@ static u32 UnquantizeTexelWeight(const IntegerEncodedValue& val) {
     u32 bitval = val.bit_value;
     u32 bitlen = val.num_bits;
 
-    u32 A = Replicate(bitval & 1, 1, 7);
+    u32 A = ReplicateBitTo7(bitval & 1);
     u32 B = 0, C = 0, D = 0;
 
     u32 result = 0;
     switch (val.encoding) {
     case IntegerEncoding::JustBits:
-        result = Replicate(bitval, bitlen, 6);
+        result = FastReplicateTo6(bitval, bitlen);
         break;
 
     case IntegerEncoding::Trit: {
@@ -1047,7 +1125,7 @@ static u32 UnquantizeTexelWeight(const IntegerEncodedValue& val) {
     return result;
 }
 
-static void UnquantizeTexelWeights(u32 out[2][144], const std::vector<IntegerEncodedValue>& weights,
+static void UnquantizeTexelWeights(u32 out[2][144], const IntegerEncodedVector& weights,
                                    const TexelWeightParams& params, const u32 blockWidth,
                                    const u32 blockHeight) {
     u32 weightIdx = 0;
@@ -1545,8 +1623,7 @@ static void DecompressBlock(const u8 inBuf[16], const u32 blockWidth, const u32
         static_cast<u8>((1 << (weightParams.GetPackedBitSize() % 8)) - 1);
     memset(texelWeightData + clearByteStart, 0, 16 - clearByteStart);
 
-    std::vector<IntegerEncodedValue> texelWeightValues;
-    texelWeightValues.reserve(64);
+    IntegerEncodedVector texelWeightValues;
 
     InputBitStream weightStream(texelWeightData);
 
@@ -1568,9 +1645,9 @@ static void DecompressBlock(const u8 inBuf[16], const u32 blockWidth, const u32
             Pixel p;
             for (u32 c = 0; c < 4; c++) {
                 u32 C0 = endpos32s[partition][0].Component(c);
-                C0 = Replicate(C0, 8, 16);
+                C0 = ReplicateByteTo16(C0);
                 u32 C1 = endpos32s[partition][1].Component(c);
-                C1 = Replicate(C1, 8, 16);
+                C1 = ReplicateByteTo16(C1);
 
                 u32 plane = 0;
                 if (weightParams.m_bDualPlane && (((planeIdx + 1) & 3) == c)) {
diff --git a/src/video_core/textures/texture.h b/src/video_core/textures/texture.h
index 59b8a5e66..eba05aced 100644
--- a/src/video_core/textures/texture.h
+++ b/src/video_core/textures/texture.h
@@ -131,6 +131,20 @@ enum class SwizzleSource : u32 {
     OneFloat = 7,
 };
 
+enum class MsaaMode : u32 {
+    Msaa1x1 = 0,
+    Msaa2x1 = 1,
+    Msaa2x2 = 2,
+    Msaa4x2 = 3,
+    Msaa4x2_D3D = 4,
+    Msaa2x1_D3D = 5,
+    Msaa4x4 = 6,
+    Msaa2x2_VC4 = 8,
+    Msaa2x2_VC12 = 9,
+    Msaa4x2_VC8 = 10,
+    Msaa4x2_VC24 = 11,
+};
+
 union TextureHandle {
     TextureHandle(u32 raw) : raw{raw} {}
 
@@ -197,6 +211,7 @@ struct TICEntry {
     union {
         BitField<0, 4, u32> res_min_mip_level;
         BitField<4, 4, u32> res_max_mip_level;
+        BitField<8, 4, MsaaMode> msaa_mode;
         BitField<12, 12, u32> min_lod_clamp;
     };
 
diff --git a/src/yuzu/game_list.cpp b/src/yuzu/game_list.cpp
index a2b88c787..dccbabcbf 100644
--- a/src/yuzu/game_list.cpp
+++ b/src/yuzu/game_list.cpp
@@ -315,7 +315,7 @@ GameList::GameList(FileSys::VirtualFilesystem vfs, FileSys::ManualContentProvide
         item_model->setHeaderData(COLUMN_FILE_TYPE - 1, Qt::Horizontal, tr("File type"));
         item_model->setHeaderData(COLUMN_SIZE - 1, Qt::Horizontal, tr("Size"));
     }
-    item_model->setSortRole(GameListItemPath::TitleRole);
+    item_model->setSortRole(GameListItemPath::SortRole);
 
     connect(main_window, &GMainWindow::UpdateThemedIcons, this, &GameList::onUpdateThemedIcons);
     connect(tree_view, &QTreeView::activated, this, &GameList::ValidateEntry);
@@ -441,6 +441,8 @@ void GameList::DonePopulating(QStringList watch_list) {
     if (children_total > 0) {
         search_field->setFocus();
     }
+    item_model->sort(tree_view->header()->sortIndicatorSection(),
+                     tree_view->header()->sortIndicatorOrder());
 }
 
 void GameList::PopupContextMenu(const QPoint& menu_location) {
@@ -666,8 +668,6 @@ void GameList::LoadInterfaceLayout() {
         // so make it as large as possible as default.
         header->resizeSection(COLUMN_NAME, header->width());
     }
-
-    item_model->sort(header->sortIndicatorSection(), header->sortIndicatorOrder());
 }
 
 const QStringList GameList::supported_file_extensions = {
diff --git a/src/yuzu/game_list_p.h b/src/yuzu/game_list_p.h
index 7cde72d1b..3e6d5a7cd 100644
--- a/src/yuzu/game_list_p.h
+++ b/src/yuzu/game_list_p.h
@@ -65,10 +65,10 @@ public:
  */
 class GameListItemPath : public GameListItem {
 public:
-    static const int TitleRole = SortRole;
-    static const int FullPathRole = SortRole + 1;
-    static const int ProgramIdRole = SortRole + 2;
-    static const int FileTypeRole = SortRole + 3;
+    static const int TitleRole = SortRole + 1;
+    static const int FullPathRole = SortRole + 2;
+    static const int ProgramIdRole = SortRole + 3;
+    static const int FileTypeRole = SortRole + 4;
 
     GameListItemPath() = default;
     GameListItemPath(const QString& game_path, const std::vector<u8>& picture_data,
@@ -95,7 +95,7 @@ public:
     }
 
     QVariant data(int role) const override {
-        if (role == Qt::DisplayRole) {
+        if (role == Qt::DisplayRole || role == SortRole) {
             std::string filename;
             Common::SplitPath(data(FullPathRole).toString().toStdString(), nullptr, &filename,
                               nullptr);
@@ -110,6 +110,9 @@ public:
             const auto& row1 = row_data.at(UISettings::values.row_1_text_id);
             const int row2_id = UISettings::values.row_2_text_id;
 
+            if (role == SortRole)
+                return row1.toLower();
+
             if (row2_id == 4) // None
                 return row1;
 
@@ -123,6 +126,13 @@ public:
 
         return GameListItem::data(role);
     }
+
+    /**
+     * Override to prevent automatic sorting.
+     */
+    bool operator<(const QStandardItem& other) const override {
+        return false;
+    }
 };
 
 class GameListItemCompat : public GameListItem {
@@ -289,6 +299,10 @@ public:
     int type() const override {
         return static_cast<int>(GameListItemType::AddDir);
     }
+
+    bool operator<(const QStandardItem& other) const override {
+        return false;
+    }
 };
 
 class GameList;