7 files changed, 192 insertions, 38 deletions
diff --git a/src/core/hle/service/gsp_gpu.cpp b/src/core/hle/service/gsp_gpu.cpp
index e93c1b436..3c41e656c 100644
--- a/src/core/hle/service/gsp_gpu.cpp
+++ b/src/core/hle/service/gsp_gpu.cpp
@@ -418,7 +418,7 @@ static void ExecuteCommand(const Command& command, u32 thread_id) {
 
     case CommandId::SET_DISPLAY_TRANSFER:
     {
-        auto& params = command.image_copy;
+        auto& params = command.display_transfer;
         WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(display_transfer_config.input_address)),
                 Memory::VirtualToPhysicalAddress(params.in_buffer_address) >> 3);
         WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(display_transfer_config.output_address)),
@@ -433,17 +433,22 @@ static void ExecuteCommand(const Command& command, u32 thread_id) {
     // TODO: Check if texture copies are implemented correctly..
     case CommandId::SET_TEXTURE_COPY:
     {
-        auto& params = command.image_copy;
-        WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(display_transfer_config.input_address)),
+        auto& params = command.texture_copy;
+        WriteGPURegister((u32)GPU_REG_INDEX(display_transfer_config.input_address),
                 Memory::VirtualToPhysicalAddress(params.in_buffer_address) >> 3);
-        WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(display_transfer_config.output_address)),
+        WriteGPURegister((u32)GPU_REG_INDEX(display_transfer_config.output_address),
                 Memory::VirtualToPhysicalAddress(params.out_buffer_address) >> 3);
-        WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(display_transfer_config.input_size)), params.in_buffer_size);
-        WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(display_transfer_config.output_size)), params.out_buffer_size);
-        WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(display_transfer_config.flags)), params.flags);
-
-        // TODO: Should this register be set to 1 or should instead its value be OR-ed with 1?
-        WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(display_transfer_config.trigger)), 1);
+        WriteGPURegister((u32)GPU_REG_INDEX(display_transfer_config.texture_copy.size),
+                params.size);
+        WriteGPURegister((u32)GPU_REG_INDEX(display_transfer_config.texture_copy.input_size),
+                params.in_width_gap);
+        WriteGPURegister((u32)GPU_REG_INDEX(display_transfer_config.texture_copy.output_size),
+                params.out_width_gap);
+        WriteGPURegister((u32)GPU_REG_INDEX(display_transfer_config.flags),
+                params.flags);
+
+        // NOTE: Actual GSP ORs 1 with current register instead of overwriting. Doesn't seem to matter.
+        WriteGPURegister((u32)GPU_REG_INDEX(display_transfer_config.trigger), 1);
         break;
     }
 
diff --git a/src/core/hle/service/gsp_gpu.h b/src/core/hle/service/gsp_gpu.h
index c89d0a467..8bcb30ad1 100644
--- a/src/core/hle/service/gsp_gpu.h
+++ b/src/core/hle/service/gsp_gpu.h
@@ -127,7 +127,16 @@ struct Command {
             u32 in_buffer_size;
             u32 out_buffer_size;
             u32 flags;
-        } image_copy;
+        } display_transfer;
+
+        struct {
+            u32 in_buffer_address;
+            u32 out_buffer_address;
+            u32 size;
+            u32 in_width_gap;
+            u32 out_width_gap;
+            u32 flags;
+        } texture_copy;
 
         u8 raw_data[0x1C];
     };
diff --git a/src/core/hw/gpu.cpp b/src/core/hw/gpu.cpp
index 3ccbc03b2..68ae38289 100644
--- a/src/core/hw/gpu.cpp
+++ b/src/core/hw/gpu.cpp
@@ -3,6 +3,7 @@
 // Refer to the license.txt file included.
 
 #include <cstring>
+#include <numeric>
 #include <type_traits>
 
 #include "common/color.h"
@@ -158,14 +159,59 @@ inline void Write(u32 addr, const T data) {
             u8* src_pointer = Memory::GetPhysicalPointer(config.GetPhysicalInputAddress());
             u8* dst_pointer = Memory::GetPhysicalPointer(config.GetPhysicalOutputAddress());
 
+            if (config.is_texture_copy) {
+                u32 input_width = config.texture_copy.input_width * 16;
+                u32 input_gap = config.texture_copy.input_gap * 16;
+                u32 output_width = config.texture_copy.output_width * 16;
+                u32 output_gap = config.texture_copy.output_gap * 16;
+
+                size_t contiguous_input_size = config.texture_copy.size / input_width * (input_width + input_gap);
+                VideoCore::g_renderer->hw_rasterizer->NotifyPreRead(config.GetPhysicalInputAddress(), contiguous_input_size);
+
+                u32 remaining_size = config.texture_copy.size;
+                u32 remaining_input = input_width;
+                u32 remaining_output = output_width;
+                while (remaining_size > 0) {
+                    u32 copy_size = std::min({ remaining_input, remaining_output, remaining_size });
+
+                    std::memcpy(dst_pointer, src_pointer, copy_size);
+                    src_pointer += copy_size;
+                    dst_pointer += copy_size;
+
+                    remaining_input -= copy_size;
+                    remaining_output -= copy_size;
+                    remaining_size -= copy_size;
+
+                    if (remaining_input == 0) {
+                        remaining_input = input_width;
+                        src_pointer += input_gap;
+                    }
+                    if (remaining_output == 0) {
+                        remaining_output = output_width;
+                        dst_pointer += output_gap;
+                    }
+                }
+
+                LOG_TRACE(HW_GPU, "TextureCopy: 0x%X bytes from 0x%08X(%u+%u)-> 0x%08X(%u+%u), flags 0x%08X",
+                    config.texture_copy.size,
+                    config.GetPhysicalInputAddress(), input_width, input_gap,
+                    config.GetPhysicalOutputAddress(), output_width, output_gap,
+                    config.flags);
+
+                size_t contiguous_output_size = config.texture_copy.size / output_width * (output_width + output_gap);
+                VideoCore::g_renderer->hw_rasterizer->NotifyFlush(config.GetPhysicalOutputAddress(), contiguous_output_size);
+
+                GSP_GPU::SignalInterrupt(GSP_GPU::InterruptId::PPF);
+                break;
+            }
+
             if (config.scaling > config.ScaleXY) {
                 LOG_CRITICAL(HW_GPU, "Unimplemented display transfer scaling mode %u", config.scaling.Value());
                 UNIMPLEMENTED();
                 break;
             }
 
-            if (config.output_tiled &&
-                    (config.scaling == config.ScaleXY || config.scaling == config.ScaleX)) {
+            if (config.input_linear && config.scaling != config.NoScale) {
                 LOG_CRITICAL(HW_GPU, "Scaling is only implemented on tiled input");
                 UNIMPLEMENTED();
                 break;
@@ -182,23 +228,6 @@ inline void Write(u32 addr, const T data) {
 
             VideoCore::g_renderer->hw_rasterizer->NotifyPreRead(config.GetPhysicalInputAddress(), input_size);
 
-            if (config.raw_copy) {
-                // Raw copies do not perform color conversion nor tiled->linear / linear->tiled conversions
-                // TODO(Subv): Verify if raw copies perform scaling
-                memcpy(dst_pointer, src_pointer, output_size);
-
-                LOG_TRACE(HW_GPU, "DisplayTriggerTransfer: 0x%08x bytes from 0x%08x(%ux%u)-> 0x%08x(%ux%u), output format: %x, flags 0x%08X, Raw copy",
-                    output_size,
-                    config.GetPhysicalInputAddress(), config.input_width.Value(), config.input_height.Value(),
-                    config.GetPhysicalOutputAddress(), config.output_width.Value(), config.output_height.Value(),
-                    config.output_format.Value(), config.flags);
-
-                GSP_GPU::SignalInterrupt(GSP_GPU::InterruptId::PPF);
-
-                VideoCore::g_renderer->hw_rasterizer->NotifyFlush(config.GetPhysicalOutputAddress(), output_size);
-                break;
-            }
-
             for (u32 y = 0; y < output_height; ++y) {
                 for (u32 x = 0; x < output_width; ++x) {
                     Math::Vec4<u8> src_color;
@@ -220,7 +249,7 @@ inline void Write(u32 addr, const T data) {
                     u32 src_offset;
                     u32 dst_offset;
 
-                    if (config.output_tiled) {
+                    if (config.input_linear) {
                         if (!config.dont_swizzle) {
                             // Interpret the input as linear and the output as tiled
                             u32 coarse_y = y & ~7;
diff --git a/src/core/hw/gpu.h b/src/core/hw/gpu.h
index daad506fe..2e3a9f779 100644
--- a/src/core/hw/gpu.h
+++ b/src/core/hw/gpu.h
@@ -201,12 +201,14 @@ struct Regs {
             u32 flags;
 
             BitField< 0, 1, u32> flip_vertically;  // flips input data vertically
-            BitField< 1, 1, u32> output_tiled;     // Converts from linear to tiled format
-            BitField< 3, 1, u32> raw_copy;         // Copies the data without performing any processing
+            BitField< 1, 1, u32> input_linear;     // Converts from linear to tiled format
+            BitField< 2, 1, u32> crop_input_lines;
+            BitField< 3, 1, u32> is_texture_copy;  // Copies the data without performing any processing and respecting texture copy fields
             BitField< 5, 1, u32> dont_swizzle;
             BitField< 8, 3, PixelFormat> input_format;
             BitField<12, 3, PixelFormat> output_format;
-
+            /// Uses some kind of 32x32 block swizzling mode, instead of the usual 8x8 one.
+            BitField<16, 1, u32> block_32; // TODO(yuriks): unimplemented
             BitField<24, 2, ScalingMode> scaling; // Determines the scaling mode of the transfer
         };
 
@@ -214,10 +216,30 @@ struct Regs {
 
         // it seems that writing to this field triggers the display transfer
         u32 trigger;
+
+        INSERT_PADDING_WORDS(0x1);
+
+        struct {
+            u32 size;
+
+            union {
+                u32 input_size;
+
+                BitField< 0, 16, u32> input_width;
+                BitField<16, 16, u32> input_gap;
+            };
+
+            union {
+                u32 output_size;
+
+                BitField< 0, 16, u32> output_width;
+                BitField<16, 16, u32> output_gap;
+            };
+        } texture_copy;
     } display_transfer_config;
-    ASSERT_MEMBER_SIZE(display_transfer_config, 0x1c);
+    ASSERT_MEMBER_SIZE(display_transfer_config, 0x2c);
 
-    INSERT_PADDING_WORDS(0x331);
+    INSERT_PADDING_WORDS(0x32D);
 
     struct {
         // command list size (in bytes)
diff --git a/src/video_core/shader/shader_interpreter.cpp b/src/video_core/shader/shader_interpreter.cpp
index e14de0768..646171a19 100644
--- a/src/video_core/shader/shader_interpreter.cpp
+++ b/src/video_core/shader/shader_interpreter.cpp
@@ -334,6 +334,42 @@ void RunInterpreter(UnitState<Debug>& state) {
                 Record<DebugDataRecord::CMP_RESULT>(state.debug, iteration, state.conditional_code);
                 break;
 
+            case OpCode::Id::EX2:
+            {
+                Record<DebugDataRecord::SRC1>(state.debug, iteration, src1);
+                Record<DebugDataRecord::DEST_IN>(state.debug, iteration, dest);
+
+                // EX2 only takes first component exp2 and writes it to all dest components
+                float24 ex2_res = float24::FromFloat32(std::exp2(src1[0].ToFloat32()));
+                for (int i = 0; i < 4; ++i) {
+                    if (!swizzle.DestComponentEnabled(i))
+                        continue;
+
+                    dest[i] = ex2_res;
+                }
+
+                Record<DebugDataRecord::DEST_OUT>(state.debug, iteration, dest);
+                break;
+            }
+
+            case OpCode::Id::LG2:
+            {
+                Record<DebugDataRecord::SRC1>(state.debug, iteration, src1);
+                Record<DebugDataRecord::DEST_IN>(state.debug, iteration, dest);
+
+                // LG2 only takes the first component log2 and writes it to all dest components
+                float24 lg2_res = float24::FromFloat32(std::log2(src1[0].ToFloat32()));
+                for (int i = 0; i < 4; ++i) {
+                    if (!swizzle.DestComponentEnabled(i))
+                        continue;
+
+                    dest[i] = lg2_res;
+                }
+
+                Record<DebugDataRecord::DEST_OUT>(state.debug, iteration, dest);
+                break;
+            }
+
             default:
                 LOG_ERROR(HW_GPU, "Unhandled arithmetic instruction: 0x%02x (%s): 0x%08x",
                           (int)instr.opcode.Value().EffectiveOpCode(), instr.opcode.Value().GetInfo().name, instr.hex);
diff --git a/src/video_core/shader/shader_jit_x64.cpp b/src/video_core/shader/shader_jit_x64.cpp
index 836942c6b..e4b8295b3 100644
--- a/src/video_core/shader/shader_jit_x64.cpp
+++ b/src/video_core/shader/shader_jit_x64.cpp
@@ -25,8 +25,8 @@ const JitFunction instr_table[64] = {
     &JitCompiler::Compile_DP4,      // dp4
     nullptr,                        // dph
     nullptr,                        // unknown
-    nullptr,                        // ex2
-    nullptr,                        // lg2
+    &JitCompiler::Compile_EX2,      // ex2
+    &JitCompiler::Compile_LG2,      // lg2
     nullptr,                        // unknown
     &JitCompiler::Compile_MUL,      // mul
     nullptr,                        // lge
@@ -280,6 +280,22 @@ void JitCompiler::Compile_UniformCondition(Instruction instr) {
     CMP(sizeof(bool) * 8, MDisp(UNIFORMS, offset), Imm8(0));
 }
 
+void JitCompiler::Compile_PushCallerSavedXMM() {
+#ifndef _WIN32
+    SUB(64, R(RSP), Imm8(2 * 16));
+    MOVUPS(MDisp(RSP, 16), ONE);
+    MOVUPS(MDisp(RSP, 0), NEGBIT);
+#endif
+}
+
+void JitCompiler::Compile_PopCallerSavedXMM() {
+#ifndef _WIN32
+    MOVUPS(NEGBIT, MDisp(RSP, 0));
+    MOVUPS(ONE, MDisp(RSP, 16));
+    ADD(64, R(RSP), Imm8(2 * 16));
+#endif
+}
+
 void JitCompiler::Compile_ADD(Instruction instr) {
     Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
     Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
@@ -331,6 +347,38 @@ void JitCompiler::Compile_DP4(Instruction instr) {
     Compile_DestEnable(instr, SRC1);
 }
 
+void JitCompiler::Compile_EX2(Instruction instr) {
+    Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
+    MOVSS(XMM0, R(SRC1));
+
+    // The following will actually break the stack alignment
+    ABI_PushAllCallerSavedRegsAndAdjustStack();
+    Compile_PushCallerSavedXMM();
+    ABI_CallFunction(reinterpret_cast<const void*>(exp2f));
+    Compile_PopCallerSavedXMM();
+    ABI_PopAllCallerSavedRegsAndAdjustStack();
+
+    SHUFPS(XMM0, R(XMM0), _MM_SHUFFLE(0, 0, 0, 0));
+    MOVAPS(SRC1, R(XMM0));
+    Compile_DestEnable(instr, SRC1);
+}
+
+void JitCompiler::Compile_LG2(Instruction instr) {
+    Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
+    MOVSS(XMM0, R(SRC1));
+
+    // The following will actually break the stack alignment
+    ABI_PushAllCallerSavedRegsAndAdjustStack();
+    Compile_PushCallerSavedXMM();
+    ABI_CallFunction(reinterpret_cast<const void*>(log2f));
+    Compile_PopCallerSavedXMM();
+    ABI_PopAllCallerSavedRegsAndAdjustStack();
+
+    SHUFPS(XMM0, R(XMM0), _MM_SHUFFLE(0, 0, 0, 0));
+    MOVAPS(SRC1, R(XMM0));
+    Compile_DestEnable(instr, SRC1);
+}
+
 void JitCompiler::Compile_MUL(Instruction instr) {
     Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
     Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
diff --git a/src/video_core/shader/shader_jit_x64.h b/src/video_core/shader/shader_jit_x64.h
index b88f2a0d2..a6ae7fbf1 100644
--- a/src/video_core/shader/shader_jit_x64.h
+++ b/src/video_core/shader/shader_jit_x64.h
@@ -37,6 +37,8 @@ public:
     void Compile_ADD(Instruction instr);
     void Compile_DP3(Instruction instr);
     void Compile_DP4(Instruction instr);
+    void Compile_EX2(Instruction instr);
+    void Compile_LG2(Instruction instr);
     void Compile_MUL(Instruction instr);
     void Compile_FLR(Instruction instr);
     void Compile_MAX(Instruction instr);
@@ -67,6 +69,9 @@ private:
     void Compile_EvaluateCondition(Instruction instr);
     void Compile_UniformCondition(Instruction instr);
 
+    void Compile_PushCallerSavedXMM();
+    void Compile_PopCallerSavedXMM();
+
     /// Pointer to the variable that stores the current Pica code offset. Used to handle nested code blocks.
     unsigned* offset_ptr = nullptr;