diff options
| -rw-r--r-- | src/core/hle/service/gsp_gpu.cpp | 25 | ||||
| -rw-r--r-- | src/core/hle/service/gsp_gpu.h | 11 | ||||
| -rw-r--r-- | src/core/hw/gpu.cpp | 69 | ||||
| -rw-r--r-- | src/core/hw/gpu.h | 32 | ||||
| -rw-r--r-- | src/video_core/shader/shader_interpreter.cpp | 36 | ||||
| -rw-r--r-- | src/video_core/shader/shader_jit_x64.cpp | 52 | ||||
| -rw-r--r-- | src/video_core/shader/shader_jit_x64.h | 5 | 
7 files changed, 192 insertions, 38 deletions
| diff --git a/src/core/hle/service/gsp_gpu.cpp b/src/core/hle/service/gsp_gpu.cpp index e93c1b436..3c41e656c 100644 --- a/src/core/hle/service/gsp_gpu.cpp +++ b/src/core/hle/service/gsp_gpu.cpp @@ -418,7 +418,7 @@ static void ExecuteCommand(const Command& command, u32 thread_id) {      case CommandId::SET_DISPLAY_TRANSFER:      { -        auto& params = command.image_copy; +        auto& params = command.display_transfer;          WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(display_transfer_config.input_address)),                  Memory::VirtualToPhysicalAddress(params.in_buffer_address) >> 3);          WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(display_transfer_config.output_address)), @@ -433,17 +433,22 @@ static void ExecuteCommand(const Command& command, u32 thread_id) {      // TODO: Check if texture copies are implemented correctly..      case CommandId::SET_TEXTURE_COPY:      { -        auto& params = command.image_copy; -        WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(display_transfer_config.input_address)), +        auto& params = command.texture_copy; +        WriteGPURegister((u32)GPU_REG_INDEX(display_transfer_config.input_address),                  Memory::VirtualToPhysicalAddress(params.in_buffer_address) >> 3); -        WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(display_transfer_config.output_address)), +        WriteGPURegister((u32)GPU_REG_INDEX(display_transfer_config.output_address),                  Memory::VirtualToPhysicalAddress(params.out_buffer_address) >> 3); -        WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(display_transfer_config.input_size)), params.in_buffer_size); -        WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(display_transfer_config.output_size)), params.out_buffer_size); -        WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(display_transfer_config.flags)), params.flags); - -        // TODO: Should this register be set to 1 or should instead its value be OR-ed with 1? -        WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(display_transfer_config.trigger)), 1); +        WriteGPURegister((u32)GPU_REG_INDEX(display_transfer_config.texture_copy.size), +                params.size); +        WriteGPURegister((u32)GPU_REG_INDEX(display_transfer_config.texture_copy.input_size), +                params.in_width_gap); +        WriteGPURegister((u32)GPU_REG_INDEX(display_transfer_config.texture_copy.output_size), +                params.out_width_gap); +        WriteGPURegister((u32)GPU_REG_INDEX(display_transfer_config.flags), +                params.flags); + +        // NOTE: Actual GSP ORs 1 with current register instead of overwriting. Doesn't seem to matter. +        WriteGPURegister((u32)GPU_REG_INDEX(display_transfer_config.trigger), 1);          break;      } diff --git a/src/core/hle/service/gsp_gpu.h b/src/core/hle/service/gsp_gpu.h index c89d0a467..8bcb30ad1 100644 --- a/src/core/hle/service/gsp_gpu.h +++ b/src/core/hle/service/gsp_gpu.h @@ -127,7 +127,16 @@ struct Command {              u32 in_buffer_size;              u32 out_buffer_size;              u32 flags; -        } image_copy; +        } display_transfer; + +        struct { +            u32 in_buffer_address; +            u32 out_buffer_address; +            u32 size; +            u32 in_width_gap; +            u32 out_width_gap; +            u32 flags; +        } texture_copy;          u8 raw_data[0x1C];      }; diff --git a/src/core/hw/gpu.cpp b/src/core/hw/gpu.cpp index 3ccbc03b2..68ae38289 100644 --- a/src/core/hw/gpu.cpp +++ b/src/core/hw/gpu.cpp @@ -3,6 +3,7 @@  // Refer to the license.txt file included.  #include <cstring> +#include <numeric>  #include <type_traits>  #include "common/color.h" @@ -158,14 +159,59 @@ inline void Write(u32 addr, const T data) {              u8* src_pointer = Memory::GetPhysicalPointer(config.GetPhysicalInputAddress());              u8* dst_pointer = Memory::GetPhysicalPointer(config.GetPhysicalOutputAddress()); +            if (config.is_texture_copy) { +                u32 input_width = config.texture_copy.input_width * 16; +                u32 input_gap = config.texture_copy.input_gap * 16; +                u32 output_width = config.texture_copy.output_width * 16; +                u32 output_gap = config.texture_copy.output_gap * 16; + +                size_t contiguous_input_size = config.texture_copy.size / input_width * (input_width + input_gap); +                VideoCore::g_renderer->hw_rasterizer->NotifyPreRead(config.GetPhysicalInputAddress(), contiguous_input_size); + +                u32 remaining_size = config.texture_copy.size; +                u32 remaining_input = input_width; +                u32 remaining_output = output_width; +                while (remaining_size > 0) { +                    u32 copy_size = std::min({ remaining_input, remaining_output, remaining_size }); + +                    std::memcpy(dst_pointer, src_pointer, copy_size); +                    src_pointer += copy_size; +                    dst_pointer += copy_size; + +                    remaining_input -= copy_size; +                    remaining_output -= copy_size; +                    remaining_size -= copy_size; + +                    if (remaining_input == 0) { +                        remaining_input = input_width; +                        src_pointer += input_gap; +                    } +                    if (remaining_output == 0) { +                        remaining_output = output_width; +                        dst_pointer += output_gap; +                    } +                } + +                LOG_TRACE(HW_GPU, "TextureCopy: 0x%X bytes from 0x%08X(%u+%u)-> 0x%08X(%u+%u), flags 0x%08X", +                    config.texture_copy.size, +                    config.GetPhysicalInputAddress(), input_width, input_gap, +                    config.GetPhysicalOutputAddress(), output_width, output_gap, +                    config.flags); + +                size_t contiguous_output_size = config.texture_copy.size / output_width * (output_width + output_gap); +                VideoCore::g_renderer->hw_rasterizer->NotifyFlush(config.GetPhysicalOutputAddress(), contiguous_output_size); + +                GSP_GPU::SignalInterrupt(GSP_GPU::InterruptId::PPF); +                break; +            } +              if (config.scaling > config.ScaleXY) {                  LOG_CRITICAL(HW_GPU, "Unimplemented display transfer scaling mode %u", config.scaling.Value());                  UNIMPLEMENTED();                  break;              } -            if (config.output_tiled && -                    (config.scaling == config.ScaleXY || config.scaling == config.ScaleX)) { +            if (config.input_linear && config.scaling != config.NoScale) {                  LOG_CRITICAL(HW_GPU, "Scaling is only implemented on tiled input");                  UNIMPLEMENTED();                  break; @@ -182,23 +228,6 @@ inline void Write(u32 addr, const T data) {              VideoCore::g_renderer->hw_rasterizer->NotifyPreRead(config.GetPhysicalInputAddress(), input_size); -            if (config.raw_copy) { -                // Raw copies do not perform color conversion nor tiled->linear / linear->tiled conversions -                // TODO(Subv): Verify if raw copies perform scaling -                memcpy(dst_pointer, src_pointer, output_size); - -                LOG_TRACE(HW_GPU, "DisplayTriggerTransfer: 0x%08x bytes from 0x%08x(%ux%u)-> 0x%08x(%ux%u), output format: %x, flags 0x%08X, Raw copy", -                    output_size, -                    config.GetPhysicalInputAddress(), config.input_width.Value(), config.input_height.Value(), -                    config.GetPhysicalOutputAddress(), config.output_width.Value(), config.output_height.Value(), -                    config.output_format.Value(), config.flags); - -                GSP_GPU::SignalInterrupt(GSP_GPU::InterruptId::PPF); - -                VideoCore::g_renderer->hw_rasterizer->NotifyFlush(config.GetPhysicalOutputAddress(), output_size); -                break; -            } -              for (u32 y = 0; y < output_height; ++y) {                  for (u32 x = 0; x < output_width; ++x) {                      Math::Vec4<u8> src_color; @@ -220,7 +249,7 @@ inline void Write(u32 addr, const T data) {                      u32 src_offset;                      u32 dst_offset; -                    if (config.output_tiled) { +                    if (config.input_linear) {                          if (!config.dont_swizzle) {                              // Interpret the input as linear and the output as tiled                              u32 coarse_y = y & ~7; diff --git a/src/core/hw/gpu.h b/src/core/hw/gpu.h index daad506fe..2e3a9f779 100644 --- a/src/core/hw/gpu.h +++ b/src/core/hw/gpu.h @@ -201,12 +201,14 @@ struct Regs {              u32 flags;              BitField< 0, 1, u32> flip_vertically;  // flips input data vertically -            BitField< 1, 1, u32> output_tiled;     // Converts from linear to tiled format -            BitField< 3, 1, u32> raw_copy;         // Copies the data without performing any processing +            BitField< 1, 1, u32> input_linear;     // Converts from linear to tiled format +            BitField< 2, 1, u32> crop_input_lines; +            BitField< 3, 1, u32> is_texture_copy;  // Copies the data without performing any processing and respecting texture copy fields              BitField< 5, 1, u32> dont_swizzle;              BitField< 8, 3, PixelFormat> input_format;              BitField<12, 3, PixelFormat> output_format; - +            /// Uses some kind of 32x32 block swizzling mode, instead of the usual 8x8 one. +            BitField<16, 1, u32> block_32; // TODO(yuriks): unimplemented              BitField<24, 2, ScalingMode> scaling; // Determines the scaling mode of the transfer          }; @@ -214,10 +216,30 @@ struct Regs {          // it seems that writing to this field triggers the display transfer          u32 trigger; + +        INSERT_PADDING_WORDS(0x1); + +        struct { +            u32 size; + +            union { +                u32 input_size; + +                BitField< 0, 16, u32> input_width; +                BitField<16, 16, u32> input_gap; +            }; + +            union { +                u32 output_size; + +                BitField< 0, 16, u32> output_width; +                BitField<16, 16, u32> output_gap; +            }; +        } texture_copy;      } display_transfer_config; -    ASSERT_MEMBER_SIZE(display_transfer_config, 0x1c); +    ASSERT_MEMBER_SIZE(display_transfer_config, 0x2c); -    INSERT_PADDING_WORDS(0x331); +    INSERT_PADDING_WORDS(0x32D);      struct {          // command list size (in bytes) diff --git a/src/video_core/shader/shader_interpreter.cpp b/src/video_core/shader/shader_interpreter.cpp index e14de0768..646171a19 100644 --- a/src/video_core/shader/shader_interpreter.cpp +++ b/src/video_core/shader/shader_interpreter.cpp @@ -334,6 +334,42 @@ void RunInterpreter(UnitState<Debug>& state) {                  Record<DebugDataRecord::CMP_RESULT>(state.debug, iteration, state.conditional_code);                  break; +            case OpCode::Id::EX2: +            { +                Record<DebugDataRecord::SRC1>(state.debug, iteration, src1); +                Record<DebugDataRecord::DEST_IN>(state.debug, iteration, dest); + +                // EX2 only takes first component exp2 and writes it to all dest components +                float24 ex2_res = float24::FromFloat32(std::exp2(src1[0].ToFloat32())); +                for (int i = 0; i < 4; ++i) { +                    if (!swizzle.DestComponentEnabled(i)) +                        continue; + +                    dest[i] = ex2_res; +                } + +                Record<DebugDataRecord::DEST_OUT>(state.debug, iteration, dest); +                break; +            } + +            case OpCode::Id::LG2: +            { +                Record<DebugDataRecord::SRC1>(state.debug, iteration, src1); +                Record<DebugDataRecord::DEST_IN>(state.debug, iteration, dest); + +                // LG2 only takes the first component log2 and writes it to all dest components +                float24 lg2_res = float24::FromFloat32(std::log2(src1[0].ToFloat32())); +                for (int i = 0; i < 4; ++i) { +                    if (!swizzle.DestComponentEnabled(i)) +                        continue; + +                    dest[i] = lg2_res; +                } + +                Record<DebugDataRecord::DEST_OUT>(state.debug, iteration, dest); +                break; +            } +              default:                  LOG_ERROR(HW_GPU, "Unhandled arithmetic instruction: 0x%02x (%s): 0x%08x",                            (int)instr.opcode.Value().EffectiveOpCode(), instr.opcode.Value().GetInfo().name, instr.hex); diff --git a/src/video_core/shader/shader_jit_x64.cpp b/src/video_core/shader/shader_jit_x64.cpp index 836942c6b..e4b8295b3 100644 --- a/src/video_core/shader/shader_jit_x64.cpp +++ b/src/video_core/shader/shader_jit_x64.cpp @@ -25,8 +25,8 @@ const JitFunction instr_table[64] = {      &JitCompiler::Compile_DP4,      // dp4      nullptr,                        // dph      nullptr,                        // unknown -    nullptr,                        // ex2 -    nullptr,                        // lg2 +    &JitCompiler::Compile_EX2,      // ex2 +    &JitCompiler::Compile_LG2,      // lg2      nullptr,                        // unknown      &JitCompiler::Compile_MUL,      // mul      nullptr,                        // lge @@ -280,6 +280,22 @@ void JitCompiler::Compile_UniformCondition(Instruction instr) {      CMP(sizeof(bool) * 8, MDisp(UNIFORMS, offset), Imm8(0));  } +void JitCompiler::Compile_PushCallerSavedXMM() { +#ifndef _WIN32 +    SUB(64, R(RSP), Imm8(2 * 16)); +    MOVUPS(MDisp(RSP, 16), ONE); +    MOVUPS(MDisp(RSP, 0), NEGBIT); +#endif +} + +void JitCompiler::Compile_PopCallerSavedXMM() { +#ifndef _WIN32 +    MOVUPS(NEGBIT, MDisp(RSP, 0)); +    MOVUPS(ONE, MDisp(RSP, 16)); +    ADD(64, R(RSP), Imm8(2 * 16)); +#endif +} +  void JitCompiler::Compile_ADD(Instruction instr) {      Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);      Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); @@ -331,6 +347,38 @@ void JitCompiler::Compile_DP4(Instruction instr) {      Compile_DestEnable(instr, SRC1);  } +void JitCompiler::Compile_EX2(Instruction instr) { +    Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); +    MOVSS(XMM0, R(SRC1)); + +    // The following will actually break the stack alignment +    ABI_PushAllCallerSavedRegsAndAdjustStack(); +    Compile_PushCallerSavedXMM(); +    ABI_CallFunction(reinterpret_cast<const void*>(exp2f)); +    Compile_PopCallerSavedXMM(); +    ABI_PopAllCallerSavedRegsAndAdjustStack(); + +    SHUFPS(XMM0, R(XMM0), _MM_SHUFFLE(0, 0, 0, 0)); +    MOVAPS(SRC1, R(XMM0)); +    Compile_DestEnable(instr, SRC1); +} + +void JitCompiler::Compile_LG2(Instruction instr) { +    Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); +    MOVSS(XMM0, R(SRC1)); + +    // The following will actually break the stack alignment +    ABI_PushAllCallerSavedRegsAndAdjustStack(); +    Compile_PushCallerSavedXMM(); +    ABI_CallFunction(reinterpret_cast<const void*>(log2f)); +    Compile_PopCallerSavedXMM(); +    ABI_PopAllCallerSavedRegsAndAdjustStack(); + +    SHUFPS(XMM0, R(XMM0), _MM_SHUFFLE(0, 0, 0, 0)); +    MOVAPS(SRC1, R(XMM0)); +    Compile_DestEnable(instr, SRC1); +} +  void JitCompiler::Compile_MUL(Instruction instr) {      Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);      Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); diff --git a/src/video_core/shader/shader_jit_x64.h b/src/video_core/shader/shader_jit_x64.h index b88f2a0d2..a6ae7fbf1 100644 --- a/src/video_core/shader/shader_jit_x64.h +++ b/src/video_core/shader/shader_jit_x64.h @@ -37,6 +37,8 @@ public:      void Compile_ADD(Instruction instr);      void Compile_DP3(Instruction instr);      void Compile_DP4(Instruction instr); +    void Compile_EX2(Instruction instr); +    void Compile_LG2(Instruction instr);      void Compile_MUL(Instruction instr);      void Compile_FLR(Instruction instr);      void Compile_MAX(Instruction instr); @@ -67,6 +69,9 @@ private:      void Compile_EvaluateCondition(Instruction instr);      void Compile_UniformCondition(Instruction instr); +    void Compile_PushCallerSavedXMM(); +    void Compile_PopCallerSavedXMM(); +      /// Pointer to the variable that stores the current Pica code offset. Used to handle nested code blocks.      unsigned* offset_ptr = nullptr; | 
