diff options
| author | bunnei <bunneidev@gmail.com> | 2015-08-15 18:26:12 -0400 |
|---|---|---|
| committer | bunnei <bunneidev@gmail.com> | 2015-08-15 18:26:12 -0400 |
| commit | d852c4ecc71f800b552978362947a5eb5a524694 (patch) | |
| tree | 9ea9a86da4027126914e69b12c24d2849fdb2c2d /src/video_core/shader | |
| parent | cebf245504b75469f19d3cc0a6f2f66aefa66947 (diff) | |
| parent | db97090cad236eeeb0909eb1d35cbece15e1f0a5 (diff) | |
Merge pull request #1002 from bunnei/shader-jit
Vertex Shader JIT for X86-64
Diffstat (limited to 'src/video_core/shader')
| -rw-r--r-- | src/video_core/shader/shader.cpp | 145 | ||||
| -rw-r--r-- | src/video_core/shader/shader.h | 169 | ||||
| -rw-r--r-- | src/video_core/shader/shader_interpreter.cpp | 508 | ||||
| -rw-r--r-- | src/video_core/shader/shader_interpreter.h | 19 | ||||
| -rw-r--r-- | src/video_core/shader/shader_jit_x64.cpp | 675 | ||||
| -rw-r--r-- | src/video_core/shader/shader_jit_x64.h | 79 |
6 files changed, 1595 insertions, 0 deletions
diff --git a/src/video_core/shader/shader.cpp b/src/video_core/shader/shader.cpp new file mode 100644 index 000000000..6a27a8015 --- /dev/null +++ b/src/video_core/shader/shader.cpp @@ -0,0 +1,145 @@ +// Copyright 2015 Citra Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include <memory> +#include <unordered_map> + +#include "common/hash.h" +#include "common/make_unique.h" +#include "common/profiler.h" + +#include "video_core/debug_utils/debug_utils.h" +#include "video_core/pica.h" +#include "video_core/video_core.h" + +#include "shader.h" +#include "shader_interpreter.h" + +#ifdef ARCHITECTURE_x86_64 +#include "shader_jit_x64.h" +#endif // ARCHITECTURE_x86_64 + +namespace Pica { + +namespace Shader { + +#ifdef ARCHITECTURE_x86_64 +static std::unordered_map<u64, CompiledShader*> shader_map; +static JitCompiler jit; +static CompiledShader* jit_shader; +#endif // ARCHITECTURE_x86_64 + +void Setup(UnitState& state) { +#ifdef ARCHITECTURE_x86_64 + if (VideoCore::g_shader_jit_enabled) { + u64 cache_key = (Common::ComputeHash64(&g_state.vs.program_code, sizeof(g_state.vs.program_code)) ^ + Common::ComputeHash64(&g_state.vs.swizzle_data, sizeof(g_state.vs.swizzle_data)) ^ + g_state.regs.vs.main_offset); + + auto iter = shader_map.find(cache_key); + if (iter != shader_map.end()) { + jit_shader = iter->second; + } else { + jit_shader = jit.Compile(); + shader_map.emplace(cache_key, jit_shader); + } + } +#endif // ARCHITECTURE_x86_64 +} + +void Shutdown() { + shader_map.clear(); +} + +static Common::Profiling::TimingCategory shader_category("Vertex Shader"); + +OutputVertex Run(UnitState& state, const InputVertex& input, int num_attributes) { + auto& config = g_state.regs.vs; + auto& setup = g_state.vs; + + Common::Profiling::ScopeTimer timer(shader_category); + + state.program_counter = config.main_offset; + state.debug.max_offset = 0; + state.debug.max_opdesc_id = 0; + + // Setup input register table + const auto& attribute_register_map = config.input_register_map; + + if (num_attributes > 0) state.registers.input[attribute_register_map.attribute0_register] = input.attr[0]; + if (num_attributes > 1) state.registers.input[attribute_register_map.attribute1_register] = input.attr[1]; + if (num_attributes > 2) state.registers.input[attribute_register_map.attribute2_register] = input.attr[2]; + if (num_attributes > 3) state.registers.input[attribute_register_map.attribute3_register] = input.attr[3]; + if (num_attributes > 4) state.registers.input[attribute_register_map.attribute4_register] = input.attr[4]; + if (num_attributes > 5) state.registers.input[attribute_register_map.attribute5_register] = input.attr[5]; + if (num_attributes > 6) state.registers.input[attribute_register_map.attribute6_register] = input.attr[6]; + if (num_attributes > 7) state.registers.input[attribute_register_map.attribute7_register] = input.attr[7]; + if (num_attributes > 8) state.registers.input[attribute_register_map.attribute8_register] = input.attr[8]; + if (num_attributes > 9) state.registers.input[attribute_register_map.attribute9_register] = input.attr[9]; + if (num_attributes > 10) state.registers.input[attribute_register_map.attribute10_register] = input.attr[10]; + if (num_attributes > 11) state.registers.input[attribute_register_map.attribute11_register] = input.attr[11]; + if (num_attributes > 12) state.registers.input[attribute_register_map.attribute12_register] = input.attr[12]; + if (num_attributes > 13) state.registers.input[attribute_register_map.attribute13_register] = input.attr[13]; + if (num_attributes > 14) state.registers.input[attribute_register_map.attribute14_register] = input.attr[14]; + if (num_attributes > 15) state.registers.input[attribute_register_map.attribute15_register] = input.attr[15]; + + state.conditional_code[0] = false; + state.conditional_code[1] = false; + +#ifdef ARCHITECTURE_x86_64 + if (VideoCore::g_shader_jit_enabled) + jit_shader(&state.registers); + else + RunInterpreter(state); +#else + RunInterpreter(state); +#endif // ARCHITECTURE_x86_64 + +#if PICA_DUMP_SHADERS + DebugUtils::DumpShader(setup.program_code.data(), state.debug.max_offset, setup.swizzle_data.data(), + state.debug.max_opdesc_id, config.main_offset, + g_state.regs.vs_output_attributes); // TODO: Don't hardcode VS here +#endif + + // Setup output data + OutputVertex ret; + // TODO(neobrain): Under some circumstances, up to 16 attributes may be output. We need to + // figure out what those circumstances are and enable the remaining outputs then. + for (int i = 0; i < 7; ++i) { + const auto& output_register_map = g_state.regs.vs_output_attributes[i]; // TODO: Don't hardcode VS here + + u32 semantics[4] = { + output_register_map.map_x, output_register_map.map_y, + output_register_map.map_z, output_register_map.map_w + }; + + for (int comp = 0; comp < 4; ++comp) { + float24* out = ((float24*)&ret) + semantics[comp]; + if (semantics[comp] != Regs::VSOutputAttributes::INVALID) { + *out = state.registers.output[i][comp]; + } else { + // Zero output so that attributes which aren't output won't have denormals in them, + // which would slow us down later. + memset(out, 0, sizeof(*out)); + } + } + } + + // The hardware takes the absolute and saturates vertex colors like this, *before* doing interpolation + for (int i = 0; i < 4; ++i) { + ret.color[i] = float24::FromFloat32( + std::fmin(std::fabs(ret.color[i].ToFloat32()), 1.0f)); + } + + LOG_TRACE(Render_Software, "Output vertex: pos (%.2f, %.2f, %.2f, %.2f), col(%.2f, %.2f, %.2f, %.2f), tc0(%.2f, %.2f)", + ret.pos.x.ToFloat32(), ret.pos.y.ToFloat32(), ret.pos.z.ToFloat32(), ret.pos.w.ToFloat32(), + ret.color.x.ToFloat32(), ret.color.y.ToFloat32(), ret.color.z.ToFloat32(), ret.color.w.ToFloat32(), + ret.tc0.u().ToFloat32(), ret.tc0.v().ToFloat32()); + + return ret; +} + +} // namespace Shader + +} // namespace Pica diff --git a/src/video_core/shader/shader.h b/src/video_core/shader/shader.h new file mode 100644 index 000000000..2007a2844 --- /dev/null +++ b/src/video_core/shader/shader.h @@ -0,0 +1,169 @@ +// Copyright 2015 Citra Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include <boost/container/static_vector.hpp> +#include <nihstro/shader_binary.h> + +#include "common/common_funcs.h" +#include "common/common_types.h" +#include "common/vector_math.h" + +#include "video_core/pica.h" + +using nihstro::RegisterType; +using nihstro::SourceRegister; +using nihstro::DestRegister; + +namespace Pica { + +namespace Shader { + +struct InputVertex { + Math::Vec4<float24> attr[16]; +}; + +struct OutputVertex { + OutputVertex() = default; + + // VS output attributes + Math::Vec4<float24> pos; + Math::Vec4<float24> dummy; // quaternions (not implemented, yet) + Math::Vec4<float24> color; + Math::Vec2<float24> tc0; + Math::Vec2<float24> tc1; + float24 pad[6]; + Math::Vec2<float24> tc2; + + // Padding for optimal alignment + float24 pad2[4]; + + // Attributes used to store intermediate results + + // position after perspective divide + Math::Vec3<float24> screenpos; + float24 pad3; + + // Linear interpolation + // factor: 0=this, 1=vtx + void Lerp(float24 factor, const OutputVertex& vtx) { + pos = pos * factor + vtx.pos * (float24::FromFloat32(1) - factor); + + // TODO: Should perform perspective correct interpolation here... + tc0 = tc0 * factor + vtx.tc0 * (float24::FromFloat32(1) - factor); + tc1 = tc1 * factor + vtx.tc1 * (float24::FromFloat32(1) - factor); + tc2 = tc2 * factor + vtx.tc2 * (float24::FromFloat32(1) - factor); + + screenpos = screenpos * factor + vtx.screenpos * (float24::FromFloat32(1) - factor); + + color = color * factor + vtx.color * (float24::FromFloat32(1) - factor); + } + + // Linear interpolation + // factor: 0=v0, 1=v1 + static OutputVertex Lerp(float24 factor, const OutputVertex& v0, const OutputVertex& v1) { + OutputVertex ret = v0; + ret.Lerp(factor, v1); + return ret; + } +}; +static_assert(std::is_pod<OutputVertex>::value, "Structure is not POD"); +static_assert(sizeof(OutputVertex) == 32 * sizeof(float), "OutputVertex has invalid size"); + +/** + * This structure contains the state information that needs to be unique for a shader unit. The 3DS + * has four shader units that process shaders in parallel. At the present, Citra only implements a + * single shader unit that processes all shaders serially. Putting the state information in a struct + * here will make it easier for us to parallelize the shader processing later. + */ +struct UnitState { + struct Registers { + // The registers are accessed by the shader JIT using SSE instructions, and are therefore + // required to be 16-byte aligned. + Math::Vec4<float24> MEMORY_ALIGNED16(input[16]); + Math::Vec4<float24> MEMORY_ALIGNED16(output[16]); + Math::Vec4<float24> MEMORY_ALIGNED16(temporary[16]); + } registers; + static_assert(std::is_pod<Registers>::value, "Structure is not POD"); + + u32 program_counter; + bool conditional_code[2]; + + // Two Address registers and one loop counter + // TODO: How many bits do these actually have? + s32 address_registers[3]; + + enum { + INVALID_ADDRESS = 0xFFFFFFFF + }; + + struct CallStackElement { + u32 final_address; // Address upon which we jump to return_address + u32 return_address; // Where to jump when leaving scope + u8 repeat_counter; // How often to repeat until this call stack element is removed + u8 loop_increment; // Which value to add to the loop counter after an iteration + // TODO: Should this be a signed value? Does it even matter? + u32 loop_address; // The address where we'll return to after each loop iteration + }; + + // TODO: Is there a maximal size for this? + boost::container::static_vector<CallStackElement, 16> call_stack; + + struct { + u32 max_offset; // maximum program counter ever reached + u32 max_opdesc_id; // maximum swizzle pattern index ever used + } debug; + + static int InputOffset(const SourceRegister& reg) { + switch (reg.GetRegisterType()) { + case RegisterType::Input: + return (int)offsetof(UnitState::Registers, input) + reg.GetIndex()*sizeof(Math::Vec4<float24>); + + case RegisterType::Temporary: + return (int)offsetof(UnitState::Registers, temporary) + reg.GetIndex()*sizeof(Math::Vec4<float24>); + + default: + UNREACHABLE(); + return 0; + } + } + + static int OutputOffset(const DestRegister& reg) { + switch (reg.GetRegisterType()) { + case RegisterType::Output: + return (int)offsetof(UnitState::Registers, output) + reg.GetIndex()*sizeof(Math::Vec4<float24>); + + case RegisterType::Temporary: + return (int)offsetof(UnitState::Registers, temporary) + reg.GetIndex()*sizeof(Math::Vec4<float24>); + + default: + UNREACHABLE(); + return 0; + } + } +}; + +/** + * Performs any shader unit setup that only needs to happen once per shader (as opposed to once per + * vertex, which would happen within the `Run` function). + * @param state Shader unit state, must be setup per shader and per shader unit + */ +void Setup(UnitState& state); + +/// Performs any cleanup when the emulator is shutdown +void Shutdown(); + +/** + * Runs the currently setup shader + * @param state Shader unit state, must be setup per shader and per shader unit + * @param input Input vertex into the shader + * @param num_attributes The number of vertex shader attributes + * @return The output vertex, after having been processed by the vertex shader + */ +OutputVertex Run(UnitState& state, const InputVertex& input, int num_attributes); + +} // namespace Shader + +} // namespace Pica diff --git a/src/video_core/shader/shader_interpreter.cpp b/src/video_core/shader/shader_interpreter.cpp new file mode 100644 index 000000000..c8489f920 --- /dev/null +++ b/src/video_core/shader/shader_interpreter.cpp @@ -0,0 +1,508 @@ +// Copyright 2014 Citra Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include <common/file_util.h> + +#include <nihstro/shader_bytecode.h> + +#include "video_core/pica.h" + +#include "shader.h" +#include "shader_interpreter.h" + +using nihstro::OpCode; +using nihstro::Instruction; +using nihstro::RegisterType; +using nihstro::SourceRegister; +using nihstro::SwizzlePattern; + +namespace Pica { + +namespace Shader { + +void RunInterpreter(UnitState& state) { + const auto& uniforms = g_state.vs.uniforms; + const auto& swizzle_data = g_state.vs.swizzle_data; + const auto& program_code = g_state.vs.program_code; + + // Placeholder for invalid inputs + static float24 dummy_vec4_float24[4]; + + while (true) { + if (!state.call_stack.empty()) { + auto& top = state.call_stack.back(); + if (state.program_counter == top.final_address) { + state.address_registers[2] += top.loop_increment; + + if (top.repeat_counter-- == 0) { + state.program_counter = top.return_address; + state.call_stack.pop_back(); + } else { + state.program_counter = top.loop_address; + } + + // TODO: Is "trying again" accurate to hardware? + continue; + } + } + + bool exit_loop = false; + const Instruction instr = { program_code[state.program_counter] }; + const SwizzlePattern swizzle = { swizzle_data[instr.common.operand_desc_id] }; + + static auto call = [](UnitState& state, u32 offset, u32 num_instructions, + u32 return_offset, u8 repeat_count, u8 loop_increment) { + state.program_counter = offset - 1; // -1 to make sure when incrementing the PC we end up at the correct offset + ASSERT(state.call_stack.size() < state.call_stack.capacity()); + state.call_stack.push_back({ offset + num_instructions, return_offset, repeat_count, loop_increment, offset }); + }; + state.debug.max_offset = std::max<u32>(state.debug.max_offset, 1 + state.program_counter); + + auto LookupSourceRegister = [&](const SourceRegister& source_reg) -> const float24* { + switch (source_reg.GetRegisterType()) { + case RegisterType::Input: + return &state.registers.input[source_reg.GetIndex()].x; + + case RegisterType::Temporary: + return &state.registers.temporary[source_reg.GetIndex()].x; + + case RegisterType::FloatUniform: + return &uniforms.f[source_reg.GetIndex()].x; + + default: + return dummy_vec4_float24; + } + }; + + switch (instr.opcode.Value().GetInfo().type) { + case OpCode::Type::Arithmetic: + { + const bool is_inverted = (0 != (instr.opcode.Value().GetInfo().subtype & OpCode::Info::SrcInversed)); + + const int address_offset = (instr.common.address_register_index == 0) + ? 0 : state.address_registers[instr.common.address_register_index - 1]; + + const float24* src1_ = LookupSourceRegister(instr.common.GetSrc1(is_inverted) + (!is_inverted * address_offset)); + const float24* src2_ = LookupSourceRegister(instr.common.GetSrc2(is_inverted) + ( is_inverted * address_offset)); + + const bool negate_src1 = ((bool)swizzle.negate_src1 != false); + const bool negate_src2 = ((bool)swizzle.negate_src2 != false); + + float24 src1[4] = { + src1_[(int)swizzle.GetSelectorSrc1(0)], + src1_[(int)swizzle.GetSelectorSrc1(1)], + src1_[(int)swizzle.GetSelectorSrc1(2)], + src1_[(int)swizzle.GetSelectorSrc1(3)], + }; + if (negate_src1) { + src1[0] = src1[0] * float24::FromFloat32(-1); + src1[1] = src1[1] * float24::FromFloat32(-1); + src1[2] = src1[2] * float24::FromFloat32(-1); + src1[3] = src1[3] * float24::FromFloat32(-1); + } + float24 src2[4] = { + src2_[(int)swizzle.GetSelectorSrc2(0)], + src2_[(int)swizzle.GetSelectorSrc2(1)], + src2_[(int)swizzle.GetSelectorSrc2(2)], + src2_[(int)swizzle.GetSelectorSrc2(3)], + }; + if (negate_src2) { + src2[0] = src2[0] * float24::FromFloat32(-1); + src2[1] = src2[1] * float24::FromFloat32(-1); + src2[2] = src2[2] * float24::FromFloat32(-1); + src2[3] = src2[3] * float24::FromFloat32(-1); + } + + float24* dest = (instr.common.dest.Value() < 0x10) ? &state.registers.output[instr.common.dest.Value().GetIndex()][0] + : (instr.common.dest.Value() < 0x20) ? &state.registers.temporary[instr.common.dest.Value().GetIndex()][0] + : dummy_vec4_float24; + + state.debug.max_opdesc_id = std::max<u32>(state.debug.max_opdesc_id, 1+instr.common.operand_desc_id); + + switch (instr.opcode.Value().EffectiveOpCode()) { + case OpCode::Id::ADD: + { + for (int i = 0; i < 4; ++i) { + if (!swizzle.DestComponentEnabled(i)) + continue; + + dest[i] = src1[i] + src2[i]; + } + + break; + } + + case OpCode::Id::MUL: + { + for (int i = 0; i < 4; ++i) { + if (!swizzle.DestComponentEnabled(i)) + continue; + + dest[i] = src1[i] * src2[i]; + } + + break; + } + + case OpCode::Id::FLR: + for (int i = 0; i < 4; ++i) { + if (!swizzle.DestComponentEnabled(i)) + continue; + + dest[i] = float24::FromFloat32(std::floor(src1[i].ToFloat32())); + } + break; + + case OpCode::Id::MAX: + for (int i = 0; i < 4; ++i) { + if (!swizzle.DestComponentEnabled(i)) + continue; + + dest[i] = std::max(src1[i], src2[i]); + } + break; + + case OpCode::Id::MIN: + for (int i = 0; i < 4; ++i) { + if (!swizzle.DestComponentEnabled(i)) + continue; + + dest[i] = std::min(src1[i], src2[i]); + } + break; + + case OpCode::Id::DP3: + case OpCode::Id::DP4: + { + float24 dot = float24::FromFloat32(0.f); + int num_components = (instr.opcode.Value() == OpCode::Id::DP3) ? 3 : 4; + for (int i = 0; i < num_components; ++i) + dot = dot + src1[i] * src2[i]; + + for (int i = 0; i < 4; ++i) { + if (!swizzle.DestComponentEnabled(i)) + continue; + + dest[i] = dot; + } + break; + } + + // Reciprocal + case OpCode::Id::RCP: + { + for (int i = 0; i < 4; ++i) { + if (!swizzle.DestComponentEnabled(i)) + continue; + + // TODO: Be stable against division by zero! + // TODO: I think this might be wrong... we should only use one component here + dest[i] = float24::FromFloat32(1.0f / src1[i].ToFloat32()); + } + + break; + } + + // Reciprocal Square Root + case OpCode::Id::RSQ: + { + for (int i = 0; i < 4; ++i) { + if (!swizzle.DestComponentEnabled(i)) + continue; + + // TODO: Be stable against division by zero! + // TODO: I think this might be wrong... we should only use one component here + dest[i] = float24::FromFloat32(1.0f / sqrt(src1[i].ToFloat32())); + } + + break; + } + + case OpCode::Id::MOVA: + { + for (int i = 0; i < 2; ++i) { + if (!swizzle.DestComponentEnabled(i)) + continue; + + // TODO: Figure out how the rounding is done on hardware + state.address_registers[i] = static_cast<s32>(src1[i].ToFloat32()); + } + + break; + } + + case OpCode::Id::MOV: + { + for (int i = 0; i < 4; ++i) { + if (!swizzle.DestComponentEnabled(i)) + continue; + + dest[i] = src1[i]; + } + break; + } + + case OpCode::Id::SLT: + case OpCode::Id::SLTI: + for (int i = 0; i < 4; ++i) { + if (!swizzle.DestComponentEnabled(i)) + continue; + + dest[i] = (src1[i] < src2[i]) ? float24::FromFloat32(1.0f) : float24::FromFloat32(0.0f); + } + break; + + case OpCode::Id::CMP: + for (int i = 0; i < 2; ++i) { + // TODO: Can you restrict to one compare via dest masking? + + auto compare_op = instr.common.compare_op; + auto op = (i == 0) ? compare_op.x.Value() : compare_op.y.Value(); + + switch (op) { + case compare_op.Equal: + state.conditional_code[i] = (src1[i] == src2[i]); + break; + + case compare_op.NotEqual: + state.conditional_code[i] = (src1[i] != src2[i]); + break; + + case compare_op.LessThan: + state.conditional_code[i] = (src1[i] < src2[i]); + break; + + case compare_op.LessEqual: + state.conditional_code[i] = (src1[i] <= src2[i]); + break; + + case compare_op.GreaterThan: + state.conditional_code[i] = (src1[i] > src2[i]); + break; + + case compare_op.GreaterEqual: + state.conditional_code[i] = (src1[i] >= src2[i]); + break; + + default: + LOG_ERROR(HW_GPU, "Unknown compare mode %x", static_cast<int>(op)); + break; + } + } + break; + + default: + LOG_ERROR(HW_GPU, "Unhandled arithmetic instruction: 0x%02x (%s): 0x%08x", + (int)instr.opcode.Value().EffectiveOpCode(), instr.opcode.Value().GetInfo().name, instr.hex); + DEBUG_ASSERT(false); + break; + } + + break; + } + + case OpCode::Type::MultiplyAdd: + { + if ((instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MAD) || + (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MADI)) { + const SwizzlePattern& swizzle = *(SwizzlePattern*)&swizzle_data[instr.mad.operand_desc_id]; + + bool is_inverted = (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MADI); + + const float24* src1_ = LookupSourceRegister(instr.mad.GetSrc1(is_inverted)); + const float24* src2_ = LookupSourceRegister(instr.mad.GetSrc2(is_inverted)); + const float24* src3_ = LookupSourceRegister(instr.mad.GetSrc3(is_inverted)); + + const bool negate_src1 = ((bool)swizzle.negate_src1 != false); + const bool negate_src2 = ((bool)swizzle.negate_src2 != false); + const bool negate_src3 = ((bool)swizzle.negate_src3 != false); + + float24 src1[4] = { + src1_[(int)swizzle.GetSelectorSrc1(0)], + src1_[(int)swizzle.GetSelectorSrc1(1)], + src1_[(int)swizzle.GetSelectorSrc1(2)], + src1_[(int)swizzle.GetSelectorSrc1(3)], + }; + if (negate_src1) { + src1[0] = src1[0] * float24::FromFloat32(-1); + src1[1] = src1[1] * float24::FromFloat32(-1); + src1[2] = src1[2] * float24::FromFloat32(-1); + src1[3] = src1[3] * float24::FromFloat32(-1); + } + float24 src2[4] = { + src2_[(int)swizzle.GetSelectorSrc2(0)], + src2_[(int)swizzle.GetSelectorSrc2(1)], + src2_[(int)swizzle.GetSelectorSrc2(2)], + src2_[(int)swizzle.GetSelectorSrc2(3)], + }; + if (negate_src2) { + src2[0] = src2[0] * float24::FromFloat32(-1); + src2[1] = src2[1] * float24::FromFloat32(-1); + src2[2] = src2[2] * float24::FromFloat32(-1); + src2[3] = src2[3] * float24::FromFloat32(-1); + } + float24 src3[4] = { + src3_[(int)swizzle.GetSelectorSrc3(0)], + src3_[(int)swizzle.GetSelectorSrc3(1)], + src3_[(int)swizzle.GetSelectorSrc3(2)], + src3_[(int)swizzle.GetSelectorSrc3(3)], + }; + if (negate_src3) { + src3[0] = src3[0] * float24::FromFloat32(-1); + src3[1] = src3[1] * float24::FromFloat32(-1); + src3[2] = src3[2] * float24::FromFloat32(-1); + src3[3] = src3[3] * float24::FromFloat32(-1); + } + + float24* dest = (instr.mad.dest.Value() < 0x10) ? &state.registers.output[instr.mad.dest.Value().GetIndex()][0] + : (instr.mad.dest.Value() < 0x20) ? &state.registers.temporary[instr.mad.dest.Value().GetIndex()][0] + : dummy_vec4_float24; + + for (int i = 0; i < 4; ++i) { + if (!swizzle.DestComponentEnabled(i)) + continue; + + dest[i] = src1[i] * src2[i] + src3[i]; + } + } else { + LOG_ERROR(HW_GPU, "Unhandled multiply-add instruction: 0x%02x (%s): 0x%08x", + (int)instr.opcode.Value().EffectiveOpCode(), instr.opcode.Value().GetInfo().name, instr.hex); + } + break; + } + + default: + { + static auto evaluate_condition = [](const UnitState& state, bool refx, bool refy, Instruction::FlowControlType flow_control) { + bool results[2] = { refx == state.conditional_code[0], + refy == state.conditional_code[1] }; + + switch (flow_control.op) { + case flow_control.Or: + return results[0] || results[1]; + + case flow_control.And: + return results[0] && results[1]; + + case flow_control.JustX: + return results[0]; + + case flow_control.JustY: + return results[1]; + } + }; + + // Handle each instruction on its own + switch (instr.opcode.Value()) { + case OpCode::Id::END: + exit_loop = true; + break; + + case OpCode::Id::JMPC: + if (evaluate_condition(state, instr.flow_control.refx, instr.flow_control.refy, instr.flow_control)) { + state.program_counter = instr.flow_control.dest_offset - 1; + } + break; + + case OpCode::Id::JMPU: + if (uniforms.b[instr.flow_control.bool_uniform_id]) { + state.program_counter = instr.flow_control.dest_offset - 1; + } + break; + + case OpCode::Id::CALL: + call(state, + instr.flow_control.dest_offset, + instr.flow_control.num_instructions, + state.program_counter + 1, 0, 0); + break; + + case OpCode::Id::CALLU: + if (uniforms.b[instr.flow_control.bool_uniform_id]) { + call(state, + instr.flow_control.dest_offset, + instr.flow_control.num_instructions, + state.program_counter + 1, 0, 0); + } + break; + + case OpCode::Id::CALLC: + if (evaluate_condition(state, instr.flow_control.refx, instr.flow_control.refy, instr.flow_control)) { + call(state, + instr.flow_control.dest_offset, + instr.flow_control.num_instructions, + state.program_counter + 1, 0, 0); + } + break; + + case OpCode::Id::NOP: + break; + + case OpCode::Id::IFU: + if (uniforms.b[instr.flow_control.bool_uniform_id]) { + call(state, + state.program_counter + 1, + instr.flow_control.dest_offset - state.program_counter - 1, + instr.flow_control.dest_offset + instr.flow_control.num_instructions, 0, 0); + } else { + call(state, + instr.flow_control.dest_offset, + instr.flow_control.num_instructions, + instr.flow_control.dest_offset + instr.flow_control.num_instructions, 0, 0); + } + + break; + + case OpCode::Id::IFC: + { + // TODO: Do we need to consider swizzlers here? + + if (evaluate_condition(state, instr.flow_control.refx, instr.flow_control.refy, instr.flow_control)) { + call(state, + state.program_counter + 1, + instr.flow_control.dest_offset - state.program_counter - 1, + instr.flow_control.dest_offset + instr.flow_control.num_instructions, 0, 0); + } else { + call(state, + instr.flow_control.dest_offset, + instr.flow_control.num_instructions, + instr.flow_control.dest_offset + instr.flow_control.num_instructions, 0, 0); + } + + break; + } + + case OpCode::Id::LOOP: + { + state.address_registers[2] = uniforms.i[instr.flow_control.int_uniform_id].y; + + call(state, + state.program_counter + 1, + instr.flow_control.dest_offset - state.program_counter + 1, + instr.flow_control.dest_offset + 1, + uniforms.i[instr.flow_control.int_uniform_id].x, + uniforms.i[instr.flow_control.int_uniform_id].z); + break; + } + + default: + LOG_ERROR(HW_GPU, "Unhandled instruction: 0x%02x (%s): 0x%08x", + (int)instr.opcode.Value().EffectiveOpCode(), instr.opcode.Value().GetInfo().name, instr.hex); + break; + } + + break; + } + } + + ++state.program_counter; + + if (exit_loop) + break; + } +} + +} // namespace + +} // namespace diff --git a/src/video_core/shader/shader_interpreter.h b/src/video_core/shader/shader_interpreter.h new file mode 100644 index 000000000..ad6e58e39 --- /dev/null +++ b/src/video_core/shader/shader_interpreter.h @@ -0,0 +1,19 @@ +// Copyright 2014 Citra Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include "video_core/pica.h" + +#include "shader.h" + +namespace Pica { + +namespace Shader { + +void RunInterpreter(UnitState& state); + +} // namespace + +} // namespace diff --git a/src/video_core/shader/shader_jit_x64.cpp b/src/video_core/shader/shader_jit_x64.cpp new file mode 100644 index 000000000..ce47774d5 --- /dev/null +++ b/src/video_core/shader/shader_jit_x64.cpp @@ -0,0 +1,675 @@ +// Copyright 2015 Citra Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include <smmintrin.h> + +#include "common/x64/abi.h" +#include "common/x64/cpu_detect.h" +#include "common/x64/emitter.h" + +#include "shader.h" +#include "shader_jit_x64.h" + +namespace Pica { + +namespace Shader { + +using namespace Gen; + +typedef void (JitCompiler::*JitFunction)(Instruction instr); + +const JitFunction instr_table[64] = { + &JitCompiler::Compile_ADD, // add + &JitCompiler::Compile_DP3, // dp3 + &JitCompiler::Compile_DP4, // dp4 + nullptr, // dph + nullptr, // unknown + nullptr, // ex2 + nullptr, // lg2 + nullptr, // unknown + &JitCompiler::Compile_MUL, // mul + nullptr, // lge + nullptr, // slt + &JitCompiler::Compile_FLR, // flr + &JitCompiler::Compile_MAX, // max + &JitCompiler::Compile_MIN, // min + &JitCompiler::Compile_RCP, // rcp + &JitCompiler::Compile_RSQ, // rsq + nullptr, // unknown + nullptr, // unknown + &JitCompiler::Compile_MOVA, // mova + &JitCompiler::Compile_MOV, // mov + nullptr, // unknown + nullptr, // unknown + nullptr, // unknown + nullptr, // unknown + nullptr, // dphi + nullptr, // unknown + nullptr, // sgei + &JitCompiler::Compile_SLTI, // slti + nullptr, // unknown + nullptr, // unknown + nullptr, // unknown + nullptr, // unknown + nullptr, // unknown + &JitCompiler::Compile_NOP, // nop + &JitCompiler::Compile_END, // end + nullptr, // break + &JitCompiler::Compile_CALL, // call + &JitCompiler::Compile_CALLC, // callc + &JitCompiler::Compile_CALLU, // callu + &JitCompiler::Compile_IF, // ifu + &JitCompiler::Compile_IF, // ifc + &JitCompiler::Compile_LOOP, // loop + nullptr, // emit + nullptr, // sete + &JitCompiler::Compile_JMP, // jmpc + &JitCompiler::Compile_JMP, // jmpu + &JitCompiler::Compile_CMP, // cmp + &JitCompiler::Compile_CMP, // cmp + &JitCompiler::Compile_MAD, // madi + &JitCompiler::Compile_MAD, // madi + &JitCompiler::Compile_MAD, // madi + &JitCompiler::Compile_MAD, // madi + &JitCompiler::Compile_MAD, // madi + &JitCompiler::Compile_MAD, // madi + &JitCompiler::Compile_MAD, // madi + &JitCompiler::Compile_MAD, // madi + &JitCompiler::Compile_MAD, // mad + &JitCompiler::Compile_MAD, // mad + &JitCompiler::Compile_MAD, // mad + &JitCompiler::Compile_MAD, // mad + &JitCompiler::Compile_MAD, // mad + &JitCompiler::Compile_MAD, // mad + &JitCompiler::Compile_MAD, // mad + &JitCompiler::Compile_MAD, // mad +}; + +// The following is used to alias some commonly used registers. Generally, RAX-RDX and XMM0-XMM3 can +// be used as scratch registers within a compiler function. The other registers have designated +// purposes, as documented below: + +/// Pointer to the uniform memory +static const X64Reg UNIFORMS = R9; +/// The two 32-bit VS address offset registers set by the MOVA instruction +static const X64Reg ADDROFFS_REG_0 = R10; +static const X64Reg ADDROFFS_REG_1 = R11; +/// VS loop count register +static const X64Reg LOOPCOUNT_REG = R12; +/// Current VS loop iteration number (we could probably use LOOPCOUNT_REG, but this quicker) +static const X64Reg LOOPCOUNT = RSI; +/// Number to increment LOOPCOUNT_REG by on each loop iteration +static const X64Reg LOOPINC = RDI; +/// Result of the previous CMP instruction for the X-component comparison +static const X64Reg COND0 = R13; +/// Result of the previous CMP instruction for the Y-component comparison +static const X64Reg COND1 = R14; +/// Pointer to the UnitState instance for the current VS unit +static const X64Reg REGISTERS = R15; +/// SIMD scratch register +static const X64Reg SCRATCH = XMM0; +/// Loaded with the first swizzled source register, otherwise can be used as a scratch register +static const X64Reg SRC1 = XMM1; +/// Loaded with the second swizzled source register, otherwise can be used as a scratch register +static const X64Reg SRC2 = XMM2; +/// Loaded with the third swizzled source register, otherwise can be used as a scratch register +static const X64Reg SRC3 = XMM3; +/// Constant vector of [1.0f, 1.0f, 1.0f, 1.0f], used to efficiently set a vector to one +static const X64Reg ONE = XMM14; +/// Constant vector of [-0.f, -0.f, -0.f, -0.f], used to efficiently negate a vector with XOR +static const X64Reg NEGBIT = XMM15; + +/// Raw constant for the source register selector that indicates no swizzling is performed +static const u8 NO_SRC_REG_SWIZZLE = 0x1b; +/// Raw constant for the destination register enable mask that indicates all components are enabled +static const u8 NO_DEST_REG_MASK = 0xf; + +/** + * Loads and swizzles a source register into the specified XMM register. + * @param instr VS instruction, used for determining how to load the source register + * @param src_num Number indicating which source register to load (1 = src1, 2 = src2, 3 = src3) + * @param src_reg SourceRegister object corresponding to the source register to load + * @param dest Destination XMM register to store the loaded, swizzled source register + */ +void JitCompiler::Compile_SwizzleSrc(Instruction instr, unsigned src_num, SourceRegister src_reg, X64Reg dest) { + X64Reg src_ptr; + int src_offset; + + if (src_reg.GetRegisterType() == RegisterType::FloatUniform) { + src_ptr = UNIFORMS; + src_offset = src_reg.GetIndex() * sizeof(float24) * 4; + } else { + src_ptr = REGISTERS; + src_offset = UnitState::InputOffset(src_reg); + } + + unsigned operand_desc_id; + if (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MAD || + instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MADI) { + // The MAD and MADI instructions do not use the address offset registers, so loading the + // source is a bit simpler here + + operand_desc_id = instr.mad.operand_desc_id; + + // Load the source + MOVAPS(dest, MDisp(src_ptr, src_offset)); + } else { + operand_desc_id = instr.common.operand_desc_id; + + const bool is_inverted = (0 != (instr.opcode.Value().GetInfo().subtype & OpCode::Info::SrcInversed)); + unsigned offset_src = is_inverted ? 2 : 1; + + if (src_num == offset_src && instr.common.address_register_index != 0) { + switch (instr.common.address_register_index) { + case 1: // address offset 1 + MOVAPS(dest, MComplex(src_ptr, ADDROFFS_REG_0, 1, src_offset)); + break; + case 2: // address offset 2 + MOVAPS(dest, MComplex(src_ptr, ADDROFFS_REG_1, 1, src_offset)); + break; + case 3: // adddress offet 3 + MOVAPS(dest, MComplex(src_ptr, LOOPCOUNT_REG, 1, src_offset)); + break; + default: + UNREACHABLE(); + break; + } + } else { + // Load the source + MOVAPS(dest, MDisp(src_ptr, src_offset)); + } + } + + SwizzlePattern swiz = { g_state.vs.swizzle_data[operand_desc_id] }; + + // Generate instructions for source register swizzling as needed + u8 sel = swiz.GetRawSelector(src_num); + if (sel != NO_SRC_REG_SWIZZLE) { + // Selector component order needs to be reversed for the SHUFPS instruction + sel = ((sel & 0xc0) >> 6) | ((sel & 3) << 6) | ((sel & 0xc) << 2) | ((sel & 0x30) >> 2); + + // Shuffle inputs for swizzle + SHUFPS(dest, R(dest), sel); + } + + // If the source register should be negated, flip the negative bit using XOR + const bool negate[] = { swiz.negate_src1, swiz.negate_src2, swiz.negate_src3 }; + if (negate[src_num - 1]) { + XORPS(dest, R(NEGBIT)); + } +} + +void JitCompiler::Compile_DestEnable(Instruction instr,X64Reg src) { + DestRegister dest; + unsigned operand_desc_id; + if (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MAD || + instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MADI) { + operand_desc_id = instr.mad.operand_desc_id; + dest = instr.mad.dest.Value(); + } else { + operand_desc_id = instr.common.operand_desc_id; + dest = instr.common.dest.Value(); + } + + SwizzlePattern swiz = { g_state.vs.swizzle_data[operand_desc_id] }; + + // If all components are enabled, write the result to the destination register + if (swiz.dest_mask == NO_DEST_REG_MASK) { + // Store dest back to memory + MOVAPS(MDisp(REGISTERS, UnitState::OutputOffset(dest)), src); + + } else { + // Not all components are enabled, so mask the result when storing to the destination register... + MOVAPS(SCRATCH, MDisp(REGISTERS, UnitState::OutputOffset(dest))); + + if (Common::GetCPUCaps().sse4_1) { + u8 mask = ((swiz.dest_mask & 1) << 3) | ((swiz.dest_mask & 8) >> 3) | ((swiz.dest_mask & 2) << 1) | ((swiz.dest_mask & 4) >> 1); + BLENDPS(SCRATCH, R(src), mask); + } else { + MOVAPS(XMM4, R(src)); + UNPCKHPS(XMM4, R(SCRATCH)); // Unpack X/Y components of source and destination + UNPCKLPS(SCRATCH, R(src)); // Unpack Z/W components of source and destination + + // Compute selector to selectively copy source components to destination for SHUFPS instruction + u8 sel = ((swiz.DestComponentEnabled(0) ? 1 : 0) << 0) | + ((swiz.DestComponentEnabled(1) ? 3 : 2) << 2) | + ((swiz.DestComponentEnabled(2) ? 0 : 1) << 4) | + ((swiz.DestComponentEnabled(3) ? 2 : 3) << 6); + SHUFPS(SCRATCH, R(XMM4), sel); + } + + // Store dest back to memory + MOVAPS(MDisp(REGISTERS, UnitState::OutputOffset(dest)), SCRATCH); + } +} + +void JitCompiler::Compile_EvaluateCondition(Instruction instr) { + // Note: NXOR is used below to check for equality + switch (instr.flow_control.op) { + case Instruction::FlowControlType::Or: + MOV(32, R(RAX), R(COND0)); + MOV(32, R(RBX), R(COND1)); + XOR(32, R(RAX), Imm32(instr.flow_control.refx.Value() ^ 1)); + XOR(32, R(RBX), Imm32(instr.flow_control.refy.Value() ^ 1)); + OR(32, R(RAX), R(RBX)); + break; + + case Instruction::FlowControlType::And: + MOV(32, R(RAX), R(COND0)); + MOV(32, R(RBX), R(COND1)); + XOR(32, R(RAX), Imm32(instr.flow_control.refx.Value() ^ 1)); + XOR(32, R(RBX), Imm32(instr.flow_control.refy.Value() ^ 1)); + AND(32, R(RAX), R(RBX)); + break; + + case Instruction::FlowControlType::JustX: + MOV(32, R(RAX), R(COND0)); + XOR(32, R(RAX), Imm32(instr.flow_control.refx.Value() ^ 1)); + break; + + case Instruction::FlowControlType::JustY: + MOV(32, R(RAX), R(COND1)); + XOR(32, R(RAX), Imm32(instr.flow_control.refy.Value() ^ 1)); + break; + } +} + +void JitCompiler::Compile_UniformCondition(Instruction instr) { + int offset = offsetof(decltype(g_state.vs.uniforms), b) + (instr.flow_control.bool_uniform_id * sizeof(bool)); + CMP(sizeof(bool) * 8, MDisp(UNIFORMS, offset), Imm8(0)); +} + +void JitCompiler::Compile_ADD(Instruction instr) { + Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); + Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); + ADDPS(SRC1, R(SRC2)); + Compile_DestEnable(instr, SRC1); +} + +void JitCompiler::Compile_DP3(Instruction instr) { + Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); + Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); + + if (Common::GetCPUCaps().sse4_1) { + DPPS(SRC1, R(SRC2), 0x7f); + } else { + MULPS(SRC1, R(SRC2)); + + MOVAPS(SRC2, R(SRC1)); + SHUFPS(SRC2, R(SRC2), _MM_SHUFFLE(1, 1, 1, 1)); + + MOVAPS(SRC3, R(SRC1)); + SHUFPS(SRC3, R(SRC3), _MM_SHUFFLE(2, 2, 2, 2)); + + SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(0, 0, 0, 0)); + ADDPS(SRC1, R(SRC2)); + ADDPS(SRC1, R(SRC3)); + } + + Compile_DestEnable(instr, SRC1); +} + +void JitCompiler::Compile_DP4(Instruction instr) { + Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); + Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); + + if (Common::GetCPUCaps().sse4_1) { + DPPS(SRC1, R(SRC2), 0xff); + } else { + MULPS(SRC1, R(SRC2)); + + MOVAPS(SRC2, R(SRC1)); + SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(2, 3, 0, 1)); // XYZW -> ZWXY + ADDPS(SRC1, R(SRC2)); + + MOVAPS(SRC2, R(SRC1)); + SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(0, 1, 2, 3)); // XYZW -> WZYX + ADDPS(SRC1, R(SRC2)); + } + + Compile_DestEnable(instr, SRC1); +} + +void JitCompiler::Compile_MUL(Instruction instr) { + Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); + Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); + MULPS(SRC1, R(SRC2)); + Compile_DestEnable(instr, SRC1); +} + +void JitCompiler::Compile_FLR(Instruction instr) { + Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); + + if (Common::GetCPUCaps().sse4_1) { + ROUNDFLOORPS(SRC1, R(SRC1)); + } else { + CVTPS2DQ(SRC1, R(SRC1)); + CVTDQ2PS(SRC1, R(SRC1)); + } + + Compile_DestEnable(instr, SRC1); +} + +void JitCompiler::Compile_MAX(Instruction instr) { + Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); + Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); + MAXPS(SRC1, R(SRC2)); + Compile_DestEnable(instr, SRC1); +} + +void JitCompiler::Compile_MIN(Instruction instr) { + Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); + Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); + MINPS(SRC1, R(SRC2)); + Compile_DestEnable(instr, SRC1); +} + +void JitCompiler::Compile_MOVA(Instruction instr) { + SwizzlePattern swiz = { g_state.vs.swizzle_data[instr.common.operand_desc_id] }; + + if (!swiz.DestComponentEnabled(0) && !swiz.DestComponentEnabled(1)) { + return; // NoOp + } + + Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); + + // Convert floats to integers (only care about X and Y components) + CVTPS2DQ(SRC1, R(SRC1)); + + // Get result + MOVQ_xmm(R(RAX), SRC1); + + // Handle destination enable + if (swiz.DestComponentEnabled(0) && swiz.DestComponentEnabled(1)) { + // Move and sign-extend low 32 bits + MOVSX(64, 32, ADDROFFS_REG_0, R(RAX)); + + // Move and sign-extend high 32 bits + SHR(64, R(RAX), Imm8(32)); + MOVSX(64, 32, ADDROFFS_REG_1, R(RAX)); + + // Multiply by 16 to be used as an offset later + SHL(64, R(ADDROFFS_REG_0), Imm8(4)); + SHL(64, R(ADDROFFS_REG_1), Imm8(4)); + } else { + if (swiz.DestComponentEnabled(0)) { + // Move and sign-extend low 32 bits + MOVSX(64, 32, ADDROFFS_REG_0, R(RAX)); + + // Multiply by 16 to be used as an offset later + SHL(64, R(ADDROFFS_REG_0), Imm8(4)); + } else if (swiz.DestComponentEnabled(1)) { + // Move and sign-extend high 32 bits + SHR(64, R(RAX), Imm8(32)); + MOVSX(64, 32, ADDROFFS_REG_1, R(RAX)); + + // Multiply by 16 to be used as an offset later + SHL(64, R(ADDROFFS_REG_1), Imm8(4)); + } + } +} + +void JitCompiler::Compile_MOV(Instruction instr) { + Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); + Compile_DestEnable(instr, SRC1); +} + +void JitCompiler::Compile_SLTI(Instruction instr) { + Compile_SwizzleSrc(instr, 1, instr.common.src1i, SRC1); + Compile_SwizzleSrc(instr, 1, instr.common.src2i, SRC2); + + CMPSS(SRC1, R(SRC2), CMP_LT); + ANDPS(SRC1, R(ONE)); + + Compile_DestEnable(instr, SRC1); +} + +void JitCompiler::Compile_RCP(Instruction instr) { + Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); + + // TODO(bunnei): RCPPS is a pretty rough approximation, this might cause problems if Pica + // performs this operation more accurately. This should be checked on hardware. + RCPPS(SRC1, R(SRC1)); + + Compile_DestEnable(instr, SRC1); +} + +void JitCompiler::Compile_RSQ(Instruction instr) { + Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); + + // TODO(bunnei): RSQRTPS is a pretty rough approximation, this might cause problems if Pica + // performs this operation more accurately. This should be checked on hardware. + RSQRTPS(SRC1, R(SRC1)); + + Compile_DestEnable(instr, SRC1); +} + +void JitCompiler::Compile_NOP(Instruction instr) { +} + +void JitCompiler::Compile_END(Instruction instr) { + ABI_PopAllCalleeSavedRegsAndAdjustStack(); + RET(); +} + +void JitCompiler::Compile_CALL(Instruction instr) { + unsigned offset = instr.flow_control.dest_offset; + while (offset < (instr.flow_control.dest_offset + instr.flow_control.num_instructions)) { + Compile_NextInstr(&offset); + } +} + +void JitCompiler::Compile_CALLC(Instruction instr) { + Compile_EvaluateCondition(instr); + FixupBranch b = J_CC(CC_Z, true); + Compile_CALL(instr); + SetJumpTarget(b); +} + +void JitCompiler::Compile_CALLU(Instruction instr) { + Compile_UniformCondition(instr); + FixupBranch b = J_CC(CC_Z, true); + Compile_CALL(instr); + SetJumpTarget(b); +} + +void JitCompiler::Compile_CMP(Instruction instr) { + Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); + Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); + + static const u8 cmp[] = { CMP_EQ, CMP_NEQ, CMP_LT, CMP_LE, CMP_NLE, CMP_NLT }; + + if (instr.common.compare_op.x == instr.common.compare_op.y) { + // Compare X-component and Y-component together + CMPPS(SRC1, R(SRC2), cmp[instr.common.compare_op.x]); + + MOVQ_xmm(R(COND0), SRC1); + MOV(64, R(COND1), R(COND0)); + } else { + // Compare X-component + MOVAPS(SCRATCH, R(SRC1)); + CMPSS(SCRATCH, R(SRC2), cmp[instr.common.compare_op.x]); + + // Compare Y-component + CMPPS(SRC1, R(SRC2), cmp[instr.common.compare_op.y]); + + MOVQ_xmm(R(COND0), SCRATCH); + MOVQ_xmm(R(COND1), SRC1); + } + + SHR(32, R(COND0), Imm8(31)); + SHR(64, R(COND1), Imm8(63)); +} + +void JitCompiler::Compile_MAD(Instruction instr) { + Compile_SwizzleSrc(instr, 1, instr.mad.src1, SRC1); + + if (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MADI) { + Compile_SwizzleSrc(instr, 2, instr.mad.src2i, SRC2); + Compile_SwizzleSrc(instr, 3, instr.mad.src3i, SRC3); + } else { + Compile_SwizzleSrc(instr, 2, instr.mad.src2, SRC2); + Compile_SwizzleSrc(instr, 3, instr.mad.src3, SRC3); + } + + if (Common::GetCPUCaps().fma) { + VFMADD213PS(SRC1, SRC2, R(SRC3)); + } else { + MULPS(SRC1, R(SRC2)); + ADDPS(SRC1, R(SRC3)); + } + + Compile_DestEnable(instr, SRC1); +} + +void JitCompiler::Compile_IF(Instruction instr) { + ASSERT_MSG(instr.flow_control.dest_offset > *offset_ptr, "Backwards if-statements not supported"); + + // Evaluate the "IF" condition + if (instr.opcode.Value() == OpCode::Id::IFU) { + Compile_UniformCondition(instr); + } else if (instr.opcode.Value() == OpCode::Id::IFC) { + Compile_EvaluateCondition(instr); + } + FixupBranch b = J_CC(CC_Z, true); + + // Compile the code that corresponds to the condition evaluating as true + Compile_Block(instr.flow_control.dest_offset - 1); + + // If there isn't an "ELSE" condition, we are done here + if (instr.flow_control.num_instructions == 0) { + SetJumpTarget(b); + return; + } + + FixupBranch b2 = J(true); + + SetJumpTarget(b); + + // This code corresponds to the "ELSE" condition + // Comple the code that corresponds to the condition evaluating as false + Compile_Block(instr.flow_control.dest_offset + instr.flow_control.num_instructions - 1); + + SetJumpTarget(b2); +} + +void JitCompiler::Compile_LOOP(Instruction instr) { + ASSERT_MSG(instr.flow_control.dest_offset > *offset_ptr, "Backwards loops not supported"); + ASSERT_MSG(!looping, "Nested loops not supported"); + + looping = true; + + int offset = offsetof(decltype(g_state.vs.uniforms), i) + (instr.flow_control.int_uniform_id * sizeof(Math::Vec4<u8>)); + MOV(32, R(LOOPCOUNT), MDisp(UNIFORMS, offset)); + MOV(32, R(LOOPCOUNT_REG), R(LOOPCOUNT)); + SHR(32, R(LOOPCOUNT_REG), Imm8(8)); + AND(32, R(LOOPCOUNT_REG), Imm32(0xff)); // Y-component is the start + MOV(32, R(LOOPINC), R(LOOPCOUNT)); + SHR(32, R(LOOPINC), Imm8(16)); + MOVZX(32, 8, LOOPINC, R(LOOPINC)); // Z-component is the incrementer + MOVZX(32, 8, LOOPCOUNT, R(LOOPCOUNT)); // X-component is iteration count + ADD(32, R(LOOPCOUNT), Imm8(1)); // Iteration count is X-component + 1 + + auto loop_start = GetCodePtr(); + + Compile_Block(instr.flow_control.dest_offset); + + ADD(32, R(LOOPCOUNT_REG), R(LOOPINC)); // Increment LOOPCOUNT_REG by Z-component + SUB(32, R(LOOPCOUNT), Imm8(1)); // Increment loop count by 1 + J_CC(CC_NZ, loop_start); // Loop if not equal + + looping = false; +} + +void JitCompiler::Compile_JMP(Instruction instr) { + ASSERT_MSG(instr.flow_control.dest_offset > *offset_ptr, "Backwards jumps not supported"); + + if (instr.opcode.Value() == OpCode::Id::JMPC) + Compile_EvaluateCondition(instr); + else if (instr.opcode.Value() == OpCode::Id::JMPU) + Compile_UniformCondition(instr); + else + UNREACHABLE(); + + FixupBranch b = J_CC(CC_NZ, true); + + Compile_Block(instr.flow_control.dest_offset); + + SetJumpTarget(b); +} + +void JitCompiler::Compile_Block(unsigned stop) { + // Save current offset pointer + unsigned* prev_offset_ptr = offset_ptr; + unsigned offset = *prev_offset_ptr; + + while (offset <= stop) + Compile_NextInstr(&offset); + + // Restore current offset pointer + offset_ptr = prev_offset_ptr; + *offset_ptr = offset; +} + +void JitCompiler::Compile_NextInstr(unsigned* offset) { + offset_ptr = offset; + + Instruction instr = *(Instruction*)&g_state.vs.program_code[(*offset_ptr)++]; + OpCode::Id opcode = instr.opcode.Value(); + auto instr_func = instr_table[static_cast<unsigned>(opcode)]; + + if (instr_func) { + // JIT the instruction! + ((*this).*instr_func)(instr); + } else { + // Unhandled instruction + LOG_CRITICAL(HW_GPU, "Unhandled instruction: 0x%02x (0x%08x)", instr.opcode.Value(), instr.hex); + } +} + +CompiledShader* JitCompiler::Compile() { + const u8* start = GetCodePtr(); + const auto& code = g_state.vs.program_code; + unsigned offset = g_state.regs.vs.main_offset; + + ABI_PushAllCalleeSavedRegsAndAdjustStack(); + + MOV(PTRBITS, R(REGISTERS), R(ABI_PARAM1)); + MOV(PTRBITS, R(UNIFORMS), ImmPtr(&g_state.vs.uniforms)); + + // Zero address/loop registers + XOR(64, R(ADDROFFS_REG_0), R(ADDROFFS_REG_0)); + XOR(64, R(ADDROFFS_REG_1), R(ADDROFFS_REG_1)); + XOR(64, R(LOOPCOUNT_REG), R(LOOPCOUNT_REG)); + + // Used to set a register to one + static const __m128 one = { 1.f, 1.f, 1.f, 1.f }; + MOV(PTRBITS, R(RAX), ImmPtr(&one)); + MOVAPS(ONE, MDisp(RAX, 0)); + + // Used to negate registers + static const __m128 neg = { -0.f, -0.f, -0.f, -0.f }; + MOV(PTRBITS, R(RAX), ImmPtr(&neg)); + MOVAPS(NEGBIT, MDisp(RAX, 0)); + + looping = false; + + while (offset < g_state.vs.program_code.size()) { + Compile_NextInstr(&offset); + } + + return (CompiledShader*)start; +} + +JitCompiler::JitCompiler() { + AllocCodeSpace(1024 * 1024 * 4); +} + +void JitCompiler::Clear() { + ClearCodeSpace(); +} + +} // namespace Shader + +} // namespace Pica diff --git a/src/video_core/shader/shader_jit_x64.h b/src/video_core/shader/shader_jit_x64.h new file mode 100644 index 000000000..b88f2a0d2 --- /dev/null +++ b/src/video_core/shader/shader_jit_x64.h @@ -0,0 +1,79 @@ +// Copyright 2015 Citra Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include <nihstro/shader_bytecode.h> + +#include "common/x64/emitter.h" + +#include "video_core/pica.h" + +#include "shader.h" + +using nihstro::Instruction; +using nihstro::OpCode; +using nihstro::SwizzlePattern; + +namespace Pica { + +namespace Shader { + +using CompiledShader = void(void* registers); + +/** + * This class implements the shader JIT compiler. It recompiles a Pica shader program into x86_64 + * code that can be executed on the host machine directly. + */ +class JitCompiler : public Gen::XCodeBlock { +public: + JitCompiler(); + + CompiledShader* Compile(); + + void Clear(); + + void Compile_ADD(Instruction instr); + void Compile_DP3(Instruction instr); + void Compile_DP4(Instruction instr); + void Compile_MUL(Instruction instr); + void Compile_FLR(Instruction instr); + void Compile_MAX(Instruction instr); + void Compile_MIN(Instruction instr); + void Compile_RCP(Instruction instr); + void Compile_RSQ(Instruction instr); + void Compile_MOVA(Instruction instr); + void Compile_MOV(Instruction instr); + void Compile_SLTI(Instruction instr); + void Compile_NOP(Instruction instr); + void Compile_END(Instruction instr); + void Compile_CALL(Instruction instr); + void Compile_CALLC(Instruction instr); + void Compile_CALLU(Instruction instr); + void Compile_IF(Instruction instr); + void Compile_LOOP(Instruction instr); + void Compile_JMP(Instruction instr); + void Compile_CMP(Instruction instr); + void Compile_MAD(Instruction instr); + +private: + void Compile_Block(unsigned stop); + void Compile_NextInstr(unsigned* offset); + + void Compile_SwizzleSrc(Instruction instr, unsigned src_num, SourceRegister src_reg, Gen::X64Reg dest); + void Compile_DestEnable(Instruction instr, Gen::X64Reg dest); + + void Compile_EvaluateCondition(Instruction instr); + void Compile_UniformCondition(Instruction instr); + + /// Pointer to the variable that stores the current Pica code offset. Used to handle nested code blocks. + unsigned* offset_ptr = nullptr; + + /// Set to true if currently in a loop, used to check for the existence of nested loops + bool looping = false; +}; + +} // Shader + +} // Pica |
