diff options
| author | bunnei <bunneidev@gmail.com> | 2023-06-12 12:46:54 -0700 | 
|---|---|---|
| committer | GitHub <noreply@github.com> | 2023-06-12 12:46:54 -0700 | 
| commit | ad8f122ab106a474bee26ca2f638cfe51256eb6e (patch) | |
| tree | 689bbdec246df545970abaea91d196b7a94d6bb7 | |
| parent | 333f792e10d244b8568d0dbb3abfea242a016698 (diff) | |
| parent | 2f1e87dd83b4ce4e6ea12f985b6da49829d821e4 (diff) | |
Merge pull request #10693 from liamwhite/f64-to-f32
shader_recompiler: translate f64 to f32 when unsupported on host
| -rw-r--r-- | src/shader_recompiler/CMakeLists.txt | 1 | ||||
| -rw-r--r-- | src/shader_recompiler/frontend/maxwell/translate_program.cpp | 3 | ||||
| -rw-r--r-- | src/shader_recompiler/host_translate_info.h | 1 | ||||
| -rw-r--r-- | src/shader_recompiler/ir_opt/lower_fp64_to_fp32.cpp | 185 | ||||
| -rw-r--r-- | src/shader_recompiler/ir_opt/passes.h | 1 | ||||
| -rw-r--r-- | src/video_core/renderer_opengl/gl_shader_cache.cpp | 1 | ||||
| -rw-r--r-- | src/video_core/renderer_vulkan/vk_pipeline_cache.cpp | 1 | ||||
| -rw-r--r-- | src/video_core/vulkan_common/vulkan_device.h | 5 | 
8 files changed, 198 insertions, 0 deletions
| diff --git a/src/shader_recompiler/CMakeLists.txt b/src/shader_recompiler/CMakeLists.txt index 525b2363c..03f69c191 100644 --- a/src/shader_recompiler/CMakeLists.txt +++ b/src/shader_recompiler/CMakeLists.txt @@ -223,6 +223,7 @@ add_library(shader_recompiler STATIC      ir_opt/identity_removal_pass.cpp      ir_opt/layer_pass.cpp      ir_opt/lower_fp16_to_fp32.cpp +    ir_opt/lower_fp64_to_fp32.cpp      ir_opt/lower_int64_to_int32.cpp      ir_opt/passes.h      ir_opt/position_pass.cpp diff --git a/src/shader_recompiler/frontend/maxwell/translate_program.cpp b/src/shader_recompiler/frontend/maxwell/translate_program.cpp index 17a6d4888..00d00e9f5 100644 --- a/src/shader_recompiler/frontend/maxwell/translate_program.cpp +++ b/src/shader_recompiler/frontend/maxwell/translate_program.cpp @@ -280,6 +280,9 @@ IR::Program TranslateProgram(ObjectPool<IR::Inst>& inst_pool, ObjectPool<IR::Blo      RemoveUnreachableBlocks(program);      // Replace instructions before the SSA rewrite +    if (!host_info.support_float64) { +        Optimization::LowerFp64ToFp32(program); +    }      if (!host_info.support_float16) {          Optimization::LowerFp16ToFp32(program);      } diff --git a/src/shader_recompiler/host_translate_info.h b/src/shader_recompiler/host_translate_info.h index 2aaa6c5ea..4c6322904 100644 --- a/src/shader_recompiler/host_translate_info.h +++ b/src/shader_recompiler/host_translate_info.h @@ -10,6 +10,7 @@ namespace Shader {  /// Misc information about the host  struct HostTranslateInfo { +    bool support_float64{};      ///< True when the device supports 64-bit floats      bool support_float16{};      ///< True when the device supports 16-bit floats      bool support_int64{};        ///< True when the device supports 64-bit integers      bool needs_demote_reorder{}; ///< True when the device needs DemoteToHelperInvocation reordered diff --git a/src/shader_recompiler/ir_opt/lower_fp64_to_fp32.cpp b/src/shader_recompiler/ir_opt/lower_fp64_to_fp32.cpp new file mode 100644 index 000000000..5db7a38ad --- /dev/null +++ b/src/shader_recompiler/ir_opt/lower_fp64_to_fp32.cpp @@ -0,0 +1,185 @@ +// SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#include "shader_recompiler/frontend/ir/ir_emitter.h" +#include "shader_recompiler/frontend/ir/opcodes.h" +#include "shader_recompiler/frontend/ir/value.h" +#include "shader_recompiler/ir_opt/passes.h" + +namespace Shader::Optimization { +namespace { + +constexpr s32 F64ToF32Exp = +1023 - 127; +constexpr s32 F32ToF64Exp = +127 - 1023; + +IR::F32 PackedF64ToF32(IR::IREmitter& ir, const IR::Value& packed) { +    const IR::U32 lo{ir.CompositeExtract(packed, 0)}; +    const IR::U32 hi{ir.CompositeExtract(packed, 1)}; +    const IR::U32 sign{ir.BitFieldExtract(hi, ir.Imm32(31), ir.Imm32(1))}; +    const IR::U32 exp{ir.BitFieldExtract(hi, ir.Imm32(20), ir.Imm32(11))}; +    const IR::U32 mantissa_hi{ir.BitFieldExtract(hi, ir.Imm32(0), ir.Imm32(20))}; +    const IR::U32 mantissa_lo{ir.BitFieldExtract(lo, ir.Imm32(29), ir.Imm32(3))}; +    const IR::U32 mantissa{ +        ir.BitwiseOr(ir.ShiftLeftLogical(mantissa_hi, ir.Imm32(3)), mantissa_lo)}; +    const IR::U32 exp_if_subnorm{ +        ir.Select(ir.IEqual(exp, ir.Imm32(0)), ir.Imm32(0), ir.IAdd(exp, ir.Imm32(F64ToF32Exp)))}; +    const IR::U32 exp_if_infnan{ +        ir.Select(ir.IEqual(exp, ir.Imm32(0x7ff)), ir.Imm32(0xff), exp_if_subnorm)}; +    const IR::U32 result{ +        ir.BitwiseOr(ir.ShiftLeftLogical(sign, ir.Imm32(31)), +                     ir.BitwiseOr(ir.ShiftLeftLogical(exp_if_infnan, ir.Imm32(23)), mantissa))}; +    return ir.BitCast<IR::F32>(result); +} + +IR::Value F32ToPackedF64(IR::IREmitter& ir, const IR::Value& raw) { +    const IR::U32 value{ir.BitCast<IR::U32>(IR::F32(raw))}; +    const IR::U32 sign{ir.BitFieldExtract(value, ir.Imm32(31), ir.Imm32(1))}; +    const IR::U32 exp{ir.BitFieldExtract(value, ir.Imm32(23), ir.Imm32(8))}; +    const IR::U32 mantissa{ir.BitFieldExtract(value, ir.Imm32(0), ir.Imm32(23))}; +    const IR::U32 mantissa_hi{ir.BitFieldExtract(mantissa, ir.Imm32(3), ir.Imm32(20))}; +    const IR::U32 mantissa_lo{ir.BitFieldExtract(mantissa, ir.Imm32(0), ir.Imm32(3))}; +    const IR::U32 exp_if_subnorm{ +        ir.Select(ir.IEqual(exp, ir.Imm32(0)), ir.Imm32(0), ir.IAdd(exp, ir.Imm32(F32ToF64Exp)))}; +    const IR::U32 exp_if_infnan{ +        ir.Select(ir.IEqual(exp, ir.Imm32(0xff)), ir.Imm32(0x7ff), exp_if_subnorm)}; +    const IR::U32 lo{ir.ShiftLeftLogical(mantissa_lo, ir.Imm32(29))}; +    const IR::U32 hi{ +        ir.BitwiseOr(ir.ShiftLeftLogical(sign, ir.Imm32(31)), +                     ir.BitwiseOr(ir.ShiftLeftLogical(exp_if_infnan, ir.Imm32(20)), mantissa_hi))}; +    return ir.CompositeConstruct(lo, hi); +} + +IR::Opcode Replace(IR::Opcode op) { +    switch (op) { +    case IR::Opcode::FPAbs64: +        return IR::Opcode::FPAbs32; +    case IR::Opcode::FPAdd64: +        return IR::Opcode::FPAdd32; +    case IR::Opcode::FPCeil64: +        return IR::Opcode::FPCeil32; +    case IR::Opcode::FPFloor64: +        return IR::Opcode::FPFloor32; +    case IR::Opcode::FPFma64: +        return IR::Opcode::FPFma32; +    case IR::Opcode::FPMul64: +        return IR::Opcode::FPMul32; +    case IR::Opcode::FPNeg64: +        return IR::Opcode::FPNeg32; +    case IR::Opcode::FPRoundEven64: +        return IR::Opcode::FPRoundEven32; +    case IR::Opcode::FPSaturate64: +        return IR::Opcode::FPSaturate32; +    case IR::Opcode::FPClamp64: +        return IR::Opcode::FPClamp32; +    case IR::Opcode::FPTrunc64: +        return IR::Opcode::FPTrunc32; +    case IR::Opcode::CompositeConstructF64x2: +        return IR::Opcode::CompositeConstructF32x2; +    case IR::Opcode::CompositeConstructF64x3: +        return IR::Opcode::CompositeConstructF32x3; +    case IR::Opcode::CompositeConstructF64x4: +        return IR::Opcode::CompositeConstructF32x4; +    case IR::Opcode::CompositeExtractF64x2: +        return IR::Opcode::CompositeExtractF32x2; +    case IR::Opcode::CompositeExtractF64x3: +        return IR::Opcode::CompositeExtractF32x3; +    case IR::Opcode::CompositeExtractF64x4: +        return IR::Opcode::CompositeExtractF32x4; +    case IR::Opcode::CompositeInsertF64x2: +        return IR::Opcode::CompositeInsertF32x2; +    case IR::Opcode::CompositeInsertF64x3: +        return IR::Opcode::CompositeInsertF32x3; +    case IR::Opcode::CompositeInsertF64x4: +        return IR::Opcode::CompositeInsertF32x4; +    case IR::Opcode::FPOrdEqual64: +        return IR::Opcode::FPOrdEqual32; +    case IR::Opcode::FPUnordEqual64: +        return IR::Opcode::FPUnordEqual32; +    case IR::Opcode::FPOrdNotEqual64: +        return IR::Opcode::FPOrdNotEqual32; +    case IR::Opcode::FPUnordNotEqual64: +        return IR::Opcode::FPUnordNotEqual32; +    case IR::Opcode::FPOrdLessThan64: +        return IR::Opcode::FPOrdLessThan32; +    case IR::Opcode::FPUnordLessThan64: +        return IR::Opcode::FPUnordLessThan32; +    case IR::Opcode::FPOrdGreaterThan64: +        return IR::Opcode::FPOrdGreaterThan32; +    case IR::Opcode::FPUnordGreaterThan64: +        return IR::Opcode::FPUnordGreaterThan32; +    case IR::Opcode::FPOrdLessThanEqual64: +        return IR::Opcode::FPOrdLessThanEqual32; +    case IR::Opcode::FPUnordLessThanEqual64: +        return IR::Opcode::FPUnordLessThanEqual32; +    case IR::Opcode::FPOrdGreaterThanEqual64: +        return IR::Opcode::FPOrdGreaterThanEqual32; +    case IR::Opcode::FPUnordGreaterThanEqual64: +        return IR::Opcode::FPUnordGreaterThanEqual32; +    case IR::Opcode::FPIsNan64: +        return IR::Opcode::FPIsNan32; +    case IR::Opcode::ConvertS16F64: +        return IR::Opcode::ConvertS16F32; +    case IR::Opcode::ConvertS32F64: +        return IR::Opcode::ConvertS32F32; +    case IR::Opcode::ConvertS64F64: +        return IR::Opcode::ConvertS64F32; +    case IR::Opcode::ConvertU16F64: +        return IR::Opcode::ConvertU16F32; +    case IR::Opcode::ConvertU32F64: +        return IR::Opcode::ConvertU32F32; +    case IR::Opcode::ConvertU64F64: +        return IR::Opcode::ConvertU64F32; +    case IR::Opcode::ConvertF32F64: +        return IR::Opcode::Identity; +    case IR::Opcode::ConvertF64F32: +        return IR::Opcode::Identity; +    case IR::Opcode::ConvertF64S8: +        return IR::Opcode::ConvertF32S8; +    case IR::Opcode::ConvertF64S16: +        return IR::Opcode::ConvertF32S16; +    case IR::Opcode::ConvertF64S32: +        return IR::Opcode::ConvertF32S32; +    case IR::Opcode::ConvertF64S64: +        return IR::Opcode::ConvertF32S64; +    case IR::Opcode::ConvertF64U8: +        return IR::Opcode::ConvertF32U8; +    case IR::Opcode::ConvertF64U16: +        return IR::Opcode::ConvertF32U16; +    case IR::Opcode::ConvertF64U32: +        return IR::Opcode::ConvertF32U32; +    case IR::Opcode::ConvertF64U64: +        return IR::Opcode::ConvertF32U64; +    default: +        return op; +    } +} + +void Lower(IR::Block& block, IR::Inst& inst) { +    switch (inst.GetOpcode()) { +    case IR::Opcode::PackDouble2x32: { +        IR::IREmitter ir(block, IR::Block::InstructionList::s_iterator_to(inst)); +        inst.ReplaceUsesWith(PackedF64ToF32(ir, inst.Arg(0))); +        break; +    } +    case IR::Opcode::UnpackDouble2x32: { +        IR::IREmitter ir(block, IR::Block::InstructionList::s_iterator_to(inst)); +        inst.ReplaceUsesWith(F32ToPackedF64(ir, inst.Arg(0))); +        break; +    } +    default: +        inst.ReplaceOpcode(Replace(inst.GetOpcode())); +        break; +    } +} + +} // Anonymous namespace + +void LowerFp64ToFp32(IR::Program& program) { +    for (IR::Block* const block : program.blocks) { +        for (IR::Inst& inst : block->Instructions()) { +            Lower(*block, inst); +        } +    } +} + +} // namespace Shader::Optimization diff --git a/src/shader_recompiler/ir_opt/passes.h b/src/shader_recompiler/ir_opt/passes.h index 1f8f2ba95..53606b78d 100644 --- a/src/shader_recompiler/ir_opt/passes.h +++ b/src/shader_recompiler/ir_opt/passes.h @@ -17,6 +17,7 @@ void ConstantPropagationPass(Environment& env, IR::Program& program);  void DeadCodeEliminationPass(IR::Program& program);  void GlobalMemoryToStorageBufferPass(IR::Program& program);  void IdentityRemovalPass(IR::Program& program); +void LowerFp64ToFp32(IR::Program& program);  void LowerFp16ToFp32(IR::Program& program);  void LowerInt64ToInt32(IR::Program& program);  void RescalingPass(IR::Program& program); diff --git a/src/video_core/renderer_opengl/gl_shader_cache.cpp b/src/video_core/renderer_opengl/gl_shader_cache.cpp index 6ecda2984..dd8caa556 100644 --- a/src/video_core/renderer_opengl/gl_shader_cache.cpp +++ b/src/video_core/renderer_opengl/gl_shader_cache.cpp @@ -232,6 +232,7 @@ ShaderCache::ShaderCache(RasterizerOpenGL& rasterizer_, Core::Frontend::EmuWindo            .gl_max_compute_smem_size = device.GetMaxComputeSharedMemorySize(),        },        host_info{ +          .support_float64 = true,            .support_float16 = false,            .support_int64 = device.HasShaderInt64(),            .needs_demote_reorder = device.IsAmd(), diff --git a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp index 9482e91b0..5734f51e5 100644 --- a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp @@ -350,6 +350,7 @@ PipelineCache::PipelineCache(RasterizerVulkan& rasterizer_, const Device& device          .has_broken_spirv_subgroup_mask_vector_extract_dynamic =              driver_id == VK_DRIVER_ID_QUALCOMM_PROPRIETARY};      host_info = Shader::HostTranslateInfo{ +        .support_float64 = device.IsFloat64Supported(),          .support_float16 = device.IsFloat16Supported(),          .support_int64 = device.IsShaderInt64Supported(),          .needs_demote_reorder = diff --git a/src/video_core/vulkan_common/vulkan_device.h b/src/video_core/vulkan_common/vulkan_device.h index d62a103a1..0c53e35a6 100644 --- a/src/video_core/vulkan_common/vulkan_device.h +++ b/src/video_core/vulkan_common/vulkan_device.h @@ -300,6 +300,11 @@ public:          return GetDriverID() != VK_DRIVER_ID_QUALCOMM_PROPRIETARY;      } +    /// Returns true if the device suppors float64 natively. +    bool IsFloat64Supported() const { +        return features.features.shaderFloat64; +    } +      /// Returns true if the device supports float16 natively.      bool IsFloat16Supported() const {          return features.shader_float16_int8.shaderFloat16; | 
