diff options
Diffstat (limited to 'src')
23 files changed, 199 insertions, 91 deletions
diff --git a/src/common/x64/native_clock.cpp b/src/common/x64/native_clock.cpp index 7a3f21dcf..7fd9d22f8 100644 --- a/src/common/x64/native_clock.cpp +++ b/src/common/x64/native_clock.cpp @@ -10,25 +10,49 @@ #include "common/uint128.h" #include "common/x64/native_clock.h" +#ifdef _MSC_VER +#include <intrin.h> +#endif + namespace Common { +#ifdef _MSC_VER +__forceinline static u64 FencedRDTSC() { + _mm_lfence(); + _ReadWriteBarrier(); + const u64 result = __rdtsc(); + _mm_lfence(); + _ReadWriteBarrier(); + return result; +} +#else +static u64 FencedRDTSC() { + u64 result; + asm volatile("lfence\n\t" + "rdtsc\n\t" + "shl $32, %%rdx\n\t" + "or %%rdx, %0\n\t" + "lfence" + : "=a"(result) + : + : "rdx", "memory", "cc"); + return result; +} +#endif + u64 EstimateRDTSCFrequency() { // Discard the first result measuring the rdtsc. - _mm_mfence(); - __rdtsc(); + FencedRDTSC(); std::this_thread::sleep_for(std::chrono::milliseconds{1}); - _mm_mfence(); - __rdtsc(); + FencedRDTSC(); // Get the current time. const auto start_time = std::chrono::steady_clock::now(); - _mm_mfence(); - const u64 tsc_start = __rdtsc(); + const u64 tsc_start = FencedRDTSC(); // Wait for 200 milliseconds. std::this_thread::sleep_for(std::chrono::milliseconds{200}); const auto end_time = std::chrono::steady_clock::now(); - _mm_mfence(); - const u64 tsc_end = __rdtsc(); + const u64 tsc_end = FencedRDTSC(); // Calculate differences. const u64 timer_diff = static_cast<u64>( std::chrono::duration_cast<std::chrono::nanoseconds>(end_time - start_time).count()); @@ -42,8 +66,7 @@ NativeClock::NativeClock(u64 emulated_cpu_frequency_, u64 emulated_clock_frequen u64 rtsc_frequency_) : WallClock(emulated_cpu_frequency_, emulated_clock_frequency_, true), rtsc_frequency{ rtsc_frequency_} { - _mm_mfence(); - time_point.inner.last_measure = __rdtsc(); + time_point.inner.last_measure = FencedRDTSC(); time_point.inner.accumulated_ticks = 0U; ns_rtsc_factor = GetFixedPoint64Factor(NS_RATIO, rtsc_frequency); us_rtsc_factor = GetFixedPoint64Factor(US_RATIO, rtsc_frequency); @@ -58,8 +81,7 @@ u64 NativeClock::GetRTSC() { current_time_point.pack = Common::AtomicLoad128(time_point.pack.data()); do { - _mm_mfence(); - const u64 current_measure = __rdtsc(); + const u64 current_measure = FencedRDTSC(); u64 diff = current_measure - current_time_point.inner.last_measure; diff = diff & ~static_cast<u64>(static_cast<s64>(diff) >> 63); // max(diff, 0) new_time_point.inner.last_measure = current_measure > current_time_point.inner.last_measure @@ -80,8 +102,7 @@ void NativeClock::Pause(bool is_paused) { current_time_point.pack = Common::AtomicLoad128(time_point.pack.data()); do { new_time_point.pack = current_time_point.pack; - _mm_mfence(); - new_time_point.inner.last_measure = __rdtsc(); + new_time_point.inner.last_measure = FencedRDTSC(); } while (!Common::AtomicCompareAndSwap(time_point.pack.data(), new_time_point.pack, current_time_point.pack, current_time_point.pack)); } diff --git a/src/core/arm/arm_interface.h b/src/core/arm/arm_interface.h index c60322442..dce2f4195 100644 --- a/src/core/arm/arm_interface.h +++ b/src/core/arm/arm_interface.h @@ -171,6 +171,9 @@ public: /// Prepare core for thread reschedule (if needed to correctly handle state) virtual void PrepareReschedule() = 0; + /// Signal an interrupt and ask the core to halt as soon as possible. + virtual void SignalInterrupt() = 0; + struct BacktraceEntry { std::string module; u64 address; diff --git a/src/core/arm/dynarmic/arm_dynarmic_32.cpp b/src/core/arm/dynarmic/arm_dynarmic_32.cpp index 054572445..ab3210d84 100644 --- a/src/core/arm/dynarmic/arm_dynarmic_32.cpp +++ b/src/core/arm/dynarmic/arm_dynarmic_32.cpp @@ -25,6 +25,9 @@ namespace Core { using namespace Common::Literals; +constexpr Dynarmic::HaltReason break_loop = Dynarmic::HaltReason::UserDefined2; +constexpr Dynarmic::HaltReason svc_call = Dynarmic::HaltReason::UserDefined3; + class DynarmicCallbacks32 : public Dynarmic::A32::UserCallbacks { public: explicit DynarmicCallbacks32(ARM_Dynarmic_32& parent_) @@ -84,15 +87,13 @@ public: } void CallSVC(u32 swi) override { - parent.svc_called = true; parent.svc_swi = swi; - parent.jit->HaltExecution(); + parent.jit->HaltExecution(svc_call); } void AddTicks(u64 ticks) override { - if (parent.uses_wall_clock) { - return; - } + ASSERT_MSG(!parent.uses_wall_clock, "This should never happen - dynarmic ticking disabled"); + // Divide the number of ticks by the amount of CPU cores. TODO(Subv): This yields only a // rough approximation of the amount of executed ticks in the system, it may be thrown off // if not all cores are doing a similar amount of work. Instead of doing this, we should @@ -108,12 +109,8 @@ public: } u64 GetTicksRemaining() override { - if (parent.uses_wall_clock) { - if (!parent.interrupt_handlers[parent.core_index].IsInterrupted()) { - return minimum_run_cycles; - } - return 0U; - } + ASSERT_MSG(!parent.uses_wall_clock, "This should never happen - dynarmic ticking disabled"); + return std::max<s64>(parent.system.CoreTiming().GetDowncount(), 0); } @@ -148,6 +145,7 @@ std::shared_ptr<Dynarmic::A32::Jit> ARM_Dynarmic_32::MakeJit(Common::PageTable* // Timing config.wall_clock_cntpct = uses_wall_clock; + config.enable_cycle_counting = !uses_wall_clock; // Code cache size config.code_cache_size = 512_MiB; @@ -230,13 +228,11 @@ std::shared_ptr<Dynarmic::A32::Jit> ARM_Dynarmic_32::MakeJit(Common::PageTable* void ARM_Dynarmic_32::Run() { while (true) { - jit->Run(); - if (!svc_called) { - break; + const auto hr = jit->Run(); + if (Has(hr, svc_call)) { + Kernel::Svc::Call(system, svc_swi); } - svc_called = false; - Kernel::Svc::Call(system, svc_swi); - if (shutdown) { + if (Has(hr, break_loop)) { break; } } @@ -322,8 +318,11 @@ void ARM_Dynarmic_32::LoadContext(const ThreadContext32& ctx) { } void ARM_Dynarmic_32::PrepareReschedule() { - jit->HaltExecution(); - shutdown = true; + jit->HaltExecution(break_loop); +} + +void ARM_Dynarmic_32::SignalInterrupt() { + jit->HaltExecution(break_loop); } void ARM_Dynarmic_32::ClearInstructionCache() { diff --git a/src/core/arm/dynarmic/arm_dynarmic_32.h b/src/core/arm/dynarmic/arm_dynarmic_32.h index 5d47b600d..3f68a4ff1 100644 --- a/src/core/arm/dynarmic/arm_dynarmic_32.h +++ b/src/core/arm/dynarmic/arm_dynarmic_32.h @@ -57,6 +57,7 @@ public: void LoadContext(const ThreadContext64& ctx) override {} void PrepareReschedule() override; + void SignalInterrupt() override; void ClearExclusiveState() override; void ClearInstructionCache() override; @@ -83,9 +84,6 @@ private: // SVC callback u32 svc_swi{}; - bool svc_called{}; - - bool shutdown{}; }; } // namespace Core diff --git a/src/core/arm/dynarmic/arm_dynarmic_64.cpp b/src/core/arm/dynarmic/arm_dynarmic_64.cpp index 7ff8f9495..68822a1fc 100644 --- a/src/core/arm/dynarmic/arm_dynarmic_64.cpp +++ b/src/core/arm/dynarmic/arm_dynarmic_64.cpp @@ -26,6 +26,9 @@ namespace Core { using Vector = Dynarmic::A64::Vector; using namespace Common::Literals; +constexpr Dynarmic::HaltReason break_loop = Dynarmic::HaltReason::UserDefined2; +constexpr Dynarmic::HaltReason svc_call = Dynarmic::HaltReason::UserDefined3; + class DynarmicCallbacks64 : public Dynarmic::A64::UserCallbacks { public: explicit DynarmicCallbacks64(ARM_Dynarmic_64& parent_) @@ -106,7 +109,7 @@ public: break; } - parent.jit->HaltExecution(); + parent.jit->HaltExecution(Dynarmic::HaltReason::CacheInvalidation); } void ExceptionRaised(u64 pc, Dynarmic::A64::Exception exception) override { @@ -126,15 +129,12 @@ public: } void CallSVC(u32 swi) override { - parent.svc_called = true; parent.svc_swi = swi; - parent.jit->HaltExecution(); + parent.jit->HaltExecution(svc_call); } void AddTicks(u64 ticks) override { - if (parent.uses_wall_clock) { - return; - } + ASSERT_MSG(!parent.uses_wall_clock, "This should never happen - dynarmic ticking disabled"); // Divide the number of ticks by the amount of CPU cores. TODO(Subv): This yields only a // rough approximation of the amount of executed ticks in the system, it may be thrown off @@ -149,12 +149,8 @@ public: } u64 GetTicksRemaining() override { - if (parent.uses_wall_clock) { - if (!parent.interrupt_handlers[parent.core_index].IsInterrupted()) { - return minimum_run_cycles; - } - return 0U; - } + ASSERT_MSG(!parent.uses_wall_clock, "This should never happen - dynarmic ticking disabled"); + return std::max<s64>(parent.system.CoreTiming().GetDowncount(), 0); } @@ -210,6 +206,7 @@ std::shared_ptr<Dynarmic::A64::Jit> ARM_Dynarmic_64::MakeJit(Common::PageTable* // Timing config.wall_clock_cntpct = uses_wall_clock; + config.enable_cycle_counting = !uses_wall_clock; // Code cache size config.code_cache_size = 512_MiB; @@ -292,13 +289,11 @@ std::shared_ptr<Dynarmic::A64::Jit> ARM_Dynarmic_64::MakeJit(Common::PageTable* void ARM_Dynarmic_64::Run() { while (true) { - jit->Run(); - if (!svc_called) { - break; + const auto hr = jit->Run(); + if (Has(hr, svc_call)) { + Kernel::Svc::Call(system, svc_swi); } - svc_called = false; - Kernel::Svc::Call(system, svc_swi); - if (shutdown) { + if (Has(hr, break_loop)) { break; } } @@ -389,8 +384,11 @@ void ARM_Dynarmic_64::LoadContext(const ThreadContext64& ctx) { } void ARM_Dynarmic_64::PrepareReschedule() { - jit->HaltExecution(); - shutdown = true; + jit->HaltExecution(break_loop); +} + +void ARM_Dynarmic_64::SignalInterrupt() { + jit->HaltExecution(break_loop); } void ARM_Dynarmic_64::ClearInstructionCache() { diff --git a/src/core/arm/dynarmic/arm_dynarmic_64.h b/src/core/arm/dynarmic/arm_dynarmic_64.h index 0c4e46c64..58bc7fbec 100644 --- a/src/core/arm/dynarmic/arm_dynarmic_64.h +++ b/src/core/arm/dynarmic/arm_dynarmic_64.h @@ -51,6 +51,7 @@ public: void LoadContext(const ThreadContext64& ctx) override; void PrepareReschedule() override; + void SignalInterrupt() override; void ClearExclusiveState() override; void ClearInstructionCache() override; @@ -77,9 +78,6 @@ private: // SVC callback u32 svc_swi{}; - bool svc_called{}; - - bool shutdown{}; }; } // namespace Core diff --git a/src/core/hle/kernel/physical_core.cpp b/src/core/hle/kernel/physical_core.cpp index 7477668e4..18a5f40f8 100644 --- a/src/core/hle/kernel/physical_core.cpp +++ b/src/core/hle/kernel/physical_core.cpp @@ -58,6 +58,7 @@ bool PhysicalCore::IsInterrupted() const { void PhysicalCore::Interrupt() { guard->lock(); interrupts[core_index].SetInterrupt(true); + arm_interface->SignalInterrupt(); guard->unlock(); } diff --git a/src/shader_recompiler/backend/glsl/emit_glsl_context_get_set.cpp b/src/shader_recompiler/backend/glsl/emit_glsl_context_get_set.cpp index 0c1fbc7b1..282668b36 100644 --- a/src/shader_recompiler/backend/glsl/emit_glsl_context_get_set.cpp +++ b/src/shader_recompiler/backend/glsl/emit_glsl_context_get_set.cpp @@ -35,6 +35,15 @@ std::string_view OutputVertexIndex(EmitContext& ctx) { return ctx.stage == Stage::TessellationControl ? "[gl_InvocationID]" : ""; } +std::string ChooseCbuf(EmitContext& ctx, const IR::Value& binding, std::string_view index) { + if (binding.IsImmediate()) { + return fmt::format("{}_cbuf{}[{}]", ctx.stage_name, binding.U32(), index); + } else { + const auto binding_var{ctx.var_alloc.Consume(binding)}; + return fmt::format("GetCbufIndirect({},{})", binding_var, index); + } +} + void GetCbuf(EmitContext& ctx, std::string_view ret, const IR::Value& binding, const IR::Value& offset, u32 num_bits, std::string_view cast = {}, std::string_view bit_offset = {}) { @@ -55,8 +64,8 @@ void GetCbuf(EmitContext& ctx, std::string_view ret, const IR::Value& binding, const auto swizzle{is_immediate ? fmt::format(".{}", OffsetSwizzle(offset.U32())) : fmt::format("[({}>>2)%4]", offset_var)}; - const auto cbuf{fmt::format("{}_cbuf{}", ctx.stage_name, binding.U32())}; - const auto cbuf_cast{fmt::format("{}({}[{}]{{}})", cast, cbuf, index)}; + const auto cbuf{ChooseCbuf(ctx, binding, index)}; + const auto cbuf_cast{fmt::format("{}({}{{}})", cast, cbuf)}; const auto extraction{num_bits == 32 ? cbuf_cast : fmt::format("bitfieldExtract({},int({}),{})", cbuf_cast, bit_offset, num_bits)}; @@ -140,9 +149,9 @@ void EmitGetCbufF32(EmitContext& ctx, IR::Inst& inst, const IR::Value& binding, void EmitGetCbufU32x2(EmitContext& ctx, IR::Inst& inst, const IR::Value& binding, const IR::Value& offset) { - const auto cbuf{fmt::format("{}_cbuf{}", ctx.stage_name, binding.U32())}; const auto cast{ctx.profile.has_gl_cbuf_ftou_bug ? "" : "ftou"}; if (offset.IsImmediate()) { + const auto cbuf{fmt::format("{}_cbuf{}", ctx.stage_name, binding.U32())}; static constexpr u32 cbuf_size{0x10000}; const u32 u32_offset{offset.U32()}; const s32 signed_offset{static_cast<s32>(offset.U32())}; @@ -162,17 +171,17 @@ void EmitGetCbufU32x2(EmitContext& ctx, IR::Inst& inst, const IR::Value& binding return; } const auto offset_var{ctx.var_alloc.Consume(offset)}; + const auto cbuf{ChooseCbuf(ctx, binding, fmt::format("{}>>4", offset_var))}; if (!ctx.profile.has_gl_component_indexing_bug) { - ctx.AddU32x2("{}=uvec2({}({}[{}>>4][({}>>2)%4]),{}({}[({}+4)>>4][(({}+4)>>2)%4]));", inst, - cast, cbuf, offset_var, offset_var, cast, cbuf, offset_var, offset_var); + ctx.AddU32x2("{}=uvec2({}({}[({}>>2)%4]),{}({}[(({}+4)>>2)%4]));", inst, cast, cbuf, + offset_var, cast, cbuf, offset_var); return; } const auto ret{ctx.var_alloc.Define(inst, GlslVarType::U32x2)}; const auto cbuf_offset{fmt::format("{}>>2", offset_var)}; for (u32 swizzle = 0; swizzle < 4; ++swizzle) { - ctx.Add("if(({}&3)=={}){}=uvec2({}({}[{}>>4].{}),{}({}[({}+4)>>4].{}));", cbuf_offset, - swizzle, ret, cast, cbuf, offset_var, "xyzw"[swizzle], cast, cbuf, offset_var, - "xyzw"[(swizzle + 1) % 4]); + ctx.Add("if(({}&3)=={}){}=uvec2({}({}.{}),{}({}.{}));", cbuf_offset, swizzle, ret, cast, + cbuf, "xyzw"[swizzle], cast, cbuf, "xyzw"[(swizzle + 1) % 4]); } } diff --git a/src/shader_recompiler/backend/glsl/glsl_emit_context.cpp b/src/shader_recompiler/backend/glsl/glsl_emit_context.cpp index e816a93ec..17266f40d 100644 --- a/src/shader_recompiler/backend/glsl/glsl_emit_context.cpp +++ b/src/shader_recompiler/backend/glsl/glsl_emit_context.cpp @@ -359,6 +359,7 @@ EmitContext::EmitContext(IR::Program& program, Bindings& bindings, const Profile header += "layout(location=0) uniform vec4 scaling;"; } DefineConstantBuffers(bindings); + DefineConstantBufferIndirect(); DefineStorageBuffers(bindings); SetupImages(bindings); SetupTextures(bindings); @@ -436,6 +437,24 @@ void EmitContext::DefineConstantBuffers(Bindings& bindings) { } } +void EmitContext::DefineConstantBufferIndirect() { + if (!info.uses_cbuf_indirect) { + return; + } + + header += profile.has_gl_cbuf_ftou_bug ? "uvec4 " : "vec4 "; + header += "GetCbufIndirect(uint binding, uint offset){" + "switch(binding){" + "default:"; + + for (const auto& desc : info.constant_buffer_descriptors) { + header += + fmt::format("case {}:return {}_cbuf{}[offset];", desc.index, stage_name, desc.index); + } + + header += "}}"; +} + void EmitContext::DefineStorageBuffers(Bindings& bindings) { if (info.storage_buffers_descriptors.empty()) { return; diff --git a/src/shader_recompiler/backend/glsl/glsl_emit_context.h b/src/shader_recompiler/backend/glsl/glsl_emit_context.h index d9b639d29..2b13db6e6 100644 --- a/src/shader_recompiler/backend/glsl/glsl_emit_context.h +++ b/src/shader_recompiler/backend/glsl/glsl_emit_context.h @@ -162,6 +162,7 @@ public: private: void SetupExtensions(); void DefineConstantBuffers(Bindings& bindings); + void DefineConstantBufferIndirect(); void DefineStorageBuffers(Bindings& bindings); void DefineGenericOutput(size_t index, u32 invocations); void DefineHelperFunctions(); diff --git a/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp b/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp index 28f6a6184..9c83cd2e4 100644 --- a/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp +++ b/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp @@ -1043,15 +1043,15 @@ void EmitContext::DefineConstantBufferIndirectFunctions(const Info& info) { const Id merge_label{OpLabel()}; const Id uniform_type{uniform_types.*member_ptr}; - std::array<Id, Info::MAX_CBUFS> buf_labels; - std::array<Sirit::Literal, Info::MAX_CBUFS> buf_literals; - for (u32 i = 0; i < Info::MAX_CBUFS; i++) { + std::array<Id, Info::MAX_INDIRECT_CBUFS> buf_labels; + std::array<Sirit::Literal, Info::MAX_INDIRECT_CBUFS> buf_literals; + for (u32 i = 0; i < Info::MAX_INDIRECT_CBUFS; i++) { buf_labels[i] = OpLabel(); buf_literals[i] = Sirit::Literal{i}; } OpSelectionMerge(merge_label, spv::SelectionControlMask::MaskNone); OpSwitch(binding, buf_labels[0], buf_literals, buf_labels); - for (u32 i = 0; i < Info::MAX_CBUFS; i++) { + for (u32 i = 0; i < Info::MAX_INDIRECT_CBUFS; i++) { AddLabel(buf_labels[i]); const Id cbuf{cbufs[i].*member_ptr}; const Id access_chain{OpAccessChain(uniform_type, cbuf, u32_zero_value, offset)}; @@ -1064,22 +1064,23 @@ void EmitContext::DefineConstantBufferIndirectFunctions(const Info& info) { return func; }}; IR::Type types{info.used_indirect_cbuf_types}; - if (True(types & IR::Type::U8)) { + bool supports_aliasing = profile.support_descriptor_aliasing; + if (supports_aliasing && True(types & IR::Type::U8)) { load_const_func_u8 = make_accessor(U8, &UniformDefinitions::U8); } - if (True(types & IR::Type::U16)) { + if (supports_aliasing && True(types & IR::Type::U16)) { load_const_func_u16 = make_accessor(U16, &UniformDefinitions::U16); } - if (True(types & IR::Type::F32)) { + if (supports_aliasing && True(types & IR::Type::F32)) { load_const_func_f32 = make_accessor(F32[1], &UniformDefinitions::F32); } - if (True(types & IR::Type::U32)) { + if (supports_aliasing && True(types & IR::Type::U32)) { load_const_func_u32 = make_accessor(U32[1], &UniformDefinitions::U32); } - if (True(types & IR::Type::U32x2)) { + if (supports_aliasing && True(types & IR::Type::U32x2)) { load_const_func_u32x2 = make_accessor(U32[2], &UniformDefinitions::U32x2); } - if (True(types & IR::Type::U32x4)) { + if (!supports_aliasing || True(types & IR::Type::U32x4)) { load_const_func_u32x4 = make_accessor(U32[4], &UniformDefinitions::U32x4); } } diff --git a/src/shader_recompiler/ir_opt/collect_shader_info_pass.cpp b/src/shader_recompiler/ir_opt/collect_shader_info_pass.cpp index 0b2c60842..16278faab 100644 --- a/src/shader_recompiler/ir_opt/collect_shader_info_pass.cpp +++ b/src/shader_recompiler/ir_opt/collect_shader_info_pass.cpp @@ -32,13 +32,8 @@ void AddConstantBufferDescriptor(Info& info, u32 index, u32 count) { void AddRegisterIndexedLdc(Info& info) { info.uses_cbuf_indirect = true; - // The shader can use any possible constant buffer - info.constant_buffer_mask = (1 << Info::MAX_CBUFS) - 1; - - auto& cbufs{info.constant_buffer_descriptors}; - cbufs.clear(); - for (u32 i = 0; i < Info::MAX_CBUFS; i++) { - cbufs.push_back(ConstantBufferDescriptor{.index = i, .count = 1}); + for (u32 i = 0; i < Info::MAX_INDIRECT_CBUFS; i++) { + AddConstantBufferDescriptor(info, i, 1); // The shader can use any possible access size info.constant_buffer_used_sizes[i] = 0x10'000; diff --git a/src/shader_recompiler/shader_info.h b/src/shader_recompiler/shader_info.h index 9d36bd9eb..a3a09c71c 100644 --- a/src/shader_recompiler/shader_info.h +++ b/src/shader_recompiler/shader_info.h @@ -105,6 +105,7 @@ struct ImageDescriptor { using ImageDescriptors = boost::container::small_vector<ImageDescriptor, 4>; struct Info { + static constexpr size_t MAX_INDIRECT_CBUFS{14}; static constexpr size_t MAX_CBUFS{18}; static constexpr size_t MAX_SSBOS{32}; diff --git a/src/video_core/host_shaders/CMakeLists.txt b/src/video_core/host_shaders/CMakeLists.txt index af05d47d1..190fc6aea 100644 --- a/src/video_core/host_shaders/CMakeLists.txt +++ b/src/video_core/host_shaders/CMakeLists.txt @@ -18,6 +18,7 @@ set(SHADER_FILES full_screen_triangle.vert fxaa.frag fxaa.vert + opengl_convert_s8d24.comp opengl_copy_bc4.comp opengl_present.frag opengl_present.vert diff --git a/src/video_core/host_shaders/opengl_convert_s8d24.comp b/src/video_core/host_shaders/opengl_convert_s8d24.comp new file mode 100644 index 000000000..83e1ab176 --- /dev/null +++ b/src/video_core/host_shaders/opengl_convert_s8d24.comp @@ -0,0 +1,18 @@ +// Copyright 2022 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#version 430 core + +layout(local_size_x = 16, local_size_y = 8) in; + +layout(binding = 0, rgba8ui) restrict uniform uimage2D destination; +layout(location = 0) uniform uvec3 size; + +void main() { + if (any(greaterThanEqual(gl_GlobalInvocationID, size))) { + return; + } + uvec4 components = imageLoad(destination, ivec2(gl_GlobalInvocationID.xy)); + imageStore(destination, ivec2(gl_GlobalInvocationID.xy), components.wxyz); +} diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp index e6f9ece8b..7ab7f0c0a 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.cpp +++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp @@ -520,6 +520,8 @@ bool RasterizerOpenGL::AccelerateDisplay(const Tegra::FramebufferConfig& config, // ASSERT_MSG(image_view->size.width == config.width, "Framebuffer width is different"); // ASSERT_MSG(image_view->size.height == config.height, "Framebuffer height is different"); + screen_info.texture.width = image_view->size.width; + screen_info.texture.height = image_view->size.height; screen_info.display_texture = image_view->Handle(Shader::TextureType::Color2D); screen_info.display_srgb = VideoCore::Surface::IsPixelFormatSRGB(image_view->format); return true; diff --git a/src/video_core/renderer_opengl/gl_texture_cache.cpp b/src/video_core/renderer_opengl/gl_texture_cache.cpp index 8f9a65beb..d12076358 100644 --- a/src/video_core/renderer_opengl/gl_texture_cache.cpp +++ b/src/video_core/renderer_opengl/gl_texture_cache.cpp @@ -409,8 +409,8 @@ ImageBufferMap::~ImageBufferMap() { TextureCacheRuntime::TextureCacheRuntime(const Device& device_, ProgramManager& program_manager, StateTracker& state_tracker_) - : device{device_}, state_tracker{state_tracker_}, - util_shaders(program_manager), resolution{Settings::values.resolution_info} { + : device{device_}, state_tracker{state_tracker_}, util_shaders(program_manager), + format_conversion_pass{util_shaders}, resolution{Settings::values.resolution_info} { static constexpr std::array TARGETS{GL_TEXTURE_1D_ARRAY, GL_TEXTURE_2D_ARRAY, GL_TEXTURE_3D}; for (size_t i = 0; i < TARGETS.size(); ++i) { const GLenum target = TARGETS[i]; @@ -1325,6 +1325,9 @@ Framebuffer::Framebuffer(TextureCacheRuntime& runtime, std::span<ImageView*, NUM Framebuffer::~Framebuffer() = default; +FormatConversionPass::FormatConversionPass(UtilShaders& util_shaders_) + : util_shaders{util_shaders_} {} + void FormatConversionPass::ConvertImage(Image& dst_image, Image& src_image, std::span<const VideoCommon::ImageCopy> copies) { const GLenum dst_target = ImageTarget(dst_image.info); @@ -1357,6 +1360,12 @@ void FormatConversionPass::ConvertImage(Image& dst_image, Image& src_image, dst_origin.z, region.width, region.height, region.depth, dst_image.GlFormat(), dst_image.GlType(), nullptr); } + + // Swap component order of S8D24 to ABGR8 reinterprets + if (src_image.info.format == PixelFormat::D24_UNORM_S8_UINT && + dst_image.info.format == PixelFormat::A8B8G8R8_UNORM) { + util_shaders.ConvertS8D24(dst_image, copies); + } } } // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_texture_cache.h b/src/video_core/renderer_opengl/gl_texture_cache.h index 53088b66e..672fa8dde 100644 --- a/src/video_core/renderer_opengl/gl_texture_cache.h +++ b/src/video_core/renderer_opengl/gl_texture_cache.h @@ -55,13 +55,14 @@ struct FormatProperties { class FormatConversionPass { public: - FormatConversionPass() = default; + explicit FormatConversionPass(UtilShaders& util_shaders); ~FormatConversionPass() = default; void ConvertImage(Image& dst_image, Image& src_image, std::span<const VideoCommon::ImageCopy> copies); private: + UtilShaders& util_shaders; OGLBuffer intermediate_pbo; size_t pbo_size{}; }; diff --git a/src/video_core/renderer_opengl/renderer_opengl.cpp b/src/video_core/renderer_opengl/renderer_opengl.cpp index f8f29013a..3a3c213bb 100644 --- a/src/video_core/renderer_opengl/renderer_opengl.cpp +++ b/src/video_core/renderer_opengl/renderer_opengl.cpp @@ -208,6 +208,8 @@ void RendererOpenGL::LoadFBToScreenInfo(const Tegra::FramebufferConfig& framebuf // Framebuffer orientation handling framebuffer_transform_flags = framebuffer.transform_flags; framebuffer_crop_rect = framebuffer.crop_rect; + framebuffer_width = framebuffer.width; + framebuffer_height = framebuffer.height; const VAddr framebuffer_addr{framebuffer.address + framebuffer.offset}; screen_info.was_accelerated = @@ -480,9 +482,12 @@ void RendererOpenGL::DrawScreen(const Layout::FramebufferLayout& layout) { ASSERT_MSG(framebuffer_crop_rect.top == 0, "Unimplemented"); ASSERT_MSG(framebuffer_crop_rect.left == 0, "Unimplemented"); + f32 scale_u = static_cast<f32>(framebuffer_width) / static_cast<f32>(screen_info.texture.width); + f32 scale_v = + static_cast<f32>(framebuffer_height) / static_cast<f32>(screen_info.texture.height); + // Scale the output by the crop width/height. This is commonly used with 1280x720 rendering // (e.g. handheld mode) on a 1920x1080 framebuffer. - f32 scale_u = 1.f, scale_v = 1.f; if (framebuffer_crop_rect.GetWidth() > 0) { scale_u = static_cast<f32>(framebuffer_crop_rect.GetWidth()) / static_cast<f32>(screen_info.texture.width); diff --git a/src/video_core/renderer_opengl/renderer_opengl.h b/src/video_core/renderer_opengl/renderer_opengl.h index aa206878b..ae9558a33 100644 --- a/src/video_core/renderer_opengl/renderer_opengl.h +++ b/src/video_core/renderer_opengl/renderer_opengl.h @@ -137,6 +137,8 @@ private: /// Used for transforming the framebuffer orientation Service::android::BufferTransformFlags framebuffer_transform_flags{}; Common::Rectangle<int> framebuffer_crop_rect; + u32 framebuffer_width; + u32 framebuffer_height; }; } // namespace OpenGL diff --git a/src/video_core/renderer_opengl/util_shaders.cpp b/src/video_core/renderer_opengl/util_shaders.cpp index 897c380b3..04c482a09 100644 --- a/src/video_core/renderer_opengl/util_shaders.cpp +++ b/src/video_core/renderer_opengl/util_shaders.cpp @@ -13,6 +13,7 @@ #include "video_core/host_shaders/astc_decoder_comp.h" #include "video_core/host_shaders/block_linear_unswizzle_2d_comp.h" #include "video_core/host_shaders/block_linear_unswizzle_3d_comp.h" +#include "video_core/host_shaders/opengl_convert_s8d24_comp.h" #include "video_core/host_shaders/opengl_copy_bc4_comp.h" #include "video_core/host_shaders/pitch_unswizzle_comp.h" #include "video_core/renderer_opengl/gl_shader_manager.h" @@ -50,7 +51,8 @@ UtilShaders::UtilShaders(ProgramManager& program_manager_) block_linear_unswizzle_2d_program(MakeProgram(BLOCK_LINEAR_UNSWIZZLE_2D_COMP)), block_linear_unswizzle_3d_program(MakeProgram(BLOCK_LINEAR_UNSWIZZLE_3D_COMP)), pitch_unswizzle_program(MakeProgram(PITCH_UNSWIZZLE_COMP)), - copy_bc4_program(MakeProgram(OPENGL_COPY_BC4_COMP)) { + copy_bc4_program(MakeProgram(OPENGL_COPY_BC4_COMP)), + convert_s8d24_program(MakeProgram(OPENGL_CONVERT_S8D24_COMP)) { const auto swizzle_table = Tegra::Texture::MakeSwizzleTable(); swizzle_table_buffer.Create(); glNamedBufferStorage(swizzle_table_buffer.handle, sizeof(swizzle_table), &swizzle_table, 0); @@ -248,6 +250,26 @@ void UtilShaders::CopyBC4(Image& dst_image, Image& src_image, std::span<const Im program_manager.RestoreGuestCompute(); } +void UtilShaders::ConvertS8D24(Image& dst_image, std::span<const ImageCopy> copies) { + static constexpr GLuint BINDING_DESTINATION = 0; + static constexpr GLuint LOC_SIZE = 0; + + program_manager.BindComputeProgram(convert_s8d24_program.handle); + for (const ImageCopy& copy : copies) { + ASSERT(copy.src_subresource.base_layer == 0); + ASSERT(copy.src_subresource.num_layers == 1); + ASSERT(copy.dst_subresource.base_layer == 0); + ASSERT(copy.dst_subresource.num_layers == 1); + + glUniform3ui(LOC_SIZE, copy.extent.width, copy.extent.height, copy.extent.depth); + glBindImageTexture(BINDING_DESTINATION, dst_image.StorageHandle(), + copy.dst_subresource.base_level, GL_TRUE, 0, GL_READ_WRITE, GL_RGBA8UI); + glDispatchCompute(Common::DivCeil(copy.extent.width, 16u), + Common::DivCeil(copy.extent.height, 8u), copy.extent.depth); + } + program_manager.RestoreGuestCompute(); +} + GLenum StoreFormat(u32 bytes_per_block) { switch (bytes_per_block) { case 1: diff --git a/src/video_core/renderer_opengl/util_shaders.h b/src/video_core/renderer_opengl/util_shaders.h index 5de95ea7a..5c132e67f 100644 --- a/src/video_core/renderer_opengl/util_shaders.h +++ b/src/video_core/renderer_opengl/util_shaders.h @@ -39,6 +39,8 @@ public: void CopyBC4(Image& dst_image, Image& src_image, std::span<const VideoCommon::ImageCopy> copies); + void ConvertS8D24(Image& dst_image, std::span<const VideoCommon::ImageCopy> copies); + private: ProgramManager& program_manager; @@ -49,6 +51,7 @@ private: OGLProgram block_linear_unswizzle_3d_program; OGLProgram pitch_unswizzle_program; OGLProgram copy_bc4_program; + OGLProgram convert_s8d24_program; }; GLenum StoreFormat(u32 bytes_per_block); diff --git a/src/video_core/renderer_vulkan/vk_blit_screen.cpp b/src/video_core/renderer_vulkan/vk_blit_screen.cpp index d893c1952..b866e9103 100644 --- a/src/video_core/renderer_vulkan/vk_blit_screen.cpp +++ b/src/video_core/renderer_vulkan/vk_blit_screen.cpp @@ -1406,8 +1406,9 @@ void VKBlitScreen::SetVertexData(BufferData& data, const Tegra::FramebufferConfi UNIMPLEMENTED_IF(framebuffer_crop_rect.top != 0); UNIMPLEMENTED_IF(framebuffer_crop_rect.left != 0); - f32 scale_u = 1.0f; - f32 scale_v = 1.0f; + f32 scale_u = static_cast<f32>(framebuffer.width) / static_cast<f32>(screen_info.width); + f32 scale_v = static_cast<f32>(framebuffer.height) / static_cast<f32>(screen_info.height); + // Scale the output by the crop width/height. This is commonly used with 1280x720 rendering // (e.g. handheld mode) on a 1920x1080 framebuffer. if (!fsr) { |