diff options
| -rw-r--r-- | src/video_core/renderer_opengl/gl_device.cpp | 7 | ||||
| -rw-r--r-- | src/video_core/renderer_opengl/gl_device.h | 5 | ||||
| -rw-r--r-- | src/video_core/renderer_opengl/gl_rasterizer.cpp | 96 | ||||
| -rw-r--r-- | src/video_core/renderer_opengl/gl_rasterizer.h | 16 | ||||
| -rw-r--r-- | src/video_core/renderer_opengl/gl_resource_manager.cpp | 9 | ||||
| -rw-r--r-- | src/video_core/renderer_opengl/gl_resource_manager.h | 16 | ||||
| -rw-r--r-- | src/video_core/renderer_opengl/gl_shader_cache.cpp | 101 | ||||
| -rw-r--r-- | src/video_core/renderer_opengl/gl_shader_cache.h | 15 | ||||
| -rw-r--r-- | src/video_core/renderer_opengl/gl_shader_manager.cpp | 106 | ||||
| -rw-r--r-- | src/video_core/renderer_opengl/gl_shader_manager.h | 56 | ||||
| -rw-r--r-- | src/video_core/renderer_opengl/renderer_opengl.cpp | 16 | ||||
| -rw-r--r-- | src/video_core/renderer_opengl/renderer_opengl.h | 5 | 
12 files changed, 339 insertions, 109 deletions
| diff --git a/src/video_core/renderer_opengl/gl_device.cpp b/src/video_core/renderer_opengl/gl_device.cpp index d83dca25a..466a911db 100644 --- a/src/video_core/renderer_opengl/gl_device.cpp +++ b/src/video_core/renderer_opengl/gl_device.cpp @@ -13,6 +13,7 @@  #include "common/logging/log.h"  #include "common/scope_exit.h" +#include "core/settings.h"  #include "video_core/renderer_opengl/gl_device.h"  #include "video_core/renderer_opengl/gl_resource_manager.h" @@ -183,10 +184,16 @@ Device::Device() : base_bindings{BuildBaseBindings()} {      has_precise_bug = TestPreciseBug();      has_broken_compute = is_intel_proprietary;      has_fast_buffer_sub_data = is_nvidia; +    use_assembly_shaders = Settings::values.use_assembly_shaders && GLAD_GL_NV_gpu_program5 && +                           GLAD_GL_NV_compute_program5;      LOG_INFO(Render_OpenGL, "Renderer_VariableAOFFI: {}", has_variable_aoffi);      LOG_INFO(Render_OpenGL, "Renderer_ComponentIndexingBug: {}", has_component_indexing_bug);      LOG_INFO(Render_OpenGL, "Renderer_PreciseBug: {}", has_precise_bug); + +    if (Settings::values.use_assembly_shaders && !use_assembly_shaders) { +        LOG_ERROR(Render_OpenGL, "Assembly shaders enabled but not supported"); +    }  }  Device::Device(std::nullptr_t) { diff --git a/src/video_core/renderer_opengl/gl_device.h b/src/video_core/renderer_opengl/gl_device.h index a55050cb5..e915dbd86 100644 --- a/src/video_core/renderer_opengl/gl_device.h +++ b/src/video_core/renderer_opengl/gl_device.h @@ -88,6 +88,10 @@ public:          return has_fast_buffer_sub_data;      } +    bool UseAssemblyShaders() const { +        return use_assembly_shaders; +    } +  private:      static bool TestVariableAoffi();      static bool TestPreciseBug(); @@ -107,6 +111,7 @@ private:      bool has_precise_bug{};      bool has_broken_compute{};      bool has_fast_buffer_sub_data{}; +    bool use_assembly_shaders{};  };  } // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp index 69dcf952f..92ca22136 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.cpp +++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp @@ -94,17 +94,30 @@ void oglEnable(GLenum cap, bool state) {  } // Anonymous namespace  RasterizerOpenGL::RasterizerOpenGL(Core::System& system, Core::Frontend::EmuWindow& emu_window, -                                   ScreenInfo& info, GLShader::ProgramManager& program_manager, -                                   StateTracker& state_tracker) -    : RasterizerAccelerated{system.Memory()}, texture_cache{system, *this, device, state_tracker}, +                                   const Device& device, ScreenInfo& info, +                                   ProgramManager& program_manager, StateTracker& state_tracker) +    : RasterizerAccelerated{system.Memory()}, device{device}, texture_cache{system, *this, device, +                                                                            state_tracker},        shader_cache{*this, system, emu_window, device}, query_cache{system, *this},        buffer_cache{*this, system, device, STREAM_BUFFER_SIZE},        fence_manager{system, *this, texture_cache, buffer_cache, query_cache}, system{system},        screen_info{info}, program_manager{program_manager}, state_tracker{state_tracker} {      CheckExtensions(); + +    if (device.UseAssemblyShaders()) { +        glCreateBuffers(static_cast<GLsizei>(staging_cbufs.size()), staging_cbufs.data()); +        for (const GLuint cbuf : staging_cbufs) { +            glNamedBufferStorage(cbuf, static_cast<GLsizeiptr>(Maxwell::MaxConstBufferSize), +                                 nullptr, 0); +        } +    }  } -RasterizerOpenGL::~RasterizerOpenGL() {} +RasterizerOpenGL::~RasterizerOpenGL() { +    if (device.UseAssemblyShaders()) { +        glDeleteBuffers(static_cast<GLsizei>(staging_cbufs.size()), staging_cbufs.data()); +    } +}  void RasterizerOpenGL::CheckExtensions() {      if (!GLAD_GL_ARB_texture_filter_anisotropic && !GLAD_GL_EXT_texture_filter_anisotropic) { @@ -230,6 +243,7 @@ GLintptr RasterizerOpenGL::SetupIndexBuffer() {  void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) {      MICROPROFILE_SCOPE(OpenGL_Shader);      auto& gpu = system.GPU().Maxwell3D(); +    std::size_t num_ssbos = 0;      u32 clip_distances = 0;      for (std::size_t index = 0; index < Maxwell::MaxShaderProgram; ++index) { @@ -261,6 +275,14 @@ void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) {          Shader shader{shader_cache.GetStageProgram(program)}; +        if (device.UseAssemblyShaders()) { +            // Check for ARB limitation. We only have 16 SSBOs per context state. To workaround this +            // all stages share the same bindings. +            const std::size_t num_stage_ssbos = shader->GetEntries().global_memory_entries.size(); +            ASSERT_MSG(num_stage_ssbos == 0 || num_ssbos == 0, "SSBOs on more than one stage"); +            num_ssbos += num_stage_ssbos; +        } +          // Stage indices are 0 - 5          const std::size_t stage = index == 0 ? 0 : index - 1;          SetupDrawConstBuffers(stage, shader); @@ -526,6 +548,7 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) {      SyncFramebufferSRGB();      buffer_cache.Acquire(); +    current_cbuf = 0;      std::size_t buffer_size = CalculateVertexArraysSize(); @@ -535,9 +558,9 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) {      }      // Uniform space for the 5 shader stages -    buffer_size = Common::AlignUp<std::size_t>(buffer_size, 4) + -                  (sizeof(GLShader::MaxwellUniformData) + device.GetUniformBufferAlignment()) * -                      Maxwell::MaxShaderStage; +    buffer_size = +        Common::AlignUp<std::size_t>(buffer_size, 4) + +        (sizeof(MaxwellUniformData) + device.GetUniformBufferAlignment()) * Maxwell::MaxShaderStage;      // Add space for at least 18 constant buffers      buffer_size += Maxwell::MaxConstBuffers * @@ -558,12 +581,14 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) {      }      // Setup emulation uniform buffer. -    GLShader::MaxwellUniformData ubo; -    ubo.SetFromRegs(gpu); -    const auto [buffer, offset] = -        buffer_cache.UploadHostMemory(&ubo, sizeof(ubo), device.GetUniformBufferAlignment()); -    glBindBufferRange(GL_UNIFORM_BUFFER, EmulationUniformBlockBinding, buffer, offset, -                      static_cast<GLsizeiptr>(sizeof(ubo))); +    if (!device.UseAssemblyShaders()) { +        MaxwellUniformData ubo; +        ubo.SetFromRegs(gpu); +        const auto [buffer, offset] = +            buffer_cache.UploadHostMemory(&ubo, sizeof(ubo), device.GetUniformBufferAlignment()); +        glBindBufferRange(GL_UNIFORM_BUFFER, EmulationUniformBlockBinding, buffer, offset, +                          static_cast<GLsizeiptr>(sizeof(ubo))); +    }      // Setup shaders and their used resources.      texture_cache.GuardSamplers(true); @@ -635,11 +660,11 @@ void RasterizerOpenGL::DispatchCompute(GPUVAddr code_addr) {      }      buffer_cache.Acquire(); +    current_cbuf = 0;      auto kernel = shader_cache.GetComputeKernel(code_addr);      SetupComputeTextures(kernel);      SetupComputeImages(kernel); -    program_manager.BindComputeShader(kernel->GetHandle());      const std::size_t buffer_size =          Tegra::Engines::KeplerCompute::NumConstBuffers * @@ -652,6 +677,7 @@ void RasterizerOpenGL::DispatchCompute(GPUVAddr code_addr) {      buffer_cache.Unmap();      const auto& launch_desc = system.GPU().KeplerCompute().launch_description; +    program_manager.BindCompute(kernel->GetHandle());      glDispatchCompute(launch_desc.grid_dim_x, launch_desc.grid_dim_y, launch_desc.grid_dim_z);      ++num_queued_commands;  } @@ -812,14 +838,20 @@ bool RasterizerOpenGL::AccelerateDisplay(const Tegra::FramebufferConfig& config,  }  void RasterizerOpenGL::SetupDrawConstBuffers(std::size_t stage_index, const Shader& shader) { +    static constexpr std::array PARAMETER_LUT = { +        GL_VERTEX_PROGRAM_PARAMETER_BUFFER_NV, GL_TESS_CONTROL_PROGRAM_PARAMETER_BUFFER_NV, +        GL_TESS_EVALUATION_PROGRAM_PARAMETER_BUFFER_NV, GL_GEOMETRY_PROGRAM_PARAMETER_BUFFER_NV, +        GL_FRAGMENT_PROGRAM_PARAMETER_BUFFER_NV}; +      MICROPROFILE_SCOPE(OpenGL_UBO);      const auto& stages = system.GPU().Maxwell3D().state.shader_stages;      const auto& shader_stage = stages[stage_index]; -    u32 binding = device.GetBaseBindings(stage_index).uniform_buffer; +    u32 binding = +        device.UseAssemblyShaders() ? 0 : device.GetBaseBindings(stage_index).uniform_buffer;      for (const auto& entry : shader->GetEntries().const_buffers) {          const auto& buffer = shader_stage.const_buffers[entry.GetIndex()]; -        SetupConstBuffer(binding++, buffer, entry); +        SetupConstBuffer(PARAMETER_LUT[stage_index], binding++, buffer, entry);      }  } @@ -835,16 +867,21 @@ void RasterizerOpenGL::SetupComputeConstBuffers(const Shader& kernel) {          buffer.address = config.Address();          buffer.size = config.size;          buffer.enabled = mask[entry.GetIndex()]; -        SetupConstBuffer(binding++, buffer, entry); +        SetupConstBuffer(GL_COMPUTE_PROGRAM_PARAMETER_BUFFER_NV, binding++, buffer, entry);      }  } -void RasterizerOpenGL::SetupConstBuffer(u32 binding, const Tegra::Engines::ConstBufferInfo& buffer, +void RasterizerOpenGL::SetupConstBuffer(GLenum stage, u32 binding, +                                        const Tegra::Engines::ConstBufferInfo& buffer,                                          const ConstBufferEntry& entry) {      if (!buffer.enabled) {          // Set values to zero to unbind buffers -        glBindBufferRange(GL_UNIFORM_BUFFER, binding, buffer_cache.GetEmptyBuffer(sizeof(float)), 0, -                          sizeof(float)); +        if (device.UseAssemblyShaders()) { +            glBindBufferRangeNV(stage, entry.GetIndex(), 0, 0, 0); +        } else { +            glBindBufferRange(GL_UNIFORM_BUFFER, binding, +                              buffer_cache.GetEmptyBuffer(sizeof(float)), 0, sizeof(float)); +        }          return;      } @@ -853,9 +890,19 @@ void RasterizerOpenGL::SetupConstBuffer(u32 binding, const Tegra::Engines::Const      const std::size_t size = Common::AlignUp(GetConstBufferSize(buffer, entry), sizeof(GLvec4));      const auto alignment = device.GetUniformBufferAlignment(); -    const auto [cbuf, offset] = buffer_cache.UploadMemory(buffer.address, size, alignment, false, -                                                          device.HasFastBufferSubData()); -    glBindBufferRange(GL_UNIFORM_BUFFER, binding, cbuf, offset, size); +    auto [cbuf, offset] = buffer_cache.UploadMemory(buffer.address, size, alignment, false, +                                                    device.HasFastBufferSubData()); +    if (!device.UseAssemblyShaders()) { +        glBindBufferRange(GL_UNIFORM_BUFFER, binding, cbuf, offset, size); +        return; +    } +    if (offset != 0) { +        const GLuint staging_cbuf = staging_cbufs[current_cbuf++]; +        glCopyNamedBufferSubData(cbuf, staging_cbuf, offset, 0, size); +        cbuf = staging_cbuf; +        offset = 0; +    } +    glBindBufferRangeNV(stage, binding, cbuf, offset, size);  }  void RasterizerOpenGL::SetupDrawGlobalMemory(std::size_t stage_index, const Shader& shader) { @@ -863,7 +910,8 @@ void RasterizerOpenGL::SetupDrawGlobalMemory(std::size_t stage_index, const Shad      auto& memory_manager{gpu.MemoryManager()};      const auto cbufs{gpu.Maxwell3D().state.shader_stages[stage_index]}; -    u32 binding = device.GetBaseBindings(stage_index).shader_storage_buffer; +    u32 binding = +        device.UseAssemblyShaders() ? 0 : device.GetBaseBindings(stage_index).shader_storage_buffer;      for (const auto& entry : shader->GetEntries().global_memory_entries) {          const GPUVAddr addr{cbufs.const_buffers[entry.cbuf_index].address + entry.cbuf_offset};          const GPUVAddr gpu_addr{memory_manager.Read<u64>(addr)}; diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h index b94c65907..87f7fe159 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.h +++ b/src/video_core/renderer_opengl/gl_rasterizer.h @@ -56,8 +56,8 @@ struct DrawParameters;  class RasterizerOpenGL : public VideoCore::RasterizerAccelerated {  public:      explicit RasterizerOpenGL(Core::System& system, Core::Frontend::EmuWindow& emu_window, -                              ScreenInfo& info, GLShader::ProgramManager& program_manager, -                              StateTracker& state_tracker); +                              const Device& device, ScreenInfo& info, +                              ProgramManager& program_manager, StateTracker& state_tracker);      ~RasterizerOpenGL() override;      void Draw(bool is_indexed, bool is_instanced) override; @@ -106,7 +106,7 @@ private:      void SetupComputeConstBuffers(const Shader& kernel);      /// Configures a constant buffer. -    void SetupConstBuffer(u32 binding, const Tegra::Engines::ConstBufferInfo& buffer, +    void SetupConstBuffer(GLenum stage, u32 binding, const Tegra::Engines::ConstBufferInfo& buffer,                            const ConstBufferEntry& entry);      /// Configures the current global memory entries to use for the draw command. @@ -224,7 +224,7 @@ private:      void SetupShaders(GLenum primitive_mode); -    const Device device; +    const Device& device;      TextureCacheOpenGL texture_cache;      ShaderCacheOpenGL shader_cache; @@ -236,7 +236,7 @@ private:      Core::System& system;      ScreenInfo& screen_info; -    GLShader::ProgramManager& program_manager; +    ProgramManager& program_manager;      StateTracker& state_tracker;      static constexpr std::size_t STREAM_BUFFER_SIZE = 128 * 1024 * 1024; @@ -248,6 +248,12 @@ private:      std::bitset<Tegra::Engines::Maxwell3D::Regs::NumTransformFeedbackBuffers>          enabled_transform_feedback_buffers; +    static constexpr std::size_t NUM_CONSTANT_BUFFERS = +        Tegra::Engines::Maxwell3D::Regs::MaxConstBuffers * +        Tegra::Engines::Maxwell3D::Regs::MaxShaderProgram; +    std::array<GLuint, NUM_CONSTANT_BUFFERS> staging_cbufs{}; +    std::size_t current_cbuf = 0; +      /// Number of commands queued to the OpenGL driver. Reseted on flush.      std::size_t num_queued_commands = 0; diff --git a/src/video_core/renderer_opengl/gl_resource_manager.cpp b/src/video_core/renderer_opengl/gl_resource_manager.cpp index 97803d480..a787e27d2 100644 --- a/src/video_core/renderer_opengl/gl_resource_manager.cpp +++ b/src/video_core/renderer_opengl/gl_resource_manager.cpp @@ -125,6 +125,15 @@ void OGLProgram::Release() {      handle = 0;  } +void OGLAssemblyProgram::Release() { +    if (handle == 0) { +        return; +    } +    MICROPROFILE_SCOPE(OpenGL_ResourceDeletion); +    glDeleteProgramsARB(1, &handle); +    handle = 0; +} +  void OGLPipeline::Create() {      if (handle != 0)          return; diff --git a/src/video_core/renderer_opengl/gl_resource_manager.h b/src/video_core/renderer_opengl/gl_resource_manager.h index de93f4212..f8b322227 100644 --- a/src/video_core/renderer_opengl/gl_resource_manager.h +++ b/src/video_core/renderer_opengl/gl_resource_manager.h @@ -167,6 +167,22 @@ public:      GLuint handle = 0;  }; +class OGLAssemblyProgram : private NonCopyable { +public: +    OGLAssemblyProgram() = default; + +    OGLAssemblyProgram(OGLAssemblyProgram&& o) noexcept : handle(std::exchange(o.handle, 0)) {} + +    ~OGLAssemblyProgram() { +        Release(); +    } + +    /// Deletes the internal OpenGL resource +    void Release(); + +    GLuint handle = 0; +}; +  class OGLPipeline : private NonCopyable {  public:      OGLPipeline() = default; diff --git a/src/video_core/renderer_opengl/gl_shader_cache.cpp b/src/video_core/renderer_opengl/gl_shader_cache.cpp index 9759a7078..4cd0f36cf 100644 --- a/src/video_core/renderer_opengl/gl_shader_cache.cpp +++ b/src/video_core/renderer_opengl/gl_shader_cache.cpp @@ -97,6 +97,24 @@ constexpr ShaderType GetShaderType(Maxwell::ShaderProgram program_type) {      return {};  } +constexpr GLenum AssemblyEnum(ShaderType shader_type) { +    switch (shader_type) { +    case ShaderType::Vertex: +        return GL_VERTEX_PROGRAM_NV; +    case ShaderType::TesselationControl: +        return GL_TESS_CONTROL_PROGRAM_NV; +    case ShaderType::TesselationEval: +        return GL_TESS_EVALUATION_PROGRAM_NV; +    case ShaderType::Geometry: +        return GL_GEOMETRY_PROGRAM_NV; +    case ShaderType::Fragment: +        return GL_FRAGMENT_PROGRAM_NV; +    case ShaderType::Compute: +        return GL_COMPUTE_PROGRAM_NV; +    } +    return {}; +} +  std::string MakeShaderID(u64 unique_identifier, ShaderType shader_type) {      return fmt::format("{}{:016X}", GetShaderTypeName(shader_type), unique_identifier);  } @@ -120,18 +138,43 @@ std::shared_ptr<Registry> MakeRegistry(const ShaderDiskCacheEntry& entry) {      return registry;  } -std::shared_ptr<OGLProgram> BuildShader(const Device& device, ShaderType shader_type, -                                        u64 unique_identifier, const ShaderIR& ir, -                                        const Registry& registry, bool hint_retrievable = false) { +ProgramSharedPtr BuildShader(const Device& device, ShaderType shader_type, u64 unique_identifier, +                             const ShaderIR& ir, const Registry& registry, +                             bool hint_retrievable = false) {      const std::string shader_id = MakeShaderID(unique_identifier, shader_type);      LOG_INFO(Render_OpenGL, "{}", shader_id); -    const std::string glsl = DecompileShader(device, ir, registry, shader_type, shader_id); -    OGLShader shader; -    shader.Create(glsl.c_str(), GetGLShaderType(shader_type)); +    auto program = std::make_shared<ProgramHandle>(); + +    if (device.UseAssemblyShaders()) { +        const std::string arb = "Not implemented"; + +        GLuint& arb_prog = program->assembly_program.handle; + +// Commented out functions signal OpenGL errors but are compatible with apitrace. +// Use them only to capture and replay on apitrace. +#if 0 +        glGenProgramsNV(1, &arb_prog); +        glLoadProgramNV(AssemblyEnum(shader_type), arb_prog, static_cast<GLsizei>(arb.size()), +                        reinterpret_cast<const GLubyte*>(arb.data())); +#else +        glGenProgramsARB(1, &arb_prog); +        glNamedProgramStringEXT(arb_prog, AssemblyEnum(shader_type), GL_PROGRAM_FORMAT_ASCII_ARB, +                                static_cast<GLsizei>(arb.size()), arb.data()); +#endif +        const auto err = reinterpret_cast<const char*>(glGetString(GL_PROGRAM_ERROR_STRING_NV)); +        if (err && *err) { +            LOG_CRITICAL(Render_OpenGL, "{}", err); +            LOG_INFO(Render_OpenGL, "\n{}", arb); +        } +    } else { +        const std::string glsl = DecompileShader(device, ir, registry, shader_type, shader_id); +        OGLShader shader; +        shader.Create(glsl.c_str(), GetGLShaderType(shader_type)); + +        program->source_program.Create(true, hint_retrievable, shader.handle); +    } -    auto program = std::make_shared<OGLProgram>(); -    program->Create(true, hint_retrievable, shader.handle);      return program;  } @@ -153,15 +196,22 @@ std::unordered_set<GLenum> GetSupportedFormats() {  CachedShader::CachedShader(VAddr cpu_addr, std::size_t size_in_bytes,                             std::shared_ptr<VideoCommon::Shader::Registry> registry, -                           ShaderEntries entries, std::shared_ptr<OGLProgram> program) +                           ShaderEntries entries, ProgramSharedPtr program_)      : RasterizerCacheObject{cpu_addr}, registry{std::move(registry)}, entries{std::move(entries)}, -      size_in_bytes{size_in_bytes}, program{std::move(program)} {} +      size_in_bytes{size_in_bytes}, program{std::move(program_)} { +    // Assign either the assembly program or source program. We can't have both. +    handle = program->assembly_program.handle; +    if (handle == 0) { +        handle = program->source_program.handle; +    } +    ASSERT(handle != 0); +}  CachedShader::~CachedShader() = default;  GLuint CachedShader::GetHandle() const {      DEBUG_ASSERT(registry->IsConsistent()); -    return program->handle; +    return handle;  }  Shader CachedShader::CreateStageFromMemory(const ShaderParameters& params, @@ -239,7 +289,11 @@ void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading,          return;      } -    const std::vector gl_cache = disk_cache.LoadPrecompiled(); +    std::vector<ShaderDiskCachePrecompiled> gl_cache; +    if (!device.UseAssemblyShaders()) { +        // Only load precompiled cache when we are not using assembly shaders +        gl_cache = disk_cache.LoadPrecompiled(); +    }      const auto supported_formats = GetSupportedFormats();      // Track if precompiled cache was altered during loading to know if we have to @@ -278,7 +332,7 @@ void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading,              auto registry = MakeRegistry(entry);              const ShaderIR ir(entry.code, main_offset, COMPILER_SETTINGS, *registry); -            std::shared_ptr<OGLProgram> program; +            ProgramSharedPtr program;              if (precompiled_entry) {                  // If the shader is precompiled, attempt to load it with                  program = GeneratePrecompiledProgram(entry, *precompiled_entry, supported_formats); @@ -332,6 +386,11 @@ void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading,          return;      } +    if (device.UseAssemblyShaders()) { +        // Don't store precompiled binaries for assembly shaders. +        return; +    } +      // TODO(Rodrigo): Do state tracking for transferable shaders and do a dummy draw      // before precompiling them @@ -339,7 +398,7 @@ void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading,          const u64 id = (*transferable)[i].unique_identifier;          const auto it = find_precompiled(id);          if (it == gl_cache.end()) { -            const GLuint program = runtime_cache.at(id).program->handle; +            const GLuint program = runtime_cache.at(id).program->source_program.handle;              disk_cache.SavePrecompiled(id, program);              precompiled_cache_altered = true;          } @@ -350,7 +409,7 @@ void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading,      }  } -std::shared_ptr<OGLProgram> ShaderCacheOpenGL::GeneratePrecompiledProgram( +ProgramSharedPtr ShaderCacheOpenGL::GeneratePrecompiledProgram(      const ShaderDiskCacheEntry& entry, const ShaderDiskCachePrecompiled& precompiled_entry,      const std::unordered_set<GLenum>& supported_formats) {      if (supported_formats.find(precompiled_entry.binary_format) == supported_formats.end()) { @@ -358,15 +417,15 @@ std::shared_ptr<OGLProgram> ShaderCacheOpenGL::GeneratePrecompiledProgram(          return {};      } -    auto program = std::make_shared<OGLProgram>(); -    program->handle = glCreateProgram(); -    glProgramParameteri(program->handle, GL_PROGRAM_SEPARABLE, GL_TRUE); -    glProgramBinary(program->handle, precompiled_entry.binary_format, -                    precompiled_entry.binary.data(), +    auto program = std::make_shared<ProgramHandle>(); +    GLuint& handle = program->source_program.handle; +    handle = glCreateProgram(); +    glProgramParameteri(handle, GL_PROGRAM_SEPARABLE, GL_TRUE); +    glProgramBinary(handle, precompiled_entry.binary_format, precompiled_entry.binary.data(),                      static_cast<GLsizei>(precompiled_entry.binary.size()));      GLint link_status; -    glGetProgramiv(program->handle, GL_LINK_STATUS, &link_status); +    glGetProgramiv(handle, GL_LINK_STATUS, &link_status);      if (link_status == GL_FALSE) {          LOG_INFO(Render_OpenGL, "Precompiled cache rejected by the driver, removing");          return {}; diff --git a/src/video_core/renderer_opengl/gl_shader_cache.h b/src/video_core/renderer_opengl/gl_shader_cache.h index 91690b470..b2ae8d7f9 100644 --- a/src/video_core/renderer_opengl/gl_shader_cache.h +++ b/src/video_core/renderer_opengl/gl_shader_cache.h @@ -43,8 +43,14 @@ struct UnspecializedShader;  using Shader = std::shared_ptr<CachedShader>;  using Maxwell = Tegra::Engines::Maxwell3D::Regs; +struct ProgramHandle { +    OGLProgram source_program; +    OGLAssemblyProgram assembly_program; +}; +using ProgramSharedPtr = std::shared_ptr<ProgramHandle>; +  struct PrecompiledShader { -    std::shared_ptr<OGLProgram> program; +    ProgramSharedPtr program;      std::shared_ptr<VideoCommon::Shader::Registry> registry;      ShaderEntries entries;  }; @@ -87,12 +93,13 @@ public:  private:      explicit CachedShader(VAddr cpu_addr, std::size_t size_in_bytes,                            std::shared_ptr<VideoCommon::Shader::Registry> registry, -                          ShaderEntries entries, std::shared_ptr<OGLProgram> program); +                          ShaderEntries entries, ProgramSharedPtr program);      std::shared_ptr<VideoCommon::Shader::Registry> registry;      ShaderEntries entries;      std::size_t size_in_bytes = 0; -    std::shared_ptr<OGLProgram> program; +    ProgramSharedPtr program; +    GLuint handle = 0;  };  class ShaderCacheOpenGL final : public RasterizerCache<Shader> { @@ -115,7 +122,7 @@ protected:      void FlushObjectInner(const Shader& object) override {}  private: -    std::shared_ptr<OGLProgram> GeneratePrecompiledProgram( +    ProgramSharedPtr GeneratePrecompiledProgram(          const ShaderDiskCacheEntry& entry, const ShaderDiskCachePrecompiled& precompiled_entry,          const std::unordered_set<GLenum>& supported_formats); diff --git a/src/video_core/renderer_opengl/gl_shader_manager.cpp b/src/video_core/renderer_opengl/gl_shader_manager.cpp index 9c7b0adbd..96605db84 100644 --- a/src/video_core/renderer_opengl/gl_shader_manager.cpp +++ b/src/video_core/renderer_opengl/gl_shader_manager.cpp @@ -6,45 +6,105 @@  #include "common/common_types.h"  #include "video_core/engines/maxwell_3d.h" +#include "video_core/renderer_opengl/gl_device.h"  #include "video_core/renderer_opengl/gl_shader_manager.h" -namespace OpenGL::GLShader { +namespace OpenGL { -ProgramManager::ProgramManager() = default; +ProgramManager::ProgramManager(const Device& device) { +    use_assembly_programs = device.UseAssemblyShaders(); +    if (use_assembly_programs) { +        glEnable(GL_COMPUTE_PROGRAM_NV); +    } else { +        graphics_pipeline.Create(); +        glBindProgramPipeline(graphics_pipeline.handle); +    } +}  ProgramManager::~ProgramManager() = default; -void ProgramManager::Create() { -    graphics_pipeline.Create(); -    glBindProgramPipeline(graphics_pipeline.handle); +void ProgramManager::BindCompute(GLuint program) { +    if (use_assembly_programs) { +        glBindProgramARB(GL_COMPUTE_PROGRAM_NV, program); +    } else { +        is_graphics_bound = false; +        glUseProgram(program); +    }  }  void ProgramManager::BindGraphicsPipeline() { -    if (!is_graphics_bound) { -        is_graphics_bound = true; -        glUseProgram(0); +    if (use_assembly_programs) { +        UpdateAssemblyPrograms(); +    } else { +        UpdateSourcePrograms();      } +} -    // Avoid updating the pipeline when values have no changed -    if (old_state == current_state) { -        return; +void ProgramManager::BindHostPipeline(GLuint pipeline) { +    if (use_assembly_programs) { +        if (geometry_enabled) { +            geometry_enabled = false; +            old_state.geometry = 0; +            glDisable(GL_GEOMETRY_PROGRAM_NV); +        }      } +    glBindProgramPipeline(pipeline); +} -    // Workaround for AMD bug -    static constexpr GLenum all_used_stages{GL_VERTEX_SHADER_BIT | GL_GEOMETRY_SHADER_BIT | -                                            GL_FRAGMENT_SHADER_BIT}; -    const GLuint handle = graphics_pipeline.handle; -    glUseProgramStages(handle, all_used_stages, 0); -    glUseProgramStages(handle, GL_VERTEX_SHADER_BIT, current_state.vertex_shader); -    glUseProgramStages(handle, GL_GEOMETRY_SHADER_BIT, current_state.geometry_shader); -    glUseProgramStages(handle, GL_FRAGMENT_SHADER_BIT, current_state.fragment_shader); +void ProgramManager::RestoreGuestPipeline() { +    if (use_assembly_programs) { +        glBindProgramPipeline(0); +    } else { +        glBindProgramPipeline(graphics_pipeline.handle); +    } +} + +void ProgramManager::UpdateAssemblyPrograms() { +    const auto update_state = [](GLenum stage, bool& enabled, GLuint current, GLuint old) { +        if (current == old) { +            return; +        } +        if (current == 0) { +            if (enabled) { +                enabled = false; +                glDisable(stage); +            } +            return; +        } +        if (!enabled) { +            enabled = true; +            glEnable(stage); +        } +        glBindProgramARB(stage, current); +    }; + +    update_state(GL_VERTEX_PROGRAM_NV, vertex_enabled, current_state.vertex, old_state.vertex); +    update_state(GL_GEOMETRY_PROGRAM_NV, geometry_enabled, current_state.geometry, +                 old_state.geometry); +    update_state(GL_FRAGMENT_PROGRAM_NV, fragment_enabled, current_state.fragment, +                 old_state.fragment);      old_state = current_state;  } -void ProgramManager::BindComputeShader(GLuint program) { -    is_graphics_bound = false; -    glUseProgram(program); +void ProgramManager::UpdateSourcePrograms() { +    if (!is_graphics_bound) { +        is_graphics_bound = true; +        glUseProgram(0); +    } + +    const GLuint handle = graphics_pipeline.handle; +    const auto update_state = [handle](GLenum stage, GLuint current, GLuint old) { +        if (current == old) { +            return; +        } +        glUseProgramStages(handle, stage, current); +    }; +    update_state(GL_VERTEX_SHADER_BIT, current_state.vertex, old_state.vertex); +    update_state(GL_GEOMETRY_SHADER_BIT, current_state.geometry, old_state.geometry); +    update_state(GL_FRAGMENT_SHADER_BIT, current_state.fragment, old_state.fragment); + +    old_state = current_state;  }  void MaxwellUniformData::SetFromRegs(const Tegra::Engines::Maxwell3D& maxwell) { @@ -54,4 +114,4 @@ void MaxwellUniformData::SetFromRegs(const Tegra::Engines::Maxwell3D& maxwell) {      y_direction = regs.screen_y_control.y_negate == 0 ? 1.0f : -1.0f;  } -} // namespace OpenGL::GLShader +} // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_shader_manager.h b/src/video_core/renderer_opengl/gl_shader_manager.h index d2e47f2a9..0f03b4f12 100644 --- a/src/video_core/renderer_opengl/gl_shader_manager.h +++ b/src/video_core/renderer_opengl/gl_shader_manager.h @@ -11,7 +11,9 @@  #include "video_core/renderer_opengl/gl_resource_manager.h"  #include "video_core/renderer_opengl/maxwell_to_gl.h" -namespace OpenGL::GLShader { +namespace OpenGL { + +class Device;  /// Uniform structure for the Uniform Buffer Object, all vectors must be 16-byte aligned  /// @note Always keep a vec4 at the end. The GL spec is not clear whether the alignment at @@ -28,50 +30,58 @@ static_assert(sizeof(MaxwellUniformData) < 16384,  class ProgramManager {  public: -    explicit ProgramManager(); +    explicit ProgramManager(const Device& device);      ~ProgramManager(); -    void Create(); +    /// Binds a compute program +    void BindCompute(GLuint program); -    /// Updates the graphics pipeline and binds it. +    /// Updates bound programs.      void BindGraphicsPipeline(); -    /// Binds a compute shader. -    void BindComputeShader(GLuint program); +    /// Binds an OpenGL pipeline object unsynchronized with the guest state. +    void BindHostPipeline(GLuint pipeline); + +    /// Rewinds BindHostPipeline state changes. +    void RestoreGuestPipeline();      void UseVertexShader(GLuint program) { -        current_state.vertex_shader = program; +        current_state.vertex = program;      }      void UseGeometryShader(GLuint program) { -        current_state.geometry_shader = program; +        current_state.geometry = program;      }      void UseFragmentShader(GLuint program) { -        current_state.fragment_shader = program; +        current_state.fragment = program;      }  private:      struct PipelineState { -        bool operator==(const PipelineState& rhs) const noexcept { -            return vertex_shader == rhs.vertex_shader && fragment_shader == rhs.fragment_shader && -                   geometry_shader == rhs.geometry_shader; -        } - -        bool operator!=(const PipelineState& rhs) const noexcept { -            return !operator==(rhs); -        } - -        GLuint vertex_shader = 0; -        GLuint fragment_shader = 0; -        GLuint geometry_shader = 0; +        GLuint vertex = 0; +        GLuint geometry = 0; +        GLuint fragment = 0;      }; +    /// Update NV_gpu_program5 programs. +    void UpdateAssemblyPrograms(); + +    /// Update GLSL programs. +    void UpdateSourcePrograms(); +      OGLPipeline graphics_pipeline; -    OGLPipeline compute_pipeline; +      PipelineState current_state;      PipelineState old_state; + +    bool use_assembly_programs = false; +      bool is_graphics_bound = true; + +    bool vertex_enabled = false; +    bool geometry_enabled = false; +    bool fragment_enabled = false;  }; -} // namespace OpenGL::GLShader +} // namespace OpenGL diff --git a/src/video_core/renderer_opengl/renderer_opengl.cpp b/src/video_core/renderer_opengl/renderer_opengl.cpp index b2a179746..6b489e6db 100644 --- a/src/video_core/renderer_opengl/renderer_opengl.cpp +++ b/src/video_core/renderer_opengl/renderer_opengl.cpp @@ -316,7 +316,7 @@ public:  RendererOpenGL::RendererOpenGL(Core::Frontend::EmuWindow& emu_window, Core::System& system,                                 Core::Frontend::GraphicsContext& context)      : RendererBase{emu_window}, emu_window{emu_window}, system{system}, context{context}, -      has_debug_tool{HasDebugTool()} {} +      program_manager{device}, has_debug_tool{HasDebugTool()} {}  RendererOpenGL::~RendererOpenGL() = default; @@ -468,8 +468,9 @@ void RendererOpenGL::InitOpenGLObjects() {      vertex_program.Create(true, false, vertex_shader.handle);      fragment_program.Create(true, false, fragment_shader.handle); -    // Create program pipeline -    program_manager.Create(); +    pipeline.Create(); +    glUseProgramStages(pipeline.handle, GL_VERTEX_SHADER_BIT, vertex_program.handle); +    glUseProgramStages(pipeline.handle, GL_FRAGMENT_SHADER_BIT, fragment_program.handle);      // Generate VBO handle for drawing      vertex_buffer.Create(); @@ -508,7 +509,7 @@ void RendererOpenGL::CreateRasterizer() {      if (rasterizer) {          return;      } -    rasterizer = std::make_unique<RasterizerOpenGL>(system, emu_window, screen_info, +    rasterizer = std::make_unique<RasterizerOpenGL>(system, emu_window, device, screen_info,                                                      program_manager, state_tracker);  } @@ -620,10 +621,7 @@ void RendererOpenGL::DrawScreen(const Layout::FramebufferLayout& layout) {      state_tracker.NotifyClipControl();      state_tracker.NotifyAlphaTest(); -    program_manager.UseVertexShader(vertex_program.handle); -    program_manager.UseGeometryShader(0); -    program_manager.UseFragmentShader(fragment_program.handle); -    program_manager.BindGraphicsPipeline(); +    program_manager.BindHostPipeline(pipeline.handle);      glEnable(GL_CULL_FACE);      if (screen_info.display_srgb) { @@ -665,6 +663,8 @@ void RendererOpenGL::DrawScreen(const Layout::FramebufferLayout& layout) {      glClear(GL_COLOR_BUFFER_BIT);      glDrawArrays(GL_TRIANGLE_STRIP, 0, 4); + +    program_manager.RestoreGuestPipeline();  }  bool RendererOpenGL::TryPresent(int timeout_ms) { diff --git a/src/video_core/renderer_opengl/renderer_opengl.h b/src/video_core/renderer_opengl/renderer_opengl.h index 50b647661..61bf507f4 100644 --- a/src/video_core/renderer_opengl/renderer_opengl.h +++ b/src/video_core/renderer_opengl/renderer_opengl.h @@ -9,6 +9,7 @@  #include "common/common_types.h"  #include "common/math_util.h"  #include "video_core/renderer_base.h" +#include "video_core/renderer_opengl/gl_device.h"  #include "video_core/renderer_opengl/gl_resource_manager.h"  #include "video_core/renderer_opengl/gl_shader_manager.h"  #include "video_core/renderer_opengl/gl_state_tracker.h" @@ -95,6 +96,7 @@ private:      Core::Frontend::EmuWindow& emu_window;      Core::System& system;      Core::Frontend::GraphicsContext& context; +    const Device device;      StateTracker state_tracker{system}; @@ -102,13 +104,14 @@ private:      OGLBuffer vertex_buffer;      OGLProgram vertex_program;      OGLProgram fragment_program; +    OGLPipeline pipeline;      OGLFramebuffer screenshot_framebuffer;      /// Display information for Switch screen      ScreenInfo screen_info;      /// Global dummy shader pipeline -    GLShader::ProgramManager program_manager; +    ProgramManager program_manager;      /// OpenGL framebuffer data      std::vector<u8> gl_framebuffer_data; | 
