diff options
| author | ReinUsesLisp <reinuseslisp@airmail.cc> | 2019-11-02 04:08:31 -0300 | 
|---|---|---|
| committer | ReinUsesLisp <reinuseslisp@airmail.cc> | 2019-11-02 05:05:34 -0300 | 
| commit | 76ca2a5f82f4df64cb839af42c93acb6705411ae (patch) | |
| tree | 22ef46bef8f32d8b2aa2f3928a96b30b3f69e213 /src/video_core | |
| parent | 11e39da02bec92fe4a332bfb737323ccb8087626 (diff) | |
gl_rasterizer: Upload constant buffers with glNamedBufferSubData
Nvidia's OpenGL driver maps gl(Named)BufferSubData with some requirements
to a fast. This path has an extra memcpy but updates the buffer without
orphaning or waiting for previous calls. It can be seen as a better
model for "push constants" that can upload a whole UBO instead of 256
bytes.
This path has some requirements established here:
http://on-demand.gputechconf.com/gtc/2014/presentations/S4379-opengl-44-scene-rendering-techniques.pdf#page=24
Instead of using the stream buffer, this commits moves constant buffers
uploads to calls of glNamedBufferSubData and from my testing it brings a
performance improvement. This is disabled when the vendor is not Nvidia
since it brings performance regressions.
Diffstat (limited to 'src/video_core')
| -rw-r--r-- | src/video_core/buffer_cache/buffer_cache.h | 14 | ||||
| -rw-r--r-- | src/video_core/renderer_opengl/gl_buffer_cache.cpp | 31 | ||||
| -rw-r--r-- | src/video_core/renderer_opengl/gl_buffer_cache.h | 20 | ||||
| -rw-r--r-- | src/video_core/renderer_opengl/gl_device.cpp | 4 | ||||
| -rw-r--r-- | src/video_core/renderer_opengl/gl_device.h | 5 | ||||
| -rw-r--r-- | src/video_core/renderer_opengl/gl_rasterizer.cpp | 29 | 
6 files changed, 84 insertions, 19 deletions
| diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h index 2442ddfd6..63b3a8205 100644 --- a/src/video_core/buffer_cache/buffer_cache.h +++ b/src/video_core/buffer_cache/buffer_cache.h @@ -30,7 +30,7 @@ public:      using BufferInfo = std::pair<const TBufferType*, u64>;      BufferInfo UploadMemory(GPUVAddr gpu_addr, std::size_t size, std::size_t alignment = 4, -                            bool is_written = false) { +                            bool is_written = false, bool use_fast_cbuf = false) {          std::lock_guard lock{mutex};          auto& memory_manager = system.GPU().MemoryManager(); @@ -43,9 +43,13 @@ public:          // Cache management is a big overhead, so only cache entries with a given size.          // TODO: Figure out which size is the best for given games.          constexpr std::size_t max_stream_size = 0x800; -        if (size < max_stream_size) { +        if (use_fast_cbuf || size < max_stream_size) {              if (!is_written && !IsRegionWritten(cache_addr, cache_addr + size - 1)) { -                return StreamBufferUpload(host_ptr, size, alignment); +                if (use_fast_cbuf) { +                    return ConstBufferUpload(host_ptr, size); +                } else { +                    return StreamBufferUpload(host_ptr, size, alignment); +                }              }          } @@ -152,6 +156,10 @@ protected:      virtual void CopyBlock(const TBuffer& src, const TBuffer& dst, std::size_t src_offset,                             std::size_t dst_offset, std::size_t size) = 0; +    virtual BufferInfo ConstBufferUpload(const void* raw_pointer, std::size_t size) { +        return {}; +    } +      /// Register an object into the cache      void Register(const MapInterval& new_map, bool inherit_written = false) {          const CacheAddr cache_ptr = new_map->GetStart(); diff --git a/src/video_core/renderer_opengl/gl_buffer_cache.cpp b/src/video_core/renderer_opengl/gl_buffer_cache.cpp index f8a807c84..0375fca17 100644 --- a/src/video_core/renderer_opengl/gl_buffer_cache.cpp +++ b/src/video_core/renderer_opengl/gl_buffer_cache.cpp @@ -8,13 +8,17 @@  #include "common/assert.h"  #include "common/microprofile.h" +#include "video_core/engines/maxwell_3d.h"  #include "video_core/rasterizer_interface.h"  #include "video_core/renderer_opengl/gl_buffer_cache.h" +#include "video_core/renderer_opengl/gl_device.h"  #include "video_core/renderer_opengl/gl_rasterizer.h"  #include "video_core/renderer_opengl/gl_resource_manager.h"  namespace OpenGL { +using Maxwell = Tegra::Engines::Maxwell3D::Regs; +  MICROPROFILE_DEFINE(OpenGL_Buffer_Download, "OpenGL", "Buffer Download", MP_RGB(192, 192, 128));  CachedBufferBlock::CachedBufferBlock(CacheAddr cache_addr, const std::size_t size) @@ -26,11 +30,22 @@ CachedBufferBlock::CachedBufferBlock(CacheAddr cache_addr, const std::size_t siz  CachedBufferBlock::~CachedBufferBlock() = default;  OGLBufferCache::OGLBufferCache(RasterizerOpenGL& rasterizer, Core::System& system, -                               std::size_t stream_size) -    : VideoCommon::BufferCache<Buffer, GLuint, OGLStreamBuffer>{ -          rasterizer, system, std::make_unique<OGLStreamBuffer>(stream_size, true)} {} +                               const Device& device, std::size_t stream_size) +    : GenericBufferCache{rasterizer, system, std::make_unique<OGLStreamBuffer>(stream_size, true)} { +    if (!device.HasFastBufferSubData()) { +        return; +    } + +    static constexpr auto size = static_cast<GLsizeiptr>(Maxwell::MaxConstBufferSize); +    glCreateBuffers(static_cast<GLsizei>(std::size(cbufs)), std::data(cbufs)); +    for (const GLuint cbuf : cbufs) { +        glNamedBufferData(cbuf, size, nullptr, GL_STREAM_DRAW); +    } +} -OGLBufferCache::~OGLBufferCache() = default; +OGLBufferCache::~OGLBufferCache() { +    glDeleteBuffers(static_cast<GLsizei>(std::size(cbufs)), std::data(cbufs)); +}  Buffer OGLBufferCache::CreateBlock(CacheAddr cache_addr, std::size_t size) {      return std::make_shared<CachedBufferBlock>(cache_addr, size); @@ -69,4 +84,12 @@ void OGLBufferCache::CopyBlock(const Buffer& src, const Buffer& dst, std::size_t                               static_cast<GLsizeiptr>(size));  } +OGLBufferCache::BufferInfo OGLBufferCache::ConstBufferUpload(const void* raw_pointer, +                                                             std::size_t size) { +    DEBUG_ASSERT(cbuf_cursor < std::size(cbufs)); +    const GLuint& cbuf = cbufs[cbuf_cursor++]; +    glNamedBufferSubData(cbuf, 0, static_cast<GLsizeiptr>(size), raw_pointer); +    return {&cbuf, 0}; +} +  } // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_buffer_cache.h b/src/video_core/renderer_opengl/gl_buffer_cache.h index 022e7bfa9..8c7145443 100644 --- a/src/video_core/renderer_opengl/gl_buffer_cache.h +++ b/src/video_core/renderer_opengl/gl_buffer_cache.h @@ -4,10 +4,12 @@  #pragma once +#include <array>  #include <memory>  #include "common/common_types.h"  #include "video_core/buffer_cache/buffer_cache.h" +#include "video_core/engines/maxwell_3d.h"  #include "video_core/rasterizer_cache.h"  #include "video_core/renderer_opengl/gl_resource_manager.h"  #include "video_core/renderer_opengl/gl_stream_buffer.h" @@ -18,12 +20,14 @@ class System;  namespace OpenGL { +class Device;  class OGLStreamBuffer;  class RasterizerOpenGL;  class CachedBufferBlock;  using Buffer = std::shared_ptr<CachedBufferBlock>; +using GenericBufferCache = VideoCommon::BufferCache<Buffer, GLuint, OGLStreamBuffer>;  class CachedBufferBlock : public VideoCommon::BufferBlock {  public: @@ -38,14 +42,18 @@ private:      OGLBuffer gl_buffer{};  }; -class OGLBufferCache final : public VideoCommon::BufferCache<Buffer, GLuint, OGLStreamBuffer> { +class OGLBufferCache final : public GenericBufferCache {  public:      explicit OGLBufferCache(RasterizerOpenGL& rasterizer, Core::System& system, -                            std::size_t stream_size); +                            const Device& device, std::size_t stream_size);      ~OGLBufferCache();      const GLuint* GetEmptyBuffer(std::size_t) override; +    void Acquire() noexcept { +        cbuf_cursor = 0; +    } +  protected:      Buffer CreateBlock(CacheAddr cache_addr, std::size_t size) override; @@ -61,6 +69,14 @@ protected:      void CopyBlock(const Buffer& src, const Buffer& dst, std::size_t src_offset,                     std::size_t dst_offset, std::size_t size) override; + +    BufferInfo ConstBufferUpload(const void* raw_pointer, std::size_t size) override; + +private: +    std::size_t cbuf_cursor = 0; +    std::array<GLuint, Tegra::Engines::Maxwell3D::Regs::MaxConstBuffers * +                           Tegra::Engines::Maxwell3D::Regs::MaxShaderProgram> +        cbufs;  };  } // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_device.cpp b/src/video_core/renderer_opengl/gl_device.cpp index 64de7e425..c65b24c69 100644 --- a/src/video_core/renderer_opengl/gl_device.cpp +++ b/src/video_core/renderer_opengl/gl_device.cpp @@ -51,8 +51,11 @@ bool HasExtension(const std::vector<std::string_view>& images, std::string_view  } // Anonymous namespace  Device::Device() { +    const std::string_view vendor = reinterpret_cast<const char*>(glGetString(GL_VENDOR));      const std::vector extensions = GetExtensions(); +    const bool is_nvidia = vendor == "NVIDIA Corporation"; +      uniform_buffer_alignment = GetInteger<std::size_t>(GL_UNIFORM_BUFFER_OFFSET_ALIGNMENT);      shader_storage_alignment = GetInteger<std::size_t>(GL_SHADER_STORAGE_BUFFER_OFFSET_ALIGNMENT);      max_vertex_attributes = GetInteger<u32>(GL_MAX_VERTEX_ATTRIBS); @@ -64,6 +67,7 @@ Device::Device() {      has_variable_aoffi = TestVariableAoffi();      has_component_indexing_bug = TestComponentIndexingBug();      has_precise_bug = TestPreciseBug(); +    has_fast_buffer_sub_data = is_nvidia;      LOG_INFO(Render_OpenGL, "Renderer_VariableAOFFI: {}", has_variable_aoffi);      LOG_INFO(Render_OpenGL, "Renderer_ComponentIndexingBug: {}", has_component_indexing_bug); diff --git a/src/video_core/renderer_opengl/gl_device.h b/src/video_core/renderer_opengl/gl_device.h index bb273c3d6..bf35bd0b6 100644 --- a/src/video_core/renderer_opengl/gl_device.h +++ b/src/video_core/renderer_opengl/gl_device.h @@ -54,6 +54,10 @@ public:          return has_precise_bug;      } +    bool HasFastBufferSubData() const { +        return has_fast_buffer_sub_data; +    } +  private:      static bool TestVariableAoffi();      static bool TestComponentIndexingBug(); @@ -69,6 +73,7 @@ private:      bool has_variable_aoffi{};      bool has_component_indexing_bug{};      bool has_precise_bug{}; +    bool has_fast_buffer_sub_data{};  };  } // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp index 6a4d2c83a..28fa8a8be 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.cpp +++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp @@ -67,7 +67,7 @@ static std::size_t GetConstBufferSize(const Tegra::Engines::ConstBufferInfo& buf  RasterizerOpenGL::RasterizerOpenGL(Core::System& system, Core::Frontend::EmuWindow& emu_window,                                     ScreenInfo& info)      : texture_cache{system, *this, device}, shader_cache{*this, system, emu_window, device}, -      system{system}, screen_info{info}, buffer_cache{*this, system, STREAM_BUFFER_SIZE} { +      system{system}, screen_info{info}, buffer_cache{*this, system, device, STREAM_BUFFER_SIZE} {      shader_program_manager = std::make_unique<GLShader::ProgramManager>();      state.draw.shader_program = 0;      state.Apply(); @@ -558,6 +558,8 @@ void RasterizerOpenGL::DrawPrelude() {      SyncPolygonOffset();      SyncAlphaTest(); +    buffer_cache.Acquire(); +      // Draw the vertex batch      const bool is_indexed = accelerate_draw == AccelDraw::Indexed; @@ -573,9 +575,11 @@ void RasterizerOpenGL::DrawPrelude() {                    (sizeof(GLShader::MaxwellUniformData) + device.GetUniformBufferAlignment()) *                        Maxwell::MaxShaderStage; -    // Add space for at least 18 constant buffers -    buffer_size += Maxwell::MaxConstBuffers * -                   (Maxwell::MaxConstBufferSize + device.GetUniformBufferAlignment()); +    if (!device.HasFastBufferSubData()) { +        // Add space for at least 18 constant buffers +        buffer_size += Maxwell::MaxConstBuffers * +                       (Maxwell::MaxConstBufferSize + device.GetUniformBufferAlignment()); +    }      // Prepare the vertex array.      buffer_cache.Map(buffer_size); @@ -739,10 +743,12 @@ void RasterizerOpenGL::DispatchCompute(GPUVAddr code_addr) {      state.draw.shader_program = program;      state.draw.program_pipeline = 0; -    const std::size_t buffer_size = -        Tegra::Engines::KeplerCompute::NumConstBuffers * -        (Maxwell::MaxConstBufferSize + device.GetUniformBufferAlignment()); -    buffer_cache.Map(buffer_size); +    if (!device.HasFastBufferSubData()) { +        const std::size_t buffer_size = +            Tegra::Engines::KeplerCompute::NumConstBuffers * +            (Maxwell::MaxConstBufferSize + device.GetUniformBufferAlignment()); +        buffer_cache.Map(buffer_size); +    }      bind_ubo_pushbuffer.Setup(0);      bind_ssbo_pushbuffer.Setup(0); @@ -750,7 +756,9 @@ void RasterizerOpenGL::DispatchCompute(GPUVAddr code_addr) {      SetupComputeConstBuffers(kernel);      SetupComputeGlobalMemory(kernel); -    buffer_cache.Unmap(); +    if (!device.HasFastBufferSubData()) { +        buffer_cache.Unmap(); +    }      bind_ubo_pushbuffer.Bind();      bind_ssbo_pushbuffer.Bind(); @@ -879,7 +887,8 @@ void RasterizerOpenGL::SetupConstBuffer(const Tegra::Engines::ConstBufferInfo& b      const std::size_t size = Common::AlignUp(GetConstBufferSize(buffer, entry), sizeof(GLvec4));      const auto alignment = device.GetUniformBufferAlignment(); -    const auto [cbuf, offset] = buffer_cache.UploadMemory(buffer.address, size, alignment); +    const auto [cbuf, offset] = buffer_cache.UploadMemory(buffer.address, size, alignment, false, +                                                          device.HasFastBufferSubData());      bind_ubo_pushbuffer.Push(cbuf, offset, size);  } | 
