diff options
| -rw-r--r-- | src/video_core/engines/shader_bytecode.h | 11 | ||||
| -rw-r--r-- | src/video_core/gpu.h | 1 | ||||
| -rw-r--r-- | src/video_core/morton.cpp | 2 | ||||
| -rw-r--r-- | src/video_core/renderer_opengl/gl_shader_decompiler.cpp | 15 | ||||
| -rw-r--r-- | src/video_core/renderer_opengl/gl_texture_cache.cpp | 1 | ||||
| -rw-r--r-- | src/video_core/renderer_opengl/renderer_opengl.cpp | 288 | ||||
| -rw-r--r-- | src/video_core/renderer_vulkan/maxwell_to_vk.cpp | 3 | ||||
| -rw-r--r-- | src/video_core/renderer_vulkan/vk_device.cpp | 1 | ||||
| -rw-r--r-- | src/video_core/renderer_vulkan/vk_rasterizer.cpp | 2 | ||||
| -rw-r--r-- | src/video_core/renderer_vulkan/vk_shader_decompiler.cpp | 2 | ||||
| -rw-r--r-- | src/video_core/shader/decode/bfe.cpp | 69 | ||||
| -rw-r--r-- | src/video_core/shader/node_helper.cpp | 2 | ||||
| -rw-r--r-- | src/video_core/surface.cpp | 2 | ||||
| -rw-r--r-- | src/video_core/surface.h | 142 | ||||
| -rw-r--r-- | src/video_core/texture_cache/format_lookup_table.cpp | 3 | 
15 files changed, 320 insertions, 224 deletions
| diff --git a/src/video_core/engines/shader_bytecode.h b/src/video_core/engines/shader_bytecode.h index c9bc83cd7..eba42deb4 100644 --- a/src/video_core/engines/shader_bytecode.h +++ b/src/video_core/engines/shader_bytecode.h @@ -911,14 +911,9 @@ union Instruction {      } fadd32i;      union { -        BitField<20, 8, u64> shift_position; -        BitField<28, 8, u64> shift_length; -        BitField<48, 1, u64> negate_b; -        BitField<49, 1, u64> negate_a; - -        u64 GetLeftShiftValue() const { -            return 32 - (shift_position + shift_length); -        } +        BitField<40, 1, u64> brev; +        BitField<47, 1, u64> rd_cc; +        BitField<48, 1, u64> is_signed;      } bfe;      union { diff --git a/src/video_core/gpu.h b/src/video_core/gpu.h index ba8c9d665..64acb17df 100644 --- a/src/video_core/gpu.h +++ b/src/video_core/gpu.h @@ -39,6 +39,7 @@ enum class RenderTargetFormat : u32 {      RGBA32_FLOAT = 0xC0,      RGBA32_UINT = 0xC2,      RGBA16_UNORM = 0xC6, +    RGBA16_SNORM = 0xC7,      RGBA16_UINT = 0xC9,      RGBA16_FLOAT = 0xCA,      RG32_FLOAT = 0xCB, diff --git a/src/video_core/morton.cpp b/src/video_core/morton.cpp index f2c83266e..6d522c318 100644 --- a/src/video_core/morton.cpp +++ b/src/video_core/morton.cpp @@ -51,6 +51,7 @@ static constexpr ConversionArray morton_to_linear_fns = {      MortonCopy<true, PixelFormat::R8UI>,      MortonCopy<true, PixelFormat::RGBA16F>,      MortonCopy<true, PixelFormat::RGBA16U>, +    MortonCopy<true, PixelFormat::RGBA16S>,      MortonCopy<true, PixelFormat::RGBA16UI>,      MortonCopy<true, PixelFormat::R11FG11FB10F>,      MortonCopy<true, PixelFormat::RGBA32UI>, @@ -131,6 +132,7 @@ static constexpr ConversionArray linear_to_morton_fns = {      MortonCopy<false, PixelFormat::R8U>,      MortonCopy<false, PixelFormat::R8UI>,      MortonCopy<false, PixelFormat::RGBA16F>, +    MortonCopy<false, PixelFormat::RGBA16S>,      MortonCopy<false, PixelFormat::RGBA16U>,      MortonCopy<false, PixelFormat::RGBA16UI>,      MortonCopy<false, PixelFormat::R11FG11FB10F>, diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp index 3adf7f0cb..849839fe3 100644 --- a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp +++ b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp @@ -2009,16 +2009,19 @@ private:          expr += GetSampler(meta->sampler);          expr += ", "; -        expr += constructors.at(operation.GetOperandsCount() - 1); +        expr += constructors.at(operation.GetOperandsCount() + (meta->array ? 1 : 0) - 1);          expr += '(';          for (std::size_t i = 0; i < count; ++i) { -            expr += VisitOperand(operation, i).AsInt(); -            const std::size_t next = i + 1; -            if (next == count) -                expr += ')'; -            else if (next < count) +            if (i > 0) {                  expr += ", "; +            } +            expr += VisitOperand(operation, i).AsInt();          } +        if (meta->array) { +            expr += ", "; +            expr += Visit(meta->array).AsInt(); +        } +        expr += ')';          if (meta->lod && !meta->sampler.IsBuffer()) {              expr += ", "; diff --git a/src/video_core/renderer_opengl/gl_texture_cache.cpp b/src/video_core/renderer_opengl/gl_texture_cache.cpp index 2d3838a7a..f424e3000 100644 --- a/src/video_core/renderer_opengl/gl_texture_cache.cpp +++ b/src/video_core/renderer_opengl/gl_texture_cache.cpp @@ -53,6 +53,7 @@ constexpr std::array<FormatTuple, VideoCore::Surface::MaxPixelFormat> tex_format      {GL_R8UI, GL_RED_INTEGER, GL_UNSIGNED_BYTE, false},                             // R8UI      {GL_RGBA16F, GL_RGBA, GL_HALF_FLOAT, false},                                    // RGBA16F      {GL_RGBA16, GL_RGBA, GL_UNSIGNED_SHORT, false},                                 // RGBA16U +    {GL_RGBA16_SNORM, GL_RGBA, GL_SHORT, false},                                    // RGBA16S      {GL_RGBA16UI, GL_RGBA_INTEGER, GL_UNSIGNED_SHORT, false},                       // RGBA16UI      {GL_R11F_G11F_B10F, GL_RGB, GL_UNSIGNED_INT_10F_11F_11F_REV, false},            // R11FG11FB10F      {GL_RGBA32UI, GL_RGBA_INTEGER, GL_UNSIGNED_INT, false},                         // RGBA32UI diff --git a/src/video_core/renderer_opengl/renderer_opengl.cpp b/src/video_core/renderer_opengl/renderer_opengl.cpp index 12333e8c9..fca5e3ec0 100644 --- a/src/video_core/renderer_opengl/renderer_opengl.cpp +++ b/src/video_core/renderer_opengl/renderer_opengl.cpp @@ -5,8 +5,11 @@  #include <algorithm>  #include <cstddef>  #include <cstdlib> +#include <cstring>  #include <memory> +  #include <glad/glad.h> +  #include "common/assert.h"  #include "common/logging/log.h"  #include "common/microprofile.h" @@ -25,6 +28,8 @@  namespace OpenGL { +namespace { +  // If the size of this is too small, it ends up creating a soft cap on FPS as the renderer will have  // to wait on available presentation frames.  constexpr std::size_t SWAP_CHAIN_SIZE = 3; @@ -41,124 +46,6 @@ struct Frame {      bool is_srgb{};                   /// Framebuffer is sRGB or RGB  }; -/** - * For smooth Vsync rendering, we want to always present the latest frame that the core generates, - * but also make sure that rendering happens at the pace that the frontend dictates. This is a - * helper class that the renderer uses to sync frames between the render thread and the presentation - * thread - */ -class FrameMailbox { -public: -    std::mutex swap_chain_lock; -    std::condition_variable present_cv; -    std::array<Frame, SWAP_CHAIN_SIZE> swap_chain{}; -    std::queue<Frame*> free_queue; -    std::deque<Frame*> present_queue; -    Frame* previous_frame{}; - -    FrameMailbox() { -        for (auto& frame : swap_chain) { -            free_queue.push(&frame); -        } -    } - -    ~FrameMailbox() { -        // lock the mutex and clear out the present and free_queues and notify any people who are -        // blocked to prevent deadlock on shutdown -        std::scoped_lock lock{swap_chain_lock}; -        std::queue<Frame*>().swap(free_queue); -        present_queue.clear(); -        present_cv.notify_all(); -    } - -    void ReloadPresentFrame(Frame* frame, u32 height, u32 width) { -        frame->present.Release(); -        frame->present.Create(); -        GLint previous_draw_fbo{}; -        glGetIntegerv(GL_DRAW_FRAMEBUFFER_BINDING, &previous_draw_fbo); -        glBindFramebuffer(GL_FRAMEBUFFER, frame->present.handle); -        glFramebufferRenderbuffer(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_RENDERBUFFER, -                                  frame->color.handle); -        if (glCheckFramebufferStatus(GL_FRAMEBUFFER) != GL_FRAMEBUFFER_COMPLETE) { -            LOG_CRITICAL(Render_OpenGL, "Failed to recreate present FBO!"); -        } -        glBindFramebuffer(GL_DRAW_FRAMEBUFFER, previous_draw_fbo); -        frame->color_reloaded = false; -    } - -    void ReloadRenderFrame(Frame* frame, u32 width, u32 height) { -        // Recreate the color texture attachment -        frame->color.Release(); -        frame->color.Create(); -        const GLenum internal_format = frame->is_srgb ? GL_SRGB8 : GL_RGB8; -        glNamedRenderbufferStorage(frame->color.handle, internal_format, width, height); - -        // Recreate the FBO for the render target -        frame->render.Release(); -        frame->render.Create(); -        glBindFramebuffer(GL_FRAMEBUFFER, frame->render.handle); -        glFramebufferRenderbuffer(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_RENDERBUFFER, -                                  frame->color.handle); -        if (glCheckFramebufferStatus(GL_FRAMEBUFFER) != GL_FRAMEBUFFER_COMPLETE) { -            LOG_CRITICAL(Render_OpenGL, "Failed to recreate render FBO!"); -        } - -        frame->width = width; -        frame->height = height; -        frame->color_reloaded = true; -    } - -    Frame* GetRenderFrame() { -        std::unique_lock lock{swap_chain_lock}; - -        // If theres no free frames, we will reuse the oldest render frame -        if (free_queue.empty()) { -            auto frame = present_queue.back(); -            present_queue.pop_back(); -            return frame; -        } - -        Frame* frame = free_queue.front(); -        free_queue.pop(); -        return frame; -    } - -    void ReleaseRenderFrame(Frame* frame) { -        std::unique_lock lock{swap_chain_lock}; -        present_queue.push_front(frame); -        present_cv.notify_one(); -    } - -    Frame* TryGetPresentFrame(int timeout_ms) { -        std::unique_lock lock{swap_chain_lock}; -        // wait for new entries in the present_queue -        present_cv.wait_for(lock, std::chrono::milliseconds(timeout_ms), -                            [&] { return !present_queue.empty(); }); -        if (present_queue.empty()) { -            // timed out waiting for a frame to draw so return the previous frame -            return previous_frame; -        } - -        // free the previous frame and add it back to the free queue -        if (previous_frame) { -            free_queue.push(previous_frame); -        } - -        // the newest entries are pushed to the front of the queue -        Frame* frame = present_queue.front(); -        present_queue.pop_front(); -        // remove all old entries from the present queue and move them back to the free_queue -        for (auto f : present_queue) { -            free_queue.push(f); -        } -        present_queue.clear(); -        previous_frame = frame; -        return frame; -    } -}; - -namespace { -  constexpr char VERTEX_SHADER[] = R"(  #version 430 core @@ -211,6 +98,24 @@ struct ScreenRectVertex {      std::array<GLfloat, 2> tex_coord;  }; +/// Returns true if any debug tool is attached +bool HasDebugTool() { +    const bool nsight = std::getenv("NVTX_INJECTION64_PATH") || std::getenv("NSIGHT_LAUNCHED"); +    if (nsight) { +        return true; +    } + +    GLint num_extensions; +    glGetIntegerv(GL_NUM_EXTENSIONS, &num_extensions); +    for (GLuint index = 0; index < static_cast<GLuint>(num_extensions); ++index) { +        const auto name = reinterpret_cast<const char*>(glGetStringi(GL_EXTENSIONS, index)); +        if (!std::strcmp(name, "GL_EXT_debug_tool")) { +            return true; +        } +    } +    return false; +} +  /**   * Defines a 1:1 pixel ortographic projection matrix with (0,0) on the top-left   * corner and (width, height) on the lower-bottom. @@ -294,6 +199,153 @@ void APIENTRY DebugHandler(GLenum source, GLenum type, GLuint id, GLenum severit  } // Anonymous namespace +/** + * For smooth Vsync rendering, we want to always present the latest frame that the core generates, + * but also make sure that rendering happens at the pace that the frontend dictates. This is a + * helper class that the renderer uses to sync frames between the render thread and the presentation + * thread + */ +class FrameMailbox { +public: +    std::mutex swap_chain_lock; +    std::condition_variable present_cv; +    std::array<Frame, SWAP_CHAIN_SIZE> swap_chain{}; +    std::queue<Frame*> free_queue; +    std::deque<Frame*> present_queue; +    Frame* previous_frame{}; + +    FrameMailbox() : has_debug_tool{HasDebugTool()} { +        for (auto& frame : swap_chain) { +            free_queue.push(&frame); +        } +    } + +    ~FrameMailbox() { +        // lock the mutex and clear out the present and free_queues and notify any people who are +        // blocked to prevent deadlock on shutdown +        std::scoped_lock lock{swap_chain_lock}; +        std::queue<Frame*>().swap(free_queue); +        present_queue.clear(); +        present_cv.notify_all(); +    } + +    void ReloadPresentFrame(Frame* frame, u32 height, u32 width) { +        frame->present.Release(); +        frame->present.Create(); +        GLint previous_draw_fbo{}; +        glGetIntegerv(GL_DRAW_FRAMEBUFFER_BINDING, &previous_draw_fbo); +        glBindFramebuffer(GL_FRAMEBUFFER, frame->present.handle); +        glFramebufferRenderbuffer(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_RENDERBUFFER, +                                  frame->color.handle); +        if (glCheckFramebufferStatus(GL_FRAMEBUFFER) != GL_FRAMEBUFFER_COMPLETE) { +            LOG_CRITICAL(Render_OpenGL, "Failed to recreate present FBO!"); +        } +        glBindFramebuffer(GL_DRAW_FRAMEBUFFER, previous_draw_fbo); +        frame->color_reloaded = false; +    } + +    void ReloadRenderFrame(Frame* frame, u32 width, u32 height) { +        // Recreate the color texture attachment +        frame->color.Release(); +        frame->color.Create(); +        const GLenum internal_format = frame->is_srgb ? GL_SRGB8 : GL_RGB8; +        glNamedRenderbufferStorage(frame->color.handle, internal_format, width, height); + +        // Recreate the FBO for the render target +        frame->render.Release(); +        frame->render.Create(); +        glBindFramebuffer(GL_FRAMEBUFFER, frame->render.handle); +        glFramebufferRenderbuffer(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_RENDERBUFFER, +                                  frame->color.handle); +        if (glCheckFramebufferStatus(GL_FRAMEBUFFER) != GL_FRAMEBUFFER_COMPLETE) { +            LOG_CRITICAL(Render_OpenGL, "Failed to recreate render FBO!"); +        } + +        frame->width = width; +        frame->height = height; +        frame->color_reloaded = true; +    } + +    Frame* GetRenderFrame() { +        std::unique_lock lock{swap_chain_lock}; + +        // If theres no free frames, we will reuse the oldest render frame +        if (free_queue.empty()) { +            auto frame = present_queue.back(); +            present_queue.pop_back(); +            return frame; +        } + +        Frame* frame = free_queue.front(); +        free_queue.pop(); +        return frame; +    } + +    void ReleaseRenderFrame(Frame* frame) { +        std::unique_lock lock{swap_chain_lock}; +        present_queue.push_front(frame); +        present_cv.notify_one(); + +        DebugNotifyNextFrame(); +    } + +    Frame* TryGetPresentFrame(int timeout_ms) { +        DebugWaitForNextFrame(); + +        std::unique_lock lock{swap_chain_lock}; +        // wait for new entries in the present_queue +        present_cv.wait_for(lock, std::chrono::milliseconds(timeout_ms), +                            [&] { return !present_queue.empty(); }); +        if (present_queue.empty()) { +            // timed out waiting for a frame to draw so return the previous frame +            return previous_frame; +        } + +        // free the previous frame and add it back to the free queue +        if (previous_frame) { +            free_queue.push(previous_frame); +        } + +        // the newest entries are pushed to the front of the queue +        Frame* frame = present_queue.front(); +        present_queue.pop_front(); +        // remove all old entries from the present queue and move them back to the free_queue +        for (auto f : present_queue) { +            free_queue.push(f); +        } +        present_queue.clear(); +        previous_frame = frame; +        return frame; +    } + +private: +    std::mutex debug_synch_mutex; +    std::condition_variable debug_synch_condition; +    std::atomic_int frame_for_debug{}; +    const bool has_debug_tool; // When true, using a GPU debugger, so keep frames in lock-step + +    /// Signal that a new frame is available (called from GPU thread) +    void DebugNotifyNextFrame() { +        if (!has_debug_tool) { +            return; +        } +        frame_for_debug++; +        std::lock_guard lock{debug_synch_mutex}; +        debug_synch_condition.notify_one(); +    } + +    /// Wait for a new frame to be available (called from presentation thread) +    void DebugWaitForNextFrame() { +        if (!has_debug_tool) { +            return; +        } +        const int last_frame = frame_for_debug; +        std::unique_lock lock{debug_synch_mutex}; +        debug_synch_condition.wait(lock, +                                   [this, last_frame] { return frame_for_debug > last_frame; }); +    } +}; +  RendererOpenGL::RendererOpenGL(Core::Frontend::EmuWindow& emu_window, Core::System& system)      : VideoCore::RendererBase{emu_window}, emu_window{emu_window}, system{system},        frame_mailbox{std::make_unique<FrameMailbox>()} {} diff --git a/src/video_core/renderer_vulkan/maxwell_to_vk.cpp b/src/video_core/renderer_vulkan/maxwell_to_vk.cpp index 72880d7ea..0e2e5e6c7 100644 --- a/src/video_core/renderer_vulkan/maxwell_to_vk.cpp +++ b/src/video_core/renderer_vulkan/maxwell_to_vk.cpp @@ -125,6 +125,7 @@ struct FormatTuple {      {vk::Format::eR8Uint, Attachable | Storage},                 // R8UI      {vk::Format::eR16G16B16A16Sfloat, Attachable | Storage},     // RGBA16F      {vk::Format::eR16G16B16A16Unorm, Attachable | Storage},      // RGBA16U +    {vk::Format::eR16G16B16A16Snorm, Attachable | Storage},      // RGBA16S      {vk::Format::eR16G16B16A16Uint, Attachable | Storage},       // RGBA16UI      {vk::Format::eB10G11R11UfloatPack32, Attachable | Storage},  // R11FG11FB10F      {vk::Format::eR32G32B32A32Uint, Attachable | Storage},       // RGBA32UI @@ -331,6 +332,8 @@ vk::Format VertexFormat(Maxwell::VertexAttribute::Type type, Maxwell::VertexAttr              return vk::Format::eR16G16B16Unorm;          case Maxwell::VertexAttribute::Size::Size_16_16_16_16:              return vk::Format::eR16G16B16A16Unorm; +        case Maxwell::VertexAttribute::Size::Size_10_10_10_2: +            return vk::Format::eA2B10G10R10UnormPack32;          default:              break;          } diff --git a/src/video_core/renderer_vulkan/vk_device.cpp b/src/video_core/renderer_vulkan/vk_device.cpp index 3847bd722..28d2fbc4f 100644 --- a/src/video_core/renderer_vulkan/vk_device.cpp +++ b/src/video_core/renderer_vulkan/vk_device.cpp @@ -535,6 +535,7 @@ std::unordered_map<vk::Format, vk::FormatProperties> VKDevice::GetFormatProperti                                          vk::Format::eR32G32Sfloat,                                          vk::Format::eR32G32Uint,                                          vk::Format::eR16G16B16A16Uint, +                                        vk::Format::eR16G16B16A16Snorm,                                          vk::Format::eR16G16B16A16Unorm,                                          vk::Format::eR16G16Unorm,                                          vk::Format::eR16G16Snorm, diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp index f889019c1..c9886cc16 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp +++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp @@ -1151,7 +1151,7 @@ std::size_t RasterizerVulkan::CalculateVertexArraysSize() const {          // This implementation assumes that all attributes are used in the shader.          const GPUVAddr start{regs.vertex_array[index].StartAddress()};          const GPUVAddr end{regs.vertex_array_limit[index].LimitAddress()}; -        DEBUG_ASSERT(end > start); +        DEBUG_ASSERT(end >= start);          size += (end - start + 1) * regs.vertex_array[index].enable;      } diff --git a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp index b2c298051..51ecb5567 100644 --- a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp +++ b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp @@ -837,7 +837,7 @@ private:                  Decorate(id, spv::Decoration::Offset, static_cast<u32>(tfb->offset));              } -            element += static_cast<u8>(num_components); +            element = static_cast<u8>(static_cast<std::size_t>(element) + num_components);          }      } diff --git a/src/video_core/shader/decode/bfe.cpp b/src/video_core/shader/decode/bfe.cpp index e02bcd097..8e3b46e8e 100644 --- a/src/video_core/shader/decode/bfe.cpp +++ b/src/video_core/shader/decode/bfe.cpp @@ -17,33 +17,60 @@ u32 ShaderIR::DecodeBfe(NodeBlock& bb, u32 pc) {      const Instruction instr = {program_code[pc]};      const auto opcode = OpCode::Decode(instr); -    UNIMPLEMENTED_IF(instr.bfe.negate_b); -      Node op_a = GetRegister(instr.gpr8); -    op_a = GetOperandAbsNegInteger(op_a, false, instr.bfe.negate_a, false); - -    switch (opcode->get().GetId()) { -    case OpCode::Id::BFE_IMM: { -        UNIMPLEMENTED_IF_MSG(instr.generates_cc, -                             "Condition codes generation in BFE is not implemented"); +    Node op_b = [&] { +        switch (opcode->get().GetId()) { +        case OpCode::Id::BFE_R: +            return GetRegister(instr.gpr20); +        case OpCode::Id::BFE_C: +            return GetConstBuffer(instr.cbuf34.index, instr.cbuf34.GetOffset()); +        case OpCode::Id::BFE_IMM: +            return Immediate(instr.alu.GetSignedImm20_20()); +        default: +            UNREACHABLE(); +            return Immediate(0); +        } +    }(); -        const Node inner_shift_imm = Immediate(static_cast<u32>(instr.bfe.GetLeftShiftValue())); -        const Node outer_shift_imm = -            Immediate(static_cast<u32>(instr.bfe.GetLeftShiftValue() + instr.bfe.shift_position)); +    UNIMPLEMENTED_IF_MSG(instr.bfe.rd_cc, "Condition codes in BFE is not implemented"); -        const Node inner_shift = -            Operation(OperationCode::ILogicalShiftLeft, NO_PRECISE, op_a, inner_shift_imm); -        const Node outer_shift = -            Operation(OperationCode::ILogicalShiftRight, NO_PRECISE, inner_shift, outer_shift_imm); +    const bool is_signed = instr.bfe.is_signed; -        SetInternalFlagsFromInteger(bb, outer_shift, instr.generates_cc); -        SetRegister(bb, instr.gpr0, outer_shift); -        break; -    } -    default: -        UNIMPLEMENTED_MSG("Unhandled BFE instruction: {}", opcode->get().GetName()); +    // using reverse parallel method in +    // https://graphics.stanford.edu/~seander/bithacks.html#ReverseParallel +    // note for later if possible to implement faster method. +    if (instr.bfe.brev) { +        const auto swap = [&](u32 s, u32 mask) { +            Node v1 = +                SignedOperation(OperationCode::ILogicalShiftRight, is_signed, op_a, Immediate(s)); +            if (mask != 0) { +                v1 = SignedOperation(OperationCode::IBitwiseAnd, is_signed, std::move(v1), +                                     Immediate(mask)); +            } +            Node v2 = op_a; +            if (mask != 0) { +                v2 = SignedOperation(OperationCode::IBitwiseAnd, is_signed, std::move(v2), +                                     Immediate(mask)); +            } +            v2 = SignedOperation(OperationCode::ILogicalShiftLeft, is_signed, std::move(v2), +                                 Immediate(s)); +            return SignedOperation(OperationCode::IBitwiseOr, is_signed, std::move(v1), +                                   std::move(v2)); +        }; +        op_a = swap(1, 0x55555555U); +        op_a = swap(2, 0x33333333U); +        op_a = swap(4, 0x0F0F0F0FU); +        op_a = swap(8, 0x00FF00FFU); +        op_a = swap(16, 0);      } +    const auto offset = SignedOperation(OperationCode::IBitfieldExtract, is_signed, op_b, +                                        Immediate(0), Immediate(8)); +    const auto bits = SignedOperation(OperationCode::IBitfieldExtract, is_signed, op_b, +                                      Immediate(8), Immediate(8)); +    auto result = SignedOperation(OperationCode::IBitfieldExtract, is_signed, op_a, offset, bits); +    SetRegister(bb, instr.gpr0, std::move(result)); +      return pc;  } diff --git a/src/video_core/shader/node_helper.cpp b/src/video_core/shader/node_helper.cpp index b3dcd291c..76c56abb5 100644 --- a/src/video_core/shader/node_helper.cpp +++ b/src/video_core/shader/node_helper.cpp @@ -68,6 +68,8 @@ OperationCode SignedToUnsignedCode(OperationCode operation_code, bool is_signed)          return OperationCode::UBitwiseXor;      case OperationCode::IBitwiseNot:          return OperationCode::UBitwiseNot; +    case OperationCode::IBitfieldExtract: +        return OperationCode::UBitfieldExtract;      case OperationCode::IBitfieldInsert:          return OperationCode::UBitfieldInsert;      case OperationCode::IBitCount: diff --git a/src/video_core/surface.cpp b/src/video_core/surface.cpp index 9707c353d..cc7181229 100644 --- a/src/video_core/surface.cpp +++ b/src/video_core/surface.cpp @@ -111,6 +111,8 @@ PixelFormat PixelFormatFromRenderTargetFormat(Tegra::RenderTargetFormat format)          return PixelFormat::RGBA16F;      case Tegra::RenderTargetFormat::RGBA16_UNORM:          return PixelFormat::RGBA16U; +    case Tegra::RenderTargetFormat::RGBA16_SNORM: +        return PixelFormat::RGBA16S;      case Tegra::RenderTargetFormat::RGBA16_UINT:          return PixelFormat::RGBA16UI;      case Tegra::RenderTargetFormat::RGBA32_FLOAT: diff --git a/src/video_core/surface.h b/src/video_core/surface.h index d88109e5a..ae8817465 100644 --- a/src/video_core/surface.h +++ b/src/video_core/surface.h @@ -25,82 +25,83 @@ enum class PixelFormat {      R8UI = 7,      RGBA16F = 8,      RGBA16U = 9, -    RGBA16UI = 10, -    R11FG11FB10F = 11, -    RGBA32UI = 12, -    DXT1 = 13, -    DXT23 = 14, -    DXT45 = 15, -    DXN1 = 16, // This is also known as BC4 -    DXN2UNORM = 17, -    DXN2SNORM = 18, -    BC7U = 19, -    BC6H_UF16 = 20, -    BC6H_SF16 = 21, -    ASTC_2D_4X4 = 22, -    BGRA8 = 23, -    RGBA32F = 24, -    RG32F = 25, -    R32F = 26, -    R16F = 27, -    R16U = 28, -    R16S = 29, -    R16UI = 30, -    R16I = 31, -    RG16 = 32, -    RG16F = 33, -    RG16UI = 34, -    RG16I = 35, -    RG16S = 36, -    RGB32F = 37, -    RGBA8_SRGB = 38, -    RG8U = 39, -    RG8S = 40, -    RG32UI = 41, -    RGBX16F = 42, -    R32UI = 43, -    R32I = 44, -    ASTC_2D_8X8 = 45, -    ASTC_2D_8X5 = 46, -    ASTC_2D_5X4 = 47, -    BGRA8_SRGB = 48, -    DXT1_SRGB = 49, -    DXT23_SRGB = 50, -    DXT45_SRGB = 51, -    BC7U_SRGB = 52, -    R4G4B4A4U = 53, -    ASTC_2D_4X4_SRGB = 54, -    ASTC_2D_8X8_SRGB = 55, -    ASTC_2D_8X5_SRGB = 56, -    ASTC_2D_5X4_SRGB = 57, -    ASTC_2D_5X5 = 58, -    ASTC_2D_5X5_SRGB = 59, -    ASTC_2D_10X8 = 60, -    ASTC_2D_10X8_SRGB = 61, -    ASTC_2D_6X6 = 62, -    ASTC_2D_6X6_SRGB = 63, -    ASTC_2D_10X10 = 64, -    ASTC_2D_10X10_SRGB = 65, -    ASTC_2D_12X12 = 66, -    ASTC_2D_12X12_SRGB = 67, -    ASTC_2D_8X6 = 68, -    ASTC_2D_8X6_SRGB = 69, -    ASTC_2D_6X5 = 70, -    ASTC_2D_6X5_SRGB = 71, -    E5B9G9R9F = 72, +    RGBA16S = 10, +    RGBA16UI = 11, +    R11FG11FB10F = 12, +    RGBA32UI = 13, +    DXT1 = 14, +    DXT23 = 15, +    DXT45 = 16, +    DXN1 = 17, // This is also known as BC4 +    DXN2UNORM = 18, +    DXN2SNORM = 19, +    BC7U = 20, +    BC6H_UF16 = 21, +    BC6H_SF16 = 22, +    ASTC_2D_4X4 = 23, +    BGRA8 = 24, +    RGBA32F = 25, +    RG32F = 26, +    R32F = 27, +    R16F = 28, +    R16U = 29, +    R16S = 30, +    R16UI = 31, +    R16I = 32, +    RG16 = 33, +    RG16F = 34, +    RG16UI = 35, +    RG16I = 36, +    RG16S = 37, +    RGB32F = 38, +    RGBA8_SRGB = 39, +    RG8U = 40, +    RG8S = 41, +    RG32UI = 42, +    RGBX16F = 43, +    R32UI = 44, +    R32I = 45, +    ASTC_2D_8X8 = 46, +    ASTC_2D_8X5 = 47, +    ASTC_2D_5X4 = 48, +    BGRA8_SRGB = 49, +    DXT1_SRGB = 50, +    DXT23_SRGB = 51, +    DXT45_SRGB = 52, +    BC7U_SRGB = 53, +    R4G4B4A4U = 54, +    ASTC_2D_4X4_SRGB = 55, +    ASTC_2D_8X8_SRGB = 56, +    ASTC_2D_8X5_SRGB = 57, +    ASTC_2D_5X4_SRGB = 58, +    ASTC_2D_5X5 = 59, +    ASTC_2D_5X5_SRGB = 60, +    ASTC_2D_10X8 = 61, +    ASTC_2D_10X8_SRGB = 62, +    ASTC_2D_6X6 = 63, +    ASTC_2D_6X6_SRGB = 64, +    ASTC_2D_10X10 = 65, +    ASTC_2D_10X10_SRGB = 66, +    ASTC_2D_12X12 = 67, +    ASTC_2D_12X12_SRGB = 68, +    ASTC_2D_8X6 = 69, +    ASTC_2D_8X6_SRGB = 70, +    ASTC_2D_6X5 = 71, +    ASTC_2D_6X5_SRGB = 72, +    E5B9G9R9F = 73,      MaxColorFormat,      // Depth formats -    Z32F = 73, -    Z16 = 74, +    Z32F = 74, +    Z16 = 75,      MaxDepthFormat,      // DepthStencil formats -    Z24S8 = 75, -    S8Z24 = 76, -    Z32FS8 = 77, +    Z24S8 = 76, +    S8Z24 = 77, +    Z32FS8 = 78,      MaxDepthStencilFormat, @@ -138,6 +139,7 @@ constexpr std::array<u32, MaxPixelFormat> compression_factor_shift_table = {{      0, // R8UI      0, // RGBA16F      0, // RGBA16U +    0, // RGBA16S      0, // RGBA16UI      0, // R11FG11FB10F      0, // RGBA32UI @@ -235,6 +237,7 @@ constexpr std::array<u32, MaxPixelFormat> block_width_table = {{      1,  // R8UI      1,  // RGBA16F      1,  // RGBA16U +    1,  // RGBA16S      1,  // RGBA16UI      1,  // R11FG11FB10F      1,  // RGBA32UI @@ -324,6 +327,7 @@ constexpr std::array<u32, MaxPixelFormat> block_height_table = {{      1,  // R8UI      1,  // RGBA16F      1,  // RGBA16U +    1,  // RGBA16S      1,  // RGBA16UI      1,  // R11FG11FB10F      1,  // RGBA32UI @@ -413,6 +417,7 @@ constexpr std::array<u32, MaxPixelFormat> bpp_table = {{      8,   // R8UI      64,  // RGBA16F      64,  // RGBA16U +    64,  // RGBA16S      64,  // RGBA16UI      32,  // R11FG11FB10F      128, // RGBA32UI @@ -517,6 +522,7 @@ constexpr std::array<SurfaceCompression, MaxPixelFormat> compression_type_table      SurfaceCompression::None,       // R8UI      SurfaceCompression::None,       // RGBA16F      SurfaceCompression::None,       // RGBA16U +    SurfaceCompression::None,       // RGBA16S      SurfaceCompression::None,       // RGBA16UI      SurfaceCompression::None,       // R11FG11FB10F      SurfaceCompression::None,       // RGBA32UI diff --git a/src/video_core/texture_cache/format_lookup_table.cpp b/src/video_core/texture_cache/format_lookup_table.cpp index cc3ad8417..e151c26c4 100644 --- a/src/video_core/texture_cache/format_lookup_table.cpp +++ b/src/video_core/texture_cache/format_lookup_table.cpp @@ -41,7 +41,7 @@ struct Table {      ComponentType alpha_component;      bool is_srgb;  }; -constexpr std::array<Table, 75> DefinitionTable = {{ +constexpr std::array<Table, 76> DefinitionTable = {{      {TextureFormat::A8R8G8B8, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::ABGR8U},      {TextureFormat::A8R8G8B8, C, SNORM, SNORM, SNORM, SNORM, PixelFormat::ABGR8S},      {TextureFormat::A8R8G8B8, C, UINT, UINT, UINT, UINT, PixelFormat::ABGR8UI}, @@ -61,6 +61,7 @@ constexpr std::array<Table, 75> DefinitionTable = {{      {TextureFormat::G8R8, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::RG8U},      {TextureFormat::G8R8, C, SNORM, SNORM, SNORM, SNORM, PixelFormat::RG8S}, +    {TextureFormat::R16_G16_B16_A16, C, SNORM, SNORM, SNORM, SNORM, PixelFormat::RGBA16S},      {TextureFormat::R16_G16_B16_A16, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::RGBA16U},      {TextureFormat::R16_G16_B16_A16, C, FLOAT, FLOAT, FLOAT, FLOAT, PixelFormat::RGBA16F},      {TextureFormat::R16_G16_B16_A16, C, UINT, UINT, UINT, UINT, PixelFormat::RGBA16UI}, | 
