diff options
38 files changed, 778 insertions, 365 deletions
diff --git a/.ci/templates/build-standard.yml b/.ci/templates/build-standard.yml index 9975f5c49..6cd209dbf 100644 --- a/.ci/templates/build-standard.yml +++ b/.ci/templates/build-standard.yml @@ -3,7 +3,7 @@ jobs:    displayName: 'standard'    pool:      vmImage: ubuntu-latest -  strategy:  +  strategy:      maxParallel: 10      matrix:        windows: diff --git a/.ci/templates/build-testing.yml b/.ci/templates/build-testing.yml index 101e52996..278efb6f5 100644 --- a/.ci/templates/build-testing.yml +++ b/.ci/templates/build-testing.yml @@ -3,7 +3,7 @@ jobs:    displayName: 'testing'    pool:      vmImage: ubuntu-latest -  strategy:  +  strategy:      maxParallel: 10      matrix:        windows: diff --git a/.ci/templates/release.yml b/.ci/templates/release.yml deleted file mode 100644 index 60bebd2aa..000000000 --- a/.ci/templates/release.yml +++ /dev/null @@ -1,29 +0,0 @@ -steps: -  - task: DownloadPipelineArtifact@2 -    displayName: 'Download Windows Release' -    inputs: -      artifactName: 'yuzu-$(BuildName)-windows-mingw' -      buildType: 'current' -      targetPath: '$(Build.ArtifactStagingDirectory)' -  - task: DownloadPipelineArtifact@2 -    displayName: 'Download Linux Release' -    inputs: -      artifactName: 'yuzu-$(BuildName)-linux' -      buildType: 'current' -      targetPath: '$(Build.ArtifactStagingDirectory)' -  - task: DownloadPipelineArtifact@2 -    displayName: 'Download Release Point' -    inputs: -      artifactName: 'yuzu-$(BuildName)-release-point' -      buildType: 'current' -      targetPath: '$(Build.ArtifactStagingDirectory)' -  - script: echo '##vso[task.setvariable variable=tagcommit]' && cat $(Build.ArtifactStagingDirectory)/tag-commit.sha -    displayName: 'Calculate Release Point' -  - task: GitHubRelease@0 -    inputs: -      gitHubConnection: $(GitHubReleaseConnectionName) -      repositoryName: '$(GitHubReleaseRepoName)' -      action: 'create' -      target: $(variables.tagcommit) -      title: 'yuzu $(BuildName) #$(Build.BuildId)' -      assets: '$(Build.ArtifactStagingDirectory)/*' @@ -1,7 +1,8 @@ -yuzu emulator +yuzu  emulator  =============  [](https://travis-ci.org/yuzu-emu/yuzu)  [](https://ci.appveyor.com/project/bunnei/yuzu) +[](https://dev.azure.com/yuzu-emu/yuzu/)  yuzu is an experimental open-source emulator for the Nintendo Switch from the creators of [Citra](https://citra-emu.org/). diff --git a/src/core/arm/unicorn/arm_unicorn.cpp b/src/core/arm/unicorn/arm_unicorn.cpp index b0ee7821a..97d5c2a8a 100644 --- a/src/core/arm/unicorn/arm_unicorn.cpp +++ b/src/core/arm/unicorn/arm_unicorn.cpp @@ -50,11 +50,14 @@ static void CodeHook(uc_engine* uc, uint64_t address, uint32_t size, void* user_  static bool UnmappedMemoryHook(uc_engine* uc, uc_mem_type type, u64 addr, int size, u64 value,                                 void* user_data) { +    auto* const system = static_cast<System*>(user_data); +      ARM_Interface::ThreadContext ctx{}; -    Core::CurrentArmInterface().SaveContext(ctx); +    system->CurrentArmInterface().SaveContext(ctx);      ASSERT_MSG(false, "Attempted to read from unmapped memory: 0x{:X}, pc=0x{:X}, lr=0x{:X}", addr,                 ctx.pc, ctx.cpu_registers[30]); -    return {}; + +    return false;  }  ARM_Unicorn::ARM_Unicorn(System& system) : system{system} { @@ -65,7 +68,7 @@ ARM_Unicorn::ARM_Unicorn(System& system) : system{system} {      uc_hook hook{};      CHECKED(uc_hook_add(uc, &hook, UC_HOOK_INTR, (void*)InterruptHook, this, 0, -1)); -    CHECKED(uc_hook_add(uc, &hook, UC_HOOK_MEM_INVALID, (void*)UnmappedMemoryHook, this, 0, -1)); +    CHECKED(uc_hook_add(uc, &hook, UC_HOOK_MEM_INVALID, (void*)UnmappedMemoryHook, &system, 0, -1));      if (GDBStub::IsServerEnabled()) {          CHECKED(uc_hook_add(uc, &hook, UC_HOOK_CODE, (void*)CodeHook, this, 0, -1));          last_bkpt_hit = false; diff --git a/src/core/core.h b/src/core/core.h index 11e73278e..8ebb385ac 100644 --- a/src/core/core.h +++ b/src/core/core.h @@ -327,10 +327,6 @@ private:      static System s_instance;  }; -inline ARM_Interface& CurrentArmInterface() { -    return System::GetInstance().CurrentArmInterface(); -} -  inline Kernel::Process* CurrentProcess() {      return System::GetInstance().CurrentProcess();  } diff --git a/src/core/hle/kernel/process.cpp b/src/core/hle/kernel/process.cpp index db3ab14ce..92169a97b 100644 --- a/src/core/hle/kernel/process.cpp +++ b/src/core/hle/kernel/process.cpp @@ -184,19 +184,11 @@ ResultCode Process::LoadFromMetadata(const FileSys::ProgramMetadata& metadata) {  }  void Process::Run(s32 main_thread_priority, u64 stack_size) { -    // The kernel always ensures that the given stack size is page aligned. -    main_thread_stack_size = Common::AlignUp(stack_size, Memory::PAGE_SIZE); - -    // Allocate and map the main thread stack -    // TODO(bunnei): This is heap area that should be allocated by the kernel and not mapped as part -    // of the user address space. -    const VAddr mapping_address = vm_manager.GetTLSIORegionEndAddress() - main_thread_stack_size; -    vm_manager -        .MapMemoryBlock(mapping_address, std::make_shared<std::vector<u8>>(main_thread_stack_size), -                        0, main_thread_stack_size, MemoryState::Stack) -        .Unwrap(); +    AllocateMainThreadStack(stack_size); +    tls_region_address = CreateTLSRegion();      vm_manager.LogLayout(); +      ChangeStatus(ProcessStatus::Running);      SetupMainThread(*this, kernel, main_thread_priority); @@ -226,6 +218,9 @@ void Process::PrepareForTermination() {      stop_threads(system.Scheduler(2).GetThreadList());      stop_threads(system.Scheduler(3).GetThreadList()); +    FreeTLSRegion(tls_region_address); +    tls_region_address = 0; +      ChangeStatus(ProcessStatus::Exited);  } @@ -325,4 +320,16 @@ void Process::ChangeStatus(ProcessStatus new_status) {      WakeupAllWaitingThreads();  } +void Process::AllocateMainThreadStack(u64 stack_size) { +    // The kernel always ensures that the given stack size is page aligned. +    main_thread_stack_size = Common::AlignUp(stack_size, Memory::PAGE_SIZE); + +    // Allocate and map the main thread stack +    const VAddr mapping_address = vm_manager.GetTLSIORegionEndAddress() - main_thread_stack_size; +    vm_manager +        .MapMemoryBlock(mapping_address, std::make_shared<std::vector<u8>>(main_thread_stack_size), +                        0, main_thread_stack_size, MemoryState::Stack) +        .Unwrap(); +} +  } // namespace Kernel diff --git a/src/core/hle/kernel/process.h b/src/core/hle/kernel/process.h index 3196014da..c2df451f3 100644 --- a/src/core/hle/kernel/process.h +++ b/src/core/hle/kernel/process.h @@ -135,6 +135,11 @@ public:          return mutex;      } +    /// Gets the address to the process' dedicated TLS region. +    VAddr GetTLSRegionAddress() const { +        return tls_region_address; +    } +      /// Gets the current status of the process      ProcessStatus GetStatus() const {          return status; @@ -296,6 +301,9 @@ private:      /// a process signal.      void ChangeStatus(ProcessStatus new_status); +    /// Allocates the main thread stack for the process, given the stack size in bytes. +    void AllocateMainThreadStack(u64 stack_size); +      /// Memory manager for this process.      Kernel::VMManager vm_manager; @@ -358,6 +366,9 @@ private:      /// variable related facilities.      Mutex mutex; +    /// Address indicating the location of the process' dedicated TLS region. +    VAddr tls_region_address = 0; +      /// Random values for svcGetInfo RandomEntropy      std::array<u64, RANDOM_ENTROPY_SIZE> random_entropy{}; diff --git a/src/core/hle/kernel/svc.cpp b/src/core/hle/kernel/svc.cpp index a46eed3da..1fd1a732a 100644 --- a/src/core/hle/kernel/svc.cpp +++ b/src/core/hle/kernel/svc.cpp @@ -843,9 +843,7 @@ static ResultCode GetInfo(Core::System& system, u64* result, u64 info_id, u64 ha              return RESULT_SUCCESS;          case GetInfoType::UserExceptionContextAddr: -            LOG_WARNING(Kernel_SVC, -                        "(STUBBED) Attempted to query user exception context address, returned 0"); -            *result = 0; +            *result = process->GetTLSRegionAddress();              return RESULT_SUCCESS;          case GetInfoType::TotalPhysicalMemoryAvailableWithoutSystemResource: @@ -1739,8 +1737,8 @@ static ResultCode SignalProcessWideKey(Core::System& system, VAddr condition_var  // Wait for an address (via Address Arbiter)  static ResultCode WaitForAddress(Core::System& system, VAddr address, u32 type, s32 value,                                   s64 timeout) { -    LOG_WARNING(Kernel_SVC, "called, address=0x{:X}, type=0x{:X}, value=0x{:X}, timeout={}", -                address, type, value, timeout); +    LOG_TRACE(Kernel_SVC, "called, address=0x{:X}, type=0x{:X}, value=0x{:X}, timeout={}", address, +              type, value, timeout);      // If the passed address is a kernel virtual address, return invalid memory state.      if (Memory::IsKernelVirtualAddress(address)) { @@ -1762,8 +1760,8 @@ static ResultCode WaitForAddress(Core::System& system, VAddr address, u32 type,  // Signals to an address (via Address Arbiter)  static ResultCode SignalToAddress(Core::System& system, VAddr address, u32 type, s32 value,                                    s32 num_to_wake) { -    LOG_WARNING(Kernel_SVC, "called, address=0x{:X}, type=0x{:X}, value=0x{:X}, num_to_wake=0x{:X}", -                address, type, value, num_to_wake); +    LOG_TRACE(Kernel_SVC, "called, address=0x{:X}, type=0x{:X}, value=0x{:X}, num_to_wake=0x{:X}", +              address, type, value, num_to_wake);      // If the passed address is a kernel virtual address, return invalid memory state.      if (Memory::IsKernelVirtualAddress(address)) { diff --git a/src/video_core/dma_pusher.cpp b/src/video_core/dma_pusher.cpp index 3175579cc..bd036cbe8 100644 --- a/src/video_core/dma_pusher.cpp +++ b/src/video_core/dma_pusher.cpp @@ -22,7 +22,7 @@ void DmaPusher::DispatchCalls() {      MICROPROFILE_SCOPE(DispatchCalls);      // On entering GPU code, assume all memory may be touched by the ARM core. -    gpu.Maxwell3D().dirty_flags.OnMemoryWrite(); +    gpu.Maxwell3D().dirty.OnMemoryWrite();      dma_pushbuffer_subindex = 0; diff --git a/src/video_core/engines/kepler_compute.cpp b/src/video_core/engines/kepler_compute.cpp index 089465a71..08586d33c 100644 --- a/src/video_core/engines/kepler_compute.cpp +++ b/src/video_core/engines/kepler_compute.cpp @@ -37,7 +37,7 @@ void KeplerCompute::CallMethod(const GPU::MethodCall& method_call) {          const bool is_last_call = method_call.IsLastCall();          upload_state.ProcessData(method_call.argument, is_last_call);          if (is_last_call) { -            system.GPU().Maxwell3D().dirty_flags.OnMemoryWrite(); +            system.GPU().Maxwell3D().dirty.OnMemoryWrite();          }          break;      } diff --git a/src/video_core/engines/kepler_memory.cpp b/src/video_core/engines/kepler_memory.cpp index 0561f676c..44279de00 100644 --- a/src/video_core/engines/kepler_memory.cpp +++ b/src/video_core/engines/kepler_memory.cpp @@ -34,7 +34,7 @@ void KeplerMemory::CallMethod(const GPU::MethodCall& method_call) {          const bool is_last_call = method_call.IsLastCall();          upload_state.ProcessData(method_call.argument, is_last_call);          if (is_last_call) { -            system.GPU().Maxwell3D().dirty_flags.OnMemoryWrite(); +            system.GPU().Maxwell3D().dirty.OnMemoryWrite();          }          break;      } diff --git a/src/video_core/engines/maxwell_3d.cpp b/src/video_core/engines/maxwell_3d.cpp index 8755b8af4..fe9fc0278 100644 --- a/src/video_core/engines/maxwell_3d.cpp +++ b/src/video_core/engines/maxwell_3d.cpp @@ -22,6 +22,7 @@ Maxwell3D::Maxwell3D(Core::System& system, VideoCore::RasterizerInterface& raste                       MemoryManager& memory_manager)      : system{system}, rasterizer{rasterizer}, memory_manager{memory_manager},        macro_interpreter{*this}, upload_state{memory_manager, regs.upload} { +    InitDirtySettings();      InitializeRegisterDefaults();  } @@ -69,6 +70,10 @@ void Maxwell3D::InitializeRegisterDefaults() {      regs.stencil_back_func_mask = 0xFFFFFFFF;      regs.stencil_back_mask = 0xFFFFFFFF; +    regs.depth_test_func = Regs::ComparisonOp::Always; +    regs.cull.front_face = Regs::Cull::FrontFace::CounterClockWise; +    regs.cull.cull_face = Regs::Cull::CullFace::Back; +      // TODO(Rodrigo): Most games do not set a point size. I think this is a case of a      // register carrying a default value. Assume it's OpenGL's default (1).      regs.point_size = 1.0f; @@ -86,6 +91,159 @@ void Maxwell3D::InitializeRegisterDefaults() {      regs.rt_separate_frag_data = 1;  } +#define DIRTY_REGS_POS(field_name) (offsetof(Maxwell3D::DirtyRegs, field_name)) + +void Maxwell3D::InitDirtySettings() { +    const auto set_block = [this](const u32 start, const u32 range, const u8 position) { +        const auto start_itr = dirty_pointers.begin() + start; +        const auto end_itr = start_itr + range; +        std::fill(start_itr, end_itr, position); +    }; +    dirty.regs.fill(true); + +    // Init Render Targets +    constexpr u32 registers_per_rt = sizeof(regs.rt[0]) / sizeof(u32); +    constexpr u32 rt_start_reg = MAXWELL3D_REG_INDEX(rt); +    constexpr u32 rt_end_reg = rt_start_reg + registers_per_rt * 8; +    u32 rt_dirty_reg = DIRTY_REGS_POS(render_target); +    for (u32 rt_reg = rt_start_reg; rt_reg < rt_end_reg; rt_reg += registers_per_rt) { +        set_block(rt_reg, registers_per_rt, rt_dirty_reg); +        rt_dirty_reg++; +    } +    constexpr u32 depth_buffer_flag = DIRTY_REGS_POS(depth_buffer); +    dirty_pointers[MAXWELL3D_REG_INDEX(zeta_enable)] = depth_buffer_flag; +    dirty_pointers[MAXWELL3D_REG_INDEX(zeta_width)] = depth_buffer_flag; +    dirty_pointers[MAXWELL3D_REG_INDEX(zeta_height)] = depth_buffer_flag; +    constexpr u32 registers_in_zeta = sizeof(regs.zeta) / sizeof(u32); +    constexpr u32 zeta_reg = MAXWELL3D_REG_INDEX(zeta); +    set_block(zeta_reg, registers_in_zeta, depth_buffer_flag); + +    // Init Vertex Arrays +    constexpr u32 vertex_array_start = MAXWELL3D_REG_INDEX(vertex_array); +    constexpr u32 vertex_array_size = sizeof(regs.vertex_array[0]) / sizeof(u32); +    constexpr u32 vertex_array_end = vertex_array_start + vertex_array_size * Regs::NumVertexArrays; +    u32 va_reg = DIRTY_REGS_POS(vertex_array); +    u32 vi_reg = DIRTY_REGS_POS(vertex_instance); +    for (u32 vertex_reg = vertex_array_start; vertex_reg < vertex_array_end; +         vertex_reg += vertex_array_size) { +        set_block(vertex_reg, 3, va_reg); +        // The divisor concerns vertex array instances +        dirty_pointers[vertex_reg + 3] = vi_reg; +        va_reg++; +        vi_reg++; +    } +    constexpr u32 vertex_limit_start = MAXWELL3D_REG_INDEX(vertex_array_limit); +    constexpr u32 vertex_limit_size = sizeof(regs.vertex_array_limit[0]) / sizeof(u32); +    constexpr u32 vertex_limit_end = vertex_limit_start + vertex_limit_size * Regs::NumVertexArrays; +    va_reg = DIRTY_REGS_POS(vertex_array); +    for (u32 vertex_reg = vertex_limit_start; vertex_reg < vertex_limit_end; +         vertex_reg += vertex_limit_size) { +        set_block(vertex_reg, vertex_limit_size, va_reg); +        va_reg++; +    } +    constexpr u32 vertex_instance_start = MAXWELL3D_REG_INDEX(instanced_arrays); +    constexpr u32 vertex_instance_size = +        sizeof(regs.instanced_arrays.is_instanced[0]) / sizeof(u32); +    constexpr u32 vertex_instance_end = +        vertex_instance_start + vertex_instance_size * Regs::NumVertexArrays; +    vi_reg = DIRTY_REGS_POS(vertex_instance); +    for (u32 vertex_reg = vertex_instance_start; vertex_reg < vertex_instance_end; +         vertex_reg += vertex_instance_size) { +        set_block(vertex_reg, vertex_instance_size, vi_reg); +        vi_reg++; +    } +    set_block(MAXWELL3D_REG_INDEX(vertex_attrib_format), regs.vertex_attrib_format.size(), +              DIRTY_REGS_POS(vertex_attrib_format)); + +    // Init Shaders +    constexpr u32 shader_registers_count = +        sizeof(regs.shader_config[0]) * Regs::MaxShaderProgram / sizeof(u32); +    set_block(MAXWELL3D_REG_INDEX(shader_config[0]), shader_registers_count, +              DIRTY_REGS_POS(shaders)); + +    // State + +    // Viewport +    constexpr u32 viewport_dirty_reg = DIRTY_REGS_POS(viewport); +    constexpr u32 viewport_start = MAXWELL3D_REG_INDEX(viewports); +    constexpr u32 viewport_size = sizeof(regs.viewports) / sizeof(u32); +    set_block(viewport_start, viewport_size, viewport_dirty_reg); +    constexpr u32 view_volume_start = MAXWELL3D_REG_INDEX(view_volume_clip_control); +    constexpr u32 view_volume_size = sizeof(regs.view_volume_clip_control) / sizeof(u32); +    set_block(view_volume_start, view_volume_size, viewport_dirty_reg); + +    // Viewport transformation +    constexpr u32 viewport_trans_start = MAXWELL3D_REG_INDEX(viewport_transform); +    constexpr u32 viewport_trans_size = sizeof(regs.viewport_transform) / sizeof(u32); +    set_block(viewport_trans_start, viewport_trans_size, DIRTY_REGS_POS(viewport_transform)); + +    // Cullmode +    constexpr u32 cull_mode_start = MAXWELL3D_REG_INDEX(cull); +    constexpr u32 cull_mode_size = sizeof(regs.cull) / sizeof(u32); +    set_block(cull_mode_start, cull_mode_size, DIRTY_REGS_POS(cull_mode)); + +    // Screen y control +    dirty_pointers[MAXWELL3D_REG_INDEX(screen_y_control)] = DIRTY_REGS_POS(screen_y_control); + +    // Primitive Restart +    constexpr u32 primitive_restart_start = MAXWELL3D_REG_INDEX(primitive_restart); +    constexpr u32 primitive_restart_size = sizeof(regs.primitive_restart) / sizeof(u32); +    set_block(primitive_restart_start, primitive_restart_size, DIRTY_REGS_POS(primitive_restart)); + +    // Depth Test +    constexpr u32 depth_test_dirty_reg = DIRTY_REGS_POS(depth_test); +    dirty_pointers[MAXWELL3D_REG_INDEX(depth_test_enable)] = depth_test_dirty_reg; +    dirty_pointers[MAXWELL3D_REG_INDEX(depth_write_enabled)] = depth_test_dirty_reg; +    dirty_pointers[MAXWELL3D_REG_INDEX(depth_test_func)] = depth_test_dirty_reg; + +    // Stencil Test +    constexpr u32 stencil_test_dirty_reg = DIRTY_REGS_POS(stencil_test); +    dirty_pointers[MAXWELL3D_REG_INDEX(stencil_enable)] = stencil_test_dirty_reg; +    dirty_pointers[MAXWELL3D_REG_INDEX(stencil_front_func_func)] = stencil_test_dirty_reg; +    dirty_pointers[MAXWELL3D_REG_INDEX(stencil_front_func_ref)] = stencil_test_dirty_reg; +    dirty_pointers[MAXWELL3D_REG_INDEX(stencil_front_func_mask)] = stencil_test_dirty_reg; +    dirty_pointers[MAXWELL3D_REG_INDEX(stencil_front_op_fail)] = stencil_test_dirty_reg; +    dirty_pointers[MAXWELL3D_REG_INDEX(stencil_front_op_zfail)] = stencil_test_dirty_reg; +    dirty_pointers[MAXWELL3D_REG_INDEX(stencil_front_op_zpass)] = stencil_test_dirty_reg; +    dirty_pointers[MAXWELL3D_REG_INDEX(stencil_front_mask)] = stencil_test_dirty_reg; +    dirty_pointers[MAXWELL3D_REG_INDEX(stencil_two_side_enable)] = stencil_test_dirty_reg; +    dirty_pointers[MAXWELL3D_REG_INDEX(stencil_back_func_func)] = stencil_test_dirty_reg; +    dirty_pointers[MAXWELL3D_REG_INDEX(stencil_back_func_ref)] = stencil_test_dirty_reg; +    dirty_pointers[MAXWELL3D_REG_INDEX(stencil_back_func_mask)] = stencil_test_dirty_reg; +    dirty_pointers[MAXWELL3D_REG_INDEX(stencil_back_op_fail)] = stencil_test_dirty_reg; +    dirty_pointers[MAXWELL3D_REG_INDEX(stencil_back_op_zfail)] = stencil_test_dirty_reg; +    dirty_pointers[MAXWELL3D_REG_INDEX(stencil_back_op_zpass)] = stencil_test_dirty_reg; +    dirty_pointers[MAXWELL3D_REG_INDEX(stencil_back_mask)] = stencil_test_dirty_reg; + +    // Color Mask +    constexpr u32 color_mask_dirty_reg = DIRTY_REGS_POS(color_mask); +    dirty_pointers[MAXWELL3D_REG_INDEX(color_mask_common)] = color_mask_dirty_reg; +    set_block(MAXWELL3D_REG_INDEX(color_mask), sizeof(regs.color_mask) / sizeof(u32), +              color_mask_dirty_reg); +    // Blend State +    constexpr u32 blend_state_dirty_reg = DIRTY_REGS_POS(blend_state); +    set_block(MAXWELL3D_REG_INDEX(blend_color), sizeof(regs.blend_color) / sizeof(u32), +              blend_state_dirty_reg); +    dirty_pointers[MAXWELL3D_REG_INDEX(independent_blend_enable)] = blend_state_dirty_reg; +    set_block(MAXWELL3D_REG_INDEX(blend), sizeof(regs.blend) / sizeof(u32), blend_state_dirty_reg); +    set_block(MAXWELL3D_REG_INDEX(independent_blend), sizeof(regs.independent_blend) / sizeof(u32), +              blend_state_dirty_reg); + +    // Scissor State +    constexpr u32 scissor_test_dirty_reg = DIRTY_REGS_POS(scissor_test); +    set_block(MAXWELL3D_REG_INDEX(scissor_test), sizeof(regs.scissor_test) / sizeof(u32), +              scissor_test_dirty_reg); + +    // Polygon Offset +    constexpr u32 polygon_offset_dirty_reg = DIRTY_REGS_POS(polygon_offset); +    dirty_pointers[MAXWELL3D_REG_INDEX(polygon_offset_fill_enable)] = polygon_offset_dirty_reg; +    dirty_pointers[MAXWELL3D_REG_INDEX(polygon_offset_line_enable)] = polygon_offset_dirty_reg; +    dirty_pointers[MAXWELL3D_REG_INDEX(polygon_offset_point_enable)] = polygon_offset_dirty_reg; +    dirty_pointers[MAXWELL3D_REG_INDEX(polygon_offset_units)] = polygon_offset_dirty_reg; +    dirty_pointers[MAXWELL3D_REG_INDEX(polygon_offset_factor)] = polygon_offset_dirty_reg; +    dirty_pointers[MAXWELL3D_REG_INDEX(polygon_offset_clamp)] = polygon_offset_dirty_reg; +} +  void Maxwell3D::CallMacroMethod(u32 method, std::vector<u32> parameters) {      // Reset the current macro.      executing_macro = 0; @@ -108,6 +266,14 @@ void Maxwell3D::CallMethod(const GPU::MethodCall& method_call) {      const u32 method = method_call.method; +    if (method == cb_data_state.current) { +        regs.reg_array[method] = method_call.argument; +        ProcessCBData(method_call.argument); +        return; +    } else if (cb_data_state.current != null_cb_data) { +        FinishCBData(); +    } +      // It is an error to write to a register other than the current macro's ARG register before it      // has finished execution.      if (executing_macro != 0) { @@ -143,49 +309,19 @@ void Maxwell3D::CallMethod(const GPU::MethodCall& method_call) {      if (regs.reg_array[method] != method_call.argument) {          regs.reg_array[method] = method_call.argument; -        // Color buffers -        constexpr u32 first_rt_reg = MAXWELL3D_REG_INDEX(rt); -        constexpr u32 registers_per_rt = sizeof(regs.rt[0]) / sizeof(u32); -        if (method >= first_rt_reg && -            method < first_rt_reg + registers_per_rt * Regs::NumRenderTargets) { -            const std::size_t rt_index = (method - first_rt_reg) / registers_per_rt; -            dirty_flags.color_buffer.set(rt_index); -        } - -        // Zeta buffer -        constexpr u32 registers_in_zeta = sizeof(regs.zeta) / sizeof(u32); -        if (method == MAXWELL3D_REG_INDEX(zeta_enable) || -            method == MAXWELL3D_REG_INDEX(zeta_width) || -            method == MAXWELL3D_REG_INDEX(zeta_height) || -            (method >= MAXWELL3D_REG_INDEX(zeta) && -             method < MAXWELL3D_REG_INDEX(zeta) + registers_in_zeta)) { -            dirty_flags.zeta_buffer = true; -        } - -        // Shader -        constexpr u32 shader_registers_count = -            sizeof(regs.shader_config[0]) * Regs::MaxShaderProgram / sizeof(u32); -        if (method >= MAXWELL3D_REG_INDEX(shader_config[0]) && -            method < MAXWELL3D_REG_INDEX(shader_config[0]) + shader_registers_count) { -            dirty_flags.shaders = true; -        } - -        // Vertex format -        if (method >= MAXWELL3D_REG_INDEX(vertex_attrib_format) && -            method < MAXWELL3D_REG_INDEX(vertex_attrib_format) + regs.vertex_attrib_format.size()) { -            dirty_flags.vertex_attrib_format = true; -        } - -        // Vertex buffer -        if (method >= MAXWELL3D_REG_INDEX(vertex_array) && -            method < MAXWELL3D_REG_INDEX(vertex_array) + 4 * Regs::NumVertexArrays) { -            dirty_flags.vertex_array.set((method - MAXWELL3D_REG_INDEX(vertex_array)) >> 2); -        } else if (method >= MAXWELL3D_REG_INDEX(vertex_array_limit) && -                   method < MAXWELL3D_REG_INDEX(vertex_array_limit) + 2 * Regs::NumVertexArrays) { -            dirty_flags.vertex_array.set((method - MAXWELL3D_REG_INDEX(vertex_array_limit)) >> 1); -        } else if (method >= MAXWELL3D_REG_INDEX(instanced_arrays) && -                   method < MAXWELL3D_REG_INDEX(instanced_arrays) + Regs::NumVertexArrays) { -            dirty_flags.vertex_array.set(method - MAXWELL3D_REG_INDEX(instanced_arrays)); +        const std::size_t dirty_reg = dirty_pointers[method]; +        if (dirty_reg) { +            dirty.regs[dirty_reg] = true; +            if (dirty_reg >= DIRTY_REGS_POS(vertex_array) && +                dirty_reg < DIRTY_REGS_POS(vertex_array_buffers)) { +                dirty.vertex_array_buffers = true; +            } else if (dirty_reg >= DIRTY_REGS_POS(vertex_instance) && +                       dirty_reg < DIRTY_REGS_POS(vertex_instances)) { +                dirty.vertex_instances = true; +            } else if (dirty_reg >= DIRTY_REGS_POS(render_target) && +                       dirty_reg < DIRTY_REGS_POS(render_settings)) { +                dirty.render_settings = true; +            }          }      } @@ -214,7 +350,7 @@ void Maxwell3D::CallMethod(const GPU::MethodCall& method_call) {      case MAXWELL3D_REG_INDEX(const_buffer.cb_data[13]):      case MAXWELL3D_REG_INDEX(const_buffer.cb_data[14]):      case MAXWELL3D_REG_INDEX(const_buffer.cb_data[15]): { -        ProcessCBData(method_call.argument); +        StartCBData(method);          break;      }      case MAXWELL3D_REG_INDEX(cb_bind[0].raw_config): { @@ -261,7 +397,7 @@ void Maxwell3D::CallMethod(const GPU::MethodCall& method_call) {          const bool is_last_call = method_call.IsLastCall();          upload_state.ProcessData(method_call.argument, is_last_call);          if (is_last_call) { -            dirty_flags.OnMemoryWrite(); +            dirty.OnMemoryWrite();          }          break;      } @@ -333,7 +469,6 @@ void Maxwell3D::ProcessQueryGet() {              query_result.timestamp = system.CoreTiming().GetTicks();              memory_manager.WriteBlock(sequence_address, &query_result, sizeof(query_result));          } -        dirty_flags.OnMemoryWrite();          break;      }      default: @@ -405,23 +540,39 @@ void Maxwell3D::ProcessCBBind(Regs::ShaderStage stage) {  }  void Maxwell3D::ProcessCBData(u32 value) { +    const u32 id = cb_data_state.id; +    cb_data_state.buffer[id][cb_data_state.counter] = value; +    // Increment the current buffer position. +    regs.const_buffer.cb_pos = regs.const_buffer.cb_pos + 4; +    cb_data_state.counter++; +} + +void Maxwell3D::StartCBData(u32 method) { +    constexpr u32 first_cb_data = MAXWELL3D_REG_INDEX(const_buffer.cb_data[0]); +    cb_data_state.start_pos = regs.const_buffer.cb_pos; +    cb_data_state.id = method - first_cb_data; +    cb_data_state.current = method; +    cb_data_state.counter = 0; +    ProcessCBData(regs.const_buffer.cb_data[cb_data_state.id]); +} + +void Maxwell3D::FinishCBData() {      // Write the input value to the current const buffer at the current position.      const GPUVAddr buffer_address = regs.const_buffer.BufferAddress();      ASSERT(buffer_address != 0);      // Don't allow writing past the end of the buffer. -    ASSERT(regs.const_buffer.cb_pos + sizeof(u32) <= regs.const_buffer.cb_size); +    ASSERT(regs.const_buffer.cb_pos <= regs.const_buffer.cb_size); -    const GPUVAddr address{buffer_address + regs.const_buffer.cb_pos}; +    const GPUVAddr address{buffer_address + cb_data_state.start_pos}; +    const std::size_t size = regs.const_buffer.cb_pos - cb_data_state.start_pos; -    u8* ptr{memory_manager.GetPointer(address)}; -    rasterizer.InvalidateRegion(ToCacheAddr(ptr), sizeof(u32)); -    memory_manager.Write<u32>(address, value); +    const u32 id = cb_data_state.id; +    memory_manager.WriteBlock(address, cb_data_state.buffer[id].data(), size); +    dirty.OnMemoryWrite(); -    dirty_flags.OnMemoryWrite(); - -    // Increment the current buffer position. -    regs.const_buffer.cb_pos = regs.const_buffer.cb_pos + 4; +    cb_data_state.id = null_cb_data; +    cb_data_state.current = null_cb_data;  }  Texture::TICEntry Maxwell3D::GetTICEntry(u32 tic_index) const { diff --git a/src/video_core/engines/maxwell_3d.h b/src/video_core/engines/maxwell_3d.h index 8d15c8a48..ac300bf76 100644 --- a/src/video_core/engines/maxwell_3d.h +++ b/src/video_core/engines/maxwell_3d.h @@ -1124,23 +1124,77 @@ public:      State state{}; -    struct DirtyFlags { -        std::bitset<8> color_buffer{0xFF}; -        std::bitset<32> vertex_array{0xFFFFFFFF}; +    struct DirtyRegs { +        static constexpr std::size_t NUM_REGS = 256; +        union { +            struct { +                bool null_dirty; + +                // Vertex Attributes +                bool vertex_attrib_format; + +                // Vertex Arrays +                std::array<bool, 32> vertex_array; + +                bool vertex_array_buffers; + +                // Vertex Instances +                std::array<bool, 32> vertex_instance; + +                bool vertex_instances; + +                // Render Targets +                std::array<bool, 8> render_target; +                bool depth_buffer; + +                bool render_settings; + +                // Shaders +                bool shaders; + +                // Rasterizer State +                bool viewport; +                bool clip_coefficient; +                bool cull_mode; +                bool primitive_restart; +                bool depth_test; +                bool stencil_test; +                bool blend_state; +                bool scissor_test; +                bool transform_feedback; +                bool color_mask; +                bool polygon_offset; -        bool vertex_attrib_format = true; -        bool zeta_buffer = true; -        bool shaders = true; +                // Complementary +                bool viewport_transform; +                bool screen_y_control; + +                bool memory_general; +            }; +            std::array<bool, NUM_REGS> regs; +        }; + +        void ResetVertexArrays() { +            vertex_array.fill(true); +            vertex_array_buffers = true; +        } + +        void ResetRenderTargets() { +            depth_buffer = true; +            render_target.fill(true); +            render_settings = true; +        }          void OnMemoryWrite() { -            zeta_buffer = true;              shaders = true; -            color_buffer.set(); -            vertex_array.set(); +            memory_general = true; +            ResetRenderTargets(); +            ResetVertexArrays();          } -    }; -    DirtyFlags dirty_flags; +    } dirty{}; + +    std::array<u8, Regs::NUM_REGS> dirty_pointers{};      /// Reads a register value located at the input method address      u32 GetRegisterValue(u32 method) const; @@ -1192,6 +1246,15 @@ private:      /// Interpreter for the macro codes uploaded to the GPU.      MacroInterpreter macro_interpreter; +    static constexpr u32 null_cb_data = 0xFFFFFFFF; +    struct { +        std::array<std::array<u32, 0x4000>, 16> buffer; +        u32 current{null_cb_data}; +        u32 id{null_cb_data}; +        u32 start_pos{}; +        u32 counter{}; +    } cb_data_state; +      Upload::State upload_state;      /// Retrieves information about a specific TIC entry from the TIC buffer. @@ -1200,6 +1263,8 @@ private:      /// Retrieves information about a specific TSC entry from the TSC buffer.      Texture::TSCEntry GetTSCEntry(u32 tsc_index) const; +    void InitDirtySettings(); +      /**       * Call a macro on this engine.       * @param method Method to call @@ -1223,7 +1288,9 @@ private:      void ProcessSyncPoint();      /// Handles a write to the CB_DATA[i] register. +    void StartCBData(u32 method);      void ProcessCBData(u32 value); +    void FinishCBData();      /// Handles a write to the CB_BIND register.      void ProcessCBBind(Regs::ShaderStage stage); diff --git a/src/video_core/engines/maxwell_dma.cpp b/src/video_core/engines/maxwell_dma.cpp index afb9578d0..b5f57e534 100644 --- a/src/video_core/engines/maxwell_dma.cpp +++ b/src/video_core/engines/maxwell_dma.cpp @@ -58,7 +58,7 @@ void MaxwellDMA::HandleCopy() {      }      // All copies here update the main memory, so mark all rasterizer states as invalid. -    system.GPU().Maxwell3D().dirty_flags.OnMemoryWrite(); +    system.GPU().Maxwell3D().dirty.OnMemoryWrite();      if (regs.exec.is_dst_linear && regs.exec.is_src_linear) {          // When the enable_2d bit is disabled, the copy is performed as if we were copying a 1D diff --git a/src/video_core/engines/shader_bytecode.h b/src/video_core/engines/shader_bytecode.h index 79d469b88..8520a0143 100644 --- a/src/video_core/engines/shader_bytecode.h +++ b/src/video_core/engines/shader_bytecode.h @@ -931,8 +931,6 @@ union Instruction {      } csetp;      union { -        BitField<35, 4, PredCondition> cond; -        BitField<49, 1, u64> h_and;          BitField<6, 1, u64> ftz;          BitField<45, 2, PredOperation> op;          BitField<3, 3, u64> pred3; @@ -940,9 +938,21 @@ union Instruction {          BitField<43, 1, u64> negate_a;          BitField<44, 1, u64> abs_a;          BitField<47, 2, HalfType> type_a; -        BitField<31, 1, u64> negate_b; -        BitField<30, 1, u64> abs_b; -        BitField<28, 2, HalfType> type_b; +        union { +            BitField<35, 4, PredCondition> cond; +            BitField<49, 1, u64> h_and; +            BitField<31, 1, u64> negate_b; +            BitField<30, 1, u64> abs_b; +            BitField<28, 2, HalfType> type_b; +        } reg; +        union { +            BitField<56, 1, u64> negate_b; +            BitField<54, 1, u64> abs_b; +        } cbuf; +        union { +            BitField<49, 4, PredCondition> cond; +            BitField<53, 1, u64> h_and; +        } cbuf_and_imm;          BitField<42, 1, u64> neg_pred;          BitField<39, 3, u64> pred39;      } hsetp2; @@ -1548,7 +1558,9 @@ public:          HFMA2_RC,          HFMA2_RR,          HFMA2_IMM_R, +        HSETP2_C,          HSETP2_R, +        HSETP2_IMM,          HSET2_R,          POPC_C,          POPC_R, @@ -1831,7 +1843,9 @@ private:              INST("01100---1-------", Id::HFMA2_RC, Type::Hfma2, "HFMA2_RC"),              INST("0101110100000---", Id::HFMA2_RR, Type::Hfma2, "HFMA2_RR"),              INST("01110---0-------", Id::HFMA2_IMM_R, Type::Hfma2, "HFMA2_R_IMM"), -            INST("0101110100100---", Id::HSETP2_R, Type::HalfSetPredicate, "HSETP_R"), +            INST("0111111-1-------", Id::HSETP2_C, Type::HalfSetPredicate, "HSETP2_C"), +            INST("0101110100100---", Id::HSETP2_R, Type::HalfSetPredicate, "HSETP2_R"), +            INST("0111111-0-------", Id::HSETP2_IMM, Type::HalfSetPredicate, "HSETP2_IMM"),              INST("0101110100011---", Id::HSET2_R, Type::HalfSet, "HSET2_R"),              INST("0101000010000---", Id::MUFU, Type::Arithmetic, "MUFU"),              INST("0100110010010---", Id::RRO_C, Type::Arithmetic, "RRO_C"), diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp index 9cd4cf7b8..c59e687b6 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.cpp +++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp @@ -107,6 +107,7 @@ RasterizerOpenGL::RasterizerOpenGL(Core::System& system, Core::Frontend::EmuWind      shader_program_manager = std::make_unique<GLShader::ProgramManager>();      state.draw.shader_program = 0;      state.Apply(); +    clear_framebuffer.Create();      LOG_DEBUG(Render_OpenGL, "Sync fixed function OpenGL state here");      CheckExtensions(); @@ -126,10 +127,10 @@ GLuint RasterizerOpenGL::SetupVertexFormat() {      auto& gpu = system.GPU().Maxwell3D();      const auto& regs = gpu.regs; -    if (!gpu.dirty_flags.vertex_attrib_format) { +    if (!gpu.dirty.vertex_attrib_format) {          return state.draw.vertex_array;      } -    gpu.dirty_flags.vertex_attrib_format = false; +    gpu.dirty.vertex_attrib_format = false;      MICROPROFILE_SCOPE(OpenGL_VAO); @@ -183,7 +184,7 @@ GLuint RasterizerOpenGL::SetupVertexFormat() {      }      // Rebinding the VAO invalidates the vertex buffer bindings. -    gpu.dirty_flags.vertex_array.set(); +    gpu.dirty.ResetVertexArrays();      state.draw.vertex_array = vao_entry.handle;      return vao_entry.handle; @@ -191,17 +192,20 @@ GLuint RasterizerOpenGL::SetupVertexFormat() {  void RasterizerOpenGL::SetupVertexBuffer(GLuint vao) {      auto& gpu = system.GPU().Maxwell3D(); -    const auto& regs = gpu.regs; - -    if (gpu.dirty_flags.vertex_array.none()) +    if (!gpu.dirty.vertex_array_buffers)          return; +    gpu.dirty.vertex_array_buffers = false; + +    const auto& regs = gpu.regs;      MICROPROFILE_SCOPE(OpenGL_VB);      // Upload all guest vertex arrays sequentially to our buffer      for (u32 index = 0; index < Maxwell::NumVertexArrays; ++index) { -        if (!gpu.dirty_flags.vertex_array[index]) +        if (!gpu.dirty.vertex_array[index])              continue; +        gpu.dirty.vertex_array[index] = false; +        gpu.dirty.vertex_instance[index] = false;          const auto& vertex_array = regs.vertex_array[index];          if (!vertex_array.IsEnabled()) @@ -226,8 +230,32 @@ void RasterizerOpenGL::SetupVertexBuffer(GLuint vao) {              glVertexArrayBindingDivisor(vao, index, 0);          }      } +} + +void RasterizerOpenGL::SetupVertexInstances(GLuint vao) { +    auto& gpu = system.GPU().Maxwell3D(); + +    if (!gpu.dirty.vertex_instances) +        return; +    gpu.dirty.vertex_instances = false; -    gpu.dirty_flags.vertex_array.reset(); +    const auto& regs = gpu.regs; +    // Upload all guest vertex arrays sequentially to our buffer +    for (u32 index = 0; index < Maxwell::NumVertexArrays; ++index) { +        if (!gpu.dirty.vertex_instance[index]) +            continue; + +        gpu.dirty.vertex_instance[index] = false; + +        if (regs.instanced_arrays.IsInstancingEnabled(index) && +            regs.vertex_array[index].divisor != 0) { +            // Enable vertex buffer instancing with the specified divisor. +            glVertexArrayBindingDivisor(vao, index, regs.vertex_array[index].divisor); +        } else { +            // Disable the vertex buffer instancing. +            glVertexArrayBindingDivisor(vao, index, 0); +        } +    }  }  GLintptr RasterizerOpenGL::SetupIndexBuffer() { @@ -343,7 +371,7 @@ void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) {      SyncClipEnabled(clip_distances); -    gpu.dirty_flags.shaders = false; +    gpu.dirty.shaders = false;  }  std::size_t RasterizerOpenGL::CalculateVertexArraysSize() const { @@ -426,13 +454,13 @@ std::pair<bool, bool> RasterizerOpenGL::ConfigureFramebuffers(      const FramebufferConfigState fb_config_state{using_color_fb, using_depth_fb, preserve_contents,                                                   single_color_target}; -    if (fb_config_state == current_framebuffer_config_state && -        gpu.dirty_flags.color_buffer.none() && !gpu.dirty_flags.zeta_buffer) { +    if (fb_config_state == current_framebuffer_config_state && !gpu.dirty.render_settings) {          // Only skip if the previous ConfigureFramebuffers call was from the same kind (multiple or          // single color targets). This is done because the guest registers may not change but the          // host framebuffer may contain different attachments          return current_depth_stencil_usage;      } +    gpu.dirty.render_settings = false;      current_framebuffer_config_state = fb_config_state;      texture_cache.GuardRenderTargets(true); @@ -521,13 +549,65 @@ std::pair<bool, bool> RasterizerOpenGL::ConfigureFramebuffers(      return current_depth_stencil_usage = {static_cast<bool>(depth_surface), fbkey.stencil_enable};  } +void RasterizerOpenGL::ConfigureClearFramebuffer(OpenGLState& current_state, bool using_color_fb, +                                                 bool using_depth_fb, bool using_stencil_fb) { +    auto& gpu = system.GPU().Maxwell3D(); +    const auto& regs = gpu.regs; + +    texture_cache.GuardRenderTargets(true); +    View color_surface{}; +    if (using_color_fb) { +        color_surface = texture_cache.GetColorBufferSurface(regs.clear_buffers.RT, false); +    } +    View depth_surface{}; +    if (using_depth_fb || using_stencil_fb) { +        depth_surface = texture_cache.GetDepthBufferSurface(false); +    } +    texture_cache.GuardRenderTargets(false); + +    current_state.draw.draw_framebuffer = clear_framebuffer.handle; +    current_state.ApplyFramebufferState(); + +    if (color_surface) { +        color_surface->Attach(GL_COLOR_ATTACHMENT0, GL_DRAW_FRAMEBUFFER); +    } else { +        glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, 0, 0); +    } + +    if (depth_surface) { +        const auto& params = depth_surface->GetSurfaceParams(); +        switch (params.type) { +        case VideoCore::Surface::SurfaceType::Depth: { +            depth_surface->Attach(GL_DEPTH_ATTACHMENT, GL_DRAW_FRAMEBUFFER); +            glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_STENCIL_ATTACHMENT, GL_TEXTURE_2D, 0, 0); +            break; +        } +        case VideoCore::Surface::SurfaceType::DepthStencil: { +            depth_surface->Attach(GL_DEPTH_ATTACHMENT, GL_DRAW_FRAMEBUFFER); +            break; +        } +        default: { UNIMPLEMENTED(); } +        } +    } else { +        glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_DEPTH_STENCIL_ATTACHMENT, GL_TEXTURE_2D, 0, +                               0); +    } +} +  void RasterizerOpenGL::Clear() {      const auto& regs = system.GPU().Maxwell3D().regs;      bool use_color{};      bool use_depth{};      bool use_stencil{}; -    OpenGLState clear_state; +    OpenGLState prev_state{OpenGLState::GetCurState()}; +    SCOPE_EXIT({ +        prev_state.AllDirty(); +        prev_state.Apply(); +    }); + +    OpenGLState clear_state{OpenGLState::GetCurState()}; +    clear_state.SetDefaultViewports();      if (regs.clear_buffers.R || regs.clear_buffers.G || regs.clear_buffers.B ||          regs.clear_buffers.A) {          use_color = true; @@ -547,6 +627,7 @@ void RasterizerOpenGL::Clear() {          // true.          clear_state.depth.test_enabled = true;          clear_state.depth.test_func = GL_ALWAYS; +        clear_state.depth.write_mask = GL_TRUE;      }      if (regs.clear_buffers.S) {          ASSERT_MSG(regs.zeta_enable != 0, "Tried to clear stencil but buffer is not enabled!"); @@ -583,8 +664,9 @@ void RasterizerOpenGL::Clear() {          return;      } -    const auto [clear_depth, clear_stencil] = ConfigureFramebuffers( -        clear_state, use_color, use_depth || use_stencil, false, regs.clear_buffers.RT.Value()); +    ConfigureClearFramebuffer(clear_state, use_color, use_depth, use_stencil); + +    SyncViewport(clear_state);      if (regs.clear_flags.scissor) {          SyncScissorTest(clear_state);      } @@ -593,21 +675,18 @@ void RasterizerOpenGL::Clear() {          clear_state.EmulateViewportWithScissor();      } -    clear_state.ApplyColorMask(); -    clear_state.ApplyDepth(); -    clear_state.ApplyStencilTest(); -    clear_state.ApplyViewport(); -    clear_state.ApplyFramebufferState(); +    clear_state.AllDirty(); +    clear_state.Apply();      if (use_color) { -        glClearBufferfv(GL_COLOR, regs.clear_buffers.RT, regs.clear_color); +        glClearBufferfv(GL_COLOR, 0, regs.clear_color);      } -    if (clear_depth && clear_stencil) { +    if (use_depth && use_stencil) {          glClearBufferfi(GL_DEPTH_STENCIL, 0, regs.clear_depth, regs.clear_stencil); -    } else if (clear_depth) { +    } else if (use_depth) {          glClearBufferfv(GL_DEPTH, 0, ®s.clear_depth); -    } else if (clear_stencil) { +    } else if (use_stencil) {          glClearBufferiv(GL_STENCIL, 0, ®s.clear_stencil);      }  } @@ -663,6 +742,7 @@ void RasterizerOpenGL::DrawArrays() {      // Upload vertex and index data.      SetupVertexBuffer(vao); +    SetupVertexInstances(vao);      const GLintptr index_buffer_offset = SetupIndexBuffer();      // Setup draw parameters. It will automatically choose what glDraw* method to use. @@ -689,7 +769,7 @@ void RasterizerOpenGL::DrawArrays() {      if (invalidate) {          // As all cached buffers are invalidated, we need to recheck their state. -        gpu.dirty_flags.vertex_array.set(); +        gpu.dirty.ResetVertexArrays();      }      shader_program_manager->ApplyTo(state); @@ -702,6 +782,7 @@ void RasterizerOpenGL::DrawArrays() {      params.DispatchDraw();      accelerate_draw = AccelDraw::Disabled; +    gpu.dirty.memory_general = false;  }  void RasterizerOpenGL::DispatchCompute(GPUVAddr code_addr) { @@ -976,10 +1057,11 @@ void RasterizerOpenGL::SyncClipCoef() {  }  void RasterizerOpenGL::SyncCullMode() { -    const auto& regs = system.GPU().Maxwell3D().regs; +    auto& maxwell3d = system.GPU().Maxwell3D(); -    state.cull.enabled = regs.cull.enabled != 0; +    const auto& regs = maxwell3d.regs; +    state.cull.enabled = regs.cull.enabled != 0;      if (state.cull.enabled) {          state.cull.front_face = MaxwellToGL::FrontFace(regs.cull.front_face);          state.cull.mode = MaxwellToGL::CullFace(regs.cull.cull_face); @@ -1012,16 +1094,21 @@ void RasterizerOpenGL::SyncDepthTestState() {      state.depth.test_enabled = regs.depth_test_enable != 0;      state.depth.write_mask = regs.depth_write_enabled ? GL_TRUE : GL_FALSE; -    if (!state.depth.test_enabled) +    if (!state.depth.test_enabled) {          return; +    }      state.depth.test_func = MaxwellToGL::ComparisonOp(regs.depth_test_func);  }  void RasterizerOpenGL::SyncStencilTestState() { -    const auto& regs = system.GPU().Maxwell3D().regs; -    state.stencil.test_enabled = regs.stencil_enable != 0; +    auto& maxwell3d = system.GPU().Maxwell3D(); +    if (!maxwell3d.dirty.stencil_test) { +        return; +    } +    const auto& regs = maxwell3d.regs; +    state.stencil.test_enabled = regs.stencil_enable != 0;      if (!regs.stencil_enable) {          return;      } @@ -1050,10 +1137,17 @@ void RasterizerOpenGL::SyncStencilTestState() {          state.stencil.back.action_depth_fail = GL_KEEP;          state.stencil.back.action_depth_pass = GL_KEEP;      } +    state.MarkDirtyStencilState(); +    maxwell3d.dirty.stencil_test = false;  }  void RasterizerOpenGL::SyncColorMask() { -    const auto& regs = system.GPU().Maxwell3D().regs; +    auto& maxwell3d = system.GPU().Maxwell3D(); +    if (!maxwell3d.dirty.color_mask) { +        return; +    } +    const auto& regs = maxwell3d.regs; +      const std::size_t count =          regs.independent_blend_enable ? Tegra::Engines::Maxwell3D::Regs::NumRenderTargets : 1;      for (std::size_t i = 0; i < count; i++) { @@ -1064,6 +1158,9 @@ void RasterizerOpenGL::SyncColorMask() {          dest.blue_enabled = (source.B == 0) ? GL_FALSE : GL_TRUE;          dest.alpha_enabled = (source.A == 0) ? GL_FALSE : GL_TRUE;      } + +    state.MarkDirtyColorMask(); +    maxwell3d.dirty.color_mask = false;  }  void RasterizerOpenGL::SyncMultiSampleState() { @@ -1078,7 +1175,11 @@ void RasterizerOpenGL::SyncFragmentColorClampState() {  }  void RasterizerOpenGL::SyncBlendState() { -    const auto& regs = system.GPU().Maxwell3D().regs; +    auto& maxwell3d = system.GPU().Maxwell3D(); +    if (!maxwell3d.dirty.blend_state) { +        return; +    } +    const auto& regs = maxwell3d.regs;      state.blend_color.red = regs.blend_color.r;      state.blend_color.green = regs.blend_color.g; @@ -1101,6 +1202,8 @@ void RasterizerOpenGL::SyncBlendState() {          for (std::size_t i = 1; i < Tegra::Engines::Maxwell3D::Regs::NumRenderTargets; i++) {              state.blend[i].enabled = false;          } +        maxwell3d.dirty.blend_state = false; +        state.MarkDirtyBlendState();          return;      } @@ -1117,6 +1220,9 @@ void RasterizerOpenGL::SyncBlendState() {          blend.src_a_func = MaxwellToGL::BlendFunc(src.factor_source_a);          blend.dst_a_func = MaxwellToGL::BlendFunc(src.factor_dest_a);      } + +    state.MarkDirtyBlendState(); +    maxwell3d.dirty.blend_state = false;  }  void RasterizerOpenGL::SyncLogicOpState() { @@ -1168,13 +1274,21 @@ void RasterizerOpenGL::SyncPointState() {  }  void RasterizerOpenGL::SyncPolygonOffset() { -    const auto& regs = system.GPU().Maxwell3D().regs; +    auto& maxwell3d = system.GPU().Maxwell3D(); +    if (!maxwell3d.dirty.polygon_offset) { +        return; +    } +    const auto& regs = maxwell3d.regs; +      state.polygon_offset.fill_enable = regs.polygon_offset_fill_enable != 0;      state.polygon_offset.line_enable = regs.polygon_offset_line_enable != 0;      state.polygon_offset.point_enable = regs.polygon_offset_point_enable != 0;      state.polygon_offset.units = regs.polygon_offset_units;      state.polygon_offset.factor = regs.polygon_offset_factor;      state.polygon_offset.clamp = regs.polygon_offset_clamp; + +    state.MarkDirtyPolygonOffset(); +    maxwell3d.dirty.polygon_offset = false;  }  void RasterizerOpenGL::SyncAlphaTest() { diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h index b2b671230..8b123c48d 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.h +++ b/src/video_core/renderer_opengl/gl_rasterizer.h @@ -109,6 +109,9 @@ private:          OpenGLState& current_state, bool using_color_fb = true, bool using_depth_fb = true,          bool preserve_contents = true, std::optional<std::size_t> single_color_target = {}); +    void ConfigureClearFramebuffer(OpenGLState& current_state, bool using_color_fb, +                                   bool using_depth_fb, bool using_stencil_fb); +      /// Configures the current constbuffers to use for the draw command.      void SetupDrawConstBuffers(Tegra::Engines::Maxwell3D::Regs::ShaderStage stage,                                 const Shader& shader); @@ -227,6 +230,7 @@ private:      GLuint SetupVertexFormat();      void SetupVertexBuffer(GLuint vao); +    void SetupVertexInstances(GLuint vao);      GLintptr SetupIndexBuffer(); @@ -237,6 +241,8 @@ private:      enum class AccelDraw { Disabled, Arrays, Indexed };      AccelDraw accelerate_draw = AccelDraw::Disabled; +    OGLFramebuffer clear_framebuffer; +      using CachedPageMap = boost::icl::interval_map<u64, int>;      CachedPageMap cached_pages;  }; diff --git a/src/video_core/renderer_opengl/gl_shader_cache.cpp b/src/video_core/renderer_opengl/gl_shader_cache.cpp index 865c191bd..1c90facc3 100644 --- a/src/video_core/renderer_opengl/gl_shader_cache.cpp +++ b/src/video_core/renderer_opengl/gl_shader_cache.cpp @@ -628,7 +628,7 @@ std::unordered_map<u64, UnspecializedShader> ShaderCacheOpenGL::GenerateUnspecia  }  Shader ShaderCacheOpenGL::GetStageProgram(Maxwell::ShaderProgram program) { -    if (!system.GPU().Maxwell3D().dirty_flags.shaders) { +    if (!system.GPU().Maxwell3D().dirty.shaders) {          return last_shaders[static_cast<std::size_t>(program)];      } diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp index 50b616be4..ffe26b241 100644 --- a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp +++ b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp @@ -257,10 +257,6 @@ public:      }  private: -    using OperationDecompilerFn = std::string (GLSLDecompiler::*)(Operation); -    using OperationDecompilersArray = -        std::array<OperationDecompilerFn, static_cast<std::size_t>(OperationCode::Amount)>; -      void DeclareVertex() {          if (!IsVertexShader(stage))              return; @@ -1414,14 +1410,10 @@ private:          return fmt::format("{}[{}]", pair, VisitOperand(operation, 1, Type::Uint));      } -    std::string LogicalAll2(Operation operation) { +    std::string LogicalAnd2(Operation operation) {          return GenerateUnary(operation, "all", Type::Bool, Type::Bool2);      } -    std::string LogicalAny2(Operation operation) { -        return GenerateUnary(operation, "any", Type::Bool, Type::Bool2); -    } -      template <bool with_nan>      std::string GenerateHalfComparison(Operation operation, const std::string& compare_op) {          const std::string comparison{GenerateBinaryCall(operation, compare_op, Type::Bool2, @@ -1728,7 +1720,7 @@ private:          return "utof(gl_WorkGroupID"s + GetSwizzle(element) + ')';      } -    static constexpr OperationDecompilersArray operation_decompilers = { +    static constexpr std::array operation_decompilers = {          &GLSLDecompiler::Assign,          &GLSLDecompiler::Select, @@ -1812,8 +1804,7 @@ private:          &GLSLDecompiler::LogicalXor,          &GLSLDecompiler::LogicalNegate,          &GLSLDecompiler::LogicalPick2, -        &GLSLDecompiler::LogicalAll2, -        &GLSLDecompiler::LogicalAny2, +        &GLSLDecompiler::LogicalAnd2,          &GLSLDecompiler::LogicalLessThan<Type::Float>,          &GLSLDecompiler::LogicalEqual<Type::Float>, @@ -1877,6 +1868,7 @@ private:          &GLSLDecompiler::WorkGroupId<1>,          &GLSLDecompiler::WorkGroupId<2>,      }; +    static_assert(operation_decompilers.size() == static_cast<std::size_t>(OperationCode::Amount));      std::string GetRegister(u32 index) const {          return GetDeclarationWithSuffix(index, "gpr"); diff --git a/src/video_core/renderer_opengl/gl_state.cpp b/src/video_core/renderer_opengl/gl_state.cpp index 0eae98afe..f4777d0b0 100644 --- a/src/video_core/renderer_opengl/gl_state.cpp +++ b/src/video_core/renderer_opengl/gl_state.cpp @@ -165,6 +165,25 @@ OpenGLState::OpenGLState() {      alpha_test.ref = 0.0f;  } +void OpenGLState::SetDefaultViewports() { +    for (auto& item : viewports) { +        item.x = 0; +        item.y = 0; +        item.width = 0; +        item.height = 0; +        item.depth_range_near = 0.0f; +        item.depth_range_far = 1.0f; +        item.scissor.enabled = false; +        item.scissor.x = 0; +        item.scissor.y = 0; +        item.scissor.width = 0; +        item.scissor.height = 0; +    } + +    depth_clamp.far_plane = false; +    depth_clamp.near_plane = false; +} +  void OpenGLState::ApplyDefaultState() {      glEnable(GL_BLEND);      glDisable(GL_FRAMEBUFFER_SRGB); @@ -526,7 +545,7 @@ void OpenGLState::ApplySamplers() const {      }  } -void OpenGLState::Apply() const { +void OpenGLState::Apply() {      MICROPROFILE_SCOPE(OpenGL_State);      ApplyFramebufferState();      ApplyVertexArrayState(); @@ -536,19 +555,31 @@ void OpenGLState::Apply() const {      ApplyPointSize();      ApplyFragmentColorClamp();      ApplyMultisample(); +    if (dirty.color_mask) { +        ApplyColorMask(); +        dirty.color_mask = false; +    }      ApplyDepthClamp(); -    ApplyColorMask();      ApplyViewport(); -    ApplyStencilTest(); +    if (dirty.stencil_state) { +        ApplyStencilTest(); +        dirty.stencil_state = false; +    }      ApplySRgb();      ApplyCulling();      ApplyDepth();      ApplyPrimitiveRestart(); -    ApplyBlending(); +    if (dirty.blend_state) { +        ApplyBlending(); +        dirty.blend_state = false; +    }      ApplyLogicOp();      ApplyTextures();      ApplySamplers(); -    ApplyPolygonOffset(); +    if (dirty.polygon_offset) { +        ApplyPolygonOffset(); +        dirty.polygon_offset = false; +    }      ApplyAlphaTest();  } diff --git a/src/video_core/renderer_opengl/gl_state.h b/src/video_core/renderer_opengl/gl_state.h index b0140495d..fdf9a8a12 100644 --- a/src/video_core/renderer_opengl/gl_state.h +++ b/src/video_core/renderer_opengl/gl_state.h @@ -195,8 +195,9 @@ public:          s_rgb_used = false;      } +    void SetDefaultViewports();      /// Apply this state as the current OpenGL state -    void Apply() const; +    void Apply();      void ApplyFramebufferState() const;      void ApplyVertexArrayState() const; @@ -237,11 +238,41 @@ public:      /// Viewport does not affects glClearBuffer so emulate viewport using scissor test      void EmulateViewportWithScissor(); +    void MarkDirtyBlendState() { +        dirty.blend_state = true; +    } + +    void MarkDirtyStencilState() { +        dirty.stencil_state = true; +    } + +    void MarkDirtyPolygonOffset() { +        dirty.polygon_offset = true; +    } + +    void MarkDirtyColorMask() { +        dirty.color_mask = true; +    } + +    void AllDirty() { +        dirty.blend_state = true; +        dirty.stencil_state = true; +        dirty.polygon_offset = true; +        dirty.color_mask = true; +    } +  private:      static OpenGLState cur_state;      // Workaround for sRGB problems caused by QT not supporting srgb output      static bool s_rgb_used; +    struct { +        bool blend_state; +        bool stencil_state; +        bool viewport_state; +        bool polygon_offset; +        bool color_mask; +    } dirty{};  };  } // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_texture_cache.cpp b/src/video_core/renderer_opengl/gl_texture_cache.cpp index b1f6bc7c2..8fcd39a69 100644 --- a/src/video_core/renderer_opengl/gl_texture_cache.cpp +++ b/src/video_core/renderer_opengl/gl_texture_cache.cpp @@ -485,11 +485,15 @@ void TextureCacheOpenGL::ImageBlit(View& src_view, View& dst_view,      const auto& dst_params{dst_view->GetSurfaceParams()};      OpenGLState prev_state{OpenGLState::GetCurState()}; -    SCOPE_EXIT({ prev_state.Apply(); }); +    SCOPE_EXIT({ +        prev_state.AllDirty(); +        prev_state.Apply(); +    });      OpenGLState state;      state.draw.read_framebuffer = src_framebuffer.handle;      state.draw.draw_framebuffer = dst_framebuffer.handle; +    state.AllDirty();      state.Apply();      u32 buffers{}; diff --git a/src/video_core/renderer_opengl/renderer_opengl.cpp b/src/video_core/renderer_opengl/renderer_opengl.cpp index 9ecdddb0d..a05cef3b9 100644 --- a/src/video_core/renderer_opengl/renderer_opengl.cpp +++ b/src/video_core/renderer_opengl/renderer_opengl.cpp @@ -108,6 +108,7 @@ void RendererOpenGL::SwapBuffers(      // Maintain the rasterizer's state as a priority      OpenGLState prev_state = OpenGLState::GetCurState(); +    state.AllDirty();      state.Apply();      if (framebuffer) { @@ -140,6 +141,7 @@ void RendererOpenGL::SwapBuffers(      system.GetPerfStats().BeginSystemFrame();      // Restore the rasterizer state +    prev_state.AllDirty();      prev_state.Apply();  } @@ -206,6 +208,7 @@ void RendererOpenGL::InitOpenGLObjects() {      // Link shaders and get variable locations      shader.CreateFromSource(vertex_shader, nullptr, fragment_shader);      state.draw.shader_program = shader.handle; +    state.AllDirty();      state.Apply();      uniform_modelview_matrix = glGetUniformLocation(shader.handle, "modelview_matrix");      uniform_color_texture = glGetUniformLocation(shader.handle, "color_texture"); @@ -338,12 +341,14 @@ void RendererOpenGL::DrawScreenTriangles(const ScreenInfo& screen_info, float x,      // Workaround brigthness problems in SMO by enabling sRGB in the final output      // if it has been used in the frame. Needed because of this bug in QT: QTBUG-50987      state.framebuffer_srgb.enabled = OpenGLState::GetsRGBUsed(); +    state.AllDirty();      state.Apply();      glNamedBufferSubData(vertex_buffer.handle, 0, sizeof(vertices), vertices.data());      glDrawArrays(GL_TRIANGLE_STRIP, 0, 4);      // Restore default state      state.framebuffer_srgb.enabled = false;      state.texture_units[0].texture = 0; +    state.AllDirty();      state.Apply();      // Clear sRGB state for the next frame      OpenGLState::ClearsRGBUsed(); @@ -388,6 +393,7 @@ void RendererOpenGL::CaptureScreenshot() {      GLuint old_read_fb = state.draw.read_framebuffer;      GLuint old_draw_fb = state.draw.draw_framebuffer;      state.draw.read_framebuffer = state.draw.draw_framebuffer = screenshot_framebuffer.handle; +    state.AllDirty();      state.Apply();      Layout::FramebufferLayout layout{renderer_settings.screenshot_framebuffer_layout}; @@ -407,6 +413,7 @@ void RendererOpenGL::CaptureScreenshot() {      screenshot_framebuffer.Release();      state.draw.read_framebuffer = old_read_fb;      state.draw.draw_framebuffer = old_draw_fb; +    state.AllDirty();      state.Apply();      glDeleteRenderbuffers(1, &renderbuffer); diff --git a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp index 9b2d8e987..d267712c9 100644 --- a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp +++ b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp @@ -205,10 +205,6 @@ public:      }  private: -    using OperationDecompilerFn = Id (SPIRVDecompiler::*)(Operation); -    using OperationDecompilersArray = -        std::array<OperationDecompilerFn, static_cast<std::size_t>(OperationCode::Amount)>; -      static constexpr auto INTERNAL_FLAGS_COUNT = static_cast<std::size_t>(InternalFlag::Amount);      void AllocateBindings() { @@ -804,12 +800,7 @@ private:          return {};      } -    Id LogicalAll2(Operation operation) { -        UNIMPLEMENTED(); -        return {}; -    } - -    Id LogicalAny2(Operation operation) { +    Id LogicalAnd2(Operation operation) {          UNIMPLEMENTED();          return {};      } @@ -1206,7 +1197,7 @@ private:          return {};      } -    static constexpr OperationDecompilersArray operation_decompilers = { +    static constexpr std::array operation_decompilers = {          &SPIRVDecompiler::Assign,          &SPIRVDecompiler::Ternary<&Module::OpSelect, Type::Float, Type::Bool, Type::Float, @@ -1291,8 +1282,7 @@ private:          &SPIRVDecompiler::Binary<&Module::OpLogicalNotEqual, Type::Bool>,          &SPIRVDecompiler::Unary<&Module::OpLogicalNot, Type::Bool>,          &SPIRVDecompiler::LogicalPick2, -        &SPIRVDecompiler::LogicalAll2, -        &SPIRVDecompiler::LogicalAny2, +        &SPIRVDecompiler::LogicalAnd2,          &SPIRVDecompiler::Binary<&Module::OpFOrdLessThan, Type::Bool, Type::Float>,          &SPIRVDecompiler::Binary<&Module::OpFOrdEqual, Type::Bool, Type::Float>, @@ -1357,6 +1347,7 @@ private:          &SPIRVDecompiler::WorkGroupId<1>,          &SPIRVDecompiler::WorkGroupId<2>,      }; +    static_assert(operation_decompilers.size() == static_cast<std::size_t>(OperationCode::Amount));      const VKDevice& device;      const ShaderIR& ir; diff --git a/src/video_core/shader/decode.cpp b/src/video_core/shader/decode.cpp index 29c8895c5..afffd157f 100644 --- a/src/video_core/shader/decode.cpp +++ b/src/video_core/shader/decode.cpp @@ -46,12 +46,12 @@ void ShaderIR::Decode() {          coverage_end = shader_info.end;          if (shader_info.decompilable) {              disable_flow_stack = true; -            const auto insert_block = ([this](NodeBlock& nodes, u32 label) { +            const auto insert_block = [this](NodeBlock& nodes, u32 label) {                  if (label == exit_branch) {                      return;                  }                  basic_blocks.insert({label, nodes}); -            }); +            };              const auto& blocks = shader_info.blocks;              NodeBlock current_block;              u32 current_label = exit_branch; @@ -103,7 +103,7 @@ void ShaderIR::DecodeRangeInner(NodeBlock& bb, u32 begin, u32 end) {  }  void ShaderIR::InsertControlFlow(NodeBlock& bb, const ShaderBlock& block) { -    const auto apply_conditions = ([&](const Condition& cond, Node n) -> Node { +    const auto apply_conditions = [&](const Condition& cond, Node n) -> Node {          Node result = n;          if (cond.cc != ConditionCode::T) {              result = Conditional(GetConditionCode(cond.cc), {result}); @@ -117,7 +117,7 @@ void ShaderIR::InsertControlFlow(NodeBlock& bb, const ShaderBlock& block) {              result = Conditional(GetPredicate(pred, is_neg), {result});          }          return result; -    }); +    };      if (block.branch.address < 0) {          if (block.branch.kills) {              Node n = Operation(OperationCode::Discard); diff --git a/src/video_core/shader/decode/half_set_predicate.cpp b/src/video_core/shader/decode/half_set_predicate.cpp index d59d15bd8..ad180d6df 100644 --- a/src/video_core/shader/decode/half_set_predicate.cpp +++ b/src/video_core/shader/decode/half_set_predicate.cpp @@ -23,38 +23,51 @@ u32 ShaderIR::DecodeHalfSetPredicate(NodeBlock& bb, u32 pc) {      Node op_a = UnpackHalfFloat(GetRegister(instr.gpr8), instr.hsetp2.type_a);      op_a = GetOperandAbsNegHalf(op_a, instr.hsetp2.abs_a, instr.hsetp2.negate_a); -    Node op_b = [&]() { -        switch (opcode->get().GetId()) { -        case OpCode::Id::HSETP2_R: -            return GetOperandAbsNegHalf(GetRegister(instr.gpr20), instr.hsetp2.abs_a, -                                        instr.hsetp2.negate_b); -        default: -            UNREACHABLE(); -            return Immediate(0); -        } -    }(); -    op_b = UnpackHalfFloat(op_b, instr.hsetp2.type_b); - -    // We can't use the constant predicate as destination. -    ASSERT(instr.hsetp2.pred3 != static_cast<u64>(Pred::UnusedIndex)); - -    const Node second_pred = GetPredicate(instr.hsetp2.pred39, instr.hsetp2.neg_pred != 0); +    Tegra::Shader::PredCondition cond{}; +    bool h_and{}; +    Node op_b{}; +    switch (opcode->get().GetId()) { +    case OpCode::Id::HSETP2_C: +        cond = instr.hsetp2.cbuf_and_imm.cond; +        h_and = instr.hsetp2.cbuf_and_imm.h_and; +        op_b = GetOperandAbsNegHalf(GetConstBuffer(instr.cbuf34.index, instr.cbuf34.offset), +                                    instr.hsetp2.cbuf.abs_b, instr.hsetp2.cbuf.negate_b); +        break; +    case OpCode::Id::HSETP2_IMM: +        cond = instr.hsetp2.cbuf_and_imm.cond; +        h_and = instr.hsetp2.cbuf_and_imm.h_and; +        op_b = UnpackHalfImmediate(instr, true); +        break; +    case OpCode::Id::HSETP2_R: +        cond = instr.hsetp2.reg.cond; +        h_and = instr.hsetp2.reg.h_and; +        op_b = +            UnpackHalfFloat(GetOperandAbsNegHalf(GetRegister(instr.gpr20), instr.hsetp2.reg.abs_b, +                                                 instr.hsetp2.reg.negate_b), +                            instr.hsetp2.reg.type_b); +        break; +    default: +        UNREACHABLE(); +        op_b = Immediate(0); +    }      const OperationCode combiner = GetPredicateCombiner(instr.hsetp2.op); -    const OperationCode pair_combiner = -        instr.hsetp2.h_and ? OperationCode::LogicalAll2 : OperationCode::LogicalAny2; - -    const Node comparison = GetPredicateComparisonHalf(instr.hsetp2.cond, op_a, op_b); -    const Node first_pred = Operation(pair_combiner, comparison); +    const Node pred39 = GetPredicate(instr.hsetp2.pred39, instr.hsetp2.neg_pred); -    // Set the primary predicate to the result of Predicate OP SecondPredicate -    const Node value = Operation(combiner, first_pred, second_pred); -    SetPredicate(bb, instr.hsetp2.pred3, value); +    const auto Write = [&](u64 dest, Node src) { +        SetPredicate(bb, dest, Operation(combiner, std::move(src), pred39)); +    }; -    if (instr.hsetp2.pred0 != static_cast<u64>(Pred::UnusedIndex)) { -        // Set the secondary predicate to the result of !Predicate OP SecondPredicate, if enabled -        const Node negated_pred = Operation(OperationCode::LogicalNegate, first_pred); -        SetPredicate(bb, instr.hsetp2.pred0, Operation(combiner, negated_pred, second_pred)); +    const Node comparison = GetPredicateComparisonHalf(cond, op_a, op_b); +    const u64 first = instr.hsetp2.pred0; +    const u64 second = instr.hsetp2.pred3; +    if (h_and) { +        const Node joined = Operation(OperationCode::LogicalAnd2, comparison); +        Write(first, joined); +        Write(second, Operation(OperationCode::LogicalNegate, joined)); +    } else { +        Write(first, Operation(OperationCode::LogicalPick2, comparison, Immediate(0u))); +        Write(second, Operation(OperationCode::LogicalPick2, comparison, Immediate(1u)));      }      return pc; diff --git a/src/video_core/shader/decode/image.cpp b/src/video_core/shader/decode/image.cpp index 24f022cc0..77151a24b 100644 --- a/src/video_core/shader/decode/image.cpp +++ b/src/video_core/shader/decode/image.cpp @@ -95,12 +95,8 @@ const Image& ShaderIR::GetImage(Tegra::Shader::Image image, Tegra::Shader::Image  const Image& ShaderIR::GetBindlessImage(Tegra::Shader::Register reg,                                          Tegra::Shader::ImageType type) {      const Node image_register{GetRegister(reg)}; -    const Node base_image{ +    const auto [base_image, cbuf_index, cbuf_offset]{          TrackCbuf(image_register, global_code, static_cast<s64>(global_code.size()))}; -    const auto cbuf{std::get_if<CbufNode>(&*base_image)}; -    const auto cbuf_offset_imm{std::get_if<ImmediateNode>(&*cbuf->GetOffset())}; -    const auto cbuf_offset{cbuf_offset_imm->GetValue()}; -    const auto cbuf_index{cbuf->GetIndex()};      const auto cbuf_key{(static_cast<u64>(cbuf_index) << 32) | static_cast<u64>(cbuf_offset)};      // If this image has already been used, return the existing mapping. diff --git a/src/video_core/shader/decode/memory.cpp b/src/video_core/shader/decode/memory.cpp index 80fc0ccfc..ed108bea8 100644 --- a/src/video_core/shader/decode/memory.cpp +++ b/src/video_core/shader/decode/memory.cpp @@ -95,10 +95,10 @@ u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) {              const Node op_b =                  GetConstBufferIndirect(instr.cbuf36.index, instr.cbuf36.GetOffset() + 4, index); -            SetTemporal(bb, 0, op_a); -            SetTemporal(bb, 1, op_b); -            SetRegister(bb, instr.gpr0, GetTemporal(0)); -            SetRegister(bb, instr.gpr0.Value() + 1, GetTemporal(1)); +            SetTemporary(bb, 0, op_a); +            SetTemporary(bb, 1, op_b); +            SetRegister(bb, instr.gpr0, GetTemporary(0)); +            SetRegister(bb, instr.gpr0.Value() + 1, GetTemporary(1));              break;          }          default: @@ -136,9 +136,9 @@ u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) {                  }              }();              for (u32 i = 0; i < count; ++i) -                SetTemporal(bb, i, GetLmem(i * 4)); +                SetTemporary(bb, i, GetLmem(i * 4));              for (u32 i = 0; i < count; ++i) -                SetRegister(bb, instr.gpr0.Value() + i, GetTemporal(i)); +                SetRegister(bb, instr.gpr0.Value() + i, GetTemporary(i));              break;          }          default: @@ -172,10 +172,10 @@ u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) {                  Operation(OperationCode::UAdd, NO_PRECISE, real_address_base, it_offset);              const Node gmem = MakeNode<GmemNode>(real_address, base_address, descriptor); -            SetTemporal(bb, i, gmem); +            SetTemporary(bb, i, gmem);          }          for (u32 i = 0; i < count; ++i) { -            SetRegister(bb, instr.gpr0.Value() + i, GetTemporal(i)); +            SetRegister(bb, instr.gpr0.Value() + i, GetTemporary(i));          }          break;      } @@ -253,11 +253,11 @@ u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) {              TrackAndGetGlobalMemory(bb, instr, true);          // Encode in temporary registers like this: real_base_address, {registers_to_be_written...} -        SetTemporal(bb, 0, real_address_base); +        SetTemporary(bb, 0, real_address_base);          const u32 count = GetUniformTypeElementsCount(type);          for (u32 i = 0; i < count; ++i) { -            SetTemporal(bb, i + 1, GetRegister(instr.gpr0.Value() + i)); +            SetTemporary(bb, i + 1, GetRegister(instr.gpr0.Value() + i));          }          for (u32 i = 0; i < count; ++i) {              const Node it_offset = Immediate(i * 4); @@ -265,7 +265,7 @@ u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) {                  Operation(OperationCode::UAdd, NO_PRECISE, real_address_base, it_offset);              const Node gmem = MakeNode<GmemNode>(real_address, base_address, descriptor); -            bb.push_back(Operation(OperationCode::Assign, gmem, GetTemporal(i + 1))); +            bb.push_back(Operation(OperationCode::Assign, gmem, GetTemporary(i + 1)));          }          break;      } @@ -297,18 +297,13 @@ std::tuple<Node, Node, GlobalMemoryBase> ShaderIR::TrackAndGetGlobalMemory(NodeB      const auto addr_register{GetRegister(instr.gmem.gpr)};      const auto immediate_offset{static_cast<u32>(instr.gmem.offset)}; -    const Node base_address{ -        TrackCbuf(addr_register, global_code, static_cast<s64>(global_code.size()))}; -    const auto cbuf = std::get_if<CbufNode>(&*base_address); -    ASSERT(cbuf != nullptr); -    const auto cbuf_offset_imm = std::get_if<ImmediateNode>(&*cbuf->GetOffset()); -    ASSERT(cbuf_offset_imm != nullptr); -    const auto cbuf_offset = cbuf_offset_imm->GetValue(); +    const auto [base_address, index, offset] = +        TrackCbuf(addr_register, global_code, static_cast<s64>(global_code.size())); +    ASSERT(base_address != nullptr); -    bb.push_back( -        Comment(fmt::format("Base address is c[0x{:x}][0x{:x}]", cbuf->GetIndex(), cbuf_offset))); +    bb.push_back(Comment(fmt::format("Base address is c[0x{:x}][0x{:x}]", index, offset))); -    const GlobalMemoryBase descriptor{cbuf->GetIndex(), cbuf_offset}; +    const GlobalMemoryBase descriptor{index, offset};      const auto& [entry, is_new] = used_global_memory.try_emplace(descriptor);      auto& usage = entry->second;      if (is_write) { diff --git a/src/video_core/shader/decode/other.cpp b/src/video_core/shader/decode/other.cpp index 42e3de02f..c0f64d7a0 100644 --- a/src/video_core/shader/decode/other.cpp +++ b/src/video_core/shader/decode/other.cpp @@ -102,7 +102,7 @@ u32 ShaderIR::DecodeOther(NodeBlock& bb, u32 pc) {                                                   PRECISE, op_a, Immediate(3));              const Node operand =                  Operation(OperationCode::IAdd, PRECISE, convert, Immediate(target)); -            branch = Operation(OperationCode::BranchIndirect, convert); +            branch = Operation(OperationCode::BranchIndirect, operand);          }          const Tegra::Shader::ConditionCode cc = instr.flow_condition_code; diff --git a/src/video_core/shader/decode/texture.cpp b/src/video_core/shader/decode/texture.cpp index 323be3f14..0b934a069 100644 --- a/src/video_core/shader/decode/texture.cpp +++ b/src/video_core/shader/decode/texture.cpp @@ -181,10 +181,10 @@ u32 ShaderIR::DecodeTexture(NodeBlock& bb, u32 pc) {                  const Node value =                      Operation(OperationCode::TextureQueryDimensions, meta,                                GetRegister(instr.gpr8.Value() + (is_bindless ? 1 : 0))); -                SetTemporal(bb, indexer++, value); +                SetTemporary(bb, indexer++, value);              }              for (u32 i = 0; i < indexer; ++i) { -                SetRegister(bb, instr.gpr0.Value() + i, GetTemporal(i)); +                SetRegister(bb, instr.gpr0.Value() + i, GetTemporary(i));              }              break;          } @@ -238,10 +238,10 @@ u32 ShaderIR::DecodeTexture(NodeBlock& bb, u32 pc) {              auto params = coords;              MetaTexture meta{sampler, {}, {}, {}, {}, {}, {}, element};              const Node value = Operation(OperationCode::TextureQueryLod, meta, std::move(params)); -            SetTemporal(bb, indexer++, value); +            SetTemporary(bb, indexer++, value);          }          for (u32 i = 0; i < indexer; ++i) { -            SetRegister(bb, instr.gpr0.Value() + i, GetTemporal(i)); +            SetRegister(bb, instr.gpr0.Value() + i, GetTemporary(i));          }          break;      } @@ -308,13 +308,9 @@ const Sampler& ShaderIR::GetSampler(const Tegra::Shader::Sampler& sampler, Textu  const Sampler& ShaderIR::GetBindlessSampler(const Tegra::Shader::Register& reg, TextureType type,                                              bool is_array, bool is_shadow) {      const Node sampler_register = GetRegister(reg); -    const Node base_sampler = +    const auto [base_sampler, cbuf_index, cbuf_offset] =          TrackCbuf(sampler_register, global_code, static_cast<s64>(global_code.size())); -    const auto cbuf = std::get_if<CbufNode>(&*base_sampler); -    const auto cbuf_offset_imm = std::get_if<ImmediateNode>(&*cbuf->GetOffset()); -    ASSERT(cbuf_offset_imm != nullptr); -    const auto cbuf_offset = cbuf_offset_imm->GetValue(); -    const auto cbuf_index = cbuf->GetIndex(); +    ASSERT(base_sampler != nullptr);      const auto cbuf_key = (static_cast<u64>(cbuf_index) << 32) | static_cast<u64>(cbuf_offset);      // If this sampler has already been used, return the existing mapping. @@ -340,11 +336,11 @@ void ShaderIR::WriteTexInstructionFloat(NodeBlock& bb, Instruction instr, const              // Skip disabled components              continue;          } -        SetTemporal(bb, dest_elem++, components[elem]); +        SetTemporary(bb, dest_elem++, components[elem]);      }      // After writing values in temporals, move them to the real registers      for (u32 i = 0; i < dest_elem; ++i) { -        SetRegister(bb, instr.gpr0.Value() + i, GetTemporal(i)); +        SetRegister(bb, instr.gpr0.Value() + i, GetTemporary(i));      }  } @@ -357,17 +353,17 @@ void ShaderIR::WriteTexsInstructionFloat(NodeBlock& bb, Instruction instr,      for (u32 component = 0; component < 4; ++component) {          if (!instr.texs.IsComponentEnabled(component))              continue; -        SetTemporal(bb, dest_elem++, components[component]); +        SetTemporary(bb, dest_elem++, components[component]);      }      for (u32 i = 0; i < dest_elem; ++i) {          if (i < 2) {              // Write the first two swizzle components to gpr0 and gpr0+1 -            SetRegister(bb, instr.gpr0.Value() + i % 2, GetTemporal(i)); +            SetRegister(bb, instr.gpr0.Value() + i % 2, GetTemporary(i));          } else {              ASSERT(instr.texs.HasTwoDestinations());              // Write the rest of the swizzle components to gpr28 and gpr28+1 -            SetRegister(bb, instr.gpr28.Value() + i % 2, GetTemporal(i)); +            SetRegister(bb, instr.gpr28.Value() + i % 2, GetTemporary(i));          }      }  } @@ -395,11 +391,11 @@ void ShaderIR::WriteTexsInstructionHalfFloat(NodeBlock& bb, Instruction instr,          return;      } -    SetTemporal(bb, 0, first_value); -    SetTemporal(bb, 1, Operation(OperationCode::HPack2, values[2], values[3])); +    SetTemporary(bb, 0, first_value); +    SetTemporary(bb, 1, Operation(OperationCode::HPack2, values[2], values[3])); -    SetRegister(bb, instr.gpr0, GetTemporal(0)); -    SetRegister(bb, instr.gpr28, GetTemporal(1)); +    SetRegister(bb, instr.gpr0, GetTemporary(0)); +    SetRegister(bb, instr.gpr28, GetTemporary(1));  }  Node4 ShaderIR::GetTextureCode(Instruction instr, TextureType texture_type, diff --git a/src/video_core/shader/decode/xmad.cpp b/src/video_core/shader/decode/xmad.cpp index 93dee77d1..206961909 100644 --- a/src/video_core/shader/decode/xmad.cpp +++ b/src/video_core/shader/decode/xmad.cpp @@ -73,8 +73,8 @@ u32 ShaderIR::DecodeXmad(NodeBlock& bb, u32 pc) {      if (is_psl) {          product = Operation(OperationCode::ILogicalShiftLeft, NO_PRECISE, product, Immediate(16));      } -    SetTemporal(bb, 0, product); -    product = GetTemporal(0); +    SetTemporary(bb, 0, product); +    product = GetTemporary(0);      const Node original_c = op_c;      const Tegra::Shader::XmadMode set_mode = mode; // Workaround to clang compile error @@ -98,13 +98,13 @@ u32 ShaderIR::DecodeXmad(NodeBlock& bb, u32 pc) {          }      }(); -    SetTemporal(bb, 1, op_c); -    op_c = GetTemporal(1); +    SetTemporary(bb, 1, op_c); +    op_c = GetTemporary(1);      // TODO(Rodrigo): Use an appropiate sign for this operation      Node sum = Operation(OperationCode::IAdd, product, op_c); -    SetTemporal(bb, 2, sum); -    sum = GetTemporal(2); +    SetTemporary(bb, 2, sum); +    sum = GetTemporary(2);      if (is_merge) {          const Node a = BitfieldExtract(sum, 0, 16);          const Node b = diff --git a/src/video_core/shader/node.h b/src/video_core/shader/node.h index 7427ed896..715184d67 100644 --- a/src/video_core/shader/node.h +++ b/src/video_core/shader/node.h @@ -101,8 +101,7 @@ enum class OperationCode {      LogicalXor,    /// (bool a, bool b) -> bool      LogicalNegate, /// (bool a) -> bool      LogicalPick2,  /// (bool2 pair, uint index) -> bool -    LogicalAll2,   /// (bool2 a) -> bool -    LogicalAny2,   /// (bool2 a) -> bool +    LogicalAnd2,   /// (bool2 a) -> bool      LogicalFLessThan,     /// (float a, float b) -> bool      LogicalFEqual,        /// (float a, float b) -> bool diff --git a/src/video_core/shader/node_helper.cpp b/src/video_core/shader/node_helper.cpp index 6fccbbba3..b3dcd291c 100644 --- a/src/video_core/shader/node_helper.cpp +++ b/src/video_core/shader/node_helper.cpp @@ -12,7 +12,7 @@  namespace VideoCommon::Shader {  Node Conditional(Node condition, std::vector<Node> code) { -    return MakeNode<ConditionalNode>(condition, std::move(code)); +    return MakeNode<ConditionalNode>(std::move(condition), std::move(code));  }  Node Comment(std::string text) { diff --git a/src/video_core/shader/shader_ir.cpp b/src/video_core/shader/shader_ir.cpp index caa409788..5e91fe129 100644 --- a/src/video_core/shader/shader_ir.cpp +++ b/src/video_core/shader/shader_ir.cpp @@ -61,8 +61,17 @@ Node ShaderIR::GetConstBufferIndirect(u64 index_, u64 offset_, Node node) {      const auto [entry, is_new] = used_cbufs.try_emplace(index);      entry->second.MarkAsUsedIndirect(); -    const Node final_offset = Operation(OperationCode::UAdd, NO_PRECISE, node, Immediate(offset)); -    return MakeNode<CbufNode>(index, final_offset); +    Node final_offset = [&] { +        // Attempt to inline constant buffer without a variable offset. This is done to allow +        // tracking LDC calls. +        if (const auto gpr = std::get_if<GprNode>(&*node)) { +            if (gpr->GetIndex() == Register::ZeroIndex) { +                return Immediate(offset); +            } +        } +        return Operation(OperationCode::UAdd, NO_PRECISE, std::move(node), Immediate(offset)); +    }(); +    return MakeNode<CbufNode>(index, std::move(final_offset));  }  Node ShaderIR::GetPredicate(u64 pred_, bool negated) { @@ -80,7 +89,7 @@ Node ShaderIR::GetPredicate(bool immediate) {  Node ShaderIR::GetInputAttribute(Attribute::Index index, u64 element, Node buffer) {      used_input_attributes.emplace(index); -    return MakeNode<AbufNode>(index, static_cast<u32>(element), buffer); +    return MakeNode<AbufNode>(index, static_cast<u32>(element), std::move(buffer));  }  Node ShaderIR::GetPhysicalInputAttribute(Tegra::Shader::Register physical_address, Node buffer) { @@ -113,7 +122,7 @@ Node ShaderIR::GetOutputAttribute(Attribute::Index index, u64 element, Node buff      }      used_output_attributes.insert(index); -    return MakeNode<AbufNode>(index, static_cast<u32>(element), buffer); +    return MakeNode<AbufNode>(index, static_cast<u32>(element), std::move(buffer));  }  Node ShaderIR::GetInternalFlag(InternalFlag flag, bool negated) { @@ -125,19 +134,19 @@ Node ShaderIR::GetInternalFlag(InternalFlag flag, bool negated) {  }  Node ShaderIR::GetLocalMemory(Node address) { -    return MakeNode<LmemNode>(address); +    return MakeNode<LmemNode>(std::move(address));  } -Node ShaderIR::GetTemporal(u32 id) { +Node ShaderIR::GetTemporary(u32 id) {      return GetRegister(Register::ZeroIndex + 1 + id);  }  Node ShaderIR::GetOperandAbsNegFloat(Node value, bool absolute, bool negate) {      if (absolute) { -        value = Operation(OperationCode::FAbsolute, NO_PRECISE, value); +        value = Operation(OperationCode::FAbsolute, NO_PRECISE, std::move(value));      }      if (negate) { -        value = Operation(OperationCode::FNegate, NO_PRECISE, value); +        value = Operation(OperationCode::FNegate, NO_PRECISE, std::move(value));      }      return value;  } @@ -146,24 +155,26 @@ Node ShaderIR::GetSaturatedFloat(Node value, bool saturate) {      if (!saturate) {          return value;      } -    const Node positive_zero = Immediate(std::copysignf(0, 1)); -    const Node positive_one = Immediate(1.0f); -    return Operation(OperationCode::FClamp, NO_PRECISE, value, positive_zero, positive_one); + +    Node positive_zero = Immediate(std::copysignf(0, 1)); +    Node positive_one = Immediate(1.0f); +    return Operation(OperationCode::FClamp, NO_PRECISE, std::move(value), std::move(positive_zero), +                     std::move(positive_one));  } -Node ShaderIR::ConvertIntegerSize(Node value, Tegra::Shader::Register::Size size, bool is_signed) { +Node ShaderIR::ConvertIntegerSize(Node value, Register::Size size, bool is_signed) {      switch (size) {      case Register::Size::Byte: -        value = SignedOperation(OperationCode::ILogicalShiftLeft, is_signed, NO_PRECISE, value, -                                Immediate(24)); -        value = SignedOperation(OperationCode::IArithmeticShiftRight, is_signed, NO_PRECISE, value, -                                Immediate(24)); +        value = SignedOperation(OperationCode::ILogicalShiftLeft, is_signed, NO_PRECISE, +                                std::move(value), Immediate(24)); +        value = SignedOperation(OperationCode::IArithmeticShiftRight, is_signed, NO_PRECISE, +                                std::move(value), Immediate(24));          return value;      case Register::Size::Short: -        value = SignedOperation(OperationCode::ILogicalShiftLeft, is_signed, NO_PRECISE, value, -                                Immediate(16)); -        value = SignedOperation(OperationCode::IArithmeticShiftRight, is_signed, NO_PRECISE, value, -                                Immediate(16)); +        value = SignedOperation(OperationCode::ILogicalShiftLeft, is_signed, NO_PRECISE, +                                std::move(value), Immediate(16)); +        value = SignedOperation(OperationCode::IArithmeticShiftRight, is_signed, NO_PRECISE, +                                std::move(value), Immediate(16));      case Register::Size::Word:          // Default - do nothing          return value; @@ -179,27 +190,29 @@ Node ShaderIR::GetOperandAbsNegInteger(Node value, bool absolute, bool negate, b          return value;      }      if (absolute) { -        value = Operation(OperationCode::IAbsolute, NO_PRECISE, value); +        value = Operation(OperationCode::IAbsolute, NO_PRECISE, std::move(value));      }      if (negate) { -        value = Operation(OperationCode::INegate, NO_PRECISE, value); +        value = Operation(OperationCode::INegate, NO_PRECISE, std::move(value));      }      return value;  }  Node ShaderIR::UnpackHalfImmediate(Instruction instr, bool has_negation) { -    const Node value = Immediate(instr.half_imm.PackImmediates()); +    Node value = Immediate(instr.half_imm.PackImmediates());      if (!has_negation) {          return value;      } -    const Node first_negate = GetPredicate(instr.half_imm.first_negate != 0); -    const Node second_negate = GetPredicate(instr.half_imm.second_negate != 0); -    return Operation(OperationCode::HNegate, NO_PRECISE, value, first_negate, second_negate); +    Node first_negate = GetPredicate(instr.half_imm.first_negate != 0); +    Node second_negate = GetPredicate(instr.half_imm.second_negate != 0); + +    return Operation(OperationCode::HNegate, NO_PRECISE, std::move(value), std::move(first_negate), +                     std::move(second_negate));  }  Node ShaderIR::UnpackHalfFloat(Node value, Tegra::Shader::HalfType type) { -    return Operation(OperationCode::HUnpack, type, value); +    return Operation(OperationCode::HUnpack, type, std::move(value));  }  Node ShaderIR::HalfMerge(Node dest, Node src, Tegra::Shader::HalfMerge merge) { @@ -207,11 +220,11 @@ Node ShaderIR::HalfMerge(Node dest, Node src, Tegra::Shader::HalfMerge merge) {      case Tegra::Shader::HalfMerge::H0_H1:          return src;      case Tegra::Shader::HalfMerge::F32: -        return Operation(OperationCode::HMergeF32, src); +        return Operation(OperationCode::HMergeF32, std::move(src));      case Tegra::Shader::HalfMerge::Mrg_H0: -        return Operation(OperationCode::HMergeH0, dest, src); +        return Operation(OperationCode::HMergeH0, std::move(dest), std::move(src));      case Tegra::Shader::HalfMerge::Mrg_H1: -        return Operation(OperationCode::HMergeH1, dest, src); +        return Operation(OperationCode::HMergeH1, std::move(dest), std::move(src));      }      UNREACHABLE();      return src; @@ -219,10 +232,10 @@ Node ShaderIR::HalfMerge(Node dest, Node src, Tegra::Shader::HalfMerge merge) {  Node ShaderIR::GetOperandAbsNegHalf(Node value, bool absolute, bool negate) {      if (absolute) { -        value = Operation(OperationCode::HAbsolute, NO_PRECISE, value); +        value = Operation(OperationCode::HAbsolute, NO_PRECISE, std::move(value));      }      if (negate) { -        value = Operation(OperationCode::HNegate, NO_PRECISE, value, GetPredicate(true), +        value = Operation(OperationCode::HNegate, NO_PRECISE, std::move(value), GetPredicate(true),                            GetPredicate(true));      }      return value; @@ -232,9 +245,11 @@ Node ShaderIR::GetSaturatedHalfFloat(Node value, bool saturate) {      if (!saturate) {          return value;      } -    const Node positive_zero = Immediate(std::copysignf(0, 1)); -    const Node positive_one = Immediate(1.0f); -    return Operation(OperationCode::HClamp, NO_PRECISE, value, positive_zero, positive_one); + +    Node positive_zero = Immediate(std::copysignf(0, 1)); +    Node positive_one = Immediate(1.0f); +    return Operation(OperationCode::HClamp, NO_PRECISE, std::move(value), std::move(positive_zero), +                     std::move(positive_one));  }  Node ShaderIR::GetPredicateComparisonFloat(PredCondition condition, Node op_a, Node op_b) { @@ -262,7 +277,6 @@ Node ShaderIR::GetPredicateComparisonFloat(PredCondition condition, Node op_a, N          condition == PredCondition::LessEqualWithNan ||          condition == PredCondition::GreaterThanWithNan ||          condition == PredCondition::GreaterEqualWithNan) { -          predicate = Operation(OperationCode::LogicalOr, predicate,                                Operation(OperationCode::LogicalFIsNan, op_a));          predicate = Operation(OperationCode::LogicalOr, predicate, @@ -291,7 +305,8 @@ Node ShaderIR::GetPredicateComparisonInteger(PredCondition condition, bool is_si      UNIMPLEMENTED_IF_MSG(comparison == PredicateComparisonTable.end(),                           "Unknown predicate comparison operation"); -    Node predicate = SignedOperation(comparison->second, is_signed, NO_PRECISE, op_a, op_b); +    Node predicate = SignedOperation(comparison->second, is_signed, NO_PRECISE, std::move(op_a), +                                     std::move(op_b));      UNIMPLEMENTED_IF_MSG(condition == PredCondition::LessThanWithNan ||                               condition == PredCondition::NotEqualWithNan || @@ -321,9 +336,7 @@ Node ShaderIR::GetPredicateComparisonHalf(Tegra::Shader::PredCondition condition      UNIMPLEMENTED_IF_MSG(comparison == PredicateComparisonTable.end(),                           "Unknown predicate comparison operation"); -    const Node predicate = Operation(comparison->second, NO_PRECISE, op_a, op_b); - -    return predicate; +    return Operation(comparison->second, NO_PRECISE, std::move(op_a), std::move(op_b));  }  OperationCode ShaderIR::GetPredicateCombiner(PredOperation operation) { @@ -349,31 +362,32 @@ Node ShaderIR::GetConditionCode(Tegra::Shader::ConditionCode cc) {  }  void ShaderIR::SetRegister(NodeBlock& bb, Register dest, Node src) { -    bb.push_back(Operation(OperationCode::Assign, GetRegister(dest), src)); +    bb.push_back(Operation(OperationCode::Assign, GetRegister(dest), std::move(src)));  }  void ShaderIR::SetPredicate(NodeBlock& bb, u64 dest, Node src) { -    bb.push_back(Operation(OperationCode::LogicalAssign, GetPredicate(dest), src)); +    bb.push_back(Operation(OperationCode::LogicalAssign, GetPredicate(dest), std::move(src)));  }  void ShaderIR::SetInternalFlag(NodeBlock& bb, InternalFlag flag, Node value) { -    bb.push_back(Operation(OperationCode::LogicalAssign, GetInternalFlag(flag), value)); +    bb.push_back(Operation(OperationCode::LogicalAssign, GetInternalFlag(flag), std::move(value)));  }  void ShaderIR::SetLocalMemory(NodeBlock& bb, Node address, Node value) { -    bb.push_back(Operation(OperationCode::Assign, GetLocalMemory(address), value)); +    bb.push_back( +        Operation(OperationCode::Assign, GetLocalMemory(std::move(address)), std::move(value)));  } -void ShaderIR::SetTemporal(NodeBlock& bb, u32 id, Node value) { -    SetRegister(bb, Register::ZeroIndex + 1 + id, value); +void ShaderIR::SetTemporary(NodeBlock& bb, u32 id, Node value) { +    SetRegister(bb, Register::ZeroIndex + 1 + id, std::move(value));  }  void ShaderIR::SetInternalFlagsFromFloat(NodeBlock& bb, Node value, bool sets_cc) {      if (!sets_cc) {          return;      } -    const Node zerop = Operation(OperationCode::LogicalFEqual, value, Immediate(0.0f)); -    SetInternalFlag(bb, InternalFlag::Zero, zerop); +    Node zerop = Operation(OperationCode::LogicalFEqual, std::move(value), Immediate(0.0f)); +    SetInternalFlag(bb, InternalFlag::Zero, std::move(zerop));      LOG_WARNING(HW_GPU, "Condition codes implementation is incomplete");  } @@ -381,14 +395,14 @@ void ShaderIR::SetInternalFlagsFromInteger(NodeBlock& bb, Node value, bool sets_      if (!sets_cc) {          return;      } -    const Node zerop = Operation(OperationCode::LogicalIEqual, value, Immediate(0)); -    SetInternalFlag(bb, InternalFlag::Zero, zerop); +    Node zerop = Operation(OperationCode::LogicalIEqual, std::move(value), Immediate(0)); +    SetInternalFlag(bb, InternalFlag::Zero, std::move(zerop));      LOG_WARNING(HW_GPU, "Condition codes implementation is incomplete");  }  Node ShaderIR::BitfieldExtract(Node value, u32 offset, u32 bits) { -    return Operation(OperationCode::UBitfieldExtract, NO_PRECISE, value, Immediate(offset), -                     Immediate(bits)); +    return Operation(OperationCode::UBitfieldExtract, NO_PRECISE, std::move(value), +                     Immediate(offset), Immediate(bits));  }  } // namespace VideoCommon::Shader diff --git a/src/video_core/shader/shader_ir.h b/src/video_core/shader/shader_ir.h index 03c888def..59a083d90 100644 --- a/src/video_core/shader/shader_ir.h +++ b/src/video_core/shader/shader_ir.h @@ -5,13 +5,10 @@  #pragma once  #include <array> -#include <cstring>  #include <map>  #include <optional>  #include <set> -#include <string>  #include <tuple> -#include <variant>  #include <vector>  #include "common/common_types.h" @@ -210,8 +207,8 @@ private:      Node GetInternalFlag(InternalFlag flag, bool negated = false);      /// Generates a node representing a local memory address      Node GetLocalMemory(Node address); -    /// Generates a temporal, internally it uses a post-RZ register -    Node GetTemporal(u32 id); +    /// Generates a temporary, internally it uses a post-RZ register +    Node GetTemporary(u32 id);      /// Sets a register. src value must be a number-evaluated node.      void SetRegister(NodeBlock& bb, Tegra::Shader::Register dest, Node src); @@ -221,8 +218,8 @@ private:      void SetInternalFlag(NodeBlock& bb, InternalFlag flag, Node value);      /// Sets a local memory address. address and value must be a number-evaluated node      void SetLocalMemory(NodeBlock& bb, Node address, Node value); -    /// Sets a temporal. Internally it uses a post-RZ register -    void SetTemporal(NodeBlock& bb, u32 id, Node value); +    /// Sets a temporary. Internally it uses a post-RZ register +    void SetTemporary(NodeBlock& bb, u32 id, Node value);      /// Sets internal flags from a float      void SetInternalFlagsFromFloat(NodeBlock& bb, Node value, bool sets_cc = true); @@ -328,7 +325,7 @@ private:      void WriteLop3Instruction(NodeBlock& bb, Tegra::Shader::Register dest, Node op_a, Node op_b,                                Node op_c, Node imm_lut, bool sets_cc); -    Node TrackCbuf(Node tracked, const NodeBlock& code, s64 cursor) const; +    std::tuple<Node, u32, u32> TrackCbuf(Node tracked, const NodeBlock& code, s64 cursor) const;      std::optional<u32> TrackImmediate(Node tracked, const NodeBlock& code, s64 cursor) const; diff --git a/src/video_core/shader/track.cpp b/src/video_core/shader/track.cpp index fc957d980..a53e02253 100644 --- a/src/video_core/shader/track.cpp +++ b/src/video_core/shader/track.cpp @@ -15,56 +15,63 @@ namespace {  std::pair<Node, s64> FindOperation(const NodeBlock& code, s64 cursor,                                     OperationCode operation_code) {      for (; cursor >= 0; --cursor) { -        const Node node = code.at(cursor); +        Node node = code.at(cursor); +          if (const auto operation = std::get_if<OperationNode>(&*node)) {              if (operation->GetCode() == operation_code) { -                return {node, cursor}; +                return {std::move(node), cursor};              }          } +          if (const auto conditional = std::get_if<ConditionalNode>(&*node)) {              const auto& conditional_code = conditional->GetCode(); -            const auto [found, internal_cursor] = FindOperation( +            auto [found, internal_cursor] = FindOperation(                  conditional_code, static_cast<s64>(conditional_code.size() - 1), operation_code);              if (found) { -                return {found, cursor}; +                return {std::move(found), cursor};              }          }      }      return {};  } -} // namespace +} // Anonymous namespace -Node ShaderIR::TrackCbuf(Node tracked, const NodeBlock& code, s64 cursor) const { +std::tuple<Node, u32, u32> ShaderIR::TrackCbuf(Node tracked, const NodeBlock& code, +                                               s64 cursor) const {      if (const auto cbuf = std::get_if<CbufNode>(&*tracked)) { -        // Cbuf found, but it has to be immediate -        return std::holds_alternative<ImmediateNode>(*cbuf->GetOffset()) ? tracked : nullptr; +        // Constant buffer found, test if it's an immediate +        const auto offset = cbuf->GetOffset(); +        if (const auto immediate = std::get_if<ImmediateNode>(&*offset)) { +            return {tracked, cbuf->GetIndex(), immediate->GetValue()}; +        } +        return {};      }      if (const auto gpr = std::get_if<GprNode>(&*tracked)) {          if (gpr->GetIndex() == Tegra::Shader::Register::ZeroIndex) { -            return nullptr; +            return {};          }          // Reduce the cursor in one to avoid infinite loops when the instruction sets the same          // register that it uses as operand          const auto [source, new_cursor] = TrackRegister(gpr, code, cursor - 1);          if (!source) { -            return nullptr; +            return {};          }          return TrackCbuf(source, code, new_cursor);      }      if (const auto operation = std::get_if<OperationNode>(&*tracked)) {          for (std::size_t i = 0; i < operation->GetOperandsCount(); ++i) { -            if (const auto found = TrackCbuf((*operation)[i], code, cursor)) { -                // Cbuf found in operand +            if (auto found = TrackCbuf((*operation)[i], code, cursor); std::get<0>(found)) { +                // Cbuf found in operand.                  return found;              }          } -        return nullptr; +        return {};      }      if (const auto conditional = std::get_if<ConditionalNode>(&*tracked)) {          const auto& conditional_code = conditional->GetCode();          return TrackCbuf(tracked, conditional_code, static_cast<s64>(conditional_code.size()));      } -    return nullptr; +    return {};  }  std::optional<u32> ShaderIR::TrackImmediate(Node tracked, const NodeBlock& code, s64 cursor) const { diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h index 7f9623c62..a3a3770a7 100644 --- a/src/video_core/texture_cache/texture_cache.h +++ b/src/video_core/texture_cache/texture_cache.h @@ -116,10 +116,10 @@ public:          std::lock_guard lock{mutex};          auto& maxwell3d = system.GPU().Maxwell3D(); -        if (!maxwell3d.dirty_flags.zeta_buffer) { +        if (!maxwell3d.dirty.depth_buffer) {              return depth_buffer.view;          } -        maxwell3d.dirty_flags.zeta_buffer = false; +        maxwell3d.dirty.depth_buffer = false;          const auto& regs{maxwell3d.regs};          const auto gpu_addr{regs.zeta.Address()}; @@ -145,10 +145,10 @@ public:          std::lock_guard lock{mutex};          ASSERT(index < Tegra::Engines::Maxwell3D::Regs::NumRenderTargets);          auto& maxwell3d = system.GPU().Maxwell3D(); -        if (!maxwell3d.dirty_flags.color_buffer[index]) { +        if (!maxwell3d.dirty.render_target[index]) {              return render_targets[index].view;          } -        maxwell3d.dirty_flags.color_buffer.reset(index); +        maxwell3d.dirty.render_target[index] = false;          const auto& regs{maxwell3d.regs};          if (index >= regs.rt_control.count || regs.rt[index].Address() == 0 || @@ -274,10 +274,11 @@ protected:          auto& maxwell3d = system.GPU().Maxwell3D();          const u32 index = surface->GetRenderTarget();          if (index == DEPTH_RT) { -            maxwell3d.dirty_flags.zeta_buffer = true; +            maxwell3d.dirty.depth_buffer = true;          } else { -            maxwell3d.dirty_flags.color_buffer.set(index, true); +            maxwell3d.dirty.render_target[index] = true;          } +        maxwell3d.dirty.render_settings = true;      }      void Register(TSurface surface) {  | 
