diff options
| -rw-r--r-- | src/shader_recompiler/backend/glasm/emit_glasm_context_get_set.cpp | 25 | ||||
| -rw-r--r-- | src/video_core/macro/macro_jit_x64.cpp | 62 | ||||
| -rw-r--r-- | src/video_core/renderer_vulkan/vk_staging_buffer_pool.cpp | 84 | ||||
| -rw-r--r-- | src/video_core/renderer_vulkan/vk_staging_buffer_pool.h | 3 | ||||
| -rw-r--r-- | src/web_service/web_backend.cpp | 3 | 
5 files changed, 97 insertions, 80 deletions
| diff --git a/src/shader_recompiler/backend/glasm/emit_glasm_context_get_set.cpp b/src/shader_recompiler/backend/glasm/emit_glasm_context_get_set.cpp index b5c08d611..7e8f37563 100644 --- a/src/shader_recompiler/backend/glasm/emit_glasm_context_get_set.cpp +++ b/src/shader_recompiler/backend/glasm/emit_glasm_context_get_set.cpp @@ -13,9 +13,6 @@ namespace Shader::Backend::GLASM {  namespace {  void GetCbuf(EmitContext& ctx, IR::Inst& inst, const IR::Value& binding, ScalarU32 offset,               std::string_view size) { -    if (!binding.IsImmediate()) { -        throw NotImplementedException("Indirect constant buffer loading"); -    }      const Register ret{ctx.reg_alloc.Define(inst)};      if (offset.type == Type::U32) {          // Avoid reading arrays out of bounds, matching hardware's behavior @@ -24,7 +21,27 @@ void GetCbuf(EmitContext& ctx, IR::Inst& inst, const IR::Value& binding, ScalarU              return;          }      } -    ctx.Add("LDC.{} {},c{}[{}];", size, ret, binding.U32(), offset); + +    if (binding.IsImmediate()) { +        ctx.Add("LDC.{} {},c{}[{}];", size, ret, binding.U32(), offset); +        return; +    } + +    const ScalarU32 idx{ctx.reg_alloc.Consume(binding)}; +    for (u32 i = 0; i < Info::MAX_INDIRECT_CBUFS; i++) { +        ctx.Add("SEQ.S.CC RC.x,{},{};" +                "IF NE.x;" +                "LDC.{} {},c{}[{}];", +                idx, i, size, ret, i, offset); + +        if (i != Info::MAX_INDIRECT_CBUFS - 1) { +            ctx.Add("ELSE;"); +        } +    } + +    for (u32 i = 0; i < Info::MAX_INDIRECT_CBUFS; i++) { +        ctx.Add("ENDIF;"); +    }  }  bool IsInputArray(Stage stage) { diff --git a/src/video_core/macro/macro_jit_x64.cpp b/src/video_core/macro/macro_jit_x64.cpp index aca25d902..a302a9603 100644 --- a/src/video_core/macro/macro_jit_x64.cpp +++ b/src/video_core/macro/macro_jit_x64.cpp @@ -279,28 +279,13 @@ void MacroJITx64Impl::Compile_ExtractInsert(Macro::Opcode opcode) {      auto dst = Compile_GetRegister(opcode.src_a, RESULT);      auto src = Compile_GetRegister(opcode.src_b, eax); -    if (opcode.bf_src_bit != 0 && opcode.bf_src_bit != 31) { -        shr(src, opcode.bf_src_bit); -    } else if (opcode.bf_src_bit == 31) { -        xor_(src, src); -    } -    // Don't bother masking the whole register since we're using a 32 bit register -    if (opcode.bf_size != 31 && opcode.bf_size != 0) { -        and_(src, opcode.GetBitfieldMask()); -    } else if (opcode.bf_size == 0) { -        xor_(src, src); -    } -    if (opcode.bf_dst_bit != 31 && opcode.bf_dst_bit != 0) { -        shl(src, opcode.bf_dst_bit); -    } else if (opcode.bf_dst_bit == 31) { -        xor_(src, src); -    } -      const u32 mask = ~(opcode.GetBitfieldMask() << opcode.bf_dst_bit); -    if (mask != 0xffffffff) { -        and_(dst, mask); -    } +    and_(dst, mask); +    shr(src, opcode.bf_src_bit); +    and_(src, opcode.GetBitfieldMask()); +    shl(src, opcode.bf_dst_bit);      or_(dst, src); +      Compile_ProcessResult(opcode.result_operation, opcode.dst);  } @@ -309,17 +294,9 @@ void MacroJITx64Impl::Compile_ExtractShiftLeftImmediate(Macro::Opcode opcode) {      const auto src = Compile_GetRegister(opcode.src_b, RESULT);      shr(src, dst.cvt8()); -    if (opcode.bf_size != 0 && opcode.bf_size != 31) { -        and_(src, opcode.GetBitfieldMask()); -    } else if (opcode.bf_size == 0) { -        xor_(src, src); -    } +    and_(src, opcode.GetBitfieldMask()); +    shl(src, opcode.bf_dst_bit); -    if (opcode.bf_dst_bit != 0 && opcode.bf_dst_bit != 31) { -        shl(src, opcode.bf_dst_bit); -    } else if (opcode.bf_dst_bit == 31) { -        xor_(src, src); -    }      Compile_ProcessResult(opcode.result_operation, opcode.dst);  } @@ -327,13 +304,8 @@ void MacroJITx64Impl::Compile_ExtractShiftLeftRegister(Macro::Opcode opcode) {      const auto dst = Compile_GetRegister(opcode.src_a, ecx);      const auto src = Compile_GetRegister(opcode.src_b, RESULT); -    if (opcode.bf_src_bit != 0) { -        shr(src, opcode.bf_src_bit); -    } - -    if (opcode.bf_size != 31) { -        and_(src, opcode.GetBitfieldMask()); -    } +    shr(src, opcode.bf_src_bit); +    and_(src, opcode.GetBitfieldMask());      shl(src, dst.cvt8());      Compile_ProcessResult(opcode.result_operation, opcode.dst); @@ -429,17 +401,11 @@ void MacroJITx64Impl::Compile_Branch(Macro::Opcode opcode) {              Xbyak::Label handle_post_exit{};              Xbyak::Label skip{};              jmp(skip, T_NEAR); -            if (opcode.is_exit) { -                L(handle_post_exit); -                // Execute 1 instruction -                mov(BRANCH_HOLDER, end_of_code); -                // Jump to next instruction to skip delay slot check -                jmp(labels[jump_address], T_NEAR); -            } else { -                L(handle_post_exit); -                xor_(BRANCH_HOLDER, BRANCH_HOLDER); -                jmp(labels[jump_address], T_NEAR); -            } + +            L(handle_post_exit); +            xor_(BRANCH_HOLDER, BRANCH_HOLDER); +            jmp(labels[jump_address], T_NEAR); +              L(skip);              mov(BRANCH_HOLDER, handle_post_exit);              jmp(delay_skip[pc], T_NEAR); diff --git a/src/video_core/renderer_vulkan/vk_staging_buffer_pool.cpp b/src/video_core/renderer_vulkan/vk_staging_buffer_pool.cpp index 06f68d09a..7fb256953 100644 --- a/src/video_core/renderer_vulkan/vk_staging_buffer_pool.cpp +++ b/src/video_core/renderer_vulkan/vk_staging_buffer_pool.cpp @@ -26,20 +26,39 @@ using namespace Common::Literals;  constexpr VkDeviceSize MAX_ALIGNMENT = 256;  // Maximum size to put elements in the stream buffer  constexpr VkDeviceSize MAX_STREAM_BUFFER_REQUEST_SIZE = 8_MiB; -// Stream buffer size in bytes -constexpr VkDeviceSize STREAM_BUFFER_SIZE = 128_MiB; -constexpr VkDeviceSize REGION_SIZE = STREAM_BUFFER_SIZE / StagingBufferPool::NUM_SYNCS;  constexpr VkMemoryPropertyFlags HOST_FLAGS =      VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT;  constexpr VkMemoryPropertyFlags STREAM_FLAGS = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT | HOST_FLAGS; -bool IsStreamHeap(VkMemoryHeap heap) noexcept { -    return STREAM_BUFFER_SIZE < (heap.size * 2) / 3; +static bool IsStreamHeap(VkMemoryHeap heap, size_t staging_buffer_size) noexcept { +    return staging_buffer_size < (heap.size * 2) / 3; +} + +static bool HasLargeDeviceLocalHostVisibleMemory(const VkPhysicalDeviceMemoryProperties& props) { +    const auto flags{VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT | VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT}; + +    for (u32 type_index = 0; type_index < props.memoryTypeCount; ++type_index) { +        const auto& memory_type{props.memoryTypes[type_index]}; + +        if ((memory_type.propertyFlags & flags) != flags) { +            // Memory must be device local and host visible +            continue; +        } + +        const auto& heap{props.memoryHeaps[memory_type.heapIndex]}; +        if (heap.size >= 7168_MiB) { +            // This is the right type of memory +            return true; +        } +    } + +    return false;  }  std::optional<u32> FindMemoryTypeIndex(const VkPhysicalDeviceMemoryProperties& props, u32 type_mask, -                                       VkMemoryPropertyFlags flags) noexcept { +                                       VkMemoryPropertyFlags flags, +                                       size_t staging_buffer_size) noexcept {      for (u32 type_index = 0; type_index < props.memoryTypeCount; ++type_index) {          if (((type_mask >> type_index) & 1) == 0) {              // Memory type is incompatible @@ -50,7 +69,7 @@ std::optional<u32> FindMemoryTypeIndex(const VkPhysicalDeviceMemoryProperties& p              // Memory type doesn't have the flags we want              continue;          } -        if (!IsStreamHeap(props.memoryHeaps[memory_type.heapIndex])) { +        if (!IsStreamHeap(props.memoryHeaps[memory_type.heapIndex], staging_buffer_size)) {              // Memory heap is not suitable for streaming              continue;          } @@ -61,17 +80,17 @@ std::optional<u32> FindMemoryTypeIndex(const VkPhysicalDeviceMemoryProperties& p  }  u32 FindMemoryTypeIndex(const VkPhysicalDeviceMemoryProperties& props, u32 type_mask, -                        bool try_device_local) { +                        bool try_device_local, size_t staging_buffer_size) {      std::optional<u32> type;      if (try_device_local) {          // Try to find a DEVICE_LOCAL_BIT type, Nvidia and AMD have a dedicated heap for this -        type = FindMemoryTypeIndex(props, type_mask, STREAM_FLAGS); +        type = FindMemoryTypeIndex(props, type_mask, STREAM_FLAGS, staging_buffer_size);          if (type) {              return *type;          }      }      // Otherwise try without the DEVICE_LOCAL_BIT -    type = FindMemoryTypeIndex(props, type_mask, HOST_FLAGS); +    type = FindMemoryTypeIndex(props, type_mask, HOST_FLAGS, staging_buffer_size);      if (type) {          return *type;      } @@ -79,20 +98,32 @@ u32 FindMemoryTypeIndex(const VkPhysicalDeviceMemoryProperties& props, u32 type_      throw vk::Exception(VK_ERROR_OUT_OF_DEVICE_MEMORY);  } -size_t Region(size_t iterator) noexcept { -    return iterator / REGION_SIZE; +size_t Region(size_t iterator, size_t region_size) noexcept { +    return iterator / region_size;  }  } // Anonymous namespace  StagingBufferPool::StagingBufferPool(const Device& device_, MemoryAllocator& memory_allocator_,                                       Scheduler& scheduler_)      : device{device_}, memory_allocator{memory_allocator_}, scheduler{scheduler_} { + +    const auto memory_properties{device.GetPhysical().GetMemoryProperties().memoryProperties}; +    if (HasLargeDeviceLocalHostVisibleMemory(memory_properties)) { +        // Possible on many integrated and newer discrete cards +        staging_buffer_size = 1_GiB; +    } else { +        // Well-supported default size used by most Vulkan PC games +        staging_buffer_size = 256_MiB; +    } + +    region_size = staging_buffer_size / StagingBufferPool::NUM_SYNCS; +      const vk::Device& dev = device.GetLogical();      stream_buffer = dev.CreateBuffer(VkBufferCreateInfo{          .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,          .pNext = nullptr,          .flags = 0, -        .size = STREAM_BUFFER_SIZE, +        .size = staging_buffer_size,          .usage = VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT |                   VK_BUFFER_USAGE_INDEX_BUFFER_BIT | VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,          .sharingMode = VK_SHARING_MODE_EXCLUSIVE, @@ -117,19 +148,18 @@ StagingBufferPool::StagingBufferPool(const Device& device_, MemoryAllocator& mem          .image = nullptr,          .buffer = *stream_buffer,      }; -    const auto memory_properties = device.GetPhysical().GetMemoryProperties().memoryProperties;      VkMemoryAllocateInfo stream_memory_info{          .sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO,          .pNext = make_dedicated ? &dedicated_info : nullptr,          .allocationSize = requirements.size, -        .memoryTypeIndex = -            FindMemoryTypeIndex(memory_properties, requirements.memoryTypeBits, true), +        .memoryTypeIndex = FindMemoryTypeIndex(memory_properties, requirements.memoryTypeBits, true, +                                               staging_buffer_size),      };      stream_memory = dev.TryAllocateMemory(stream_memory_info);      if (!stream_memory) {          LOG_INFO(Render_Vulkan, "Dynamic memory allocation failed, trying with system memory"); -        stream_memory_info.memoryTypeIndex = -            FindMemoryTypeIndex(memory_properties, requirements.memoryTypeBits, false); +        stream_memory_info.memoryTypeIndex = FindMemoryTypeIndex( +            memory_properties, requirements.memoryTypeBits, false, staging_buffer_size);          stream_memory = dev.AllocateMemory(stream_memory_info);      } @@ -137,7 +167,7 @@ StagingBufferPool::StagingBufferPool(const Device& device_, MemoryAllocator& mem          stream_memory.SetObjectNameEXT("Stream Buffer Memory");      }      stream_buffer.BindMemory(*stream_memory, 0); -    stream_pointer = stream_memory.Map(0, STREAM_BUFFER_SIZE); +    stream_pointer = stream_memory.Map(0, staging_buffer_size);  }  StagingBufferPool::~StagingBufferPool() = default; @@ -158,25 +188,25 @@ void StagingBufferPool::TickFrame() {  }  StagingBufferRef StagingBufferPool::GetStreamBuffer(size_t size) { -    if (AreRegionsActive(Region(free_iterator) + 1, -                         std::min(Region(iterator + size) + 1, NUM_SYNCS))) { +    if (AreRegionsActive(Region(free_iterator, region_size) + 1, +                         std::min(Region(iterator + size, region_size) + 1, NUM_SYNCS))) {          // Avoid waiting for the previous usages to be free          return GetStagingBuffer(size, MemoryUsage::Upload);      }      const u64 current_tick = scheduler.CurrentTick(); -    std::fill(sync_ticks.begin() + Region(used_iterator), sync_ticks.begin() + Region(iterator), -              current_tick); +    std::fill(sync_ticks.begin() + Region(used_iterator, region_size), +              sync_ticks.begin() + Region(iterator, region_size), current_tick);      used_iterator = iterator;      free_iterator = std::max(free_iterator, iterator + size); -    if (iterator + size >= STREAM_BUFFER_SIZE) { -        std::fill(sync_ticks.begin() + Region(used_iterator), sync_ticks.begin() + NUM_SYNCS, -                  current_tick); +    if (iterator + size >= staging_buffer_size) { +        std::fill(sync_ticks.begin() + Region(used_iterator, region_size), +                  sync_ticks.begin() + NUM_SYNCS, current_tick);          used_iterator = 0;          iterator = 0;          free_iterator = size; -        if (AreRegionsActive(0, Region(size) + 1)) { +        if (AreRegionsActive(0, Region(size, region_size) + 1)) {              // Avoid waiting for the previous usages to be free              return GetStagingBuffer(size, MemoryUsage::Upload);          } diff --git a/src/video_core/renderer_vulkan/vk_staging_buffer_pool.h b/src/video_core/renderer_vulkan/vk_staging_buffer_pool.h index 91dc84da8..90c67177f 100644 --- a/src/video_core/renderer_vulkan/vk_staging_buffer_pool.h +++ b/src/video_core/renderer_vulkan/vk_staging_buffer_pool.h @@ -93,6 +93,9 @@ private:      size_t free_iterator = 0;      std::array<u64, NUM_SYNCS> sync_ticks{}; +    size_t staging_buffer_size = 0; +    size_t region_size = 0; +      StagingBuffersCache device_local_cache;      StagingBuffersCache upload_cache;      StagingBuffersCache download_cache; diff --git a/src/web_service/web_backend.cpp b/src/web_service/web_backend.cpp index 378804c08..12a7e4922 100644 --- a/src/web_service/web_backend.cpp +++ b/src/web_service/web_backend.cpp @@ -111,7 +111,8 @@ struct Client::Impl {          httplib::Error error;          if (!cli->send(request, response, error)) { -            LOG_ERROR(WebService, "{} to {} returned null", method, host + path); +            LOG_ERROR(WebService, "{} to {} returned null (httplib Error: {})", method, host + path, +                      httplib::to_string(error));              return WebResult{WebResult::Code::LibError, "Null response", ""};          } | 
