7 files changed, 181 insertions, 19 deletions
diff --git a/src/core/hle/kernel/scheduler.cpp b/src/core/hle/kernel/scheduler.cpp
index 5a5f4cef1..df4d6cf0a 100644
--- a/src/core/hle/kernel/scheduler.cpp
+++ b/src/core/hle/kernel/scheduler.cpp
@@ -9,6 +9,7 @@
 #include "common/logging/log.h"
 #include "core/arm/arm_interface.h"
 #include "core/core.h"
+#include "core/core_cpu.h"
 #include "core/core_timing.h"
 #include "core/hle/kernel/kernel.h"
 #include "core/hle/kernel/process.h"
@@ -179,4 +180,69 @@ void Scheduler::SetThreadPriority(Thread* thread, u32 priority) {
         ready_queue.prepare(priority);
 }
 
+Thread* Scheduler::GetNextSuggestedThread(u32 core, u32 maximum_priority) const {
+    std::lock_guard<std::mutex> lock(scheduler_mutex);
+
+    const u32 mask = 1U << core;
+    return ready_queue.get_first_filter([mask, maximum_priority](Thread const* thread) {
+        return (thread->GetAffinityMask() & mask) != 0 && thread->GetPriority() < maximum_priority;
+    });
+}
+
+void Scheduler::YieldWithoutLoadBalancing(Thread* thread) {
+    ASSERT(thread != nullptr);
+    // Avoid yielding if the thread isn't even running.
+    ASSERT(thread->GetStatus() == ThreadStatus::Running);
+
+    // Sanity check that the priority is valid
+    ASSERT(thread->GetPriority() < THREADPRIO_COUNT);
+
+    // Yield this thread -- sleep for zero time and force reschedule to different thread
+    WaitCurrentThread_Sleep();
+    GetCurrentThread()->WakeAfterDelay(0);
+}
+
+void Scheduler::YieldWithLoadBalancing(Thread* thread) {
+    ASSERT(thread != nullptr);
+    const auto priority = thread->GetPriority();
+    const auto core = static_cast<u32>(thread->GetProcessorID());
+
+    // Avoid yielding if the thread isn't even running.
+    ASSERT(thread->GetStatus() == ThreadStatus::Running);
+
+    // Sanity check that the priority is valid
+    ASSERT(priority < THREADPRIO_COUNT);
+
+    // Sleep for zero time to be able to force reschedule to different thread
+    WaitCurrentThread_Sleep();
+    GetCurrentThread()->WakeAfterDelay(0);
+
+    Thread* suggested_thread = nullptr;
+
+    // Search through all of the cpu cores (except this one) for a suggested thread.
+    // Take the first non-nullptr one
+    for (unsigned cur_core = 0; cur_core < Core::NUM_CPU_CORES; ++cur_core) {
+        const auto res =
+            Core::System::GetInstance().CpuCore(cur_core).Scheduler().GetNextSuggestedThread(
+                core, priority);
+
+        // If scheduler provides a suggested thread
+        if (res != nullptr) {
+            // And its better than the current suggested thread (or is the first valid one)
+            if (suggested_thread == nullptr ||
+                suggested_thread->GetPriority() > res->GetPriority()) {
+                suggested_thread = res;
+            }
+        }
+    }
+
+    // If a suggested thread was found, queue that for this core
+    if (suggested_thread != nullptr)
+        suggested_thread->ChangeCore(core, suggested_thread->GetAffinityMask());
+}
+
+void Scheduler::YieldAndWaitForLoadBalancing(Thread* thread) {
+    UNIMPLEMENTED_MSG("Wait for load balancing thread yield type is not implemented!");
+}
+
 } // namespace Kernel
diff --git a/src/core/hle/kernel/scheduler.h b/src/core/hle/kernel/scheduler.h
index c63032b7d..97ced4dfc 100644
--- a/src/core/hle/kernel/scheduler.h
+++ b/src/core/hle/kernel/scheduler.h
@@ -51,6 +51,75 @@ public:
     /// Sets the priority of a thread in the scheduler
     void SetThreadPriority(Thread* thread, u32 priority);
 
+    /// Gets the next suggested thread for load balancing
+    Thread* GetNextSuggestedThread(u32 core, u32 minimum_priority) const;
+
+    /**
+     * YieldWithoutLoadBalancing -- analogous to normal yield on a system
+     * Moves the thread to the end of the ready queue for its priority, and then reschedules the
+     * system to the new head of the queue.
+     *
+     * Example (Single Core -- but can be extrapolated to multi):
+     * ready_queue[prio=0]: ThreadA, ThreadB, ThreadC (->exec order->)
+     * Currently Running: ThreadR
+     *
+     * ThreadR calls YieldWithoutLoadBalancing
+     *
+     * ThreadR is moved to the end of ready_queue[prio=0]:
+     * ready_queue[prio=0]: ThreadA, ThreadB, ThreadC, ThreadR (->exec order->)
+     * Currently Running: Nothing
+     *
+     * System is rescheduled (ThreadA is popped off of queue):
+     * ready_queue[prio=0]: ThreadB, ThreadC, ThreadR (->exec order->)
+     * Currently Running: ThreadA
+     *
+     * If the queue is empty at time of call, no yielding occurs. This does not cross between cores
+     * or priorities at all.
+     */
+    void YieldWithoutLoadBalancing(Thread* thread);
+
+    /**
+     * YieldWithLoadBalancing -- yield but with better selection of the new running thread
+     * Moves the current thread to the end of the ready queue for its priority, then selects a
+     * 'suggested thread' (a thread on a different core that could run on this core) from the
+     * scheduler, changes its core, and reschedules the current core to that thread.
+     *
+     * Example (Dual Core -- can be extrapolated to Quad Core, this is just normal yield if it were
+     * single core):
+     * ready_queue[core=0][prio=0]: ThreadA, ThreadB (affinities not pictured as irrelevant
+     * ready_queue[core=1][prio=0]: ThreadC[affinity=both], ThreadD[affinity=core1only]
+     * Currently Running: ThreadQ on Core 0 || ThreadP on Core 1
+     *
+     * ThreadQ calls YieldWithLoadBalancing
+     *
+     * ThreadQ is moved to the end of ready_queue[core=0][prio=0]:
+     * ready_queue[core=0][prio=0]: ThreadA, ThreadB
+     * ready_queue[core=1][prio=0]: ThreadC[affinity=both], ThreadD[affinity=core1only]
+     * Currently Running: ThreadQ on Core 0 || ThreadP on Core 1
+     *
+     * A list of suggested threads for each core is compiled
+     * Suggested Threads: {ThreadC on Core 1}
+     * If this were quad core (as the switch is), there could be between 0 and 3 threads in this
+     * list. If there are more than one, the thread is selected by highest prio.
+     *
+     * ThreadC is core changed to Core 0:
+     * ready_queue[core=0][prio=0]: ThreadC, ThreadA, ThreadB, ThreadQ
+     * ready_queue[core=1][prio=0]: ThreadD
+     * Currently Running: None on Core 0 || ThreadP on Core 1
+     *
+     * System is rescheduled (ThreadC is popped off of queue):
+     * ready_queue[core=0][prio=0]: ThreadA, ThreadB, ThreadQ
+     * ready_queue[core=1][prio=0]: ThreadD
+     * Currently Running: ThreadC on Core 0 || ThreadP on Core 1
+     *
+     * If no suggested threads can be found this will behave just as normal yield. If there are
+     * multiple candidates for the suggested thread on a core, the highest prio is taken.
+     */
+    void YieldWithLoadBalancing(Thread* thread);
+
+    /// Currently unknown -- asserts as unimplemented on call
+    void YieldAndWaitForLoadBalancing(Thread* thread);
+
     /// Returns a list of all threads managed by the scheduler
     const std::vector<SharedPtr<Thread>>& GetThreadList() const {
         return thread_list;
diff --git a/src/core/hle/kernel/svc.cpp b/src/core/hle/kernel/svc.cpp
index 5d36792ca..348a22904 100644
--- a/src/core/hle/kernel/svc.cpp
+++ b/src/core/hle/kernel/svc.cpp
@@ -1208,18 +1208,38 @@ static void ExitThread() {
 static void SleepThread(s64 nanoseconds) {
     LOG_TRACE(Kernel_SVC, "called nanoseconds={}", nanoseconds);
 
-    // Don't attempt to yield execution if there are no available threads to run,
-    // this way we avoid a useless reschedule to the idle thread.
-    if (nanoseconds == 0 && !Core::System::GetInstance().CurrentScheduler().HaveReadyThreads())
-        return;
+    enum class SleepType : s64 {
+        YieldWithoutLoadBalancing = 0,
+        YieldWithLoadBalancing = -1,
+        YieldAndWaitForLoadBalancing = -2,
+    };
 
-    // Sleep current thread and check for next thread to schedule
-    WaitCurrentThread_Sleep();
+    if (nanoseconds <= 0) {
+        auto& scheduler{Core::System::GetInstance().CurrentScheduler()};
+        switch (static_cast<SleepType>(nanoseconds)) {
+        case SleepType::YieldWithoutLoadBalancing:
+            scheduler.YieldWithoutLoadBalancing(GetCurrentThread());
+            break;
+        case SleepType::YieldWithLoadBalancing:
+            scheduler.YieldWithLoadBalancing(GetCurrentThread());
+            break;
+        case SleepType::YieldAndWaitForLoadBalancing:
+            scheduler.YieldAndWaitForLoadBalancing(GetCurrentThread());
+            break;
+        default:
+            UNREACHABLE_MSG("Unimplemented sleep yield type '{:016X}'!", nanoseconds);
+        }
+    } else {
+        // Sleep current thread and check for next thread to schedule
+        WaitCurrentThread_Sleep();
 
-    // Create an event to wake the thread up after the specified nanosecond delay has passed
-    GetCurrentThread()->WakeAfterDelay(nanoseconds);
+        // Create an event to wake the thread up after the specified nanosecond delay has passed
+        GetCurrentThread()->WakeAfterDelay(nanoseconds);
+    }
 
-    Core::System::GetInstance().PrepareReschedule();
+    // Reschedule all CPU cores
+    for (std::size_t i = 0; i < Core::NUM_CPU_CORES; ++i)
+        Core::System::GetInstance().CpuCore(i).PrepareReschedule();
 }
 
 /// Wait process wide key atomic
diff --git a/src/core/hle/kernel/thread.h b/src/core/hle/kernel/thread.h
index d384d50db..77aec099a 100644
--- a/src/core/hle/kernel/thread.h
+++ b/src/core/hle/kernel/thread.h
@@ -26,6 +26,7 @@ enum ThreadPriority : u32 {
     THREADPRIO_USERLAND_MAX = 24, ///< Highest thread priority for userland apps
     THREADPRIO_DEFAULT = 44,      ///< Default thread priority for userland apps
     THREADPRIO_LOWEST = 63,       ///< Lowest thread priority
+    THREADPRIO_COUNT = 64,        ///< Total number of possible thread priorities.
 };
 
 enum ThreadProcessorId : s32 {
diff --git a/src/core/hle/service/nvdrv/devices/nvhost_gpu.cpp b/src/core/hle/service/nvdrv/devices/nvhost_gpu.cpp
index 3bfce0110..0a650f36c 100644
--- a/src/core/hle/service/nvdrv/devices/nvhost_gpu.cpp
+++ b/src/core/hle/service/nvdrv/devices/nvhost_gpu.cpp
@@ -137,6 +137,10 @@ u32 nvhost_gpu::AllocateObjectContext(const std::vector<u8>& input, std::vector<
 }
 
 static void PushGPUEntries(Tegra::CommandList&& entries) {
+    if (entries.empty()) {
+        return;
+    }
+
     auto& dma_pusher{Core::System::GetInstance().GPU().DmaPusher()};
     dma_pusher.Push(std::move(entries));
     dma_pusher.DispatchCalls();
diff --git a/src/core/hle/service/service.cpp b/src/core/hle/service/service.cpp
index d41df3732..d25b80ab0 100644
--- a/src/core/hle/service/service.cpp
+++ b/src/core/hle/service/service.cpp
@@ -97,29 +97,33 @@ ServiceFrameworkBase::ServiceFrameworkBase(const char* service_name, u32 max_ses
 ServiceFrameworkBase::~ServiceFrameworkBase() = default;
 
 void ServiceFrameworkBase::InstallAsService(SM::ServiceManager& service_manager) {
-    ASSERT(port == nullptr);
-    port = service_manager.RegisterService(service_name, max_sessions).Unwrap();
+    ASSERT(!port_installed);
+
+    auto port = service_manager.RegisterService(service_name, max_sessions).Unwrap();
     port->SetHleHandler(shared_from_this());
+    port_installed = true;
 }
 
 void ServiceFrameworkBase::InstallAsNamedPort() {
-    ASSERT(port == nullptr);
+    ASSERT(!port_installed);
 
     auto& kernel = Core::System::GetInstance().Kernel();
     auto [server_port, client_port] =
         Kernel::ServerPort::CreatePortPair(kernel, max_sessions, service_name);
     server_port->SetHleHandler(shared_from_this());
     kernel.AddNamedPort(service_name, std::move(client_port));
+    port_installed = true;
 }
 
 Kernel::SharedPtr<Kernel::ClientPort> ServiceFrameworkBase::CreatePort() {
-    ASSERT(port == nullptr);
+    ASSERT(!port_installed);
 
     auto& kernel = Core::System::GetInstance().Kernel();
     auto [server_port, client_port] =
         Kernel::ServerPort::CreatePortPair(kernel, max_sessions, service_name);
-    port = MakeResult(std::move(server_port)).Unwrap();
+    auto port = MakeResult(std::move(server_port)).Unwrap();
     port->SetHleHandler(shared_from_this());
+    port_installed = true;
     return client_port;
 }
 
diff --git a/src/core/hle/service/service.h b/src/core/hle/service/service.h
index 98483ecf1..029533628 100644
--- a/src/core/hle/service/service.h
+++ b/src/core/hle/service/service.h
@@ -96,11 +96,9 @@ private:
     /// Maximum number of concurrent sessions that this service can handle.
     u32 max_sessions;
 
-    /**
-     * Port where incoming connections will be received. Only created when InstallAsService() or
-     * InstallAsNamedPort() are called.
-     */
-    Kernel::SharedPtr<Kernel::ServerPort> port;
+    /// Flag to store if a port was already create/installed to detect multiple install attempts,
+    /// which is not supported.
+    bool port_installed = false;
 
     /// Function used to safely up-cast pointers to the derived class before invoking a handler.
     InvokerFn* handler_invoker;