[kernel] Schedule threads on other CPUs

Now that the other CPUs have been brought up, add support for scheduling tasks on them. The scheduler now maintains separate ready/blocked lists per CPU, and CPUs will attempt to balance load via periodic work stealing. Other changes as a result of this: - The device manager no longer creates a local APIC object, but instead just gathers relevant info from the APCI tables. Each CPU creates its own local APIC object. This also spurred the APIC timer calibration to become a static value, as all APICs are assumed to be symmetrical. - Fixed a bug where the scheduler was popping the current task off of its ready list, however the current task is never on the ready list (except the idle task was first set up as both current and ready). This was causing the lists to get into bad states. Now a task can only ever be current or in a ready or blocked list. - Got rid of the unused static process::s_processes list of all processes, instead of trying to synchronize it via locks. - Added spinlocks for synchronization to the scheduler and logger objects.
2025-12-10 00:14:32 -08:00 · 2021-02-15 12:56:22 -08:00
parent 2a347942bc
commit f0025dbc47
17 changed files with 337 additions and 220 deletions
--- a/src/kernel/scheduler.cpp
+++ b/src/kernel/scheduler.cpp
@@ -17,6 +17,7 @@
 #include "objects/channel.h"
 #include "objects/process.h"
 #include "objects/system.h"
+#include "objects/thread.h"
 #include "objects/vm_area.h"
 #include "scheduler.h"

@@ -25,43 +26,37 @@

 #include "kutil/assert.h"

-
+extern "C" void task_switch(TCB *tcb);
 scheduler *scheduler::s_instance = nullptr;

-const uint64_t rflags_noint = 0x002;
-const uint64_t rflags_int = 0x202;
-
-extern uint64_t idle_stack_end;
-
-extern "C" void task_switch(TCB *tcb);
-
-scheduler::scheduler(lapic &apic) :
-	m_apic(apic),
-	m_next_pid(1),
-	m_clock(0),
-	m_last_promotion(0)
+struct run_queue
 {
-	kassert(!s_instance, "Multiple schedulers created!");
-	s_instance = this;
+	tcb_node *current = nullptr;
+	tcb_list ready[scheduler::num_priorities];
+	tcb_list blocked;

-	process *kp = &process::kernel_process();
+	uint64_t last_promotion = 0;
+	uint64_t last_steal = 0;
+	kutil::spinlock lock;
+};

-	log::debug(logs::task, "Kernel process koid %llx", kp->koid());
+scheduler::scheduler(unsigned cpus) :
+	m_next_pid {1},
+	m_clock {0}
+{
+	kassert(!s_instance, "Created multiple schedulers!");
+	if (!s_instance)
+		s_instance = this;

-	thread *idle = thread::create_idle_thread(*kp, max_priority,
-		reinterpret_cast<uintptr_t>(&idle_stack_end));
+	m_run_queues.set_size(cpus);
+}

-	log::debug(logs::task, "Idle thread koid %llx", idle->koid());
-
-	auto *tcb = idle->tcb();
-	m_runlists[max_priority].push_back(tcb);
-	m_current = tcb;
-
-	cpu_data &cpu = current_cpu();
-	cpu.rsp0 = tcb->rsp0;
-	cpu.tcb = tcb;
-	cpu.process = kp;
-	cpu.thread = idle;
+scheduler::~scheduler()
+{
+	// Not truly necessary - if the scheduler is going away, the whole
+	// system is probably going down. But let's be clean.
+	if (s_instance == this)
+		s_instance = nullptr;
 }

 template <typename T>
@@ -72,20 +67,6 @@ inline T * push(uintptr_t &rsp, size_t size = sizeof(T)) {
 	return p;
 }

-thread *
-scheduler::create_process(bool user)
-{
-	process *p = new process;
-	thread *th = p->create_thread(default_priority, user);
-
-	TCB *tcb = th->tcb();
-	log::debug(logs::task, "Creating thread %llx, priority %d, time slice %d",
-			th->koid(), tcb->priority, tcb->time_left);
-
-	th->set_state(thread::state::ready);
-	return th;
-}
-
 void
 scheduler::create_kernel_task(void (*task)(), uint8_t priority, bool constant)
 {
@@ -115,24 +96,42 @@ scheduler::quantum(int priority)
 void
 scheduler::start()
 {
-	log::info(logs::sched, "Starting scheduler.");
-	m_apic.enable_timer(isr::isrTimer, false);
-	m_apic.reset_timer(10);
+	cpu_data &cpu = current_cpu();
+	run_queue &queue = m_run_queues[cpu.index];
+	kutil::scoped_lock lock {queue.lock};
+
+	process *kp = &process::kernel_process();
+	thread *idle = thread::create_idle_thread(*kp, max_priority, cpu.rsp0);
+	log::debug(logs::task, "CPU%02x idle thread koid %llx", cpu.index, idle->koid());
+
+	auto *tcb = idle->tcb();
+	cpu.process = kp;
+	cpu.thread = idle;
+	cpu.tcb = tcb;
+
+	queue.current = tcb;
+
+	log::info(logs::sched, "CPU%02x starting scheduler", cpu.index);
+	cpu.apic->enable_timer(isr::isrTimer, false);
+	cpu.apic->reset_timer(10);
 }

 void
 scheduler::add_thread(TCB *t)
 {
-	m_blocked.push_back(static_cast<tcb_node*>(t));
-	t->time_left = quantum(t->priority);
+	cpu_data &cpu = current_cpu();
+	run_queue &queue = m_run_queues[cpu.index];
+	kutil::scoped_lock lock {queue.lock};

+	queue.blocked.push_back(static_cast<tcb_node*>(t));
+	t->time_left = quantum(t->priority);
 }

-void scheduler::prune(uint64_t now)
+void scheduler::prune(run_queue &queue, uint64_t now)
 {
 	// Find processes that are ready or have exited and
 	// move them to the appropriate lists.
-	auto *tcb = m_blocked.front();
+	auto *tcb = queue.blocked.front();
 	while (tcb) {
 		thread *th = thread::from_tcb(tcb);
 		uint8_t priority = tcb->priority;
@@ -140,7 +139,7 @@ void scheduler::prune(uint64_t now)
 		bool ready = th->has_state(thread::state::ready);
 		bool exited = th->has_state(thread::state::exited);
 		bool constant = th->has_state(thread::state::constant);
-		bool current = tcb == m_current;
+		bool current = tcb == queue.current;

 		ready |= th->wake_on_time(now);

@@ -155,7 +154,7 @@ void scheduler::prune(uint64_t now)
 			// page tables
 			if (current) continue;

-			m_blocked.remove(remove);
+			queue.blocked.remove(remove);
 			process &p = th->parent();

 			// thread_exited deletes the thread, and returns true if the process
@@ -163,19 +162,19 @@ void scheduler::prune(uint64_t now)
 			if(!current && p.thread_exited(th))
 				delete &p;
 		} else {
-			m_blocked.remove(remove);
+			queue.blocked.remove(remove);
 			log::debug(logs::sched, "Prune: readying unblocked thread %llx", th->koid());
-			m_runlists[remove->priority].push_back(remove);
+			queue.ready[remove->priority].push_back(remove);
 		}
 	}
 }

 void
-scheduler::check_promotions(uint64_t now)
+scheduler::check_promotions(run_queue &queue, uint64_t now)
 {
-	for (auto &pri_list : m_runlists) {
+	for (auto &pri_list : queue.ready) {
 		for (auto *tcb : pri_list) {
-			const thread *th = thread::from_tcb(m_current);
+			const thread *th = thread::from_tcb(queue.current);
 			const bool constant = th->has_state(thread::state::constant);
 			if (constant)
 				continue;
@@ -190,81 +189,145 @@ scheduler::check_promotions(uint64_t now)

 			if (stale) {
 				// If the thread is stale, promote it
-				m_runlists[priority].remove(tcb);
+				queue.ready[priority].remove(tcb);
 				tcb->priority -= 1;
 				tcb->time_left = quantum(tcb->priority);
-				m_runlists[tcb->priority].push_back(tcb);
+				queue.ready[tcb->priority].push_back(tcb);
 				log::info(logs::sched, "Scheduler promoting thread %llx, priority %d",
 						th->koid(), tcb->priority);
 			}
 		}
 	}

-	m_last_promotion = now;
+	queue.last_promotion = now;
+}
+
+static size_t
+balance_lists(tcb_list &to, tcb_list &from)
+{
+	size_t to_len = to.length();
+	size_t from_len = from.length();
+
+	// Only steal from the rich, don't be Dennis Moore
+	if (from_len <= to_len)
+		return 0;
+
+	size_t steal = (from_len - to_len) / 2;
+	for (size_t i = 0; i < steal; ++i)
+		to.push_front(from.pop_front());
+	return steal;
+}
+
+void
+scheduler::steal_work(cpu_data &cpu)
+{
+	// First grab a scheduler-wide lock to avoid deadlock
+	kutil::scoped_lock steal_lock {m_steal_lock};
+
+	// Lock this cpu's queue for the whole time while we modify it
+	run_queue &my_queue = m_run_queues[cpu.index];
+	kutil::scoped_lock my_queue_lock {my_queue.lock};
+
+	const unsigned count = m_run_queues.count();
+	for (unsigned i = 0; i < count; ++i) {
+		if (i == cpu.index) continue;
+
+		run_queue &other_queue = m_run_queues[i];
+		kutil::scoped_lock other_queue_lock {other_queue.lock};
+
+		size_t stolen = 0;
+
+		// Don't steal from max_priority, that's the idle thread
+		for (unsigned pri = 0; pri < max_priority; ++pri)
+			stolen += balance_lists(my_queue.ready[pri], other_queue.ready[pri]);
+
+		stolen += balance_lists(my_queue.blocked, other_queue.blocked);
+
+		if (stolen)
+			log::debug(logs::sched, "CPU%02x stole %2d tasks from CPU%02x",
+					cpu.index, stolen, i);
+	}
 }

 void
 scheduler::schedule()
 {
-	uint8_t priority = m_current->priority;
-	uint32_t remaining = m_apic.stop_timer();
-	m_current->time_left = remaining;
-	thread *th = thread::from_tcb(m_current);
+	cpu_data &cpu = current_cpu();
+	run_queue &queue = m_run_queues[cpu.index];
+	lapic &apic = *cpu.apic;
+	uint32_t remaining = apic.stop_timer();
+
+	if (m_clock - queue.last_steal > steal_frequency) {
+		steal_work(cpu);
+		queue.last_steal = m_clock;
+	}
+
+	// We need to explicitly lock/unlock here instead of
+	// using a scoped lock, because the scope doesn't "end"
+	// for the current thread until it gets scheduled again
+	kutil::spinlock::waiter waiter;
+	queue.lock.acquire(&waiter);
+
+	queue.current->time_left = remaining;
+	thread *th = thread::from_tcb(queue.current);
+	uint8_t priority = queue.current->priority;
 	const bool constant = th->has_state(thread::state::constant);

 	if (remaining == 0) {
 		if (priority < max_priority && !constant) {
 			// Process used its whole timeslice, demote it
-			++m_current->priority;
+			++queue.current->priority;
 			log::debug(logs::sched, "Scheduler  demoting thread %llx, priority %d",
-					th->koid(), m_current->priority);
+					th->koid(), queue.current->priority);
 		}
-		m_current->time_left = quantum(m_current->priority);
+		queue.current->time_left = quantum(queue.current->priority);
 	} else if (remaining > 0) {
 		// Process gave up CPU, give it a small bonus to its
 		// remaining timeslice.
 		uint32_t bonus = quantum(priority) >> 4;
-		m_current->time_left += bonus;
+		queue.current->time_left += bonus;
 	}

-	m_runlists[priority].remove(m_current);
 	if (th->has_state(thread::state::ready)) {
-		m_runlists[m_current->priority].push_back(m_current);
+		queue.ready[queue.current->priority].push_back(queue.current);
 	} else {
-		m_blocked.push_back(m_current);
+		queue.blocked.push_back(queue.current);
 	}

 	clock::get().update();
-	prune(++m_clock);
-	if (m_clock - m_last_promotion > promote_frequency)
-		check_promotions(m_clock);
+	prune(queue, ++m_clock);
+	if (m_clock - queue.last_promotion > promote_frequency)
+		check_promotions(queue, m_clock);

 	priority = 0;
-	while (m_runlists[priority].empty()) {
+	while (queue.ready[priority].empty()) {
 		++priority;
 		kassert(priority < num_priorities, "All runlists are empty");
 	}

-	m_current->last_ran = m_clock;
+	queue.current->last_ran = m_clock;

-	auto *next = m_runlists[priority].pop_front();
+	auto *next = queue.ready[priority].pop_front();
 	next->last_ran = m_clock;
-	m_apic.reset_timer(next->time_left);
+	apic.reset_timer(next->time_left);

-	if (next != m_current) {
-		thread *next_thread = thread::from_tcb(next);
-
-		cpu_data &cpu = current_cpu();
-		cpu.thread = next_thread;
-		cpu.process = &next_thread->parent();
-		m_current = next;
-
-		log::debug(logs::sched, "Scheduler switching threads %llx->%llx",
-				th->koid(), next_thread->koid());
-		log::debug(logs::sched, "    priority %d time left %d @ %lld.",
-				m_current->priority, m_current->time_left, m_clock);
-		log::debug(logs::sched, "    PML4 %llx", m_current->pml4);
-
-		task_switch(m_current);
+	if (next == queue.current) {
+		queue.lock.release(&waiter);
+		return;
 	}
+
+	thread *next_thread = thread::from_tcb(next);
+
+	cpu.thread = next_thread;
+	cpu.process = &next_thread->parent();
+	queue.current = next;
+
+	log::debug(logs::sched, "CPU%02x switching threads %llx->%llx",
+			cpu.index, th->koid(), next_thread->koid());
+	log::debug(logs::sched, "    priority %d time left %d @ %lld.",
+			next->priority, next->time_left, m_clock);
+	log::debug(logs::sched, "    PML4 %llx", next->pml4);
+
+	queue.lock.release(&waiter);
+	task_switch(queue.current);
 }