[kernel] Schedule threads on other CPUs

Now that the other CPUs have been brought up, add support for scheduling tasks on them. The scheduler now maintains separate ready/blocked lists per CPU, and CPUs will attempt to balance load via periodic work stealing. Other changes as a result of this: - The device manager no longer creates a local APIC object, but instead just gathers relevant info from the APCI tables. Each CPU creates its own local APIC object. This also spurred the APIC timer calibration to become a static value, as all APICs are assumed to be symmetrical. - Fixed a bug where the scheduler was popping the current task off of its ready list, however the current task is never on the ready list (except the idle task was first set up as both current and ready). This was causing the lists to get into bad states. Now a task can only ever be current or in a ready or blocked list. - Got rid of the unused static process::s_processes list of all processes, instead of trying to synchronize it via locks. - Added spinlocks for synchronization to the scheduler and logger objects.
2025-12-10 00:14:32 -08:00 · 2021-02-15 12:56:22 -08:00
parent 2a347942bc
commit f0025dbc47
17 changed files with 337 additions and 220 deletions
--- a/src/kernel/ap_startup.s
+++ b/src/kernel/ap_startup.s
@@ -137,3 +137,13 @@ init_ap_trampoline:

 	pop rbp
 	ret
+
+extern long_ap_startup
+global ap_idle
+ap_idle:
+	call long_ap_startup
+	sti
+.hang:
+	hlt
+	jmp .hang
+
--- a/src/kernel/apic.cpp
+++ b/src/kernel/apic.cpp
@@ -6,6 +6,8 @@
 #include "kernel_memory.h"
 #include "log.h"

+uint64_t lapic::s_ticks_per_us = 0;
+
 static constexpr uint16_t lapic_id         = 0x0020;
 static constexpr uint16_t lapic_spurious   = 0x00f0;

@@ -54,12 +56,12 @@ apic::apic(uintptr_t base) :
 }


-lapic::lapic(uintptr_t base, isr spurious) :
+lapic::lapic(uintptr_t base) :
 	apic(base),
 	m_divisor(0)
 {
 	apic_write(m_base, lapic_lvt_error, static_cast<uint32_t>(isr::isrAPICError));
-	apic_write(m_base, lapic_spurious, static_cast<uint32_t>(spurious));
+	apic_write(m_base, lapic_spurious, static_cast<uint32_t>(isr::isrSpurious));
 	log::info(logs::apic, "LAPIC created, base %lx", m_base);
 }

@@ -122,10 +124,10 @@ lapic::calibrate_timer()
 	clock::get().spinwait(us);

 	uint32_t remaining = apic_read(m_base, lapic_timer_cur);
-	uint32_t ticks_total = initial - remaining;
-	m_ticks_per_us = ticks_total / us;
+	uint64_t ticks_total = initial - remaining;
+	s_ticks_per_us = ticks_total / us;

-	log::info(logs::apic, "APIC timer ticks %d times per microsecond.", m_ticks_per_us);
+	log::info(logs::apic, "APIC timer ticks %d times per microsecond.", s_ticks_per_us);

 	interrupts_enable();
 }
@@ -145,7 +147,7 @@ lapic::set_divisor(uint8_t divisor)
 	case  64: divbits = 0x9; break;
 	case 128: divbits = 0xa; break;
 	default:
-		kassert(0, "Invalid divisor passed to lapic::enable_timer");
+		kassert(0, "Invalid divisor passed to lapic::set_divisor");
 	}

 	apic_write(m_base, lapic_timer_div, divbits);
--- a/src/kernel/apic.h
+++ b/src/kernel/apic.h
@@ -43,8 +43,7 @@ class lapic :
 public:
 	/// Constructor
 	/// \arg base      Physicl base address of the APIC's MMIO registers
-	/// \arg spurious  Vector of the spurious interrupt handler
-	lapic(uintptr_t base, isr spurious);
+	lapic(uintptr_t base);

 	/// Get the local APIC's ID
 	uint8_t get_id();
@@ -93,19 +92,14 @@ public:
 	void calibrate_timer();

 private:
-	inline uint64_t ticks_to_us(uint32_t ticks) const {
-		return static_cast<uint64_t>(ticks) / m_ticks_per_us;
-	}
-
-	inline uint64_t us_to_ticks(uint64_t interval) const {
-		return interval * m_ticks_per_us;
-	}
+	inline static uint64_t ticks_to_us(uint64_t ticks)    { return ticks / s_ticks_per_us; }
+	inline static uint64_t us_to_ticks(uint64_t interval) { return interval * s_ticks_per_us; }

 	void set_divisor(uint8_t divisor);
 	void set_repeat(bool repeat);

 	uint32_t m_divisor;
-	uint32_t m_ticks_per_us;
+	static uint64_t s_ticks_per_us;
 };


--- a/src/kernel/cpu.cpp
+++ b/src/kernel/cpu.cpp
@@ -1,7 +1,6 @@
 #include <stdint.h>
 #include "kutil/assert.h"
 #include "kutil/memory.h"
-#include "apic.h"
 #include "cpu.h"
 #include "cpu/cpu_id.h"
 #include "device_manager.h"
--- a/src/kernel/cpu.h
+++ b/src/kernel/cpu.h
@@ -2,9 +2,8 @@

 #include <stdint.h>

-#include "kutil/spinlock.h"
-
 class GDT;
+class lapic;
 class process;
 struct TCB;
 class thread;
@@ -34,6 +33,9 @@ struct cpu_data
 	TSS *tss;
 	GDT *gdt;

+	// Members beyond this point do not appear in
+	// the assembly version
+	lapic *apic;
 };

 extern "C" cpu_data * _current_gsbase();
@@ -55,4 +57,3 @@ inline cpu_data & current_cpu() { return *_current_gsbase(); }
 /// Validate the required CPU features are present. Really, the bootloader already
 /// validated the required features, but still iterate the options and log about them.
 void cpu_validate();
-
--- a/src/kernel/debug.h
+++ b/src/kernel/debug.h
@@ -4,6 +4,8 @@

 #include <stdint.h>

+struct cpu_state;
+
 extern "C" {
 	uintptr_t get_rsp();
 	uintptr_t get_rip();
--- a/src/kernel/device_manager.cpp
+++ b/src/kernel/device_manager.cpp
@@ -63,7 +63,7 @@ void irq4_callback(void *)


 device_manager::device_manager() :
-	m_lapic(nullptr)
+	m_lapic_base(0)
 {
 	m_irqs.ensure_capacity(32);
 	m_irqs.set_size(16);
@@ -106,6 +106,26 @@ device_manager::parse_acpi(const void *root_table)
 	load_xsdt(memory::to_virtual(acpi2->xsdt_address));
 }

+const device_manager::apic_nmi *
+device_manager::get_lapic_nmi(uint8_t id) const
+{
+	for (const auto &nmi : m_nmis) {
+		if (nmi.cpu == 0xff || nmi.cpu == id)
+			return &nmi;
+	}
+
+	return nullptr;
+}
+
+const device_manager::irq_override *
+device_manager::get_irq_override(uint8_t irq) const
+{
+	for (const auto &o : m_overrides)
+		if (o.source == irq) return &o;
+
+	return nullptr;
+}
+
 ioapic *
 device_manager::get_ioapic(int i)
 {
@@ -163,38 +183,38 @@ device_manager::load_apic(const acpi_table_header *header)
 {
 	const auto *apic = check_get_table<acpi_apic>(header);

-	uintptr_t local = apic->local_address;
-	m_lapic = new lapic(local, isr::isrSpurious);
+	m_lapic_base = apic->local_address;

 	size_t count = acpi_table_entries(apic, 1);
 	uint8_t const *p = apic->controller_data;
 	uint8_t const *end = p + count;

-	// Pass one: count IOAPIC objcts
-	int num_ioapics = 0;
+	// Pass one: count objcts
+	unsigned num_lapics = 0;
+	unsigned num_ioapics = 0;
+	unsigned num_overrides = 0;
+	unsigned num_nmis = 0;
 	while (p < end) {
 		const uint8_t type = p[0];
 		const uint8_t length = p[1];
-		if (type == 1) num_ioapics++;
-		p += length;
-	}

-	m_ioapics.set_capacity(num_ioapics);
-
-	// Pass two: set up IOAPIC objcts
-	p = apic->controller_data;
-	while (p < end) {
-		const uint8_t type = p[0];
-		const uint8_t length = p[1];
-		if (type == 1) {
-			uintptr_t base = kutil::read_from<uint32_t>(p+4);
-			uint32_t base_gsr = kutil::read_from<uint32_t>(p+8);
-			m_ioapics.emplace(base, base_gsr);
+		switch (type) {
+		case 0: ++num_lapics; break;
+		case 1: ++num_ioapics; break;
+		case 2: ++num_overrides; break;
+		case 4: ++num_nmis; break;
+		default: break;
 		}
+
 		p += length;
 	}

-	// Pass three: configure APIC objects
+	m_apic_ids.set_capacity(num_lapics);
+	m_ioapics.set_capacity(num_ioapics);
+	m_overrides.set_capacity(num_overrides);
+	m_nmis.set_capacity(num_nmis);
+
+	// Pass two: configure objects
 	p = apic->controller_data;
 	while (p < end) {
 		const uint8_t type = p[0];
@@ -205,38 +225,41 @@ device_manager::load_apic(const acpi_table_header *header)
 				uint8_t uid = kutil::read_from<uint8_t>(p+2);
 				uint8_t id = kutil::read_from<uint8_t>(p+3);
 				m_apic_ids.append(id);
+
 				log::debug(logs::device, "    Local APIC uid %x id %x", uid, id);
 			}
 			break;

-		case 1: // I/O APIC
+		case 1: { // I/O APIC
+				uintptr_t base = kutil::read_from<uint32_t>(p+4);
+				uint32_t base_gsi = kutil::read_from<uint32_t>(p+8);
+				m_ioapics.emplace(base, base_gsi);
+
+				log::debug(logs::device, "    IO APIC gsi %x base %x", base_gsi, base);
+			}
 			break;

 		case 2: { // Interrupt source override
-				uint8_t source = kutil::read_from<uint8_t>(p+3);
-				isr gsi = isr::irq00 + kutil::read_from<uint32_t>(p+4);
-				uint16_t flags = kutil::read_from<uint16_t>(p+8);
+				irq_override o;
+				o.source = kutil::read_from<uint8_t>(p+3);
+				o.gsi = kutil::read_from<uint32_t>(p+4);
+				o.flags = kutil::read_from<uint16_t>(p+8);
+				m_overrides.append(o);

 				log::debug(logs::device, "    Intr source override IRQ %d -> %d Pol %d Tri %d",
-						source, gsi, (flags & 0x3), ((flags >> 2) & 0x3));
-
-				// TODO: in a multiple-IOAPIC system this might be elsewhere
-				m_ioapics[0].redirect(source, static_cast<isr>(gsi), flags, true);
+						o.source, o.gsi, (o.flags & 0x3), ((o.flags >> 2) & 0x3));
 			}
 			break;

 		case 4: {// LAPIC NMI
-			uint8_t cpu = kutil::read_from<uint8_t>(p + 2);
-			uint8_t num = kutil::read_from<uint8_t>(p + 5);
-			uint16_t flags = kutil::read_from<uint16_t>(p + 3);
+			apic_nmi nmi;
+			nmi.cpu = kutil::read_from<uint8_t>(p + 2);
+			nmi.lint = kutil::read_from<uint8_t>(p + 5);
+			nmi.flags = kutil::read_from<uint16_t>(p + 3);
+			m_nmis.append(nmi);

-			log::debug(logs::device, "    LAPIC NMI Proc %d LINT%d Pol %d Tri %d",
-					kutil::read_from<uint8_t>(p+2),
-					kutil::read_from<uint8_t>(p+5),
-					kutil::read_from<uint16_t>(p+3) & 0x3,
-					(kutil::read_from<uint16_t>(p+3) >> 2) & 0x3);
-
-			m_lapic->enable_lint(num, num == 0 ? isr::isrLINT0 : isr::isrLINT1, true, flags);
+			log::debug(logs::device, "    LAPIC NMI Proc %02x LINT%d Pol %d Tri %d",
+					nmi.cpu, nmi.lint, nmi.flags & 0x3, (nmi.flags >> 2) & 0x3);
 			}
 			break;

@@ -246,17 +269,6 @@ device_manager::load_apic(const acpi_table_header *header)

 		p += length;
 	}
-
-	/*
-	for (uint8_t i = 0; i < m_ioapics[0].get_num_gsi(); ++i) {
-		switch (i) {
-			case 2: break;
-			default: m_ioapics[0].mask(i, false);
-		}
-	}
-	*/
-
-	m_lapic->enable();
 }

 void
--- a/src/kernel/device_manager.h
+++ b/src/kernel/device_manager.h
@@ -24,10 +24,6 @@ public:
 	/// \returns  A reference to the system device manager
 	static device_manager & get() { return s_instance; }

-	/// Get the LAPIC
-	/// \returns An object representing the local APIC
-	lapic & get_lapic() { return *m_lapic; }
-
 	/// Get an IOAPIC
 	/// \arg i   Index of the requested IOAPIC
 	/// \returns An object representing the given IOAPIC if it exists,
@@ -68,6 +64,39 @@ public:
 	/// \returns  True if the interrupt was handled
 	bool dispatch_irq(unsigned irq);

+	struct apic_nmi
+	{
+		uint8_t cpu;
+		uint8_t lint;
+		uint16_t flags;
+	};
+
+	struct irq_override
+	{
+		uint8_t source;
+		uint16_t flags;
+		uint32_t gsi;
+	};
+
+	/// Get the list of APIC ids for other CPUs
+	inline const kutil::vector<uint8_t> & get_apic_ids() const { return m_apic_ids; }
+
+	/// Get the LAPIC base address
+	/// \returns The physical base address of the local apic registers
+	uintptr_t get_lapic_base() const { return m_lapic_base; }
+
+	/// Get the NMI mapping for the given local APIC
+	/// \arg id  ID of the local APIC
+	/// \returns apic_nmi structure describing the NMI configuration,
+	///          or null if no configuration was provided
+	const apic_nmi * get_lapic_nmi(uint8_t id) const;
+
+	/// Get the IRQ source override for the given IRQ
+	/// \arg irq  IRQ number (not isr vector)
+	/// \returns  irq_override structure describing that IRQ's
+	///           configuration, or null if no configuration was provided
+	const irq_override * get_irq_override(uint8_t irq) const;
+
 	/// Register the existance of a block device.
 	/// \arg blockdev  Pointer to the block device
 	void register_block_device(block_device *blockdev);
@@ -94,9 +123,6 @@ public:
 			&m_hpets[i] : nullptr;
 	}

-	/// Get the list of APIC ids for other CPUs
-	inline const kutil::vector<uint8_t> & get_apic_ids() const { return m_apic_ids; }
-
 private:
 	/// Parse the ACPI XSDT and load relevant sub-tables.
 	/// \arg xsdt  Pointer to the XSDT from the firmware
@@ -122,10 +148,13 @@ private:
 	/// that has no callback.
 	void bad_irq(uint8_t irq);

-	lapic *m_lapic;
+	uintptr_t m_lapic_base;
+
 	kutil::vector<ioapic> m_ioapics;
 	kutil::vector<hpet> m_hpets;
 	kutil::vector<uint8_t> m_apic_ids;
+	kutil::vector<apic_nmi> m_nmis;
+	kutil::vector<irq_override> m_overrides;

 	kutil::vector<pci_group> m_pci;
 	kutil::vector<pci_device> m_devices;
--- a/src/kernel/main.cpp
+++ b/src/kernel/main.cpp
@@ -39,7 +39,8 @@ extern "C" {
 	void (*__ctors_end)(void);
 	void long_ap_startup(cpu_data *cpu);
 	void ap_startup();
-	void init_ap_trampoline(void*, cpu_data *, void (*)(cpu_data *));
+	void ap_idle();
+	void init_ap_trampoline(void*, cpu_data *, void (*)());
 }

 extern void __kernel_assert(const char *, unsigned, const char *);
@@ -47,13 +48,14 @@ extern void __kernel_assert(const char *, unsigned, const char *);
 using namespace kernel;

 volatile size_t ap_startup_count;
+static bool scheduler_ready = false;

 /// Bootstrap the memory managers.
 void memory_initialize_pre_ctors(args::header &kargs);
 void memory_initialize_post_ctors(args::header &kargs);
 process * load_simple_process(args::program &program);

-void start_aps(void *kpml4);
+unsigned start_aps(lapic &apic, const kutil::vector<uint8_t> &ids, void *kpml4);

 /// TODO: not this. this is awful.
 args::framebuffer *fb = nullptr;
@@ -122,6 +124,7 @@ kernel_main(args::header *header)
 	extern TSS &g_bsp_tss;
 	extern GDT &g_bsp_gdt;
 	extern cpu_data g_bsp_cpu_data;
+	extern uintptr_t idle_stack_end;

 	IDT *idt = new (&g_idt) IDT;

@@ -131,6 +134,7 @@ kernel_main(args::header *header)
 	cpu->self = cpu;
 	cpu->tss = new (&g_bsp_tss) TSS;
 	cpu->gdt = new (&g_bsp_gdt) GDT {cpu->tss};
+	cpu->rsp0 = idle_stack_end;
 	cpu_early_init(cpu);

 	disable_legacy_pic();
@@ -160,15 +164,21 @@ kernel_main(args::header *header)
 	devices.parse_acpi(header->acpi_table);

 	// Need the local APIC to get the BSP's id
-	lapic &apic = device_manager::get().get_lapic();
-	cpu->id = apic.get_id();
+	uintptr_t apic_base = devices.get_lapic_base();
+
+	lapic *apic = new lapic(apic_base);
+	apic->enable();
+
+	cpu->id = apic->get_id();
+	cpu->apic = apic;

 	cpu_init(cpu, true);

 	devices.init_drivers();
-	devices.get_lapic().calibrate_timer();
+	apic->calibrate_timer();

-	start_aps(header->pml4);
+	const auto &apic_ids = devices.get_apic_ids();
+	unsigned num_cpus = start_aps(*apic, apic_ids, header->pml4);

 	idt->add_ist_entries();
 	interrupts_enable();
@@ -197,7 +207,8 @@ kernel_main(args::header *header)
 	}
 	*/

-	scheduler *sched = new scheduler(devices.get_lapic());
+	scheduler *sched = new scheduler {num_cpus};
+	scheduler_ready = true;

 	// Skip program 0, which is the kernel itself
 	for (unsigned i = 1; i < header->num_programs; ++i)
@@ -209,8 +220,8 @@ kernel_main(args::header *header)
 	sched->start();
 }

-void
-start_aps(void *kpml4)
+unsigned
+start_aps(lapic &apic, const kutil::vector<uint8_t> &ids, void *kpml4)
 {
 	using memory::frame_size;
 	using memory::kernel_stack_pages;
@@ -220,10 +231,8 @@ start_aps(void *kpml4)
 	extern vm_area_guarded &g_kernel_stacks;

 	clock &clk = clock::get();
-	lapic &apic = device_manager::get().get_lapic();

 	ap_startup_count = 1; // BSP processor
-	auto &ids = device_manager::get().get_apic_ids();
 	log::info(logs::boot, "Starting %d other CPUs", ids.count() - 1);

 	// Since we're using address space outside kernel space, make sure
@@ -245,7 +254,7 @@ start_aps(void *kpml4)

 	// AP idle stacks need less room than normal stacks, so pack multiple
 	// into a normal stack area
-	static constexpr size_t idle_stack_bytes = 1024; // 2KiB is generous
+	static constexpr size_t idle_stack_bytes = 2048; // 2KiB is generous
 	static constexpr size_t full_stack_bytes = kernel_stack_pages * frame_size;
 	static constexpr size_t idle_stacks_per = full_stack_bytes / idle_stack_bytes;

@@ -258,13 +267,14 @@ start_aps(void *kpml4)
 	apic.send_ipi_broadcast(mode, false, 0);

 	for (uint8_t id : ids) {
-		if (id == apic.get_id()) continue;
+		if (id == bsp.id) continue;

 		// Set up the CPU data structures
 		TSS *tss = new TSS;
 		GDT *gdt = new GDT {tss};
 		cpu_data *cpu = new cpu_data;
 		kutil::memset(cpu, 0, sizeof(cpu_data));
+
 		cpu->self = cpu;
 		cpu->id = id;
 		cpu->index = ++index;
@@ -285,7 +295,7 @@ start_aps(void *kpml4)
 		cpu->rsp0 = stack_end;

 		// Set up the trampoline with this CPU's data
-		init_ap_trampoline(kpml4, cpu, long_ap_startup);
+		init_ap_trampoline(kpml4, cpu, ap_idle);

 		// Kick it off!
 		size_t current_count = ap_startup_count;
@@ -315,6 +325,7 @@ start_aps(void *kpml4)

 	log::info(logs::boot, "%d CPUs running", ap_startup_count);
 	vm_space::kernel_space().remove(vma);
+	return ap_startup_count;
 }

 void
@@ -322,6 +333,12 @@ long_ap_startup(cpu_data *cpu)
 {
 	cpu_init(cpu, false);
 	++ap_startup_count;
+	while (!scheduler_ready) asm ("pause");

-	while(1) asm("hlt");
+	uintptr_t apic_base =
+		device_manager::get().get_lapic_base();
+	cpu->apic = new lapic(apic_base);
+	cpu->apic->enable();
+
+	scheduler::get().start();
 }
--- a/src/kernel/objects/process.cpp
+++ b/src/kernel/objects/process.cpp
@@ -13,15 +13,11 @@ static kutil::no_construct<process> __g_kernel_process_storage;
 process &g_kernel_process = __g_kernel_process_storage.value;


-kutil::vector<process*> process::s_processes;
-
 process::process() :
 	kobject {kobject::type::process},
 	m_next_handle {1},
 	m_state {state::running}
 {
-	s_processes.append(this);
-
 	j6_handle_t self = add_handle(this);
 	kassert(self == self_handle(), "Process self-handle is not 1");
 }
@@ -39,7 +35,6 @@ process::~process()
 {
 	for (auto &it : m_handles)
 		if (it.val) it.val->handle_release();
-	s_processes.remove_swap(this);
 }

 process & process::current() { return *current_cpu().process; }
--- a/src/kernel/objects/process.h
+++ b/src/kernel/objects/process.h
@@ -94,6 +94,4 @@ private:

 	enum class state : uint8_t { running, exited };
 	state m_state;
-
-	static kutil::vector<process*> s_processes;
 };
--- a/src/kernel/objects/thread.cpp
+++ b/src/kernel/objects/thread.cpp
@@ -221,7 +221,5 @@ thread::create_idle_thread(process &kernel, uint8_t pri, uintptr_t rsp0)
 	thread *idle = new thread(kernel, pri, rsp0);
 	idle->set_state(state::constant);
 	idle->set_state(state::ready);
-	log::info(logs::task, "Created idle thread as koid %llx", idle->koid());
-
 	return idle;
 }
--- a/src/kernel/scheduler.cpp
+++ b/src/kernel/scheduler.cpp
@@ -17,6 +17,7 @@
 #include "objects/channel.h"
 #include "objects/process.h"
 #include "objects/system.h"
+#include "objects/thread.h"
 #include "objects/vm_area.h"
 #include "scheduler.h"

@@ -25,43 +26,37 @@

 #include "kutil/assert.h"

-
+extern "C" void task_switch(TCB *tcb);
 scheduler *scheduler::s_instance = nullptr;

-const uint64_t rflags_noint = 0x002;
-const uint64_t rflags_int = 0x202;
-
-extern uint64_t idle_stack_end;
-
-extern "C" void task_switch(TCB *tcb);
-
-scheduler::scheduler(lapic &apic) :
-	m_apic(apic),
-	m_next_pid(1),
-	m_clock(0),
-	m_last_promotion(0)
+struct run_queue
 {
-	kassert(!s_instance, "Multiple schedulers created!");
-	s_instance = this;
+	tcb_node *current = nullptr;
+	tcb_list ready[scheduler::num_priorities];
+	tcb_list blocked;

-	process *kp = &process::kernel_process();
+	uint64_t last_promotion = 0;
+	uint64_t last_steal = 0;
+	kutil::spinlock lock;
+};

-	log::debug(logs::task, "Kernel process koid %llx", kp->koid());
+scheduler::scheduler(unsigned cpus) :
+	m_next_pid {1},
+	m_clock {0}
+{
+	kassert(!s_instance, "Created multiple schedulers!");
+	if (!s_instance)
+		s_instance = this;

-	thread *idle = thread::create_idle_thread(*kp, max_priority,
-		reinterpret_cast<uintptr_t>(&idle_stack_end));
+	m_run_queues.set_size(cpus);
+}

-	log::debug(logs::task, "Idle thread koid %llx", idle->koid());
-
-	auto *tcb = idle->tcb();
-	m_runlists[max_priority].push_back(tcb);
-	m_current = tcb;
-
-	cpu_data &cpu = current_cpu();
-	cpu.rsp0 = tcb->rsp0;
-	cpu.tcb = tcb;
-	cpu.process = kp;
-	cpu.thread = idle;
+scheduler::~scheduler()
+{
+	// Not truly necessary - if the scheduler is going away, the whole
+	// system is probably going down. But let's be clean.
+	if (s_instance == this)
+		s_instance = nullptr;
 }

 template <typename T>
@@ -72,20 +67,6 @@ inline T * push(uintptr_t &rsp, size_t size = sizeof(T)) {
 	return p;
 }

-thread *
-scheduler::create_process(bool user)
-{
-	process *p = new process;
-	thread *th = p->create_thread(default_priority, user);
-
-	TCB *tcb = th->tcb();
-	log::debug(logs::task, "Creating thread %llx, priority %d, time slice %d",
-			th->koid(), tcb->priority, tcb->time_left);
-
-	th->set_state(thread::state::ready);
-	return th;
-}
-
 void
 scheduler::create_kernel_task(void (*task)(), uint8_t priority, bool constant)
 {
@@ -115,24 +96,42 @@ scheduler::quantum(int priority)
 void
 scheduler::start()
 {
-	log::info(logs::sched, "Starting scheduler.");
-	m_apic.enable_timer(isr::isrTimer, false);
-	m_apic.reset_timer(10);
+	cpu_data &cpu = current_cpu();
+	run_queue &queue = m_run_queues[cpu.index];
+	kutil::scoped_lock lock {queue.lock};
+
+	process *kp = &process::kernel_process();
+	thread *idle = thread::create_idle_thread(*kp, max_priority, cpu.rsp0);
+	log::debug(logs::task, "CPU%02x idle thread koid %llx", cpu.index, idle->koid());
+
+	auto *tcb = idle->tcb();
+	cpu.process = kp;
+	cpu.thread = idle;
+	cpu.tcb = tcb;
+
+	queue.current = tcb;
+
+	log::info(logs::sched, "CPU%02x starting scheduler", cpu.index);
+	cpu.apic->enable_timer(isr::isrTimer, false);
+	cpu.apic->reset_timer(10);
 }

 void
 scheduler::add_thread(TCB *t)
 {
-	m_blocked.push_back(static_cast<tcb_node*>(t));
-	t->time_left = quantum(t->priority);
+	cpu_data &cpu = current_cpu();
+	run_queue &queue = m_run_queues[cpu.index];
+	kutil::scoped_lock lock {queue.lock};

+	queue.blocked.push_back(static_cast<tcb_node*>(t));
+	t->time_left = quantum(t->priority);
 }

-void scheduler::prune(uint64_t now)
+void scheduler::prune(run_queue &queue, uint64_t now)
 {
 	// Find processes that are ready or have exited and
 	// move them to the appropriate lists.
-	auto *tcb = m_blocked.front();
+	auto *tcb = queue.blocked.front();
 	while (tcb) {
 		thread *th = thread::from_tcb(tcb);
 		uint8_t priority = tcb->priority;
@@ -140,7 +139,7 @@ void scheduler::prune(uint64_t now)
 		bool ready = th->has_state(thread::state::ready);
 		bool exited = th->has_state(thread::state::exited);
 		bool constant = th->has_state(thread::state::constant);
-		bool current = tcb == m_current;
+		bool current = tcb == queue.current;

 		ready |= th->wake_on_time(now);

@@ -155,7 +154,7 @@ void scheduler::prune(uint64_t now)
 			// page tables
 			if (current) continue;

-			m_blocked.remove(remove);
+			queue.blocked.remove(remove);
 			process &p = th->parent();

 			// thread_exited deletes the thread, and returns true if the process
@@ -163,19 +162,19 @@ void scheduler::prune(uint64_t now)
 			if(!current && p.thread_exited(th))
 				delete &p;
 		} else {
-			m_blocked.remove(remove);
+			queue.blocked.remove(remove);
 			log::debug(logs::sched, "Prune: readying unblocked thread %llx", th->koid());
-			m_runlists[remove->priority].push_back(remove);
+			queue.ready[remove->priority].push_back(remove);
 		}
 	}
 }

 void
-scheduler::check_promotions(uint64_t now)
+scheduler::check_promotions(run_queue &queue, uint64_t now)
 {
-	for (auto &pri_list : m_runlists) {
+	for (auto &pri_list : queue.ready) {
 		for (auto *tcb : pri_list) {
-			const thread *th = thread::from_tcb(m_current);
+			const thread *th = thread::from_tcb(queue.current);
 			const bool constant = th->has_state(thread::state::constant);
 			if (constant)
 				continue;
@@ -190,81 +189,145 @@ scheduler::check_promotions(uint64_t now)

 			if (stale) {
 				// If the thread is stale, promote it
-				m_runlists[priority].remove(tcb);
+				queue.ready[priority].remove(tcb);
 				tcb->priority -= 1;
 				tcb->time_left = quantum(tcb->priority);
-				m_runlists[tcb->priority].push_back(tcb);
+				queue.ready[tcb->priority].push_back(tcb);
 				log::info(logs::sched, "Scheduler promoting thread %llx, priority %d",
 						th->koid(), tcb->priority);
 			}
 		}
 	}

-	m_last_promotion = now;
+	queue.last_promotion = now;
+}
+
+static size_t
+balance_lists(tcb_list &to, tcb_list &from)
+{
+	size_t to_len = to.length();
+	size_t from_len = from.length();
+
+	// Only steal from the rich, don't be Dennis Moore
+	if (from_len <= to_len)
+		return 0;
+
+	size_t steal = (from_len - to_len) / 2;
+	for (size_t i = 0; i < steal; ++i)
+		to.push_front(from.pop_front());
+	return steal;
+}
+
+void
+scheduler::steal_work(cpu_data &cpu)
+{
+	// First grab a scheduler-wide lock to avoid deadlock
+	kutil::scoped_lock steal_lock {m_steal_lock};
+
+	// Lock this cpu's queue for the whole time while we modify it
+	run_queue &my_queue = m_run_queues[cpu.index];
+	kutil::scoped_lock my_queue_lock {my_queue.lock};
+
+	const unsigned count = m_run_queues.count();
+	for (unsigned i = 0; i < count; ++i) {
+		if (i == cpu.index) continue;
+
+		run_queue &other_queue = m_run_queues[i];
+		kutil::scoped_lock other_queue_lock {other_queue.lock};
+
+		size_t stolen = 0;
+
+		// Don't steal from max_priority, that's the idle thread
+		for (unsigned pri = 0; pri < max_priority; ++pri)
+			stolen += balance_lists(my_queue.ready[pri], other_queue.ready[pri]);
+
+		stolen += balance_lists(my_queue.blocked, other_queue.blocked);
+
+		if (stolen)
+			log::debug(logs::sched, "CPU%02x stole %2d tasks from CPU%02x",
+					cpu.index, stolen, i);
+	}
 }

 void
 scheduler::schedule()
 {
-	uint8_t priority = m_current->priority;
-	uint32_t remaining = m_apic.stop_timer();
-	m_current->time_left = remaining;
-	thread *th = thread::from_tcb(m_current);
+	cpu_data &cpu = current_cpu();
+	run_queue &queue = m_run_queues[cpu.index];
+	lapic &apic = *cpu.apic;
+	uint32_t remaining = apic.stop_timer();
+
+	if (m_clock - queue.last_steal > steal_frequency) {
+		steal_work(cpu);
+		queue.last_steal = m_clock;
+	}
+
+	// We need to explicitly lock/unlock here instead of
+	// using a scoped lock, because the scope doesn't "end"
+	// for the current thread until it gets scheduled again
+	kutil::spinlock::waiter waiter;
+	queue.lock.acquire(&waiter);
+
+	queue.current->time_left = remaining;
+	thread *th = thread::from_tcb(queue.current);
+	uint8_t priority = queue.current->priority;
 	const bool constant = th->has_state(thread::state::constant);

 	if (remaining == 0) {
 		if (priority < max_priority && !constant) {
 			// Process used its whole timeslice, demote it
-			++m_current->priority;
+			++queue.current->priority;
 			log::debug(logs::sched, "Scheduler  demoting thread %llx, priority %d",
-					th->koid(), m_current->priority);
+					th->koid(), queue.current->priority);
 		}
-		m_current->time_left = quantum(m_current->priority);
+		queue.current->time_left = quantum(queue.current->priority);
 	} else if (remaining > 0) {
 		// Process gave up CPU, give it a small bonus to its
 		// remaining timeslice.
 		uint32_t bonus = quantum(priority) >> 4;
-		m_current->time_left += bonus;
+		queue.current->time_left += bonus;
 	}

-	m_runlists[priority].remove(m_current);
 	if (th->has_state(thread::state::ready)) {
-		m_runlists[m_current->priority].push_back(m_current);
+		queue.ready[queue.current->priority].push_back(queue.current);
 	} else {
-		m_blocked.push_back(m_current);
+		queue.blocked.push_back(queue.current);
 	}

 	clock::get().update();
-	prune(++m_clock);
-	if (m_clock - m_last_promotion > promote_frequency)
-		check_promotions(m_clock);
+	prune(queue, ++m_clock);
+	if (m_clock - queue.last_promotion > promote_frequency)
+		check_promotions(queue, m_clock);

 	priority = 0;
-	while (m_runlists[priority].empty()) {
+	while (queue.ready[priority].empty()) {
 		++priority;
 		kassert(priority < num_priorities, "All runlists are empty");
 	}

-	m_current->last_ran = m_clock;
+	queue.current->last_ran = m_clock;

-	auto *next = m_runlists[priority].pop_front();
+	auto *next = queue.ready[priority].pop_front();
 	next->last_ran = m_clock;
-	m_apic.reset_timer(next->time_left);
+	apic.reset_timer(next->time_left);

-	if (next != m_current) {
-		thread *next_thread = thread::from_tcb(next);
-
-		cpu_data &cpu = current_cpu();
-		cpu.thread = next_thread;
-		cpu.process = &next_thread->parent();
-		m_current = next;
-
-		log::debug(logs::sched, "Scheduler switching threads %llx->%llx",
-				th->koid(), next_thread->koid());
-		log::debug(logs::sched, "    priority %d time left %d @ %lld.",
-				m_current->priority, m_current->time_left, m_clock);
-		log::debug(logs::sched, "    PML4 %llx", m_current->pml4);
-
-		task_switch(m_current);
+	if (next == queue.current) {
+		queue.lock.release(&waiter);
+		return;
 	}
+
+	thread *next_thread = thread::from_tcb(next);
+
+	cpu.thread = next_thread;
+	cpu.process = &next_thread->parent();
+	queue.current = next;
+
+	log::debug(logs::sched, "CPU%02x switching threads %llx->%llx",
+			cpu.index, th->koid(), next_thread->koid());
+	log::debug(logs::sched, "    priority %d time left %d @ %lld.",
+			next->priority, next->time_left, m_clock);
+	log::debug(logs::sched, "    PML4 %llx", next->pml4);
+
+	queue.lock.release(&waiter);
+	task_switch(queue.current);
 }
--- a/src/kernel/scheduler.h
+++ b/src/kernel/scheduler.h
@@ -3,7 +3,8 @@
 /// The task scheduler and related definitions

 #include <stdint.h>
-#include "objects/thread.h"
+#include "kutil/spinlock.h"
+#include "kutil/vector.h"

 namespace kernel {
 namespace args {
@@ -14,6 +15,7 @@ struct cpu_data;
 class lapic;
 class process;
 struct page_table;
+struct run_queue;


 /// The task scheduler
@@ -39,8 +41,9 @@ public:
 	static const uint16_t process_quanta = 10;

 	/// Constructor.
-	/// \arg apic  The local APIC object for this CPU
-	scheduler(lapic &apic);
+	/// \arg cpus  The number of CPUs to schedule for
+	scheduler(unsigned cpus);
+	~scheduler();

 	/// Create a new process from a program image in memory.
 	/// \arg program  The descriptor of the pogram in memory
@@ -66,15 +69,11 @@ public:
 	/// Run the scheduler, possibly switching to a new task
 	void schedule();

-	/// Get the current TCB.
-	/// \returns  A pointer to the current thread's TCB
-	inline TCB * current() { return m_current; }
-
 	/// Start scheduling a new thread.
 	/// \arg t  The new thread's TCB
 	void add_thread(TCB *t);

-	/// Get a reference to the system scheduler
+	/// Get a reference to the scheduler
 	/// \returns  A reference to the global system scheduler
 	static scheduler & get() { return *s_instance; }

@@ -82,30 +81,23 @@ private:
 	friend class process;

 	static constexpr uint64_t promote_frequency = 10;
+	static constexpr uint64_t steal_frequency = 10;

-	/// Create a new process object. This process will have its pid
-	/// set but nothing else.
-	/// \arg user True if this thread will enter userspace
-	/// \returns  The new process' main thread
-	thread * create_process(bool user);
-
-	void prune(uint64_t now);
-	void check_promotions(uint64_t now);
-
-	lapic &m_apic;
+	void prune(run_queue &queue, uint64_t now);
+	void check_promotions(run_queue &queue, uint64_t now);
+	void steal_work(cpu_data &cpu);

 	uint32_t m_next_pid;
 	uint32_t m_tick_count;

 	process *m_kernel_process;
-	tcb_node *m_current;
-	tcb_list m_runlists[num_priorities];
-	tcb_list m_blocked;
+
+	kutil::vector<run_queue> m_run_queues;

 	// TODO: lol a real clock
 	uint64_t m_clock = 0;
-	uint64_t m_last_promotion;

+	kutil::spinlock m_steal_lock;
 	static scheduler *s_instance;
 };

--- a/src/kernel/syscall.cpp
+++ b/src/kernel/syscall.cpp
@@ -1,11 +1,10 @@
 #include <stddef.h>

+#include "kutil/memory.h"
+
 #include "console.h"
-#include "cpu.h"
 #include "debug.h"
 #include "log.h"
-#include "msr.h"
-#include "scheduler.h"
 #include "syscall.h"

 extern "C" {
--- a/src/libraries/kutil/include/kutil/logger.h
+++ b/src/libraries/kutil/include/kutil/logger.h
@@ -6,6 +6,7 @@
 #include <stdint.h>

 #include "kutil/bip_buffer.h"
+#include "kutil/spinlock.h"

 namespace kutil {
 namespace log {
@@ -111,6 +112,7 @@ private:
 	uint8_t m_sequence;

 	kutil::bip_buffer m_buffer;
+	kutil::spinlock m_lock;

 	static logger *s_log;
 	static const char *s_level_names[static_cast<unsigned>(level::max)];
--- a/src/libraries/kutil/logger.cpp
+++ b/src/libraries/kutil/logger.cpp
@@ -91,6 +91,8 @@ logger::output(level severity, area_t area, const char *fmt, va_list args)
 	header->bytes +=
 		vsnprintf(header->message, sizeof(buffer) - sizeof(entry), fmt, args);

+	kutil::scoped_lock lock {m_lock};
+
 	if (m_immediate) {
 		buffer[header->bytes] = 0;
 		m_immediate(area, severity, header->message);
@@ -117,6 +119,8 @@ logger::output(level severity, area_t area, const char *fmt, va_list args)
 size_t
 logger::get_entry(void *buffer, size_t size)
 {
+	kutil::scoped_lock lock {m_lock};
+
 	void *out;
 	size_t out_size = m_buffer.get_block(&out);
 	if (out_size == 0 || out == 0)