diff --git a/src/kernel/apic.cpp b/src/kernel/apic.cpp
index effa0c4..0a31cd9 100644
--- a/src/kernel/apic.cpp
+++ b/src/kernel/apic.cpp
@@ -71,7 +71,7 @@ lapic::get_id()
 }
 
 void
-lapic::send_ipi(ipi mode, uint8_t vector, uint8_t dest)
+lapic::send_ipi(ipi mode, isr vector, uint8_t dest)
 {
     // Wait until the APIC is ready to send
     ipi_wait();
@@ -85,7 +85,7 @@ lapic::send_ipi(ipi mode, uint8_t vector, uint8_t dest)
 }
 
 void
-lapic::send_ipi_broadcast(ipi mode, bool self, uint8_t vector)
+lapic::send_ipi_broadcast(ipi mode, bool self, isr vector)
 {
     // Wait until the APIC is ready to send
     ipi_wait();
diff --git a/src/kernel/apic.h b/src/kernel/apic.h
index 2a25ed2..aee838a 100644
--- a/src/kernel/apic.h
+++ b/src/kernel/apic.h
@@ -5,6 +5,8 @@
 #include <stdint.h>
 #include <util/enum_bitfields.h>
 
+#include "interrupts.h"
+
 enum class isr : uint8_t;
 
 /// Base class for other APIC types
@@ -51,13 +53,13 @@ public:
     /// \arg mode   The sending mode
     /// \arg vector The interrupt vector
     /// \arg dest   The APIC ID of the destination
-    void send_ipi(ipi mode, uint8_t vector, uint8_t dest);
+    void send_ipi(ipi mode, isr vector, uint8_t dest);
 
     /// Send an inter-processor broadcast interrupt to all other CPUs
     /// \arg mode   The sending mode
     /// \arg self   If true, include this CPU in the broadcast
     /// \arg vector The interrupt vector
-    void send_ipi_broadcast(ipi mode, bool self, uint8_t vector);
+    void send_ipi_broadcast(ipi mode, bool self, isr vector);
 
     /// Wait for an IPI to finish sending. This is done automatically
     /// before sending another IPI with send_ipi().
diff --git a/src/kernel/interrupt_isrs.inc b/src/kernel/interrupt_isrs.inc
index 6d1e700..dbaf484 100644
--- a/src/kernel/interrupt_isrs.inc
+++ b/src/kernel/interrupt_isrs.inc
@@ -242,6 +242,8 @@ ISR (0xe1, 0, isrLINT0)
 ISR (0xe2, 0, isrLINT1)
 ISR (0xe3, 0, isrAPICError)
 
+ISR (0xe4, 0, ipiSchedule)
+
 ISR (0xef, 0, isrSpurious)
 
 ISR (0xf0, 0, isrIgnore0)
diff --git a/src/kernel/interrupts.cpp b/src/kernel/interrupts.cpp
index 61d3f90..91b3839 100644
--- a/src/kernel/interrupts.cpp
+++ b/src/kernel/interrupts.cpp
@@ -150,6 +150,10 @@ isr_handler(cpu_state *regs)
     case isr::isrLINT1:
         break;
 
+    case isr::ipiSchedule:
+        scheduler::get().schedule();
+        break;
+
     case isr::isrSpurious:
         // No EOI for the spurious interrupt
         return;
diff --git a/src/kernel/interrupts.h b/src/kernel/interrupts.h
index 6789008..b3ef75f 100644
--- a/src/kernel/interrupts.h
+++ b/src/kernel/interrupts.h
@@ -10,7 +10,7 @@ enum class isr : uint8_t
 #define ISR(i, s, name)     name = i,
 #define NISR(i, s, name)    name = i,
 #define EISR(i, s, name)    name = i,
-#define IRQ(i, q, name)  name = i,
+#define IRQ(i, q, name)     name = i,
 #include "interrupt_isrs.inc"
 #undef IRQ
 #undef EISR
diff --git a/src/kernel/objects/thread.cpp b/src/kernel/objects/thread.cpp
index 7cf7161..7682e72 100644
--- a/src/kernel/objects/thread.cpp
+++ b/src/kernel/objects/thread.cpp
@@ -53,7 +53,17 @@ thread::block()
 void
 thread::wake(uint64_t value)
 {
+    if (has_state(state::ready))
+        return;
+
     m_wake_value = value;
+    wake_only();
+    scheduler::get().maybe_schedule(tcb());
+}
+
+void
+thread::wake_only()
+{
     m_wake_timeout = 0;
     set_state(state::ready);
 }
diff --git a/src/kernel/objects/thread.h b/src/kernel/objects/thread.h
index 98b22df..737d7ca 100644
--- a/src/kernel/objects/thread.h
+++ b/src/kernel/objects/thread.h
@@ -9,6 +9,7 @@
 
 #include "objects/kobject.h"
 
+struct cpu_data;
 struct page_table;
 
 namespace obj {
@@ -37,6 +38,7 @@ struct TCB
     uint64_t last_ran;
 
     uintptr_t kernel_stack;
+    cpu_data *cpu;
 };
 
 using tcb_list = util::linked_list<TCB>;
@@ -45,15 +47,6 @@ using tcb_node = tcb_list::item_type;
 
 namespace obj {
 
-enum class wait_type : uint8_t
-{
-    none   = 0x00,
-    signal = 0x01,
-    time   = 0x02,
-    object = 0x04,
-};
-is_bitfield(wait_type);
-
 class process;
 
 class thread :
@@ -109,6 +102,10 @@ public:
     /// \arg value  The value that block() should return
     void wake(uint64_t value = 0);
 
+    /// Set this thread as awake, but do not call the scheduler
+    /// or set the wake value.
+    void wake_only();
+
     /// Set a timeout to unblock this thread
     /// \arg time  The clock time at which to wake. 0 for no timeout.
     inline void set_wake_timeout(uint64_t time) { m_wake_timeout = time; }
diff --git a/src/kernel/scheduler.cpp b/src/kernel/scheduler.cpp
index ea80db5..34f550d 100644
--- a/src/kernel/scheduler.cpp
+++ b/src/kernel/scheduler.cpp
@@ -112,8 +112,9 @@ scheduler::add_thread(TCB *t)
     run_queue &queue = m_run_queues[cpu.index];
     util::scoped_lock lock {queue.lock};
 
-    queue.blocked.push_back(static_cast<tcb_node*>(t));
+    t->cpu = &cpu;
     t->time_left = quantum(t->priority);
+    queue.blocked.push_back(static_cast<tcb_node*>(t));
 }
 
 void
@@ -128,7 +129,7 @@ scheduler::prune(run_queue &queue, uint64_t now)
 
         uint64_t timeout = th->wake_timeout();
         if (timeout && timeout <= now)
-            th->wake();
+            th->wake_only();
 
         bool ready = th->has_state(thread::state::ready);
         bool exited = th->has_state(thread::state::exited);
@@ -167,8 +168,8 @@ scheduler::check_promotions(run_queue &queue, uint64_t now)
     for (auto &pri_list : queue.ready) {
         for (auto *tcb : pri_list) {
             const thread *th = tcb->thread;
-            const bool constant = th->has_state(thread::state::constant);
-            if (constant)
+
+            if (th->has_state(thread::state::constant))
                 continue;
 
             const uint64_t age = now - tcb->last_ran;
@@ -176,8 +177,7 @@ scheduler::check_promotions(run_queue &queue, uint64_t now)
 
             bool stale =
                 age > quantum(priority) * 2 &&
-                tcb->priority > promote_limit &&
-                !constant;
+                tcb->priority > promote_limit;
 
             if (stale) {
                 // If the thread is stale, promote it
@@ -195,7 +195,7 @@ scheduler::check_promotions(run_queue &queue, uint64_t now)
 }
 
 static size_t
-balance_lists(tcb_list &to, tcb_list &from)
+balance_lists(tcb_list &to, tcb_list &from, cpu_data &new_cpu)
 {
     size_t to_len = to.length();
     size_t from_len = from.length();
@@ -205,17 +205,18 @@ balance_lists(tcb_list &to, tcb_list &from)
         return 0;
 
     size_t steal = (from_len - to_len) / 2;
-    for (size_t i = 0; i < steal; ++i)
-        to.push_front(from.pop_front());
+    for (size_t i = 0; i < steal; ++i) {
+        tcb_node *node = from.pop_front();
+        node->cpu = &new_cpu;
+        to.push_front(node);
+    }
     return steal;
 }
 
 void
 scheduler::steal_work(cpu_data &cpu)
 {
-    // Lock this cpu's queue for the whole time while we modify it
     run_queue &my_queue = m_run_queues[cpu.index];
-    util::scoped_lock my_queue_lock {my_queue.lock};
 
     const unsigned count = m_run_queues.count();
     for (unsigned i = 0; i < count; ++i) {
@@ -228,9 +229,9 @@ scheduler::steal_work(cpu_data &cpu)
 
         // Don't steal from max_priority, that's the idle thread
         for (unsigned pri = 0; pri < max_priority; ++pri)
-            stolen += balance_lists(my_queue.ready[pri], other_queue.ready[pri]);
+            stolen += balance_lists(my_queue.ready[pri], other_queue.ready[pri], cpu);
 
-        stolen += balance_lists(my_queue.blocked, other_queue.blocked);
+        stolen += balance_lists(my_queue.blocked, other_queue.blocked, cpu);
 
         if (stolen)
             log::debug(logs::sched, "CPU%02x stole %2d tasks from CPU%02x",
@@ -244,10 +245,18 @@ scheduler::schedule()
     cpu_data &cpu = current_cpu();
     run_queue &queue = m_run_queues[cpu.index];
     lapic &apic = *cpu.apic;
-    uint32_t remaining = apic.stop_timer();
 
+    uint32_t remaining = apic.stop_timer();
     uint64_t now = clock::get().value();
 
+    // We need to explicitly lock/unlock here instead of
+    // using a scoped lock, because the scope doesn't "end"
+    // for the current thread until it gets scheduled again,
+    // and _new_ threads start their life at the end of this
+    // function, which screws up RAII
+    util::spinlock::waiter waiter {false, nullptr, "schedule"};
+    queue.lock.acquire(&waiter);
+
     // Only one CPU can be stealing at a time
     if (m_steal_turn == cpu.index &&
         now - queue.last_steal > steal_frequency) {
@@ -256,12 +265,6 @@ scheduler::schedule()
         m_steal_turn = (m_steal_turn + 1) % m_run_queues.count();
     }
 
-    // We need to explicitly lock/unlock here instead of
-    // using a scoped lock, because the scope doesn't "end"
-    // for the current thread until it gets scheduled again
-    util::spinlock::waiter waiter;
-    queue.lock.acquire(&waiter);
-
     queue.current->time_left = remaining;
     thread *th = queue.current->thread;
     uint8_t priority = queue.current->priority;
@@ -325,3 +328,17 @@ scheduler::schedule()
     queue.lock.release(&waiter);
     task_switch(queue.current);
 }
+
+void
+scheduler::maybe_schedule(TCB *t)
+{
+    cpu_data *cpu = t->cpu;
+
+    run_queue &queue = m_run_queues[cpu->index];
+    uint8_t current_pri = queue.current->priority;
+    if (current_pri <= t->priority)
+        return;
+
+    current_cpu().apic->send_ipi(
+        lapic::ipi::fixed, isr::ipiSchedule, cpu->id);
+}
diff --git a/src/kernel/scheduler.h b/src/kernel/scheduler.h
index eb3877d..140dc0d 100644
--- a/src/kernel/scheduler.h
+++ b/src/kernel/scheduler.h
@@ -71,6 +71,10 @@ public:
     /// Run the scheduler, possibly switching to a new task
     void schedule();
 
+    /// Check if the CPU is running a more important task. If not,
+    /// run the scheduler.
+    void maybe_schedule(TCB *t);
+
     /// Start scheduling a new thread.
     /// \arg t  The new thread's TCB
     void add_thread(TCB *t);
diff --git a/src/kernel/smp.cpp b/src/kernel/smp.cpp
index 4f6f1e5..7e5612f 100644
--- a/src/kernel/smp.cpp
+++ b/src/kernel/smp.cpp
@@ -3,6 +3,7 @@
 #include "apic.h"
 #include "clock.h"
 #include "device_manager.h"
+#include "interrupts.h"
 #include "logger.h"
 #include "memory.h"
 #include "objects/vm_area.h"
@@ -51,7 +52,7 @@ start(cpu_data &bsp, void *kpml4)
 
     // Copy the startup code somwhere the real mode trampoline can run
     uintptr_t addr = 0x8000; // TODO: find a valid address, rewrite addresses
-    uint8_t vector = addr >> 12;
+    isr vector = static_cast<isr>(addr >> 12);
     obj::vm_area *vma = new obj::vm_area_fixed(addr, 0x1000, vm_flags::write);
     vm_space::kernel_space().add(addr, vma);
     memcpy(
@@ -70,7 +71,7 @@ start(cpu_data &bsp, void *kpml4)
 
     lapic &apic = *bsp.apic;
     lapic::ipi mode = lapic::ipi::init | lapic::ipi::level | lapic::ipi::assert;
-    apic.send_ipi_broadcast(mode, false, 0);
+    apic.send_ipi_broadcast(mode, false, static_cast<isr>(0));
 
     for (uint8_t id : ids) {
         if (id == bsp.id) continue;