[kernel] Pre-allocate cpu_data and pass to APs

In order to avoid cyclic dependencies in the case of page faults while
bringing up an AP, pre-allocate the cpu_data structure and related CPU
control structures, and pass them to the AP startup code.

This also changes the following:
- cpu_early_init() was split out of cpu_early_init() to allow early
  usage of current_cpu() on the BSP before we're ready for the rest of
  cpu_init(). (These functions were also renamed to follow the preferred
  area_action naming style.)
- isr_handler now zeroes out the IST entry for its vector instead of
  trying to increment the IST stack pointer
- the IST stacks are allocated outside of cpu_init, to also help reduce
  stack pressue and chance of page faults before APs are ready
- share stack areas between AP idle threads so we only waste 1K per
  additional AP for the unused idle stack
This commit is contained in:
Justin C. Miller
2021-02-10 01:19:41 -08:00
parent 872f178d94
commit 2d4a65c654
9 changed files with 159 additions and 104 deletions

View File

@@ -38,7 +38,7 @@ ap_startup:
align 8 align 8
.pml4: dq 0 .pml4: dq 0
.stack: dq 0 .cpu: dq 0
.ret: dq 0 .ret: dq 0
align 16 align 16
@@ -100,7 +100,8 @@ align 8
mov gs, ax mov gs, ax
mov ss, ax mov ss, ax
mov rax, [BASE + (.stack - ap_startup)] mov rdi, [BASE + (.cpu - ap_startup)]
mov rax, [rdi + CPU_DATA.rsp0]
mov rsp, rax mov rsp, rax
mov rax, [BASE + (.ret - ap_startup)] mov rax, [BASE + (.ret - ap_startup)]
@@ -121,8 +122,8 @@ init_ap_trampoline:
; rdi is the kernel pml4 ; rdi is the kernel pml4
mov [BASE + (ap_startup.pml4 - ap_startup)], rdi mov [BASE + (ap_startup.pml4 - ap_startup)], rdi
; rsi is the stack for this AP ; rsi is the cpu data for this AP
mov [BASE + (ap_startup.stack - ap_startup)], rsi mov [BASE + (ap_startup.cpu - ap_startup)], rsi
; rdx is the address to jump to ; rdx is the address to jump to
mov [BASE + (ap_startup.ret - ap_startup)], rdx mov [BASE + (ap_startup.ret - ap_startup)], rdx

View File

@@ -40,65 +40,26 @@ cpu_validate()
} }
void void
init_cpu(bool bsp) cpu_early_init(cpu_data *cpu)
{ {
extern TSS &g_bsp_tss; IDT::get().install();
extern GDT &g_bsp_gdt; cpu->gdt->install();
extern vm_area_guarded &g_kernel_stacks;
uint8_t id = 0;
TSS *tss = nullptr;
GDT *gdt = nullptr;
cpu_data *cpu = nullptr;
if (bsp) {
gdt = &g_bsp_gdt;
tss = &g_bsp_tss;
cpu = &g_bsp_cpu_data;
} else {
g_idt.install();
tss = new TSS;
gdt = new GDT {tss};
cpu = new cpu_data;
gdt->install();
lapic &apic = device_manager::get().get_lapic();
id = apic.get_id();
}
kutil::memset(cpu, 0, sizeof(cpu_data));
cpu->self = cpu;
cpu->id = id;
cpu->gdt = gdt;
cpu->tss = tss;
// Install the GS base pointint to the cpu_data // Install the GS base pointint to the cpu_data
wrmsr(msr::ia32_gs_base, reinterpret_cast<uintptr_t>(cpu)); wrmsr(msr::ia32_gs_base, reinterpret_cast<uintptr_t>(cpu));
}
using memory::frame_size; void
using memory::kernel_stack_pages; cpu_init(cpu_data *cpu, bool bsp)
constexpr size_t stack_size = kernel_stack_pages * frame_size; {
if (!bsp) {
uint8_t ist_entries = g_idt.used_ist_entries(); // The BSP already called cpu_early_init
cpu_early_init(cpu);
// Set up the IST stacks
for (unsigned ist = 1; ist < 8; ++ist) {
if (!(ist_entries & (1 << ist)))
continue;
// Two zero entries at the top for the null frame
uintptr_t stack_bottom = g_kernel_stacks.get_section();
uintptr_t stack_top = stack_bottom + stack_size - 2 * sizeof(uintptr_t);
// Pre-realize these stacks, they're no good if they page fault
*reinterpret_cast<uint64_t*>(stack_top) = 0;
tss->ist_stack(ist) = stack_top;
} }
lapic &apic = device_manager::get().get_lapic();
cpu->id = apic.get_id();
// Set up the syscall MSRs // Set up the syscall MSRs
syscall_enable(); syscall_enable();

View File

@@ -38,9 +38,16 @@ struct cpu_data
extern "C" cpu_data * _current_gsbase(); extern "C" cpu_data * _current_gsbase();
/// Initialize a CPU and set up its cpu_data structure /// Set up the running CPU. This sets GDT, IDT, and necessary MSRs as well as creating
/// \arg bsp True if the current CPU is the BSP /// the cpu_data structure for this processor.
void init_cpu(bool bsp); /// \arg cpu The cpu_data structure for this CPU
/// \arg bsp True if this CPU is the BSP
void cpu_init(cpu_data *cpu, bool bsp);
/// Do early (before cpu_init) initialization work. Only needs to be called manually for
/// the BSP, otherwise cpu_init will call it.
/// \arg cpu The cpu_data structure for this CPU
void cpu_early_init(cpu_data *cpu);
/// Get the cpu_data struct for the current executing CPU /// Get the cpu_data struct for the current executing CPU
inline cpu_data & current_cpu() { return *_current_gsbase(); } inline cpu_data & current_cpu() { return *_current_gsbase(); }
@@ -49,7 +56,3 @@ inline cpu_data & current_cpu() { return *_current_gsbase(); }
/// validated the required features, but still iterate the options and log about them. /// validated the required features, but still iterate the options and log about them.
void cpu_validate(); void cpu_validate();
/// Set up the running CPU. This sets GDT, IDT, and necessary MSRs as well as creating
/// the cpu_data structure for this processor.
/// \arg bsp True if this CPU is the BSP
void cpu_initialize(bool bsp);

View File

@@ -37,6 +37,12 @@ IDT::IDT()
#undef ISR #undef ISR
} }
IDT &
IDT::get()
{
return g_idt;
}
void void
IDT::install() const IDT::install() const
{ {
@@ -85,12 +91,6 @@ IDT::set(uint8_t i, void (*handler)(), uint16_t selector, uint8_t flags)
m_entries[i].reserved = 0; m_entries[i].reserved = 0;
} }
void
IDT::set_ist(uint8_t i, uint8_t ist)
{
m_entries[i].ist = ist;
}
void void
IDT::dump(unsigned index) const IDT::dump(unsigned index) const
{ {

View File

@@ -6,8 +6,6 @@
class IDT class IDT
{ {
public: public:
static constexpr unsigned count = 256;
IDT(); IDT();
/// Install this IDT to the current CPU /// Install this IDT to the current CPU
@@ -21,11 +19,15 @@ public:
/// Get the IST entry used by an entry. /// Get the IST entry used by an entry.
/// \arg i Which IDT entry to look in /// \arg i Which IDT entry to look in
/// \returns The IST index used by entry i, or 0 for none /// \returns The IST index used by entry i, or 0 for none
inline uint8_t get_ist(unsigned i) const { inline uint8_t get_ist(uint8_t i) const {
if (i >= count) return 0;
return m_entries[i].ist; return m_entries[i].ist;
} }
/// Set the IST entry used by an entry.
/// \arg i Which IDT entry to set
/// \arg ist The IST index for entry i, or 0 for none
void set_ist(uint8_t i, uint8_t ist) { m_entries[i].ist = ist; }
/// Get the IST entries that are used by this table, as a bitmap /// Get the IST entries that are used by this table, as a bitmap
uint8_t used_ist_entries() const; uint8_t used_ist_entries() const;
@@ -33,9 +35,11 @@ public:
/// \arg index Which entry to print, or -1 for all entries /// \arg index Which entry to print, or -1 for all entries
void dump(unsigned index = -1) const; void dump(unsigned index = -1) const;
/// Get the global IDT
static IDT & get();
private: private:
void set(uint8_t i, void (*handler)(), uint16_t selector, uint8_t flags); void set(uint8_t i, void (*handler)(), uint16_t selector, uint8_t flags);
void set_ist(uint8_t i, uint8_t ist);
struct descriptor struct descriptor
{ {
@@ -57,5 +61,3 @@ private:
descriptor m_entries[256]; descriptor m_entries[256];
ptr m_ptr; ptr m_ptr;
}; };
extern IDT &g_idt;

View File

@@ -83,10 +83,11 @@ isr_handler(cpu_state *regs)
console *cons = console::get(); console *cons = console::get();
uint8_t vector = regs->interrupt & 0xff; uint8_t vector = regs->interrupt & 0xff;
TSS &tss = TSS::current(); // Clear out the IST for this vector so we just keep using
uint8_t ist = g_idt.get_ist(vector); // this stack
if (ist) uint8_t old_ist = IDT::get().get_ist(vector);
tss.ist_stack(ist) -= increment_offset; if (old_ist)
IDT::get().set_ist(vector, 0);
switch (static_cast<isr>(vector)) { switch (static_cast<isr>(vector)) {
@@ -122,6 +123,16 @@ isr_handler(cpu_state *regs)
} }
break; break;
case isr::isrDoubleFault:
cons->set_color(9);
cons->printf("\nDouble Fault:\n");
cons->set_color();
print_regs(*regs);
print_stacktrace(2);
_halt();
break;
case isr::isrGPFault: { case isr::isrGPFault: {
cons->set_color(9); cons->set_color(9);
cons->puts("\nGeneral Protection Fault:\n"); cons->puts("\nGeneral Protection Fault:\n");
@@ -141,7 +152,7 @@ isr_handler(cpu_state *regs)
case 1: case 1:
case 3: case 3:
cons->printf(" IDT[%x]\n", index); cons->printf(" IDT[%x]\n", index);
g_idt.dump(index); IDT::get().dump(index);
break; break;
default: default:
@@ -261,8 +272,9 @@ isr_handler(cpu_state *regs)
_halt(); _halt();
} }
if (ist) // Return the IST for this vector to what it was
tss.ist_stack(ist) += increment_offset; if (old_ist)
IDT::get().set_ist(vector, old_ist);
*reinterpret_cast<uint32_t *>(apic_eoi_addr) = 0; *reinterpret_cast<uint32_t *>(apic_eoi_addr) = 0;
} }

View File

@@ -37,9 +37,9 @@ extern "C" {
void kernel_main(kernel::args::header *header); void kernel_main(kernel::args::header *header);
void (*__ctors)(void); void (*__ctors)(void);
void (*__ctors_end)(void); void (*__ctors_end)(void);
void long_ap_startup(); void long_ap_startup(cpu_data *cpu);
void ap_startup(); void ap_startup();
void init_ap_trampoline(void*, uintptr_t, void (*)()); void init_ap_trampoline(void*, cpu_data *, void (*)(cpu_data *));
} }
extern void __kernel_assert(const char *, unsigned, const char *); extern void __kernel_assert(const char *, unsigned, const char *);
@@ -118,23 +118,29 @@ kernel_main(args::header *header)
logger_clear_immediate(); logger_clear_immediate();
} }
extern IDT &g_idt;
extern TSS &g_bsp_tss; extern TSS &g_bsp_tss;
extern GDT &g_bsp_gdt; extern GDT &g_bsp_gdt;
extern cpu_data g_bsp_cpu_data;
TSS *tss = new (&g_bsp_tss) TSS;
GDT *gdt = new (&g_bsp_gdt) GDT {tss};
gdt->install();
IDT *idt = new (&g_idt) IDT; IDT *idt = new (&g_idt) IDT;
idt->install();
cpu_data *cpu = &g_bsp_cpu_data;
kutil::memset(cpu, 0, sizeof(cpu_data));
cpu->self = cpu;
cpu->tss = new (&g_bsp_tss) TSS;
cpu->gdt = new (&g_bsp_gdt) GDT {cpu->tss};
cpu_early_init(cpu);
disable_legacy_pic(); disable_legacy_pic();
memory_initialize_pre_ctors(*header); memory_initialize_pre_ctors(*header);
init_cpu(true);
run_constructors(); run_constructors();
memory_initialize_post_ctors(*header); memory_initialize_post_ctors(*header);
cpu->tss->create_ist_stacks(idt->used_ist_entries());
for (size_t i = 0; i < header->num_modules; ++i) { for (size_t i = 0; i < header->num_modules; ++i) {
args::module &mod = header->modules[i]; args::module &mod = header->modules[i];
void *virt = memory::to_virtual<void>(mod.location); void *virt = memory::to_virtual<void>(mod.location);
@@ -154,11 +160,15 @@ kernel_main(args::header *header)
device_manager &devices = device_manager::get(); device_manager &devices = device_manager::get();
devices.parse_acpi(header->acpi_table); devices.parse_acpi(header->acpi_table);
// cpu_init relies on the APIC being set up
cpu_init(cpu, true);
devices.init_drivers(); devices.init_drivers();
devices.get_lapic().calibrate_timer(); devices.get_lapic().calibrate_timer();
start_aps(header->pml4); start_aps(header->pml4);
idt->add_ist_entries();
interrupts_enable(); interrupts_enable();
/* /*
@@ -216,8 +226,8 @@ start_aps(void *kpml4)
// Since we're using address space outside kernel space, make sure // Since we're using address space outside kernel space, make sure
// the kernel's vm_space is used // the kernel's vm_space is used
cpu_data &cpu = current_cpu(); cpu_data &bsp = current_cpu();
cpu.process = &g_kernel_process; bsp.process = &g_kernel_process;
// Copy the startup code somwhere the real mode trampoline can run // Copy the startup code somwhere the real mode trampoline can run
uintptr_t addr = 0x8000; // TODO: find a valid address, rewrite addresses uintptr_t addr = 0x8000; // TODO: find a valid address, rewrite addresses
@@ -229,36 +239,69 @@ start_aps(void *kpml4)
reinterpret_cast<void*>(&ap_startup), reinterpret_cast<void*>(&ap_startup),
ap_startup_code_size); ap_startup_code_size);
static constexpr size_t stack_bytes = kernel_stack_pages * frame_size; // AP idle stacks need less room than normal stacks, so pack multiple
// into a normal stack area
static constexpr size_t idle_stack_bytes = 1024; // 2KiB is generous
static constexpr size_t full_stack_bytes = kernel_stack_pages * frame_size;
static constexpr size_t idle_stacks_per = full_stack_bytes / idle_stack_bytes;
uint8_t ist_entries = IDT::get().used_ist_entries();
size_t free_stack_count = 0;
uintptr_t stack_area_start = 0;
for (uint8_t id : ids) { for (uint8_t id : ids) {
if (id == apic.get_id()) continue; if (id == apic.get_id()) continue;
log::info(logs::boot, "Starting AP %d", id);
size_t current_count = ap_startup_count; // Set up the CPU data structures
uintptr_t stack_start = g_kernel_stacks.get_section(); TSS *tss = new TSS;
uintptr_t stack_end = stack_start + stack_bytes - 2 * sizeof(void*); GDT *gdt = new GDT {tss};
cpu_data *cpu = new cpu_data;
kutil::memset(cpu, 0, sizeof(cpu_data));
cpu->self = cpu;
cpu->gdt = gdt;
cpu->tss = tss;
tss->create_ist_stacks(ist_entries);
// Set up the CPU's idle task stack
if (free_stack_count == 0) {
stack_area_start = g_kernel_stacks.get_section();
free_stack_count = idle_stacks_per;
}
uintptr_t stack_end = stack_area_start + free_stack_count-- * idle_stack_bytes;
stack_end -= 2 * sizeof(void*); // Null frame
*reinterpret_cast<uint64_t*>(stack_end) = 0; // pre-fault the page *reinterpret_cast<uint64_t*>(stack_end) = 0; // pre-fault the page
cpu->rsp0 = stack_end;
init_ap_trampoline(kpml4, stack_end, long_ap_startup); // Set up the trampoline with this CPU's data
init_ap_trampoline(kpml4, cpu, long_ap_startup);
// Kick it off!
size_t current_count = ap_startup_count;
log::debug(logs::boot, "Starting AP %d: stack %llx", id, stack_end);
apic.send_ipi(lapic::ipi_mode::init, 0, id); apic.send_ipi(lapic::ipi_mode::init, 0, id);
clk.spinwait(1000); clk.spinwait(1000);
apic.send_ipi(lapic::ipi_mode::startup, vector, id); apic.send_ipi(lapic::ipi_mode::startup, vector, id);
for (unsigned i = 0; i < 20; ++i) { for (unsigned i = 0; i < 20; ++i) {
if (ap_startup_count > current_count) break; if (ap_startup_count > current_count) break;
clk.spinwait(10); clk.spinwait(20);
} }
// If the CPU already incremented ap_startup_count, it's done
if (ap_startup_count > current_count) if (ap_startup_count > current_count)
continue; continue;
// Send the second SIPI (intel recommends this)
apic.send_ipi(lapic::ipi_mode::startup, vector, id); apic.send_ipi(lapic::ipi_mode::startup, vector, id);
for (unsigned i = 0; i < 100; ++i) { for (unsigned i = 0; i < 100; ++i) {
if (ap_startup_count > current_count) break; if (ap_startup_count > current_count) break;
clk.spinwait(10); clk.spinwait(100);
} }
log::warn(logs::boot, "No response from AP %d within timeout", id);
} }
log::info(logs::boot, "%d CPUs running", ap_startup_count); log::info(logs::boot, "%d CPUs running", ap_startup_count);
@@ -266,9 +309,9 @@ start_aps(void *kpml4)
} }
void void
long_ap_startup() long_ap_startup(cpu_data *cpu)
{ {
init_cpu(false); cpu_init(cpu, false);
++ap_startup_count; ++ap_startup_count;
while(1) asm("hlt"); while(1) asm("hlt");

View File

@@ -1,7 +1,11 @@
#include "kutil/assert.h" #include "kutil/assert.h"
#include "kutil/memory.h" #include "kutil/memory.h"
#include "kutil/no_construct.h" #include "kutil/no_construct.h"
#include "cpu.h" #include "cpu.h"
#include "kernel_memory.h"
#include "log.h"
#include "objects/vm_area.h"
#include "tss.h" #include "tss.h"
// The BSP's TSS is initialized _before_ global constructors are called, // The BSP's TSS is initialized _before_ global constructors are called,
@@ -37,3 +41,28 @@ TSS::ist_stack(unsigned ist)
return m_ist[ist]; return m_ist[ist];
} }
void
TSS::create_ist_stacks(uint8_t ist_entries)
{
extern vm_area_guarded &g_kernel_stacks;
using memory::frame_size;
using memory::kernel_stack_pages;
constexpr size_t stack_bytes = kernel_stack_pages * frame_size;
for (unsigned ist = 1; ist < 8; ++ist) {
if (!(ist_entries & (1 << ist))) continue;
// Two zero entries at the top for the null frame
uintptr_t stack_bottom = g_kernel_stacks.get_section();
uintptr_t stack_top = stack_bottom + stack_bytes - 2 * sizeof(uintptr_t);
log::debug(logs::memory, "Created IST stack at %016lx size 0x%lx",
stack_bottom, stack_bytes);
// Pre-realize these stacks, they're no good if they page fault
for (unsigned i = 0; i < kernel_stack_pages; ++i)
*reinterpret_cast<uint64_t*>(stack_bottom + i * frame_size) = 0;
ist_stack(ist) = stack_top;
}
}

View File

@@ -22,6 +22,10 @@ public:
/// \returns A mutable reference to the stack pointer /// \returns A mutable reference to the stack pointer
uintptr_t & ist_stack(unsigned ist); uintptr_t & ist_stack(unsigned ist);
/// Allocate new stacks for the given IST entries.
/// \arg ist_entries A bitmap of used IST entries
void create_ist_stacks(uint8_t ist_entries);
private: private:
uint32_t m_reserved0; uint32_t m_reserved0;