diff --git a/Kernel/Arch/i386/Boot/boot.S b/Kernel/Arch/i386/Boot/boot.S index 8c8ea06a8d8..954ce300a4d 100644 --- a/Kernel/Arch/i386/Boot/boot.S +++ b/Kernel/Arch/i386/Boot/boot.S @@ -252,15 +252,6 @@ apic_ap_start: mov %cs, %ax mov %ax, %ds - /* Generate a new processor id. This is not the APIC id. We just - need a way to find ourselves a stack without stomping on other - APs that may be doing this concurrently. */ - xor %ax, %ax - mov %ax, %bp - inc %ax - lock; xaddw %ax, %ds:(ap_cpu_id - apic_ap_start)(%bp) /* avoid relocation entries */ - mov %ax, %bx - xor %ax, %ax mov %ax, %sp @@ -281,14 +272,18 @@ apic_ap_start32: mov %ax, %es mov %ax, %fs mov %ax, %gs - + movl $0x8000, %ebp - + + /* generate a unique ap cpu id (0 means 1st ap, not bsp!) */ + xorl %eax, %eax + incl %eax + lock; xaddl %eax, (ap_cpu_id - apic_ap_start)(%ebp) /* avoid relocation entries */ + movl %eax, %esi + /* find our allocated stack based on the generated id */ - andl 0x0000FFFF, %ebx - movl %ebx, %esi - movl (ap_cpu_init_stacks - apic_ap_start)(%ebp, %ebx, 4), %esp - + movl (ap_cpu_init_stacks - apic_ap_start)(%ebp, %eax, 4), %esp + /* check if we support NX and enable it if we do */ movl $0x80000001, %eax cpuid @@ -319,8 +314,8 @@ apic_ap_start32: lgdt (ap_cpu_gdtr_initial2 - apic_ap_start + 0xc0008000) /* jump above 3GB into our identity mapped area now */ - ljmp $8, $(1f - apic_ap_start + 0xc0008000) -1: + ljmp $8, $(apic_ap_start32_2 - apic_ap_start + 0xc0008000) +apic_ap_start32_2: /* flush the TLB */ movl %cr3, %eax movl %eax, %cr3 @@ -338,13 +333,20 @@ apic_ap_start32: movl %eax, %cr0 movl (ap_cpu_init_cr4 - apic_ap_start)(%ebp), %eax movl %eax, %cr4 - + + /* push the Processor pointer this CPU is going to use */ + movl (ap_cpu_init_processor_info_array - apic_ap_start)(%ebp), %eax + addl $0xc0000000, %eax + movl 0(%eax, %esi, 4), %eax + push %eax + + /* push the cpu id, 0 representing the bsp and call into c++ */ + incl %esi + push %esi + xor %ebp, %ebp cld - /* push the arbitrary cpu id, 0 representing the bsp and call into c++ */ - inc %esi - push %esi /* We are in identity mapped P0x8000 and the BSP will unload this code once all APs are initialized, so call init_ap but return to our infinite loop */ @@ -356,7 +358,7 @@ apic_ap_start32: apic_ap_start_size: .2byte end_apic_ap_start - apic_ap_start ap_cpu_id: - .2byte 0x0 + .4byte 0x0 ap_cpu_gdt: /* null */ .8byte 0x0 @@ -388,6 +390,9 @@ ap_cpu_init_cr3: .global ap_cpu_init_cr4 ap_cpu_init_cr4: .4byte 0x0 /* will be set at runtime */ +.global ap_cpu_init_processor_info_array +ap_cpu_init_processor_info_array: + .4byte 0x0 /* will be set at runtime */ .global ap_cpu_init_stacks ap_cpu_init_stacks: /* array of allocated stack pointers */ diff --git a/Kernel/Arch/i386/CPU.cpp b/Kernel/Arch/i386/CPU.cpp index 354447d5c51..a29b4ef4afc 100644 --- a/Kernel/Arch/i386/CPU.cpp +++ b/Kernel/Arch/i386/CPU.cpp @@ -38,42 +38,28 @@ #include #include #include +#include +#include #include +#include #include #include //#define PAGE_FAULT_DEBUG +//#define CONTEXT_SWITCH_DEBUG namespace Kernel { static DescriptorTablePointer s_idtr; -static DescriptorTablePointer s_gdtr; static Descriptor s_idt[256]; -static Descriptor s_gdt[256]; static GenericInterruptHandler* s_interrupt_handler[GENERIC_INTERRUPT_HANDLERS_COUNT]; -static Vector* s_gdt_freelist; - -static u16 s_gdt_length; - -u16 gdt_alloc_entry() -{ - ASSERT(s_gdt_freelist); - ASSERT(!s_gdt_freelist->is_empty()); - return s_gdt_freelist->take_last(); -} - -void gdt_free_entry(u16 entry) -{ - s_gdt_freelist->append(entry); -} - -extern "C" void handle_interrupt(RegisterState); +extern "C" void handle_interrupt(TrapFrame*); #define EH_ENTRY(ec, title) \ extern "C" void title##_asm_entry(); \ - extern "C" void title##_handler(RegisterState); \ + extern "C" void title##_handler(TrapFrame*); \ asm( \ ".globl " #title "_asm_entry\n" \ "" #title "_asm_entry: \n" \ @@ -83,22 +69,21 @@ extern "C" void handle_interrupt(RegisterState); " pushl %fs\n" \ " pushl %gs\n" \ " pushl %ss\n" \ - " mov $0x10, %ax\n" \ + " mov $" __STRINGIFY(GDT_SELECTOR_DATA0) ", %ax\n" \ " mov %ax, %ds\n" \ " mov %ax, %es\n" \ + " mov $" __STRINGIFY(GDT_SELECTOR_PROC) ", %ax\n" \ + " mov %ax, %fs\n" \ + " pushl %esp \n" /* set TrapFrame::regs */ \ + " subl $" __STRINGIFY(TRAP_FRAME_SIZE - 4) ", %esp \n" \ + " pushl %esp \n" \ " cld\n" \ + " call enter_trap_no_irq \n" \ " call " #title "_handler\n" \ - " add $0x4, %esp \n" \ - " popl %gs\n" \ - " popl %fs\n" \ - " popl %es\n" \ - " popl %ds\n" \ - " popa\n" \ - " add $0x4, %esp\n" \ - " iret\n"); + " jmp common_trap_exit \n"); #define EH_ENTRY_NO_CODE(ec, title) \ - extern "C" void title##_handler(RegisterState); \ + extern "C" void title##_handler(TrapFrame*); \ extern "C" void title##_asm_entry(); \ asm( \ ".globl " #title "_asm_entry\n" \ @@ -110,19 +95,18 @@ extern "C" void handle_interrupt(RegisterState); " pushl %fs\n" \ " pushl %gs\n" \ " pushl %ss\n" \ - " mov $0x10, %ax\n" \ + " mov $" __STRINGIFY(GDT_SELECTOR_DATA0) ", %ax\n" \ " mov %ax, %ds\n" \ " mov %ax, %es\n" \ + " mov $" __STRINGIFY(GDT_SELECTOR_PROC) ", %ax\n" \ + " mov %ax, %fs\n" \ + " pushl %esp \n" /* set TrapFrame::regs */ \ + " subl $" __STRINGIFY(TRAP_FRAME_SIZE - 4) ", %esp \n" \ + " pushl %esp \n" \ " cld\n" \ + " call enter_trap_no_irq \n" \ " call " #title "_handler\n" \ - " add $0x4, %esp\n" \ - " popl %gs\n" \ - " popl %fs\n" \ - " popl %es\n" \ - " popl %ds\n" \ - " popa\n" \ - " add $0x4, %esp\n" \ - " iret\n"); + " jmp common_trap_exit \n"); static void dump(const RegisterState& regs) { @@ -172,7 +156,7 @@ void handle_crash(RegisterState& regs, const char* description, int signal, bool // make sure we switch back to the right page tables. MM.enter_process_paging_scope(*Process::current); - klog() << "CRASH: " << description << ". Ring " << (Process::current->is_ring0() ? 0 : 3) << "."; + klog() << "CRASH: CPU #" << Processor::current().id() << " " << description << ". Ring " << (Process::current->is_ring0() ? 0 : 3) << "."; dump(regs); if (Process::current->is_ring0()) { @@ -186,29 +170,29 @@ void handle_crash(RegisterState& regs, const char* description, int signal, bool } EH_ENTRY_NO_CODE(6, illegal_instruction); -void illegal_instruction_handler(RegisterState regs) +void illegal_instruction_handler(TrapFrame* trap) { clac(); - handle_crash(regs, "Illegal instruction", SIGILL); + handle_crash(*trap->regs, "Illegal instruction", SIGILL); } EH_ENTRY_NO_CODE(0, divide_error); -void divide_error_handler(RegisterState regs) +void divide_error_handler(TrapFrame* trap) { clac(); - handle_crash(regs, "Divide error", SIGFPE); + handle_crash(*trap->regs, "Divide error", SIGFPE); } EH_ENTRY(13, general_protection_fault); -void general_protection_fault_handler(RegisterState regs) +void general_protection_fault_handler(TrapFrame* trap) { clac(); - handle_crash(regs, "General protection fault", SIGSEGV); + handle_crash(*trap->regs, "General protection fault", SIGSEGV); } // 7: FPU not available exception EH_ENTRY_NO_CODE(7, fpu_exception); -void fpu_exception_handler(RegisterState) +void fpu_exception_handler(TrapFrame*) { // Just clear the TS flag. We've already restored the FPU state eagerly. // FIXME: It would be nice if we didn't have to do this at all. @@ -217,10 +201,11 @@ void fpu_exception_handler(RegisterState) // 14: Page Fault EH_ENTRY(14, page_fault); -void page_fault_handler(RegisterState regs) +void page_fault_handler(TrapFrame* trap) { clac(); + auto& regs = *trap->regs; u32 fault_address; asm("movl %%cr2, %%eax" : "=a"(fault_address)); @@ -294,9 +279,10 @@ void page_fault_handler(RegisterState regs) } EH_ENTRY_NO_CODE(1, debug); -void debug_handler(RegisterState regs) +void debug_handler(TrapFrame* trap) { clac(); + auto& regs = *trap->regs; if (!Process::current || (regs.cs & 3) == 0) { klog() << "Debug Exception in Ring0"; hang(); @@ -314,9 +300,10 @@ void debug_handler(RegisterState regs) } EH_ENTRY_NO_CODE(3, breakpoint); -void breakpoint_handler(RegisterState regs) +void breakpoint_handler(TrapFrame* trap) { clac(); + auto& regs = *trap->regs; if (!Process::current || (regs.cs & 3) == 0) { klog() << "Breakpoint Trap in Ring0"; hang(); @@ -356,80 +343,11 @@ EH(12, "Stack exception") EH(15, "Unknown error") EH(16, "Coprocessor error") -static void write_raw_gdt_entry(u16 selector, u32 low, u32 high) -{ - u16 i = (selector & 0xfffc) >> 3; - s_gdt[i].low = low; - s_gdt[i].high = high; - - if (i > s_gdt_length) - s_gdtr.limit = (s_gdt_length + 1) * 8 - 1; -} - -void write_gdt_entry(u16 selector, Descriptor& descriptor) -{ - write_raw_gdt_entry(selector, descriptor.low, descriptor.high); -} - -Descriptor& get_gdt_entry(u16 selector) -{ - u16 i = (selector & 0xfffc) >> 3; - return *(Descriptor*)(&s_gdt[i]); -} - -void flush_gdt() -{ - s_gdtr.address = s_gdt; - s_gdtr.limit = (s_gdt_length * 8) - 1; - asm("lgdt %0" ::"m"(s_gdtr) - : "memory"); -} - -const DescriptorTablePointer& get_gdtr() -{ - return s_gdtr; -} - const DescriptorTablePointer& get_idtr() { return s_idtr; } -void gdt_init() -{ - s_gdt_length = 5; - - s_gdt_freelist = new Vector(); - s_gdt_freelist->ensure_capacity(256); - for (size_t i = s_gdt_length; i < 256; ++i) - s_gdt_freelist->append(i * 8); - - s_gdt_length = 256; - s_gdtr.address = s_gdt; - s_gdtr.limit = (s_gdt_length * 8) - 1; - - write_raw_gdt_entry(0x0000, 0x00000000, 0x00000000); - write_raw_gdt_entry(0x0008, 0x0000ffff, 0x00cf9a00); - write_raw_gdt_entry(0x0010, 0x0000ffff, 0x00cf9200); - write_raw_gdt_entry(0x0018, 0x0000ffff, 0x00cffa00); - write_raw_gdt_entry(0x0020, 0x0000ffff, 0x00cff200); - - flush_gdt(); - - asm volatile( - "mov %%ax, %%ds\n" - "mov %%ax, %%es\n" - "mov %%ax, %%fs\n" - "mov %%ax, %%gs\n" - "mov %%ax, %%ss\n" ::"a"(0x10) - : "memory"); - - // Make sure CS points to the kernel code descriptor. - asm volatile( - "ljmpl $0x8, $sanity\n" - "sanity:\n"); -} - static void unimp_trap() { klog() << "Unhandled IRQ."; @@ -514,7 +432,7 @@ void flush_idt() asm("lidt %0" ::"m"(s_idtr)); } -void idt_init() +static void idt_init() { s_idtr.address = s_idt; s_idtr.limit = 0x100 * 8 - 1; @@ -683,21 +601,32 @@ void load_task_register(u16 selector) asm("ltr %0" ::"r"(selector)); } -u32 g_in_irq; - -void handle_interrupt(RegisterState regs) +void handle_interrupt(TrapFrame* trap) { clac(); - ++g_in_irq; + auto& regs = *trap->regs; ASSERT(regs.isr_number >= IRQ_VECTOR_BASE && regs.isr_number <= (IRQ_VECTOR_BASE + GENERIC_INTERRUPT_HANDLERS_COUNT)); u8 irq = (u8)(regs.isr_number - 0x50); ASSERT(s_interrupt_handler[irq]); s_interrupt_handler[irq]->handle_interrupt(regs); - s_interrupt_handler[irq]->increment_invoking_counter(); - --g_in_irq; s_interrupt_handler[irq]->eoi(); } +void enter_trap_no_irq(TrapFrame* trap) +{ + Processor::current().enter_trap(*trap, false); +} + +void enter_trap(TrapFrame* trap) +{ + Processor::current().enter_trap(*trap, true); +} + +void exit_trap(TrapFrame* trap) +{ + return Processor::current().exit_trap(*trap); +} + void sse_init() { asm volatile( @@ -740,9 +669,10 @@ void cpu_detect() g_cpu_supports_rdseed = (extended_features.ebx() & (1 << 18)); } -void cpu_setup() +void cpu_setup(u32 cpu) { - cpu_detect(); + if (cpu == 0) + cpu_detect(); if (g_cpu_supports_sse) { sse_init(); @@ -863,6 +793,424 @@ u32 read_dr6() return dr6; } +FPUState Processor::s_clean_fpu_state; + +void Processor::initialize(u32 cpu) +{ + m_self = this; + + m_cpu = cpu; + m_in_irq = 0; + + gdt_init(); + if (cpu == 0) + idt_init(); + else + flush_idt(); + + ASSERT(¤t() == this); // sanity check + + if (cpu == 0) { + ASSERT((FlatPtr(&s_clean_fpu_state) & 0xF) == 0); + asm volatile("fninit"); + asm volatile("fxsave %0" + : "=m"(s_clean_fpu_state)); + } + + klog() << "CPU #" << cpu << " using Processor at " << VirtualAddress(FlatPtr(this)); +} + +void Processor::write_raw_gdt_entry(u16 selector, u32 low, u32 high) +{ + u16 i = (selector & 0xfffc) >> 3; + u32 prev_gdt_length = m_gdt_length; + + if (i > m_gdt_length) { + m_gdt_length = i + 1; + ASSERT(m_gdt_length <= sizeof(m_gdt) / sizeof(m_gdt[0])); + m_gdtr.limit = (m_gdt_length + 1) * 8 - 1; + } + m_gdt[i].low = low; + m_gdt[i].high = high; + + // clear selectors we may have skipped + while (i < prev_gdt_length) { + m_gdt[i].low = 0; + m_gdt[i].high = 0; + i++; + } +} + +void Processor::write_gdt_entry(u16 selector, Descriptor& descriptor) +{ + write_raw_gdt_entry(selector, descriptor.low, descriptor.high); +} + +Descriptor& Processor::get_gdt_entry(u16 selector) +{ + u16 i = (selector & 0xfffc) >> 3; + return *(Descriptor*)(&m_gdt[i]); +} + +void Processor::flush_gdt() +{ + m_gdtr.address = m_gdt; + m_gdtr.limit = (m_gdt_length * 8) - 1; + asm volatile("lgdt %0" ::"m"(m_gdtr) + : "memory"); +} + +const DescriptorTablePointer& Processor::get_gdtr() +{ + return m_gdtr; +} + +extern "C" void enter_thread_context(Thread* from_thread, Thread* to_thread) +{ + ASSERT(from_thread == to_thread || from_thread->state() != Thread::Running); + ASSERT(to_thread->state() == Thread::Running); + auto& from_tss = from_thread->tss(); + auto& to_tss = to_thread->tss(); + asm volatile("fxsave %0" + : "=m"(from_thread->fpu_state())); + + from_tss.fs = get_fs(); + from_tss.gs = get_gs(); + set_fs(to_tss.fs); + set_gs(to_tss.gs); + + auto& tls_descriptor = Processor::current().get_gdt_entry(GDT_SELECTOR_TLS); + tls_descriptor.set_base(to_thread->thread_specific_data().as_ptr()); + tls_descriptor.set_limit(to_thread->thread_specific_region_size()); + + if (from_tss.cr3 != to_tss.cr3) + write_cr3(to_tss.cr3); + + asm volatile("fxrstor %0" + ::"m"(to_thread->fpu_state())); + + // TODO: debug registers + // TODO: ioperm? +} + +#define ENTER_THREAD_CONTEXT_ARGS_SIZE (2 * 4) // to_thread, from_thread + +void Processor::switch_context(Thread* from_thread, Thread* to_thread) +{ + ASSERT(!in_irq()); + ASSERT(is_kernel_mode()); +#ifdef CONTEXT_SWITCH_DEBUG + dbg() << "switch_context --> switching out of: " << *from_thread; +#endif + + // Switch to new thread context, passing from_thread and to_thread + // through to the new context using registers edx and eax + asm volatile( + // NOTE: changing how much we push to the stack affects + // SWITCH_CONTEXT_TO_STACK_SIZE and thread_context_first_enter()! + "pushfl \n" + "pushl %%ebx \n" + "pushl %%esi \n" + "pushl %%edi \n" + "pushl %%ebp \n" + "movl %%esp, %[from_esp] \n" + "movl $1f, %[from_eip] \n" + "movl %[to_esp0], %%ebx \n" + "movl %%ebx, %[tss_esp0] \n" + "movl %[to_esp], %%esp \n" + "pushl %[to_thread] \n" + "pushl %[from_thread] \n" + "pushl %[to_eip] \n" + "cld \n" + "jmp enter_thread_context \n" + "1: \n" + "popl %%edx \n" + "popl %%eax \n" + "popl %%ebp \n" + "popl %%edi \n" + "popl %%esi \n" + "popl %%ebx \n" + "popfl \n" + : [from_esp] "=m" (from_thread->tss().esp), + [from_eip] "=m" (from_thread->tss().eip), + [tss_esp0] "=m" (m_tss.esp0), + "=d" (from_thread), // needed so that from_thread retains the correct value + "=a" (to_thread) // needed so that to_thread retains the correct value + : [to_esp] "g" (to_thread->tss().esp), + [to_esp0] "g" (to_thread->tss().esp0), + [to_eip] "c" (to_thread->tss().eip), + [from_thread] "d" (from_thread), + [to_thread] "a" (to_thread) + ); +#ifdef CONTEXT_SWITCH_DEBUG + dbg() << "switch_context <-- from " << *from_thread << " to " << *to_thread; +#endif +} + +extern "C" void context_first_init(Thread* from_thread, Thread* to_thread, TrapFrame* trap) +{ + ASSERT(!are_interrupts_enabled()); + ASSERT(is_kernel_mode()); + (void)from_thread; + (void)to_thread; + (void)trap; +#ifdef CONTEXT_SWITCH_DEBUG + dbg() << "switch_context <-- from " << *from_thread << " to " << *to_thread << " (context_first_init)"; +#endif +} + +extern "C" void thread_context_first_enter(void); +asm( +// enter_thread_context returns to here first time a thread is executing +".globl thread_context_first_enter \n" +"thread_context_first_enter: \n" +// switch_context will have pushed from_thread and to_thread to our new +// stack prior to thread_context_first_enter() being called, and the +// pointer to TrapFrame was the top of the stack before that +" movl 8(%esp), %ebx \n" // save pointer to TrapFrame +" cld \n" +" call context_first_init \n" +" addl $" __STRINGIFY(ENTER_THREAD_CONTEXT_ARGS_SIZE) ", %esp \n" +" movl %ebx, 0(%esp) \n" // push pointer to TrapFrame +" jmp common_trap_exit \n" +); + +u32 Processor::init_context(Thread& thread) +{ + ASSERT(is_kernel_mode()); + const u32 kernel_stack_top = thread.kernel_stack_top(); + u32 stack_top = kernel_stack_top; + + // TODO: handle NT? + ASSERT((cpu_flags() & 0x24000) == 0); // Assume !(NT | VM) + + auto& tss = thread.tss(); + bool return_to_user = (tss.cs & 3) != 0; + + // make room for an interrupt frame + if (!return_to_user) { + // userspace_esp and userspace_ss are not popped off by iret + // unless we're switching back to user mode + stack_top -= sizeof(RegisterState) - 2 * sizeof(u32); + } else { + stack_top -= sizeof(RegisterState); + } + + // we want to end up 16-byte aligned, %esp + 4 should be aligned + stack_top -= sizeof(u32); + *reinterpret_cast(kernel_stack_top - 4) = 0; + + // set up the stack so that after returning from thread_context_first_enter() + // we will end up either in kernel mode or user mode, depending on how the thread is set up + // However, the first step is to always start in kernel mode with thread_context_first_enter + RegisterState& iretframe = *reinterpret_cast(stack_top); + iretframe.ss = tss.ss; + iretframe.gs = tss.gs; + iretframe.fs = tss.fs; + iretframe.es = tss.es; + iretframe.ds = tss.ds; + iretframe.edi = tss.edi; + iretframe.esi = tss.esi; + iretframe.ebp = tss.ebp; + iretframe.esp = 0; + iretframe.ebx = tss.ebx; + iretframe.edx = tss.edx; + iretframe.ecx = tss.ecx; + iretframe.eax = tss.eax; + iretframe.eflags = tss.eflags; + iretframe.eip = tss.eip; + iretframe.cs = tss.cs; + if (return_to_user) { + iretframe.userspace_esp = tss.esp; + iretframe.userspace_ss = tss.ss; + } + + // make space for a trap frame + stack_top -= sizeof(TrapFrame); + TrapFrame& trap = *reinterpret_cast(stack_top); + trap.regs = &iretframe; + trap.prev_irq_level = 0; + + stack_top -= sizeof(u32); // pointer to TrapFrame + *reinterpret_cast(stack_top) = stack_top + 4; + +#ifdef CONTEXT_SWITCH_DEBUG + dbg() << "init_context " << thread << " set up to execute at eip: " << VirtualAddress(tss.eip) << " esp: " << VirtualAddress(tss.esp) << " stack top: " << VirtualAddress(stack_top); +#endif + + // make switch_context() always first return to thread_context_first_enter() + // in kernel mode, so set up these values so that we end up popping iretframe + // off the stack right after the context switch completed, at which point + // control is transferred to what iretframe is pointing to. + tss.eip = FlatPtr(&thread_context_first_enter); + tss.esp0 = kernel_stack_top; + tss.esp = stack_top; + tss.cs = GDT_SELECTOR_CODE0; + tss.ds = GDT_SELECTOR_DATA0; + tss.es = GDT_SELECTOR_DATA0; + tss.gs = GDT_SELECTOR_DATA0; + tss.ss = GDT_SELECTOR_DATA0; + tss.fs = GDT_SELECTOR_PROC; + return stack_top; +} + + +extern "C" u32 do_init_context(Thread* thread) +{ + return Processor::init_context(*thread); +} + +extern "C" void do_assume_context(Thread* thread); + +asm( +".global do_assume_context \n" +"do_assume_context: \n" +" movl 4(%esp), %ebx \n" +// We're going to call Processor::init_context, so just make sure +// we have enough stack space so we don't stomp over it +" subl $(" __STRINGIFY(4 + REGISTER_STATE_SIZE + TRAP_FRAME_SIZE + 4) "), %esp \n" +" pushl %ebx \n" +" cld \n" +" call do_init_context \n" +" addl $4, %esp \n" +" movl %eax, %esp \n" // move stack pointer to what Processor::init_context set up for us +" pushl %ebx \n" // push to_thread +" pushl %ebx \n" // push from_thread +" pushl $thread_context_first_enter \n" // should be same as tss.eip +" jmp enter_thread_context \n" +); + +void Processor::assume_context(Thread& thread) +{ + do_assume_context(&thread); + ASSERT_NOT_REACHED(); +} + +void Processor::initialize_context_switching(Thread& initial_thread) +{ + ASSERT(initial_thread.process().is_ring0()); + + auto& tss = initial_thread.tss(); + m_tss = tss; + m_tss.esp0 = tss.esp0; + m_tss.ss0 = GDT_SELECTOR_DATA0; + // user mode needs to be able to switch to kernel mode: + m_tss.cs = m_tss.ds = m_tss.es = m_tss.gs = m_tss.ss = GDT_SELECTOR_CODE0 | 3; + m_tss.fs = GDT_SELECTOR_PROC | 3; + + + asm volatile( + "movl %[new_esp], %%esp \n" // swich to new stack + "pushl %[from_to_thread] \n" // to_thread + "pushl %[from_to_thread] \n" // from_thread + "pushl $" __STRINGIFY(GDT_SELECTOR_CODE0) " \n" + "pushl %[new_eip] \n" // save the entry eip to the stack + "movl %%esp, %%ebx \n" + "addl $20, %%ebx \n" // calculate pointer to TrapFrame + "pushl %%ebx \n" + "cld \n" + "call enter_trap_no_irq \n" + "addl $4, %%esp \n" + "lret \n" + :: [new_esp] "g" (tss.esp), + [new_eip] "a" (tss.eip), + [from_to_thread] "b" (&initial_thread) + ); + + ASSERT_NOT_REACHED(); +} + +void Processor::enter_trap(TrapFrame& trap, bool raise_irq) +{ + InterruptDisabler disabler; + trap.prev_irq_level = m_in_irq; + if (raise_irq) + m_in_irq++; +} + +void Processor::exit_trap(TrapFrame& trap) +{ + InterruptDisabler disabler; + ASSERT(m_in_irq >= trap.prev_irq_level); + m_in_irq = trap.prev_irq_level; + + if (m_invoke_scheduler_async && !m_in_irq) { + m_invoke_scheduler_async = false; + Scheduler::invoke_async(); + } +} + +void Processor::gdt_init() +{ + m_gdt_length = 0; + m_gdtr.address = nullptr; + m_gdtr.limit = 0; + + write_raw_gdt_entry(0x0000, 0x00000000, 0x00000000); + write_raw_gdt_entry(GDT_SELECTOR_CODE0, 0x0000ffff, 0x00cf9a00); // code0 + write_raw_gdt_entry(GDT_SELECTOR_DATA0, 0x0000ffff, 0x00cf9200); // data0 + write_raw_gdt_entry(GDT_SELECTOR_CODE3, 0x0000ffff, 0x00cffa00); // code3 + write_raw_gdt_entry(GDT_SELECTOR_DATA3, 0x0000ffff, 0x00cff200); // data3 + + Descriptor tls_descriptor; + tls_descriptor.low = tls_descriptor.high = 0; + tls_descriptor.dpl = 3; + tls_descriptor.segment_present = 1; + tls_descriptor.granularity = 0; + tls_descriptor.zero = 0; + tls_descriptor.operation_size = 1; + tls_descriptor.descriptor_type = 1; + tls_descriptor.type = 2; + write_gdt_entry(GDT_SELECTOR_TLS, tls_descriptor); // tls3 + + Descriptor fs_descriptor; + fs_descriptor.set_base(this); + fs_descriptor.set_limit(sizeof(Processor)); + fs_descriptor.dpl = 0; + fs_descriptor.segment_present = 1; + fs_descriptor.granularity = 0; + fs_descriptor.zero = 0; + fs_descriptor.operation_size = 1; + fs_descriptor.descriptor_type = 1; + fs_descriptor.type = 2; + write_gdt_entry(GDT_SELECTOR_PROC, fs_descriptor); // fs0 + + Descriptor tss_descriptor; + tss_descriptor.set_base(&m_tss); + tss_descriptor.set_limit(sizeof(TSS32)); + tss_descriptor.dpl = 0; + tss_descriptor.segment_present = 1; + tss_descriptor.granularity = 0; + tss_descriptor.zero = 0; + tss_descriptor.operation_size = 1; + tss_descriptor.descriptor_type = 0; + tss_descriptor.type = 9; + write_gdt_entry(GDT_SELECTOR_TSS, tss_descriptor); // tss + + flush_gdt(); + load_task_register(GDT_SELECTOR_TSS); + + asm volatile( + "mov %%ax, %%ds\n" + "mov %%ax, %%es\n" + "mov %%ax, %%gs\n" + "mov %%ax, %%ss\n" ::"a"(GDT_SELECTOR_DATA0) + : "memory"); + set_fs(GDT_SELECTOR_PROC); + + // Make sure CS points to the kernel code descriptor. + asm volatile( + "ljmpl $" __STRINGIFY(GDT_SELECTOR_CODE0) ", $sanity\n" + "sanity:\n"); +} + +void Processor::set_thread_specific(u8* data, size_t len) +{ + auto& descriptor = get_gdt_entry(GDT_SELECTOR_TLS); + descriptor.set_base(data); + descriptor.set_limit(len); +} + } #ifdef DEBUG diff --git a/Kernel/Arch/i386/CPU.h b/Kernel/Arch/i386/CPU.h index 398bbcd69d6..cc6eebcdb22 100644 --- a/Kernel/Arch/i386/CPU.h +++ b/Kernel/Arch/i386/CPU.h @@ -106,6 +106,14 @@ union [[gnu::packed]] Descriptor TrapGate_32bit = 0xf, }; + void* get_base() const + { + u32 b = base_lo; + b |= base_hi << 16; + b |= base_hi2 << 24; + return reinterpret_cast(b); + } + void set_base(void* b) { base_lo = (u32)(b)&0xffff; @@ -256,8 +264,6 @@ struct RegisterState; const DescriptorTablePointer& get_gdtr(); const DescriptorTablePointer& get_idtr(); -void gdt_init(); -void idt_init(); void sse_init(); void register_interrupt_handler(u8 number, void (*f)()); void register_user_callable_interrupt_handler(u8 number, void (*f)()); @@ -267,12 +273,7 @@ void replace_single_handler_with_shared(GenericInterruptHandler&); void replace_shared_handler_with_single(GenericInterruptHandler&); void unregister_generic_interrupt_handler(u8 number, GenericInterruptHandler&); void flush_idt(); -void flush_gdt(); void load_task_register(u16 selector); -u16 gdt_alloc_entry(); -void gdt_free_entry(u16); -Descriptor& get_gdt_entry(u16 selector); -void write_gdt_entry(u16 selector, Descriptor&); void handle_crash(RegisterState&, const char* description, int signal, bool out_of_memory = false); [[noreturn]] static inline void hang() @@ -303,6 +304,39 @@ inline u32 cpu_flags() return flags; } +inline void set_fs(u32 segment) +{ + asm volatile( + "movl %%eax, %%fs" :: "a"(segment) + : "memory" + ); +} + +inline void set_gs(u32 segment) +{ + asm volatile( + "movl %%eax, %%gs" :: "a"(segment) + : "memory" + ); +} + + +inline u32 get_fs() +{ + u32 fs; + asm("mov %%fs, %%eax" + : "=a"(fs)); + return fs; +} + +inline u32 get_gs() +{ + u32 gs; + asm("mov %%gs, %%eax" + : "=a"(gs)); + return gs; +} + inline u32 read_fs_u32(u32 offset) { u32 val; @@ -460,6 +494,9 @@ struct [[gnu::packed]] RegisterState u32 userspace_ss; }; +#define REGISTER_STATE_SIZE (19 * 4) +static_assert(REGISTER_STATE_SIZE == sizeof(RegisterState)); + struct [[gnu::aligned(16)]] FPUState { u8 buffer[512]; @@ -492,6 +529,15 @@ u32 read_cr4(); u32 read_dr6(); +static inline bool is_kernel_mode() +{ + u32 cs; + asm volatile ( + "movl %%cs, %[cs] \n" + : [cs] "=g" (cs)); + return (cs & 3) == 0; +} + class CPUID { public: CPUID(u32 function) { asm volatile("cpuid" @@ -552,6 +598,94 @@ private: SplitQword m_start; }; +class Thread; +struct TrapFrame; + +#define GDT_SELECTOR_CODE0 0x08 +#define GDT_SELECTOR_DATA0 0x10 +#define GDT_SELECTOR_CODE3 0x18 +#define GDT_SELECTOR_DATA3 0x20 +#define GDT_SELECTOR_TLS 0x28 +#define GDT_SELECTOR_PROC 0x30 +#define GDT_SELECTOR_TSS 0x38 + +class Processor { + Processor* m_self; // must be first field (%fs offset 0x0) + + DescriptorTablePointer m_gdtr; + Descriptor m_gdt[256]; + u32 m_gdt_length; + + u32 m_cpu; + u32 m_in_irq; + + TSS32 m_tss; + static FPUState s_clean_fpu_state; + + bool m_invoke_scheduler_async; + + void gdt_init(); + void write_raw_gdt_entry(u16 selector, u32 low, u32 high); + void write_gdt_entry(u16 selector, Descriptor& descriptor); + +public: + void initialize(u32 cpu); + + Descriptor& get_gdt_entry(u16 selector); + void flush_gdt(); + const DescriptorTablePointer& get_gdtr(); + + ALWAYS_INLINE static Processor& current() + { + return *(Processor*)read_fs_u32(0); + } + + ALWAYS_INLINE static u32 id() + { + return current().m_cpu; + } + + ALWAYS_INLINE u32& in_irq() + { + return m_in_irq; + } + + ALWAYS_INLINE const FPUState& clean_fpu_state() const + { + return s_clean_fpu_state; + } + + void invoke_scheduler_async() { m_invoke_scheduler_async = true; } + + void enter_trap(TrapFrame& trap, bool raise_irq); + void exit_trap(TrapFrame& trap); + + [[noreturn]] void initialize_context_switching(Thread& initial_thread); + void switch_context(Thread* from_thread, Thread* to_thread); + [[noreturn]] static void assume_context(Thread& thread); + static u32 init_context(Thread& thread); + + void set_thread_specific(u8* data, size_t len); +}; + +struct TrapFrame { + u32 prev_irq_level; + RegisterState* regs; // must be last + + TrapFrame() = delete; + TrapFrame(const TrapFrame&) = delete; + TrapFrame(TrapFrame&&) = delete; + TrapFrame& operator=(const TrapFrame&) = delete; + TrapFrame& operator=(TrapFrame&&) = delete; +}; + +#define TRAP_FRAME_SIZE (2 * 4) +static_assert(TRAP_FRAME_SIZE == sizeof(TrapFrame)); + +extern "C" void enter_trap_no_irq(TrapFrame*); +extern "C" void enter_trap(TrapFrame*); +extern "C" void exit_trap(TrapFrame*); + class MSR { uint32_t m_msr; @@ -583,7 +717,8 @@ public: } }; -void cpu_setup(); +void cpu_setup(u32 cpu); + extern bool g_cpu_supports_nx; extern bool g_cpu_supports_pae; extern bool g_cpu_supports_pge; @@ -629,6 +764,4 @@ private: u32 m_flags; }; -extern u32 g_in_irq; - } diff --git a/Kernel/Arch/i386/Interrupts.h b/Kernel/Arch/i386/Interrupts.h index c8b83fe263a..c209f95d8f5 100644 --- a/Kernel/Arch/i386/Interrupts.h +++ b/Kernel/Arch/i386/Interrupts.h @@ -27,6 +27,8 @@ #pragma once #include +#include +#include extern "C" void interrupt_common_asm_entry(); @@ -47,16 +49,35 @@ asm( " pushl %fs\n" " pushl %gs\n" " pushl %ss\n" - " mov $0x10, %ax\n" + " mov $" __STRINGIFY(GDT_SELECTOR_DATA0) ", %ax\n" " mov %ax, %ds\n" " mov %ax, %es\n" + " mov $" __STRINGIFY(GDT_SELECTOR_PROC) ", %ax\n" + " mov %ax, %fs\n" + " pushl %esp \n" // set TrapFrame::regs + " subl $" __STRINGIFY(TRAP_FRAME_SIZE - 4) ", %esp \n" + " movl %esp, %ebx \n" // save pointer to TrapFrame + " pushl %ebx \n" " cld\n" + " call enter_trap \n" + " movl %ebx, 0(%esp) \n" // push pointer to TrapFrame " call handle_interrupt\n" - " add $0x4, %esp\n" // "popl %ss" + " movl %ebx, 0(%esp) \n" // push pointer to TrapFrame + ".globl common_trap_exit \n" + "common_trap_exit: \n" + // another thread may have handled this trap at this point, so don't + // make assumptions about the stack other than there's a TrapFrame + // and a pointer to it. + " call exit_trap \n" + " addl $" __STRINGIFY(TRAP_FRAME_SIZE + 4) ", %esp\n" // pop TrapFrame and pointer to it + ".globl interrupt_common_asm_exit \n" + "interrupt_common_asm_exit: \n" + " addl $4, %esp\n" // pop %ss " popl %gs\n" " popl %fs\n" " popl %es\n" " popl %ds\n" " popa\n" - " add $0x4, %esp\n" - " iret\n"); + " addl $0x4, %esp\n" // skip exception_code, isr_number + " iret\n" +); diff --git a/Kernel/Assertions.h b/Kernel/Assertions.h index b48273a3012..54b57fad3cb 100644 --- a/Kernel/Assertions.h +++ b/Kernel/Assertions.h @@ -28,6 +28,9 @@ #include +#define __STRINGIFY_HELPER(x) #x +#define __STRINGIFY(x) __STRINGIFY_HELPER(x) + #ifdef DEBUG [[noreturn]] void __assertion_failed(const char* msg, const char* file, unsigned line, const char* func); # define ASSERT(expr) (static_cast(expr) ? (void)0 : __assertion_failed(# expr, __FILE__, __LINE__, __PRETTY_FUNCTION__)) diff --git a/Kernel/Forward.h b/Kernel/Forward.h index 5644a373883..ee69b173680 100644 --- a/Kernel/Forward.h +++ b/Kernel/Forward.h @@ -59,6 +59,7 @@ class Scheduler; class SharedBuffer; class Socket; template class SpinLock; +class RecursiveSpinLock; template class ScopedSpinLock; class TCPSocket; class TTY; diff --git a/Kernel/Interrupts/APIC.cpp b/Kernel/Interrupts/APIC.cpp index 8498b20289d..71ae5a8bf62 100644 --- a/Kernel/Interrupts/APIC.cpp +++ b/Kernel/Interrupts/APIC.cpp @@ -38,6 +38,8 @@ #include #include +//#define APIC_DEBUG + #define IRQ_APIC_SPURIOUS 0x7f #define APIC_BASE_MSR 0x1b @@ -118,6 +120,7 @@ void APIC::write_icr(const ICRReg& icr) extern "C" void apic_ap_start(void); extern "C" u16 apic_ap_start_size; extern "C" u32 ap_cpu_init_stacks; +extern "C" u32 ap_cpu_init_processor_info_array; extern "C" u32 ap_cpu_init_cr0; extern "C" u32 ap_cpu_init_cr3; extern "C" u32 ap_cpu_init_cr4; @@ -151,7 +154,9 @@ bool APIC::init_bsp() return false; PhysicalAddress apic_base = get_base(); +#ifdef APIC_DEBUG klog() << "Initializing APIC, base: " << apic_base; +#endif set_base(apic_base); m_apic_base = MM.allocate_kernel_region(apic_base.page_base(), PAGE_ROUND_UP(1), {}, Region::Access::Read | Region::Access::Write); @@ -177,8 +182,10 @@ bool APIC::init_bsp() size_t entry_length = madt_entry->length; if (madt_entry->type == (u8)ACPI::Structures::MADTEntryType::LocalAPIC) { auto* plapic_entry = (const ACPI::Structures::MADTEntries::ProcessorLocalAPIC*)madt_entry; +#ifdef APIC_DEBUG klog() << "APIC: AP found @ MADT entry " << entry_index << ", Processor Id: " << String::format("%02x", plapic_entry->acpi_processor_id) << " APIC Id: " << String::format("%02x", plapic_entry->apic_id) << " Flags: " << String::format("%08x", plapic_entry->flags); +#endif processor_cnt++; if ((plapic_entry->flags & 0x1) != 0) processor_enabled_cnt++; @@ -201,7 +208,10 @@ bool APIC::init_bsp() u32 aps_to_enable = processor_enabled_cnt - 1; // Copy the APIC startup code and variables to P0x00008000 - auto apic_startup_region = MM.allocate_kernel_region_identity(PhysicalAddress(0x8000), PAGE_ROUND_UP(apic_ap_start_size), {}, Region::Access::Read | Region::Access::Write | Region::Access::Execute); + // Also account for the data appended to: + // * aps_to_enable u32 values for ap_cpu_init_stacks + // * aps_to_enable u32 values for ap_cpu_init_processor_info_array + auto apic_startup_region = MM.allocate_kernel_region_identity(PhysicalAddress(0x8000), PAGE_ROUND_UP(apic_ap_start_size + (2 * aps_to_enable * sizeof(u32))), {}, Region::Access::Read | Region::Access::Write | Region::Access::Execute); memcpy(apic_startup_region->vaddr().as_ptr(), reinterpret_cast(apic_ap_start), apic_ap_start_size); // Allocate enough stacks for all APs @@ -212,20 +222,35 @@ bool APIC::init_bsp() return false; } stack_region->set_stack(true); - klog() << "APIC: Allocated AP #" << i << " stack at " << stack_region->vaddr(); m_apic_ap_stacks.append(stack_region.release_nonnull()); } // Store pointers to all stacks for the APs to use auto ap_stack_array = APIC_INIT_VAR_PTR(u32, apic_startup_region->vaddr().as_ptr(), ap_cpu_init_stacks); - for (size_t i = 0; i < m_apic_ap_stacks.size(); i++) + ASSERT(aps_to_enable == m_apic_ap_stacks.size()); + for (size_t i = 0; i < aps_to_enable; i++) { ap_stack_array[i] = m_apic_ap_stacks[i].vaddr().get() + Thread::default_kernel_stack_size; +#ifdef APIC_DEBUG + klog() << "APIC: CPU[" << (i + 1) << "] stack at " << VirtualAddress(ap_stack_array[i]); +#endif + } + + // Allocate Processor structures for all APs and store the pointer to the data + m_ap_processor_info.resize(aps_to_enable); + auto ap_processor_info_array = &ap_stack_array[aps_to_enable]; + for (size_t i = 0; i < aps_to_enable; i++) { + ap_processor_info_array[i] = FlatPtr(&m_ap_processor_info.at(i)); +#ifdef APIC_DEBUG + klog() << "APIC: CPU[" << (i + 1) << "] Processor at " << VirtualAddress(ap_processor_info_array[i]); +#endif + } + *APIC_INIT_VAR_PTR(u32, apic_startup_region->vaddr().as_ptr(), ap_cpu_init_processor_info_array) = FlatPtr(&ap_processor_info_array[0]); // Store the BSP's CR3 value for the APs to use *APIC_INIT_VAR_PTR(u32, apic_startup_region->vaddr().as_ptr(), ap_cpu_init_cr3) = MM.kernel_page_directory().cr3(); // Store the BSP's GDT and IDT for the APs to use - const auto& gdtr = get_gdtr(); + const auto& gdtr = Processor::current().get_gdtr(); *APIC_INIT_VAR_PTR(u32, apic_startup_region->vaddr().as_ptr(), ap_cpu_gdtr) = FlatPtr(&gdtr); const auto& idtr = get_idtr(); *APIC_INIT_VAR_PTR(u32, apic_startup_region->vaddr().as_ptr(), ap_cpu_idtr) = FlatPtr(&idtr); @@ -233,8 +258,10 @@ bool APIC::init_bsp() // Store the BSP's CR0 and CR4 values for the APs to use *APIC_INIT_VAR_PTR(u32, apic_startup_region->vaddr().as_ptr(), ap_cpu_init_cr0) = read_cr0(); *APIC_INIT_VAR_PTR(u32, apic_startup_region->vaddr().as_ptr(), ap_cpu_init_cr4) = read_cr4(); - + +#ifdef APIC_DEBUG klog() << "APIC: Starting " << aps_to_enable << " AP(s)"; +#endif // INIT write_icr(ICRReg(0, ICRReg::INIT, ICRReg::Physical, ICRReg::Assert, ICRReg::TriggerMode::Edge, ICRReg::AllExcludingSelf)); @@ -250,14 +277,18 @@ bool APIC::init_bsp() // Now wait until the ap_cpu_init_pending variable dropped to 0, which means all APs are initialized and no longer need these special mappings if (m_apic_ap_count.load(AK::MemoryOrder::memory_order_consume) != aps_to_enable) { +#ifdef APIC_DEBUG klog() << "APIC: Waiting for " << aps_to_enable << " AP(s) to finish initialization..."; +#endif do { // Wait a little bit IO::delay(200); } while (m_apic_ap_count.load(AK::MemoryOrder::memory_order_consume) != aps_to_enable); } - + +#ifdef APIC_DEBUG klog() << "APIC: " << processor_enabled_cnt << " processors are initialized and running"; +#endif } return true; } @@ -270,8 +301,9 @@ void APIC::enable_bsp() void APIC::enable(u32 cpu) { - if (cpu == 0)// FIXME: once memory management can deal with it, re-enable for all - klog() << "Enabling local APIC for cpu #" << cpu; +#ifdef APIC_DEBUG + klog() << "Enabling local APIC for cpu #" << cpu; +#endif if (cpu == 0) { // dummy read, apparently to avoid a bug in old CPUs. diff --git a/Kernel/Interrupts/APIC.h b/Kernel/Interrupts/APIC.h index cce63ca57e3..17a2496fe8c 100644 --- a/Kernel/Interrupts/APIC.h +++ b/Kernel/Interrupts/APIC.h @@ -87,6 +87,7 @@ private: OwnPtr m_apic_base; NonnullOwnPtrVector m_apic_ap_stacks; + Vector m_ap_processor_info; AK::Atomic m_apic_ap_count{0}; static PhysicalAddress get_base(); diff --git a/Kernel/Lock.cpp b/Kernel/Lock.cpp index b63651b70a0..4c86164cf49 100644 --- a/Kernel/Lock.cpp +++ b/Kernel/Lock.cpp @@ -44,7 +44,6 @@ static bool modes_conflict(Lock::Mode mode1, Lock::Mode mode2) void Lock::lock(Mode mode) { ASSERT(mode != Mode::Unlocked); - ASSERT(!Scheduler::is_active()); if (!are_interrupts_enabled()) { klog() << "Interrupts disabled when trying to take Lock{" << m_name << "}"; dump_backtrace(); diff --git a/Kernel/Process.cpp b/Kernel/Process.cpp index dc773828d4a..ad85ed1c7bb 100644 --- a/Kernel/Process.cpp +++ b/Kernel/Process.cpp @@ -859,9 +859,17 @@ int Process::do_exec(NonnullRefPtr main_program_description, Ve // No other thread from this process will be scheduled to run m_exec_tid = Thread::current->tid(); - auto old_page_directory = move(m_page_directory); - auto old_regions = move(m_regions); - m_page_directory = PageDirectory::create_for_userspace(*this); + RefPtr old_page_directory; + NonnullOwnPtrVector old_regions; + + { + // Need to make sure we don't swap contexts in the middle + InterruptDisabler disabler; + old_page_directory = move(m_page_directory); + old_regions = move(m_regions); + m_page_directory = PageDirectory::create_for_userspace(*this); + } + #ifdef MM_DEBUG dbg() << "Process " << pid() << " exec: PD=" << m_page_directory.ptr() << " created"; #endif @@ -898,6 +906,8 @@ int Process::do_exec(NonnullRefPtr main_program_description, Ve { ArmedScopeGuard rollback_regions_guard([&]() { ASSERT(Process::current == this); + // Need to make sure we don't swap contexts in the middle + InterruptDisabler disabler; m_page_directory = move(old_page_directory); m_regions = move(old_regions); MM.enter_process_paging_scope(*this); @@ -1028,7 +1038,7 @@ int Process::do_exec(NonnullRefPtr main_program_description, Ve // and we don't want to deal with faults after this point. u32 new_userspace_esp = new_main_thread->make_userspace_stack_for_main_thread(move(arguments), move(environment)); - // We cli() manually here because we don't want to get interrupted between do_exec() and Schedule::yield(). + // We cli() manually here because we don't want to get interrupted between do_exec() and Processor::assume_context(). // The reason is that the task redirection we've set up above will be clobbered by the timer IRQ. // If we used an InterruptDisabler that sti()'d on exit, we might timer tick'd too soon in exec(). if (Process::current == this) @@ -1036,15 +1046,9 @@ int Process::do_exec(NonnullRefPtr main_program_description, Ve // NOTE: Be careful to not trigger any page faults below! - Scheduler::prepare_to_modify_tss(*new_main_thread); - m_name = parts.take_last(); new_main_thread->set_name(m_name); - auto& tss = new_main_thread->m_tss; - - u32 old_esp0 = tss.esp0; - m_master_tls_size = master_tls_size; m_master_tls_alignment = master_tls_alignment; @@ -1052,25 +1056,21 @@ int Process::do_exec(NonnullRefPtr main_program_description, Ve new_main_thread->make_thread_specific_region({}); new_main_thread->reset_fpu_state(); - memset(&tss, 0, sizeof(TSS32)); - tss.iomapbase = sizeof(TSS32); - - tss.eflags = 0x0202; + // NOTE: if a context switch were to happen, tss.eip and tss.esp would get overwritten!!! + auto& tss = new_main_thread->m_tss; + tss.cs = GDT_SELECTOR_CODE3 | 3; + tss.ds = GDT_SELECTOR_DATA3 | 3; + tss.es = GDT_SELECTOR_DATA3 | 3; + tss.ss = GDT_SELECTOR_DATA3 | 3; + tss.fs = GDT_SELECTOR_DATA3 | 3; + tss.gs = GDT_SELECTOR_TLS | 3; tss.eip = entry_eip; - tss.cs = 0x1b; - tss.ds = 0x23; - tss.es = 0x23; - tss.fs = 0x23; - tss.gs = thread_specific_selector() | 3; - tss.ss = 0x23; - tss.cr3 = page_directory().cr3(); tss.esp = new_userspace_esp; - tss.ss0 = 0x10; - tss.esp0 = old_esp0; + tss.cr3 = m_page_directory->cr3(); tss.ss2 = m_pid; #ifdef TASK_DEBUG - klog() << "Process exec'd " << path.characters() << " @ " << String::format("%p", tss.eip); + klog() << "Process exec'd " << path.characters() << " @ " << String::format("%p", entry_eip); #endif if (was_profiling) @@ -1261,7 +1261,8 @@ int Process::exec(String path, Vector arguments, Vector environm } if (Process::current == this) { - Scheduler::yield(); + Thread::current->set_state(Thread::State::Running); + Processor::assume_context(*Thread::current); ASSERT_NOT_REACHED(); } return 0; diff --git a/Kernel/Scheduler.cpp b/Kernel/Scheduler.cpp index 311a643d02a..2a943ad8fe6 100644 --- a/Kernel/Scheduler.cpp +++ b/Kernel/Scheduler.cpp @@ -76,22 +76,10 @@ timeval Scheduler::time_since_boot() Thread* g_finalizer; Thread* g_colonel; WaitQueue* g_finalizer_wait_queue; -bool g_finalizer_has_work; +Atomic g_finalizer_has_work{false}; static Process* s_colonel_process; u64 g_uptime; -struct TaskRedirectionData { - u16 selector; - TSS32 tss; -}; -static TaskRedirectionData s_redirection; -static bool s_active; - -bool Scheduler::is_active() -{ - return s_active; -} - Thread::JoinBlocker::JoinBlocker(Thread& joinee, void*& joinee_exit_value) : m_joinee(joinee) , m_joinee_exit_value(joinee_exit_value) @@ -280,6 +268,7 @@ bool Thread::WaitBlocker::should_unblock(Thread& thread, time_t, long) return IterationDecision::Continue; m_waitee_pid = child.pid(); + dbg() << "Unblocking thread " << thread << " process " << thread.process() << " child exited: " << m_waitee_pid; should_unblock = true; return IterationDecision::Break; }); @@ -325,21 +314,26 @@ void Thread::consider_unblock(time_t now_sec, long now_usec) } } +void Scheduler::start() +{ + ASSERT_INTERRUPTS_DISABLED(); + ASSERT(!Thread::current); + Thread::current = g_colonel; + Process::current = &g_colonel->process(); + g_colonel->set_ticks_left(time_slice_for(*g_colonel)); + g_colonel->did_schedule(); + g_colonel->set_initialized(true); + Processor::init_context(*g_colonel); + g_colonel->set_state(Thread::Running); + Processor::current().initialize_context_switching(*g_colonel); + ASSERT_NOT_REACHED(); +} + bool Scheduler::pick_next() { ASSERT_INTERRUPTS_DISABLED(); - ASSERT(!s_active); - - TemporaryChange change(s_active, true); - - ASSERT(s_active); - - if (!Thread::current) { - // XXX: The first ever context_switch() goes to the idle process. - // This to setup a reliable place we can return to. - return context_switch(*g_colonel); - } + ASSERT(Thread::current); auto now = time_since_boot(); auto now_sec = now.tv_sec; @@ -448,52 +442,48 @@ bool Scheduler::pick_next() return context_switch(*thread_to_schedule); } +bool Scheduler::yield() +{ +//#ifdef SCHEDULER_DEBUG +#if 0 + dbg() << "Scheduler: yielding thread " << *Thread::current << " in_trap: " << Processor::current().in_trap() << " in_irq: " << Processor::current().in_irq(); +#endif + InterruptDisabler disabler; + ASSERT(Thread::current); + if (Processor::current().in_irq()) { + // If we're handling an IRQ we can't switch context, delay until + // exiting the trap + Processor::current().invoke_scheduler_async(); + } else if (!Scheduler::pick_next()) + return false; +//#ifdef SCHEDULER_DEBUG +#if 0 + dbg() << "Scheduler: yield returns to thread " << *Thread::current << " in_trap: " << Processor::current().in_trap() << " in_irq: " << Processor::current().in_irq(); +#endif + return true; +} + bool Scheduler::donate_to(Thread* beneficiary, const char* reason) { InterruptDisabler disabler; + ASSERT(!Processor::current().in_irq()); if (!Thread::is_thread(beneficiary)) return false; (void)reason; unsigned ticks_left = Thread::current->ticks_left(); if (!beneficiary || beneficiary->state() != Thread::Runnable || ticks_left <= 1) - return yield(); + return Scheduler::yield(); unsigned ticks_to_donate = min(ticks_left - 1, time_slice_for(*beneficiary)); #ifdef SCHEDULER_DEBUG dbg() << "Scheduler: Donating " << ticks_to_donate << " ticks to " << *beneficiary << ", reason=" << reason; #endif - context_switch(*beneficiary); beneficiary->set_ticks_left(ticks_to_donate); - switch_now(); + Scheduler::context_switch(*beneficiary); return false; } -bool Scheduler::yield() -{ - InterruptDisabler disabler; - ASSERT(Thread::current); - if (!pick_next()) - return false; - switch_now(); - return true; -} - -void Scheduler::pick_next_and_switch_now() -{ - bool someone_wants_to_run = pick_next(); - ASSERT(someone_wants_to_run); - switch_now(); -} - -void Scheduler::switch_now() -{ - Descriptor& descriptor = get_gdt_entry(Thread::current->selector()); - descriptor.type = 9; - asm("sti\n" - "ljmp *(%%eax)\n" ::"a"(&Thread::current->far_ptr())); -} - bool Scheduler::context_switch(Thread& thread) { thread.set_ticks_left(time_slice_for(thread)); @@ -508,96 +498,47 @@ bool Scheduler::context_switch(Thread& thread) if (Thread::current->state() == Thread::Running) Thread::current->set_state(Thread::Runnable); - asm volatile("fxsave %0" - : "=m"(Thread::current->fpu_state())); - #ifdef LOG_EVERY_CONTEXT_SWITCH dbg() << "Scheduler: " << *Thread::current << " -> " << thread << " [" << thread.priority() << "] " << String::format("%w", thread.tss().cs) << ":" << String::format("%x", thread.tss().eip); #endif } + Thread* from = Thread::current; Thread::current = &thread; Process::current = &thread.process(); - + if (!thread.is_initialized()) { + Processor::init_context(thread); + thread.set_initialized(true); + } thread.set_state(Thread::Running); - asm volatile("fxrstor %0" ::"m"(Thread::current->fpu_state())); - - if (!thread.selector()) { - thread.set_selector(gdt_alloc_entry()); - auto& descriptor = get_gdt_entry(thread.selector()); - descriptor.set_base(&thread.tss()); - descriptor.set_limit(sizeof(TSS32)); - descriptor.dpl = 0; - descriptor.segment_present = 1; - descriptor.granularity = 0; - descriptor.zero = 0; - descriptor.operation_size = 1; - descriptor.descriptor_type = 0; - } - - if (!thread.thread_specific_data().is_null()) { - auto& descriptor = thread_specific_descriptor(); - descriptor.set_base(thread.thread_specific_data().as_ptr()); - descriptor.set_limit(sizeof(ThreadSpecificData*)); - } - - auto& descriptor = get_gdt_entry(thread.selector()); - descriptor.type = 11; // Busy TSS + Processor::current().switch_context(from, &thread); return true; } -static void initialize_redirection() -{ - auto& descriptor = get_gdt_entry(s_redirection.selector); - descriptor.set_base(&s_redirection.tss); - descriptor.set_limit(sizeof(TSS32)); - descriptor.dpl = 0; - descriptor.segment_present = 1; - descriptor.granularity = 0; - descriptor.zero = 0; - descriptor.operation_size = 1; - descriptor.descriptor_type = 0; - descriptor.type = 9; - flush_gdt(); -} - -void Scheduler::prepare_for_iret_to_new_process() -{ - auto& descriptor = get_gdt_entry(s_redirection.selector); - descriptor.type = 9; - s_redirection.tss.backlink = Thread::current->selector(); - load_task_register(s_redirection.selector); -} - -void Scheduler::prepare_to_modify_tss(Thread& thread) -{ - // This ensures that a currently running process modifying its own TSS - // in order to yield() and end up somewhere else doesn't just end up - // right after the yield(). - if (Thread::current == &thread) - load_task_register(s_redirection.selector); -} - Process* Scheduler::colonel() { return s_colonel_process; } -void Scheduler::initialize() +void Scheduler::initialize(u32 cpu) { + ASSERT(&Processor::current() != nullptr); // sanity check g_scheduler_data = new SchedulerData; g_finalizer_wait_queue = new WaitQueue; - g_finalizer_has_work = false; - s_redirection.selector = gdt_alloc_entry(); - initialize_redirection(); - s_colonel_process = Process::create_kernel_process(g_colonel, "colonel", nullptr); - g_colonel->set_priority(THREAD_PRIORITY_MIN); - load_task_register(s_redirection.selector); + + if (cpu == 0) { + g_finalizer_has_work.store(false, AK::MemoryOrder::memory_order_release); + s_colonel_process = Process::create_kernel_process(g_colonel, "colonel", idle_loop); + g_colonel->set_priority(THREAD_PRIORITY_MIN); + } } void Scheduler::timer_tick(const RegisterState& regs) { + ASSERT_INTERRUPTS_DISABLED(); + ASSERT(Processor::current().in_irq()); + if (!Thread::current) return; @@ -622,62 +563,25 @@ void Scheduler::timer_tick(const RegisterState& regs) if (Thread::current->tick()) return; - auto& outgoing_tss = Thread::current->tss(); - - if (!pick_next()) - return; - - outgoing_tss.gs = regs.gs; - outgoing_tss.fs = regs.fs; - outgoing_tss.es = regs.es; - outgoing_tss.ds = regs.ds; - outgoing_tss.edi = regs.edi; - outgoing_tss.esi = regs.esi; - outgoing_tss.ebp = regs.ebp; - outgoing_tss.ebx = regs.ebx; - outgoing_tss.edx = regs.edx; - outgoing_tss.ecx = regs.ecx; - outgoing_tss.eax = regs.eax; - outgoing_tss.eip = regs.eip; - outgoing_tss.cs = regs.cs; - outgoing_tss.eflags = regs.eflags; - - // Compute process stack pointer. - // Add 16 for CS, EIP, EFLAGS, exception code (interrupt mechanic) - outgoing_tss.esp = regs.esp + 16; - outgoing_tss.ss = regs.ss; - - if ((outgoing_tss.cs & 3) != 0) { - outgoing_tss.ss = regs.userspace_ss; - outgoing_tss.esp = regs.userspace_esp; - } - prepare_for_iret_to_new_process(); - - // Set the NT (nested task) flag. - asm( - "pushf\n" - "orl $0x00004000, (%esp)\n" - "popf\n"); + ASSERT_INTERRUPTS_DISABLED(); + ASSERT(Processor::current().in_irq()); + Processor::current().invoke_scheduler_async(); } -static bool s_should_stop_idling = false; - -void Scheduler::stop_idling() +void Scheduler::invoke_async() { - if (Thread::current != g_colonel) - return; - - s_should_stop_idling = true; + ASSERT_INTERRUPTS_DISABLED(); + ASSERT(!Processor::current().in_irq()); + pick_next(); } void Scheduler::idle_loop() { + dbg() << "Scheduler: idle loop on CPU #" << Processor::current().id(); + ASSERT(are_interrupts_enabled()); for (;;) { asm("hlt"); - if (s_should_stop_idling) { - s_should_stop_idling = false; - yield(); - } + yield(); } } diff --git a/Kernel/Scheduler.h b/Kernel/Scheduler.h index ea374fa946c..7ed8642caf8 100644 --- a/Kernel/Scheduler.h +++ b/Kernel/Scheduler.h @@ -43,28 +43,25 @@ struct SchedulerData; extern Thread* g_finalizer; extern Thread* g_colonel; extern WaitQueue* g_finalizer_wait_queue; -extern bool g_finalizer_has_work; +extern Atomic g_finalizer_has_work; extern u64 g_uptime; extern SchedulerData* g_scheduler_data; extern timeval g_timeofday; class Scheduler { public: - static void initialize(); + static void initialize(u32 cpu); static void timer_tick(const RegisterState&); + [[noreturn]] static void start(); static bool pick_next(); static timeval time_since_boot(); - static void pick_next_and_switch_now(); - static void switch_now(); static bool yield(); static bool donate_to(Thread*, const char* reason); static bool context_switch(Thread&); - static void prepare_to_modify_tss(Thread&); static Process* colonel(); - static bool is_active(); static void beep(); static void idle_loop(); - static void stop_idling(); + static void invoke_async(); template static inline IterationDecision for_each_runnable(Callback); @@ -74,9 +71,6 @@ public: static void init_thread(Thread& thread); static void update_state_for_thread(Thread& thread); - -private: - static void prepare_for_iret_to_new_process(); }; } diff --git a/Kernel/SpinLock.h b/Kernel/SpinLock.h index fa2c5d9dfc0..538895e658f 100644 --- a/Kernel/SpinLock.h +++ b/Kernel/SpinLock.h @@ -69,6 +69,42 @@ public: } }; +class RecursiveSpinLock +{ + AK::Atomic m_lock{0}; + u32 m_recursions{0}; + +public: + RecursiveSpinLock() = default; + RecursiveSpinLock(const RecursiveSpinLock&) = delete; + RecursiveSpinLock(RecursiveSpinLock&&) = delete; + + ALWAYS_INLINE void lock() + { + FlatPtr cpu = FlatPtr(&Processor::current()); + FlatPtr expected = 0; + while (!m_lock.compare_exchange_strong(expected, cpu, AK::memory_order_acq_rel)) { + if (expected == cpu) + break; + expected = 0; + } + m_recursions++; + } + + ALWAYS_INLINE void unlock() + { + ASSERT(m_recursions > 0); + ASSERT(m_lock.load(AK::memory_order_consume) == FlatPtr(&Processor::current())); + if (--m_recursions == 0) + m_lock.store(0, AK::memory_order_release); + } + + ALWAYS_INLINE bool is_locked() const + { + return m_lock.load(AK::memory_order_consume) != 0; + } +}; + template > class ScopedSpinLock { diff --git a/Kernel/Syscall.cpp b/Kernel/Syscall.cpp index 50a9e99c242..e26521fd0b6 100644 --- a/Kernel/Syscall.cpp +++ b/Kernel/Syscall.cpp @@ -33,7 +33,7 @@ namespace Kernel { -extern "C" void syscall_handler(RegisterState&); +extern "C" void syscall_handler(TrapFrame*); extern "C" void syscall_asm_entry(); asm( @@ -46,22 +46,23 @@ asm( " pushl %fs\n" " pushl %gs\n" " pushl %ss\n" - " mov $0x10, %ax\n" + " mov $" __STRINGIFY(GDT_SELECTOR_DATA0) ", %ax\n" " mov %ax, %ds\n" " mov %ax, %es\n" + " mov $" __STRINGIFY(GDT_SELECTOR_PROC) ", %ax\n" + " mov %ax, %fs\n" " cld\n" " xor %esi, %esi\n" " xor %edi, %edi\n" - " push %esp\n" - " call syscall_handler\n" - " add $0x8, %esp\n" - " popl %gs\n" - " popl %fs\n" - " popl %es\n" - " popl %ds\n" - " popa\n" - " add $0x4, %esp\n" - " iret\n"); + " pushl %esp \n" // set TrapFrame::regs + " subl $" __STRINGIFY(TRAP_FRAME_SIZE - 4) ", %esp \n" + " movl %esp, %ebx \n" + " pushl %ebx \n" // push pointer to TrapFrame + " call enter_trap_no_irq \n" + " movl %ebx, 0(%esp) \n" // push pointer to TrapFrame + " call syscall_handler \n" + " movl %ebx, 0(%esp) \n" // push pointer to TrapFrame + " jmp common_trap_exit \n"); namespace Syscall { @@ -120,8 +121,9 @@ int handle(RegisterState& regs, u32 function, u32 arg1, u32 arg2, u32 arg3) } -void syscall_handler(RegisterState& regs) +void syscall_handler(TrapFrame* trap) { + auto& regs = *trap->regs; // Special handling of the "gettid" syscall since it's extremely hot. // FIXME: Remove this hack once userspace locks stop calling it so damn much. if (regs.eax == SC_gettid) { diff --git a/Kernel/Tasks/FinalizerTask.cpp b/Kernel/Tasks/FinalizerTask.cpp index 4b1f1ef7385..554f2149c9f 100644 --- a/Kernel/Tasks/FinalizerTask.cpp +++ b/Kernel/Tasks/FinalizerTask.cpp @@ -34,14 +34,12 @@ void FinalizerTask::spawn() Process::create_kernel_process(g_finalizer, "FinalizerTask", [] { Thread::current->set_priority(THREAD_PRIORITY_LOW); for (;;) { - { - InterruptDisabler disabler; - if (!g_finalizer_has_work) - Thread::current->wait_on(*g_finalizer_wait_queue); - ASSERT(g_finalizer_has_work); - g_finalizer_has_work = false; - } - Thread::finalize_dying_threads(); + dbg() << "Finalizer task is running"; + Thread::current->wait_on(*g_finalizer_wait_queue); + + bool expected = true; + if (g_finalizer_has_work.compare_exchange_strong(expected, false, AK::MemoryOrder::memory_order_acq_rel)) + Thread::finalize_dying_threads(); } }); } diff --git a/Kernel/Tasks/SyncTask.cpp b/Kernel/Tasks/SyncTask.cpp index f4d1dffafd1..33f18c8b3ae 100644 --- a/Kernel/Tasks/SyncTask.cpp +++ b/Kernel/Tasks/SyncTask.cpp @@ -35,6 +35,7 @@ void SyncTask::spawn() { Thread* syncd_thread = nullptr; Process::create_kernel_process(syncd_thread, "SyncTask", [] { + dbg() << "SyncTask is running"; for (;;) { VFS::the().sync(); Thread::current->sleep(1 * TimeManagement::the().ticks_per_second()); diff --git a/Kernel/Thread.cpp b/Kernel/Thread.cpp index 1f7f49a94a1..13bc03a97cc 100644 --- a/Kernel/Thread.cpp +++ b/Kernel/Thread.cpp @@ -48,30 +48,6 @@ namespace Kernel { Thread* Thread::current; -static FPUState s_clean_fpu_state; - -u16 thread_specific_selector() -{ - static u16 selector; - if (!selector) { - selector = gdt_alloc_entry(); - auto& descriptor = get_gdt_entry(selector); - descriptor.dpl = 3; - descriptor.segment_present = 1; - descriptor.granularity = 0; - descriptor.zero = 0; - descriptor.operation_size = 1; - descriptor.descriptor_type = 1; - descriptor.type = 2; - } - return selector; -} - -Descriptor& thread_specific_descriptor() -{ - return get_gdt_entry(thread_specific_selector()); -} - HashTable& thread_table() { ASSERT_INTERRUPTS_DISABLED(); @@ -103,27 +79,23 @@ Thread::Thread(Process& process) // Only IF is set when a process boots. m_tss.eflags = 0x0202; - u16 cs, ds, ss, gs; if (m_process.is_ring0()) { - cs = 0x08; - ds = 0x10; - ss = 0x10; - gs = 0; + m_tss.cs = GDT_SELECTOR_CODE0; + m_tss.ds = GDT_SELECTOR_DATA0; + m_tss.es = GDT_SELECTOR_DATA0; + m_tss.fs = GDT_SELECTOR_PROC; + m_tss.ss = GDT_SELECTOR_DATA0; + m_tss.gs = 0; } else { - cs = 0x1b; - ds = 0x23; - ss = 0x23; - gs = thread_specific_selector() | 3; + m_tss.cs = GDT_SELECTOR_CODE3 | 3; + m_tss.ds = GDT_SELECTOR_DATA3 | 3; + m_tss.es = GDT_SELECTOR_DATA3 | 3; + m_tss.fs = GDT_SELECTOR_DATA3 | 3; + m_tss.ss = GDT_SELECTOR_DATA3 | 3; + m_tss.gs = GDT_SELECTOR_TLS | 3; } - m_tss.ds = ds; - m_tss.es = ds; - m_tss.fs = ds; - m_tss.gs = gs; - m_tss.ss = ss; - m_tss.cs = cs; - m_tss.cr3 = m_process.page_directory().cr3(); m_kernel_stack_region = MM.allocate_kernel_region(default_kernel_stack_size, String::format("Kernel Stack (Thread %d)", m_tid), Region::Access::Read | Region::Access::Write, false, true); @@ -132,11 +104,11 @@ Thread::Thread(Process& process) m_kernel_stack_top = m_kernel_stack_region->vaddr().offset(default_kernel_stack_size).get() & 0xfffffff8u; if (m_process.is_ring0()) { - m_tss.esp = m_kernel_stack_top; + m_tss.esp = m_tss.esp0 = m_kernel_stack_top; } else { // Ring 3 processes get a separate stack for ring 0. // The ring 3 stack will be assigned by exec(). - m_tss.ss0 = 0x10; + m_tss.ss0 = GDT_SELECTOR_DATA0; m_tss.esp0 = m_kernel_stack_top; } @@ -155,9 +127,6 @@ Thread::~Thread() thread_table().remove(this); } - if (selector()) - gdt_free_entry(selector()); - ASSERT(m_process.m_thread_count); m_process.m_thread_count--; } @@ -219,9 +188,7 @@ void Thread::die_if_needed() InterruptDisabler disabler; set_state(Thread::State::Dying); - - if (!Scheduler::is_active()) - Scheduler::pick_next_and_switch_now(); + Scheduler::yield(); } void Thread::yield_without_holding_big_lock() @@ -613,12 +580,11 @@ ShouldUnblockThread Thread::dispatch_signal(u8 signal) u32* stack = &m_tss.esp; setup_stack(m_tss, stack); - Scheduler::prepare_to_modify_tss(*this); - m_tss.cs = 0x1b; - m_tss.ds = 0x23; - m_tss.es = 0x23; - m_tss.fs = 0x23; - m_tss.gs = thread_specific_selector() | 3; + m_tss.cs = GDT_SELECTOR_CODE3 | 3; + m_tss.ds = GDT_SELECTOR_DATA3 | 3; + m_tss.es = GDT_SELECTOR_DATA3 | 3; + m_tss.fs = GDT_SELECTOR_DATA3 | 3; + m_tss.gs = GDT_SELECTOR_TLS | 3; m_tss.eip = g_return_to_ring3_from_signal_trampoline.get(); // FIXME: This state is such a hack. It avoids trouble if 'current' is the process receiving a signal. set_state(Skip1SchedulerPass); @@ -713,17 +679,10 @@ Thread* Thread::clone(Process& process) clone->m_signal_mask = m_signal_mask; memcpy(clone->m_fpu_state, m_fpu_state, sizeof(FPUState)); clone->m_thread_specific_data = m_thread_specific_data; + clone->m_thread_specific_region_size = m_thread_specific_region_size; return clone; } -void Thread::initialize() -{ - Scheduler::initialize(); - asm volatile("fninit"); - asm volatile("fxsave %0" - : "=m"(s_clean_fpu_state)); -} - Vector Thread::all_threads() { Vector threads; @@ -760,10 +719,14 @@ void Thread::set_state(State new_state) Scheduler::update_state_for_thread(*this); } - if (new_state == Dying) { - g_finalizer_has_work = true; - g_finalizer_wait_queue->wake_all(); - } + if (new_state == Dying) + notify_finalizer(); +} + +void Thread::notify_finalizer() +{ + g_finalizer_has_work.store(true, AK::MemoryOrder::memory_order_release); + g_finalizer_wait_queue->wake_all(); } String Thread::backtrace(ProcessInspectionHandle&) const @@ -786,7 +749,7 @@ static bool symbolicate(const RecognizedSymbol& symbol, const Process& process, if (!is_user_address(VirtualAddress(symbol.address))) { builder.append("0xdeadc0de\n"); } else { - if (!Scheduler::is_active() && elf_bundle && elf_bundle->elf_loader->has_symbols()) + if (elf_bundle && elf_bundle->elf_loader->has_symbols()) builder.appendf("%p %s\n", symbol.address, elf_bundle->elf_loader->symbolicate(symbol.address).characters()); else builder.appendf("%p\n", symbol.address); @@ -863,8 +826,8 @@ Vector Thread::raw_backtrace(FlatPtr ebp, FlatPtr eip) const void Thread::make_thread_specific_region(Badge) { size_t thread_specific_region_alignment = max(process().m_master_tls_alignment, alignof(ThreadSpecificData)); - size_t thread_specific_region_size = align_up_to(process().m_master_tls_size, thread_specific_region_alignment) + sizeof(ThreadSpecificData); - auto* region = process().allocate_region({}, thread_specific_region_size, "Thread-specific", PROT_READ | PROT_WRITE, true); + m_thread_specific_region_size = align_up_to(process().m_master_tls_size, thread_specific_region_alignment) + sizeof(ThreadSpecificData); + auto* region = process().allocate_region({}, m_thread_specific_region_size, "Thread-specific", PROT_READ | PROT_WRITE, true); SmapDisabler disabler; auto* thread_specific_data = (ThreadSpecificData*)region->vaddr().offset(align_up_to(process().m_master_tls_size, thread_specific_region_alignment)).as_ptr(); auto* thread_local_storage = (u8*)((u8*)thread_specific_data) - align_up_to(process().m_master_tls_size, process().m_master_tls_alignment); @@ -881,25 +844,34 @@ const LogStream& operator<<(const LogStream& stream, const Thread& value) Thread::BlockResult Thread::wait_on(WaitQueue& queue, timeval* timeout, Atomic* lock, Thread* beneficiary, const char* reason) { - cli(); - bool did_unlock = unlock_process_if_locked(); - if (lock) - *lock = false; - set_state(State::Queued); - queue.enqueue(*current); - TimerId timer_id {}; - if (timeout) { - timer_id = TimerQueue::the().add_timer(*timeout, [&]() { - wake_from_queue(); - }); + bool did_unlock; + + { + InterruptDisabler disable; + did_unlock = unlock_process_if_locked(); + if (lock) + *lock = false; + set_state(State::Queued); + queue.enqueue(*current); + + + if (timeout) { + timer_id = TimerQueue::the().add_timer(*timeout, [&]() { + wake_from_queue(); + }); + } + + // Yield and wait for the queue to wake us up again. + if (beneficiary) + Scheduler::donate_to(beneficiary, reason); + else + Scheduler::yield(); } - // Yield and wait for the queue to wake us up again. - if (beneficiary) - Scheduler::donate_to(beneficiary, reason); - else - Scheduler::yield(); + if (!are_interrupts_enabled()) + sti(); + // We've unblocked, relock the process if needed and carry on. if (did_unlock) relock_process(); @@ -916,7 +888,10 @@ Thread::BlockResult Thread::wait_on(WaitQueue& queue, timeval* timeout, Atomic all_threads(); @@ -287,6 +286,7 @@ public: u32 ticks() const { return m_ticks; } VirtualAddress thread_specific_data() const { return m_thread_specific_data; } + size_t thread_specific_region_size() const { return m_thread_specific_region_size; } u64 sleep(u32 ticks); u64 sleep_until(u64 wakeup_time); @@ -354,6 +354,9 @@ public: void set_selector(u16 s) { m_far_ptr.selector = s; } void set_state(State); + bool is_initialized() const { return m_initialized; } + void set_initialized(bool initialized) { m_initialized = initialized; } + void send_urgent_signal_to_self(u8 signal); void send_signal(u8 signal, Process* sender); void consider_unblock(time_t now_sec, long now_usec); @@ -472,6 +475,7 @@ private: u32 m_kernel_stack_top { 0 }; OwnPtr m_kernel_stack_region; VirtualAddress m_thread_specific_data; + size_t m_thread_specific_region_size { 0 }; SignalActionData m_signal_action_data[32]; Blocker* m_blocker { nullptr }; @@ -506,9 +510,11 @@ private: bool m_dump_backtrace_on_finalization { false }; bool m_should_die { false }; + bool m_initialized {false}; OwnPtr m_tracer; + void notify_finalizer(); void yield_without_holding_big_lock(); }; @@ -595,7 +601,4 @@ inline IterationDecision Scheduler::for_each_nonrunnable(Callback callback) return IterationDecision::Continue; } -u16 thread_specific_selector(); -Descriptor& thread_specific_descriptor(); - } diff --git a/Kernel/VM/MemoryManager.cpp b/Kernel/VM/MemoryManager.cpp index ce59995c6eb..5934ce19857 100644 --- a/Kernel/VM/MemoryManager.cpp +++ b/Kernel/VM/MemoryManager.cpp @@ -51,6 +51,7 @@ extern FlatPtr end_of_kernel_bss; namespace Kernel { static MemoryManager* s_the; +RecursiveSpinLock MemoryManager::s_lock; MemoryManager& MM { @@ -164,6 +165,7 @@ void MemoryManager::parse_memory_map() const PageTableEntry* MemoryManager::pte(const PageDirectory& page_directory, VirtualAddress vaddr) { ASSERT_INTERRUPTS_DISABLED(); + ScopedSpinLock lock(s_lock); u32 page_directory_table_index = (vaddr.get() >> 30) & 0x3; u32 page_directory_index = (vaddr.get() >> 21) & 0x1ff; u32 page_table_index = (vaddr.get() >> 12) & 0x1ff; @@ -179,6 +181,7 @@ const PageTableEntry* MemoryManager::pte(const PageDirectory& page_directory, Vi PageTableEntry& MemoryManager::ensure_pte(PageDirectory& page_directory, VirtualAddress vaddr) { ASSERT_INTERRUPTS_DISABLED(); + ScopedSpinLock lock(s_lock); u32 page_directory_table_index = (vaddr.get() >> 30) & 0x3; u32 page_directory_index = (vaddr.get() >> 21) & 0x1ff; u32 page_table_index = (vaddr.get() >> 12) & 0x1ff; @@ -211,6 +214,7 @@ void MemoryManager::initialize() Region* MemoryManager::kernel_region_from_vaddr(VirtualAddress vaddr) { + ScopedSpinLock lock(s_lock); for (auto& region : MM.m_kernel_regions) { if (region.contains(vaddr)) return ®ion; @@ -220,6 +224,7 @@ Region* MemoryManager::kernel_region_from_vaddr(VirtualAddress vaddr) Region* MemoryManager::user_region_from_vaddr(Process& process, VirtualAddress vaddr) { + ScopedSpinLock lock(s_lock); // FIXME: Use a binary search tree (maybe red/black?) or some other more appropriate data structure! for (auto& region : process.m_regions) { if (region.contains(vaddr)) @@ -233,6 +238,7 @@ Region* MemoryManager::user_region_from_vaddr(Process& process, VirtualAddress v Region* MemoryManager::region_from_vaddr(Process& process, VirtualAddress vaddr) { + ScopedSpinLock lock(s_lock); if (auto* region = user_region_from_vaddr(process, vaddr)) return region; return kernel_region_from_vaddr(vaddr); @@ -240,6 +246,7 @@ Region* MemoryManager::region_from_vaddr(Process& process, VirtualAddress vaddr) const Region* MemoryManager::region_from_vaddr(const Process& process, VirtualAddress vaddr) { + ScopedSpinLock lock(s_lock); if (auto* region = user_region_from_vaddr(const_cast(process), vaddr)) return region; return kernel_region_from_vaddr(vaddr); @@ -247,6 +254,7 @@ const Region* MemoryManager::region_from_vaddr(const Process& process, VirtualAd Region* MemoryManager::region_from_vaddr(VirtualAddress vaddr) { + ScopedSpinLock lock(s_lock); if (auto* region = kernel_region_from_vaddr(vaddr)) return region; auto page_directory = PageDirectory::find_by_cr3(read_cr3()); @@ -260,16 +268,18 @@ PageFaultResponse MemoryManager::handle_page_fault(const PageFault& fault) { ASSERT_INTERRUPTS_DISABLED(); ASSERT(Thread::current); - if (g_in_irq) { - dbg() << "BUG! Page fault while handling IRQ! code=" << fault.code() << ", vaddr=" << fault.vaddr(); + ScopedSpinLock lock(s_lock); + if (Processor::current().in_irq()) { + dbg() << "CPU[" << Processor::id() << "] BUG! Page fault while handling IRQ! code=" << fault.code() << ", vaddr=" << fault.vaddr() << ", irq level: " << Processor::current().in_irq(); dump_kernel_regions(); + return PageFaultResponse::ShouldCrash; } #ifdef PAGE_FAULT_DEBUG - dbg() << "MM: handle_page_fault(" << String::format("%w", fault.code()) << ") at " << fault.vaddr(); + dbg() << "MM: CPU[" << Processor::id() << "] handle_page_fault(" << String::format("%w", fault.code()) << ") at " << fault.vaddr(); #endif auto* region = region_from_vaddr(fault.vaddr()); if (!region) { - klog() << "NP(error) fault at invalid address " << fault.vaddr(); + klog() << "CPU[" << Processor::id() << "] NP(error) fault at invalid address " << fault.vaddr(); return PageFaultResponse::ShouldCrash; } @@ -279,6 +289,7 @@ PageFaultResponse MemoryManager::handle_page_fault(const PageFault& fault) OwnPtr MemoryManager::allocate_contiguous_kernel_region(size_t size, const StringView& name, u8 access, bool user_accessible, bool cacheable) { ASSERT(!(size % PAGE_SIZE)); + ScopedSpinLock lock(s_lock); auto range = kernel_page_directory().range_allocator().allocate_anywhere(size); if (!range.is_valid()) return nullptr; @@ -292,6 +303,7 @@ OwnPtr MemoryManager::allocate_contiguous_kernel_region(size_t size, con OwnPtr MemoryManager::allocate_kernel_region(size_t size, const StringView& name, u8 access, bool user_accessible, bool should_commit, bool cacheable) { ASSERT(!(size % PAGE_SIZE)); + ScopedSpinLock lock(s_lock); auto range = kernel_page_directory().range_allocator().allocate_anywhere(size); if (!range.is_valid()) return nullptr; @@ -307,6 +319,7 @@ OwnPtr MemoryManager::allocate_kernel_region(size_t size, const StringVi OwnPtr MemoryManager::allocate_kernel_region(PhysicalAddress paddr, size_t size, const StringView& name, u8 access, bool user_accessible, bool cacheable) { ASSERT(!(size % PAGE_SIZE)); + ScopedSpinLock lock(s_lock); auto range = kernel_page_directory().range_allocator().allocate_anywhere(size); if (!range.is_valid()) return nullptr; @@ -319,6 +332,7 @@ OwnPtr MemoryManager::allocate_kernel_region(PhysicalAddress paddr, size OwnPtr MemoryManager::allocate_kernel_region_identity(PhysicalAddress paddr, size_t size, const StringView& name, u8 access, bool user_accessible, bool cacheable) { ASSERT(!(size % PAGE_SIZE)); + ScopedSpinLock lock(s_lock); auto range = kernel_page_directory().identity_range_allocator().allocate_specific(VirtualAddress(paddr.get()), size); if (!range.is_valid()) return nullptr; @@ -335,7 +349,7 @@ OwnPtr MemoryManager::allocate_user_accessible_kernel_region(size_t size OwnPtr MemoryManager::allocate_kernel_region_with_vmobject(const Range& range, VMObject& vmobject, const StringView& name, u8 access, bool user_accessible, bool cacheable) { - InterruptDisabler disabler; + ScopedSpinLock lock(s_lock); OwnPtr region; if (user_accessible) region = Region::create_user_accessible(range, vmobject, 0, name, access, cacheable); @@ -349,6 +363,7 @@ OwnPtr MemoryManager::allocate_kernel_region_with_vmobject(const Range& OwnPtr MemoryManager::allocate_kernel_region_with_vmobject(VMObject& vmobject, size_t size, const StringView& name, u8 access, bool user_accessible, bool cacheable) { ASSERT(!(size % PAGE_SIZE)); + ScopedSpinLock lock(s_lock); auto range = kernel_page_directory().range_allocator().allocate_anywhere(size); if (!range.is_valid()) return nullptr; @@ -357,6 +372,7 @@ OwnPtr MemoryManager::allocate_kernel_region_with_vmobject(VMObject& vmo void MemoryManager::deallocate_user_physical_page(PhysicalPage&& page) { + ScopedSpinLock lock(s_lock); for (auto& region : m_user_physical_regions) { if (!region.contains(page)) { klog() << "MM: deallocate_user_physical_page: " << page.paddr() << " not in " << region.lower() << " -> " << region.upper(); @@ -375,6 +391,7 @@ void MemoryManager::deallocate_user_physical_page(PhysicalPage&& page) RefPtr MemoryManager::find_free_user_physical_page() { + ASSERT(s_lock.is_locked()); RefPtr page; for (auto& region : m_user_physical_regions) { page = region.take_free_page(false); @@ -386,7 +403,7 @@ RefPtr MemoryManager::find_free_user_physical_page() RefPtr MemoryManager::allocate_user_physical_page(ShouldZeroFill should_zero_fill) { - InterruptDisabler disabler; + ScopedSpinLock lock(s_lock); auto page = find_free_user_physical_page(); if (!page) { @@ -425,6 +442,7 @@ RefPtr MemoryManager::allocate_user_physical_page(ShouldZeroFill s void MemoryManager::deallocate_supervisor_physical_page(PhysicalPage&& page) { + ASSERT(s_lock.is_locked()); for (auto& region : m_super_physical_regions) { if (!region.contains(page)) { klog() << "MM: deallocate_supervisor_physical_page: " << page.paddr() << " not in " << region.lower() << " -> " << region.upper(); @@ -443,7 +461,7 @@ void MemoryManager::deallocate_supervisor_physical_page(PhysicalPage&& page) NonnullRefPtrVector MemoryManager::allocate_contiguous_supervisor_physical_pages(size_t size) { ASSERT(!(size % PAGE_SIZE)); - InterruptDisabler disabler; + ScopedSpinLock lock(s_lock); size_t count = ceil_div(size, PAGE_SIZE); NonnullRefPtrVector physical_pages; @@ -471,7 +489,7 @@ NonnullRefPtrVector MemoryManager::allocate_contiguous_supervisor_ RefPtr MemoryManager::allocate_supervisor_physical_page() { - InterruptDisabler disabler; + ScopedSpinLock lock(s_lock); RefPtr page; for (auto& region : m_super_physical_regions) { @@ -502,7 +520,7 @@ RefPtr MemoryManager::allocate_supervisor_physical_page() void MemoryManager::enter_process_paging_scope(Process& process) { ASSERT(Thread::current); - InterruptDisabler disabler; + ScopedSpinLock lock(s_lock); Thread::current->tss().cr3 = process.page_directory().cr3(); write_cr3(process.page_directory().cr3()); @@ -528,6 +546,7 @@ extern "C" PageTableEntry boot_pd3_pt1023[1024]; PageDirectoryEntry* MemoryManager::quickmap_pd(PageDirectory& directory, size_t pdpt_index) { + ScopedSpinLock lock(s_lock); auto& pte = boot_pd3_pt1023[4]; auto pd_paddr = directory.m_directory_pages[pdpt_index]->paddr(); if (pte.physical_page_base() != pd_paddr.as_ptr()) { @@ -545,6 +564,7 @@ PageDirectoryEntry* MemoryManager::quickmap_pd(PageDirectory& directory, size_t PageTableEntry* MemoryManager::quickmap_pt(PhysicalAddress pt_paddr) { + ScopedSpinLock lock(s_lock); auto& pte = boot_pd3_pt1023[8]; if (pte.physical_page_base() != pt_paddr.as_ptr()) { #ifdef MM_DEBUG @@ -562,6 +582,7 @@ PageTableEntry* MemoryManager::quickmap_pt(PhysicalAddress pt_paddr) u8* MemoryManager::quickmap_page(PhysicalPage& physical_page) { ASSERT_INTERRUPTS_DISABLED(); + ScopedSpinLock lock(s_lock); ASSERT(!m_quickmap_in_use); m_quickmap_in_use = true; @@ -582,6 +603,7 @@ u8* MemoryManager::quickmap_page(PhysicalPage& physical_page) void MemoryManager::unquickmap_page() { ASSERT_INTERRUPTS_DISABLED(); + ScopedSpinLock lock(s_lock); ASSERT(m_quickmap_in_use); auto& pte = boot_pd3_pt1023[0]; pte.clear(); @@ -592,6 +614,7 @@ void MemoryManager::unquickmap_page() template bool MemoryManager::validate_range(const Process& process, VirtualAddress base_vaddr, size_t size) const { + ASSERT(s_lock.is_locked()); ASSERT(size); if (base_vaddr > base_vaddr.offset(size)) { dbg() << "Shenanigans! Asked to validate wrappy " << base_vaddr << " size=" << size; @@ -627,12 +650,14 @@ bool MemoryManager::validate_user_stack(const Process& process, VirtualAddress v { if (!is_user_address(vaddr)) return false; + ScopedSpinLock lock(s_lock); auto* region = user_region_from_vaddr(const_cast(process), vaddr); return region && region->is_user_accessible() && region->is_stack(); } bool MemoryManager::validate_kernel_read(const Process& process, VirtualAddress vaddr, size_t size) const { + ScopedSpinLock lock(s_lock); return validate_range(process, vaddr, size); } @@ -640,6 +665,7 @@ bool MemoryManager::can_read_without_faulting(const Process& process, VirtualAdd { // FIXME: Use the size argument! UNUSED_PARAM(size); + ScopedSpinLock lock(s_lock); auto* pte = const_cast(this)->pte(process.page_directory(), vaddr); if (!pte) return false; @@ -650,6 +676,7 @@ bool MemoryManager::validate_user_read(const Process& process, VirtualAddress va { if (!is_user_address(vaddr)) return false; + ScopedSpinLock lock(s_lock); return validate_range(process, vaddr, size); } @@ -657,24 +684,25 @@ bool MemoryManager::validate_user_write(const Process& process, VirtualAddress v { if (!is_user_address(vaddr)) return false; + ScopedSpinLock lock(s_lock); return validate_range(process, vaddr, size); } void MemoryManager::register_vmobject(VMObject& vmobject) { - InterruptDisabler disabler; + ScopedSpinLock lock(s_lock); m_vmobjects.append(&vmobject); } void MemoryManager::unregister_vmobject(VMObject& vmobject) { - InterruptDisabler disabler; + ScopedSpinLock lock(s_lock); m_vmobjects.remove(&vmobject); } void MemoryManager::register_region(Region& region) { - InterruptDisabler disabler; + ScopedSpinLock lock(s_lock); if (region.is_kernel()) m_kernel_regions.append(®ion); else @@ -683,7 +711,7 @@ void MemoryManager::register_region(Region& region) void MemoryManager::unregister_region(Region& region) { - InterruptDisabler disabler; + ScopedSpinLock lock(s_lock); if (region.is_kernel()) m_kernel_regions.remove(®ion); else diff --git a/Kernel/VM/MemoryManager.h b/Kernel/VM/MemoryManager.h index d58602fa115..6c27ca766e7 100644 --- a/Kernel/VM/MemoryManager.h +++ b/Kernel/VM/MemoryManager.h @@ -31,6 +31,7 @@ #include #include #include +#include #include #include #include @@ -201,6 +202,8 @@ private: InlineLinkedList m_vmobjects; + static RecursiveSpinLock s_lock; + bool m_quickmap_in_use { false }; RefPtr m_low_pseudo_identity_mapping_pages[4]; diff --git a/Kernel/WaitQueue.cpp b/Kernel/WaitQueue.cpp index cc19acb117e..b5d5d759dd3 100644 --- a/Kernel/WaitQueue.cpp +++ b/Kernel/WaitQueue.cpp @@ -52,7 +52,7 @@ void WaitQueue::wake_one(Atomic* lock) return; if (auto* thread = m_threads.take_first()) thread->wake_from_queue(); - Scheduler::stop_idling(); + Scheduler::yield(); } void WaitQueue::wake_n(i32 wake_count) @@ -67,7 +67,7 @@ void WaitQueue::wake_n(i32 wake_count) break; thread->wake_from_queue(); } - Scheduler::stop_idling(); + Scheduler::yield(); } void WaitQueue::wake_all() @@ -77,7 +77,7 @@ void WaitQueue::wake_all() return; while (!m_threads.is_empty()) m_threads.take_first()->wake_from_queue(); - Scheduler::stop_idling(); + Scheduler::yield(); } void WaitQueue::clear() diff --git a/Kernel/init.cpp b/Kernel/init.cpp index 8e041bc08a7..41dfa5a5498 100644 --- a/Kernel/init.cpp +++ b/Kernel/init.cpp @@ -103,18 +103,19 @@ extern "C" [[noreturn]] void init() { setup_serial_debug(); - cpu_setup(); + cpu_setup(0); kmalloc_init(); slab_alloc_init(); + { + static Processor s_bsp_processor_info; // global but let's keep it "private" + s_bsp_processor_info.initialize(0); + } + CommandLine::initialize(reinterpret_cast(low_physical_to_virtual(multiboot_info_ptr->cmdline))); - MemoryManager::initialize(); - gdt_init(); - idt_init(); - // Invoke all static global constructors in the kernel. // Note that we want to do this as early as possible. for (ctor_func_t* ctor = &start_ctors; ctor < &end_ctors; ctor++) @@ -148,16 +149,12 @@ extern "C" [[noreturn]] void init() VirtualConsole::switch_to(0); Process::initialize(); - Thread::initialize(); + Scheduler::initialize(0); Thread* init_stage2_thread = nullptr; Process::create_kernel_process(init_stage2_thread, "init_stage2", init_stage2); - Scheduler::pick_next(); - - sti(); - - Scheduler::idle_loop(); + Scheduler::start(); ASSERT_NOT_REACHED(); } @@ -166,8 +163,12 @@ extern "C" [[noreturn]] void init() // // The purpose of init_ap() is to initialize APs for multi-tasking. // -extern "C" [[noreturn]] void init_ap(u32 cpu) +extern "C" [[noreturn]] void init_ap(u32 cpu, Processor* processor_info) { + klog() << "CPU #" << cpu << " processor_info at " << VirtualAddress(FlatPtr(processor_info)); + cpu_setup(cpu); + processor_info->initialize(cpu); + APIC::the().enable(cpu); #if 0