/* cpu.c COPYRIGHT FUJITSU LIMITED 2018-2019 */ /** * \file cpu.c * License details are found in the file LICENSE. * \brief * Control CPU. * \author Taku Shimosawa \par * Copyright (C) 2011 - 2012 Taku Shimosawa * \author Gou Nakamura \par * Copyright (C) 2015 RIKEN AICS */ /* * HISTORY * 2015/02/26: bgerofi - set pstate, turbo mode and power/perf bias MSRs * 2015/02/12: Dave - enable AVX if supported */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define LAPIC_ID 0x020 #define LAPIC_TIMER 0x320 #define LAPIC_LVTPC 0x340 #define LAPIC_TIMER_INITIAL 0x380 #define LAPIC_TIMER_CURRENT 0x390 #define LAPIC_TIMER_DIVIDE 0x3e0 #define LAPIC_SPURIOUS 0x0f0 #define LAPIC_EOI 0x0b0 #define LAPIC_ICR0 0x300 #define LAPIC_ICR2 0x310 #define LAPIC_ESR 0x280 #define LOCAL_TIMER_VECTOR 0xef #define LOCAL_PERF_VECTOR 0xf0 #define LOCAL_SMP_FUNC_CALL_VECTOR 0xf1 #define APIC_INT_LEVELTRIG 0x08000 #define APIC_INT_ASSERT 0x04000 #define APIC_ICR_BUSY 0x01000 #define APIC_DEST_PHYSICAL 0x00000 #define APIC_DM_FIXED 0x00000 #define APIC_DM_NMI 0x00400 #define APIC_DM_INIT 0x00500 #define APIC_DM_STARTUP 0x00600 #define APIC_DIVISOR 16 #define APIC_LVT_TIMER_PERIODIC (1 << 17) #define APIC_BASE_MSR 0x800 #define IA32_X2APIC_APICID 0x802 #define IA32_X2APIC_ICR 0x830 #define X2APIC_ENABLE (1UL << 10) #define NMI_VECTOR 0x02 //#define DEBUG_PRINT_CPU #ifdef DEBUG_PRINT_CPU #undef DDEBUG_DEFAULT #define DDEBUG_DEFAULT DDEBUG_PRINT #endif static void *lapic_vp; static int x2apic; static void (*lapic_write)(int reg, unsigned int value); static unsigned int (*lapic_read)(int reg); static void (*lapic_icr_write)(unsigned int h, unsigned int l); static void (*lapic_wait_icr_idle)(void); void (*x86_issue_ipi)(unsigned int apicid, unsigned int low); int running_on_kvm(void); static void smp_func_call_handler(void); void init_processors_local(int max_id); void assign_processor_id(void); void arch_delay(int); void x86_set_warm_reset(unsigned long ip, char *first_page_va); void x86_init_perfctr(void); int gettime_local_support = 0; extern int kprintf(const char *format, ...); extern int interrupt_from_user(void *); extern void perf_start(struct mc_perf_event *event); extern void perf_reset(struct mc_perf_event *event); static struct idt_entry{ uint32_t desc[4]; } idt[256] __attribute__((aligned(16))); static struct x86_desc_ptr idt_desc, gdt_desc; static uint64_t gdt[] __attribute__((aligned(16))) = { 0, /* 0 */ 0, /* 8 */ 0, /* 16 */ 0, /* 24 */ 0x00af9b000000ffff, /* 32 : KERNEL_CS */ 0x00cf93000000ffff, /* 40 : KERNEL_DS */ 0x00affb000000ffff, /* 48 : USER_CS */ 0x00aff3000000ffff, /* 56 : USER_DS */ 0x0000890000000067, /* 64 : TSS */ 0, /* (72: TSS) */ 0, /* 80 */ 0, /* 88 */ 0, /* 96 */ 0, /* 104 */ 0, /* 112 */ 0x0000f10000000000, /* 120 : GETCPU */ }; struct tss64 tss __attribute__((aligned(16))); static void set_idt_entry(int idx, unsigned long addr) { idt[idx].desc[0] = (addr & 0xffff) | (KERNEL_CS << 16); idt[idx].desc[1] = (addr & 0xffff0000) | 0x8e00; idt[idx].desc[2] = (addr >> 32); idt[idx].desc[3] = 0; } static void set_idt_entry_trap_gate(int idx, unsigned long addr) { idt[idx].desc[0] = (addr & 0xffff) | (KERNEL_CS << 16); idt[idx].desc[1] = (addr & 0xffff0000) | 0xef00; idt[idx].desc[2] = (addr >> 32); idt[idx].desc[3] = 0; } extern uint64_t generic_common_handlers[]; void reload_idt(void) { asm volatile("lidt %0" : : "m"(idt_desc) : "memory"); } static struct list_head handlers[256 - 32]; extern char nmi_handler[]; extern char page_fault[], general_protection_exception[]; extern char debug_exception[], int3_exception[]; uint64_t boot_pat_state = 0; int no_turbo = 1; /* May be updated by early parsing of kargs */ extern int num_processors; /* kernel/ap.c */ struct pvclock_vsyscall_time_info *pvti = NULL; int pvti_npages; static long pvti_msr = -1; static void init_idt(void) { int i; idt_desc.size = sizeof(idt) - 1; idt_desc.address = (unsigned long)idt; for (i = 0; i < 256; i++) { if (i >= 32) { INIT_LIST_HEAD(&handlers[i - 32]); } set_idt_entry(i, generic_common_handlers[i]); } set_idt_entry(2, (uintptr_t)nmi_handler); set_idt_entry(13, (unsigned long)general_protection_exception); set_idt_entry(14, (unsigned long)page_fault); set_idt_entry_trap_gate(1, (unsigned long)debug_exception); set_idt_entry_trap_gate(3, (unsigned long)int3_exception); reload_idt(); } static int xsave_available = 0; static int xsave_size = 0; static uint64_t xsave_mask = 0x0; void init_fpu(void) { unsigned long reg; unsigned long cpuid01_ecx; asm volatile("movq %%cr0, %0" : "=r"(reg)); /* Unset EM and TS flag. */ reg &= ~((1 << 2) | (1 << 3)); /* Set MP flag */ reg |= 1 << 1; asm volatile("movq %0, %%cr0" : : "r"(reg)); #ifdef ENABLE_SSE asm volatile("cpuid" : "=c" (cpuid01_ecx) : "a" (0x1) : "%rbx", "%rdx"); asm volatile("movq %%cr4, %0" : "=r"(reg)); /* Cr4 flags: OSFXSR[b9] - enables SSE instructions OSXMMEXCPT[b10] - generate SIMD FP exception instead of invalid op OSXSAVE[b18] - enables access to xcr0 CPUID.01H:ECX flags: XSAVE[b26] - verify existence of extended crs/XSAVE AVX[b28] - verify existence of AVX instructions */ reg |= ((1 << 9) | (1 << 10)); if(cpuid01_ecx & (1 << 26)) { /* XSAVE set, enable access to xcr0 */ dkprintf("init_fpu(): XSAVE available\n"); xsave_available = 1; reg |= (1 << 18); } asm volatile("movq %0, %%cr4" : : "r"(reg)); dkprintf("init_fpu(): SSE init: CR4 = 0x%016lX\n", reg); /* Set xcr0[2:1] to enable avx ops */ if(xsave_available){ unsigned long eax; unsigned long ebx; unsigned long ecx; unsigned long edx; asm volatile("cpuid" : "=a"(eax),"=b"(ebx),"=c"(ecx),"=d"(edx) : "a" (0x0d), "c" (0x00)); xsave_size = ecx; dkprintf("init_fpu(): xsave_size = %d\n", xsave_size); if ((eax & (1 << 5)) && (eax & (1 << 6)) && (eax & (1 << 7))) { /* Set xcr0[7:5] to enable avx-512 ops */ reg = xgetbv(0); reg |= 0xe6; xsetbv(0, reg); dkprintf("init_fpu(): AVX-512 init: XCR0 = 0x%016lX\n", reg); } else { reg = xgetbv(0); reg |= 0x6; xsetbv(0, reg); dkprintf("init_fpu(): AVX init: XCR0 = 0x%016lX\n", reg); } xsave_mask = xgetbv(0); dkprintf("init_fpu(): xsave_mask = 0x%016lX\n", xsave_mask); } /* TODO: set MSR_IA32_XSS to enable xsaves/xrstors */ #else kprintf("init_fpu(): SSE not enabled\n"); #endif asm volatile("finit"); } int get_xsave_size() { return xsave_size; } uint64_t get_xsave_mask() { return xsave_mask; } void reload_gdt(struct x86_desc_ptr *gdt_ptr) { asm volatile("pushq %1\n" "leaq 1f(%%rip), %%rbx\n" "pushq %%rbx\n" "lgdt %0\n" "lretq\n" "1:\n" : : "m" (*gdt_ptr), "i" (KERNEL_CS) : "rbx"); asm volatile("movl %0, %%ds" : : "r"(KERNEL_DS)); asm volatile("movl %0, %%ss" : : "r"(KERNEL_DS)); /* And, set TSS */ asm volatile("ltr %0" : : "r"((short)GLOBAL_TSS) : "memory"); } void init_gdt(void) { register unsigned long stack_pointer asm("rsp"); unsigned long tss_addr = (unsigned long)&tss; memset(&tss, 0, sizeof(tss)); tss.rsp0 = stack_pointer; /* 0x89 = Present (8) | Type = 9 (TSS) */ gdt[GLOBAL_TSS_ENTRY] = (sizeof(tss) - 1) | ((tss_addr & 0xffffff) << 16) | (0x89UL << 40) | ((tss_addr & 0xff000000) << 32); gdt[GLOBAL_TSS_ENTRY + 1] = (tss_addr >> 32); gdt_desc.size = sizeof(gdt) - 1; gdt_desc.address = (unsigned long)gdt; /* Load the new GDT, and set up CS, DS and SS. */ reload_gdt(&gdt_desc); } static void apic_write(int reg, unsigned int value) { *(volatile unsigned int *)((char *)lapic_vp + reg) = value; } static void x2apic_write(int reg, unsigned int value) { reg >>= 4; reg |= APIC_BASE_MSR; wrmsr(reg, value); } static unsigned int apic_read(int reg) { return *(volatile unsigned int *)((char *)lapic_vp + reg); } static unsigned int x2apic_read(int reg) { unsigned long value; reg >>= 4; reg |= APIC_BASE_MSR; value = rdmsr(reg); return (int)value; } void lapic_timer_enable(unsigned int clocks) { unsigned int lvtt_value; lapic_write(LAPIC_TIMER_INITIAL, clocks / APIC_DIVISOR); lapic_write(LAPIC_TIMER_DIVIDE, 3); /* initialize periodic timer */ lvtt_value = LOCAL_TIMER_VECTOR | APIC_LVT_TIMER_PERIODIC; lapic_write(LAPIC_TIMER, lvtt_value); } void lapic_timer_disable() { lapic_write(LAPIC_TIMER_INITIAL, 0); } void lapic_ack(void) { lapic_write(LAPIC_EOI, 0); } static void x2apic_wait_icr_idle(void) { } static void apic_wait_icr_idle(void) { while (lapic_read(LAPIC_ICR0) & APIC_ICR_BUSY) { cpu_pause(); } } static void x2apic_icr_write(unsigned int low, unsigned int apicid) { wrmsr(IA32_X2APIC_ICR, (((unsigned long)apicid) << 32) | low); } static void apic_icr_write(unsigned int h, unsigned int l) { lapic_write(LAPIC_ICR2, (unsigned int)h); lapic_write(LAPIC_ICR0, l); } static void x2apic_x86_issue_ipi(unsigned int apicid, unsigned int low) { unsigned long icr = low; unsigned long flags; ihk_mc_mb(); flags = cpu_disable_interrupt_save(); x2apic_icr_write(icr, apicid); cpu_restore_interrupt(flags); } static void apic_x86_issue_ipi(unsigned int apicid, unsigned int low) { unsigned long flags; flags = cpu_disable_interrupt_save(); apic_wait_icr_idle(); apic_icr_write(apicid << LAPIC_ICR_ID_SHIFT, low); cpu_restore_interrupt(flags); } unsigned long x2apic_is_enabled() { unsigned long msr; msr = rdmsr(MSR_IA32_APIC_BASE); return (msr & X2APIC_ENABLE); } void init_lapic_bsp(void) { if(x2apic_is_enabled()){ x2apic = 1; lapic_write = x2apic_write; lapic_read = x2apic_read; lapic_icr_write = x2apic_icr_write; lapic_wait_icr_idle = x2apic_wait_icr_idle; x86_issue_ipi = x2apic_x86_issue_ipi; } else{ x2apic = 0; lapic_write = apic_write; lapic_read = apic_read; lapic_icr_write = apic_icr_write; lapic_wait_icr_idle = apic_wait_icr_idle; x86_issue_ipi = apic_x86_issue_ipi; } } void init_lapic() { if(!x2apic){ unsigned long baseaddr; /* Enable Local APIC */ baseaddr = rdmsr(MSR_IA32_APIC_BASE); if (!lapic_vp) { lapic_vp = map_fixed_area(baseaddr & PAGE_MASK, PAGE_SIZE, 1); } baseaddr |= 0x800; wrmsr(MSR_IA32_APIC_BASE, baseaddr); } lapic_write(LAPIC_SPURIOUS, 0x1ff); lapic_write(LAPIC_LVTPC, LOCAL_PERF_VECTOR); } void print_msr(int idx) { int bit; unsigned long long val; val = rdmsr(idx); __kprintf("MSR 0x%x val (dec): %llu\n", idx, val); __kprintf("MSR 0x%x val (hex): 0x%llx\n", idx, val); __kprintf(" "); for (bit = 63; bit >= 0; --bit) { __kprintf("%3d", bit); } __kprintf("\n"); __kprintf("MSR 0x%x val (bin):", idx); for (bit = 63; bit >= 0; --bit) { __kprintf("%3d", (val & ((unsigned long)1 << bit)) ? 1 : 0); } __kprintf("\n"); } void init_pstate_and_turbo(void) { uint64_t value; uint64_t eax, ecx; if (running_on_kvm()) return; asm volatile("cpuid" : "=a" (eax), "=c" (ecx) : "a" (0x6) : "%rbx", "%rdx"); if (!(ecx & 0x01)) { /* P-states and/or Turbo Boost are not supported. */ return; } /* Query and set max pstate value: * * IA32_PERF_CTL (0x199H) bit 15:0: * Target performance State Value * * The base operating ratio can be read * from MSR_PLATFORM_INFO[15:8]. */ value = rdmsr(MSR_PLATFORM_INFO); value &= 0xFF00; /* Turbo boost setting: * Bit 1 of EAX in Leaf 06H (i.e. CPUID.06H:EAX[1]) indicates opportunistic * processor performance operation, such as IDA, has been enabled by BIOS. * * IA32_PERF_CTL (0x199H) bit 32: IDA (i.e., turbo boost) Engage. (R/W) * When set to 1: disengages IDA * When set to 0: enables IDA */ if ((eax & (1 << 1))) { if (!no_turbo) { uint64_t turbo_value; turbo_value = rdmsr(MSR_NHM_TURBO_RATIO_LIMIT); turbo_value &= 0xFF; value = turbo_value << 8; /* Enable turbo boost */ value &= ~((uint64_t)1 << 32); } /* Turbo boost feature is supported, but requested to be turned off */ else { /* Disable turbo boost */ value |= (uint64_t)1 << 32; } } wrmsr(MSR_IA32_PERF_CTL, value); /* IA32_ENERGY_PERF_BIAS (0x1B0H) bit 3:0: * (The processor supports this capability if CPUID.06H:ECX.SETBH[bit 3] is set.) * Power Policy Preference: * 0 indicates preference to highest performance. * 15 indicates preference to maximize energy saving. * * Set energy/perf bias to high performance */ if (ecx & (1 << 3)) { wrmsr(MSR_IA32_ENERGY_PERF_BIAS, 0); } //print_msr(MSR_IA32_MISC_ENABLE); //print_msr(MSR_IA32_PERF_CTL); //print_msr(MSR_IA32_ENERGY_PERF_BIAS); } enum { PAT_UC = 0, /* uncached */ PAT_WC = 1, /* Write combining */ PAT_WT = 4, /* Write Through */ PAT_WP = 5, /* Write Protected */ PAT_WB = 6, /* Write Back (default) */ PAT_UC_MINUS = 7, /* UC, but can be overriden by MTRR */ }; #define PAT(x, y) ((uint64_t)PAT_ ## y << ((x)*8)) void init_pat(void) { uint64_t pat; uint64_t edx; /* * An operating system or executive can detect the availability of the * PAT by executing the CPUID instruction with a value of 1 in the EAX * register. Support for the PAT is indicated by the PAT flag (bit 16 * of the values returned to EDX register). If the PAT is supported, * the operating system or executive can use the IA32_PAT MSR to program * the PAT. When memory types have been assigned to entries in the PAT, * software can then use of the PAT-index bit (PAT) in the page-table and * page-directory entries along with the PCD and PWT bits to assign memory * types from the PAT to individual pages. */ asm volatile("cpuid" : "=d" (edx) : "a" (0x1) : "%rbx", "%rcx"); if (!(edx & ((uint64_t)1 << 16))) { kprintf("PAT not supported.\n"); return; } /* Set PWT to Write-Combining. All other bits stay the same */ /* (Based on Linux' settings) * * PTE encoding used in Linux: * PAT * |PCD * ||PWT * ||| * 000 WB _PAGE_CACHE_WB * 001 WC _PAGE_CACHE_WC * 010 UC- _PAGE_CACHE_UC_MINUS * 011 UC _PAGE_CACHE_UC * PAT bit unused */ pat = PAT(0, WB) | PAT(1, WC) | PAT(2, UC_MINUS) | PAT(3, UC) | PAT(4, WB) | PAT(5, WC) | PAT(6, UC_MINUS) | PAT(7, UC); /* Boot CPU check */ if (!boot_pat_state) boot_pat_state = rdmsr(MSR_IA32_CR_PAT); wrmsr(MSR_IA32_CR_PAT, pat); dkprintf("PAT support detected and reconfigured.\n"); } static void set_kstack(unsigned long ptr) { struct x86_cpu_local_variables *v; v = get_x86_this_cpu_local(); v->kernel_stack = ptr; v->tss.rsp0 = ptr; } static void init_smp_processor(void) { struct x86_cpu_local_variables *v; unsigned long tss_addr; unsigned node_cpu; v = get_x86_this_cpu_local(); tss_addr = (unsigned long)&v->tss; if(x2apic_is_enabled()){ v->apic_id = rdmsr(IA32_X2APIC_APICID); } else{ v->apic_id = lapic_read(LAPIC_ID) >> LAPIC_ID_SHIFT; } memcpy(v->gdt, gdt, sizeof(v->gdt)); memset(&v->tss, 0, sizeof(v->tss)); v->gdt[GLOBAL_TSS_ENTRY] = (sizeof(v->tss) - 1) | ((tss_addr & 0xffffff) << 16) | (0x89UL << 40) | ((tss_addr & 0xff000000) << 32); v->gdt[GLOBAL_TSS_ENTRY + 1] = (tss_addr >> 32); node_cpu = v->processor_id; /* assumes NUMA node 0 */ v->gdt[GETCPU_ENTRY] |= node_cpu; v->gdt_ptr.size = sizeof(v->gdt) - 1; v->gdt_ptr.address = (unsigned long)v->gdt; /* Load the new GDT, and set up CS, DS and SS. */ reload_gdt(&v->gdt_ptr); set_kstack((unsigned long)get_x86_this_cpu_kstack()); /* MSR_IA32_TSC_AUX on KVM seems broken */ if (running_on_kvm()) return; #define MSR_IA32_TSC_AUX 0xc0000103 wrmsr(MSR_IA32_TSC_AUX, node_cpu); } static char *trampoline_va, *first_page_va; /*@ @ assigns torampoline_va; @ assigns first_page_va; @*/ void ihk_mc_init_ap(void) { struct ihk_mc_cpu_info *cpu_info = ihk_mc_get_cpu_info(); trampoline_va = map_fixed_area(ap_trampoline, AP_TRAMPOLINE_SIZE, 0); kprintf("Trampoline area: 0x%lx \n", ap_trampoline); first_page_va = map_fixed_area(0, PAGE_SIZE, 0); kprintf("# of cpus : %d\n", cpu_info->ncpus); init_processors_local(cpu_info->ncpus); /* Do initialization for THIS cpu (BSP) */ assign_processor_id(); init_smp_processor(); } extern void init_page_table(void); extern char x86_syscall[]; long (*__x86_syscall_handler)(int, ihk_mc_user_context_t *); void init_syscall(void) { unsigned long r; r = rdmsr(MSR_EFER); r |= 1; /* SYSCALL Enable */ wrmsr(MSR_EFER, r); r = (((unsigned long)KERNEL_CS) << 32) | (((unsigned long)USER_CS) << 48); wrmsr(MSR_STAR, r); wrmsr(MSR_LSTAR, (unsigned long)x86_syscall); } static void enable_page_protection_fault(void) { asm volatile ( "pushf ;" "cli ;" "mov %%cr0,%%rax;" "or $0x10000,%%rax;" "mov %%rax,%%cr0;" "popf" ::: "%rax"); return; } static int no_execute_available = 0; static void enable_no_execute(void) { unsigned long efer; if (!no_execute_available) { return; } efer = rdmsr(MSR_EFER); #define IA32_EFER_NXE (1UL << 11) efer |= IA32_EFER_NXE; wrmsr(MSR_EFER, efer); return; } static void check_no_execute(void) { uint32_t edx; extern void enable_ptattr_no_execute(void); /* check Execute Disable Bit available bit */ asm ("cpuid" : "=d" (edx) : "a" (0x80000001) : "%rbx", "%rcx"); no_execute_available = (edx & (1 << 20))? 1: 0; kprintf("no_execute_available: %d\n", no_execute_available); if (no_execute_available) { enable_ptattr_no_execute(); } return; } void init_gettime_support(void) { uint64_t op; uint64_t eax; uint64_t ebx; uint64_t ecx; uint64_t edx; /* Check if Invariant TSC supported. * Processor's support for invariant TSC is indicated by * CPUID.80000007H:EDX[8]. * See page 2498 of the Intel64 and IA-32 Architectures Software * Developer's Manual - combined */ op = 0x80000007; asm volatile("cpuid" : "=a"(eax),"=b"(ebx),"=c"(ecx),"=d"(edx) : "a" (op)); if (edx & (1 << 8)) { gettime_local_support = 1; kprintf("Invariant TSC supported.\n"); } } void init_cpu(void) { enable_page_protection_fault(); enable_no_execute(); init_fpu(); init_lapic(); init_syscall(); x86_init_perfctr(); init_pstate_and_turbo(); init_pat(); } void setup_x86_phase1(void) { cpu_disable_interrupt(); init_idt(); init_gdt(); init_page_table(); } void setup_x86_phase2(void) { check_no_execute(); init_lapic_bsp(); init_cpu(); init_gettime_support(); kprintf("setup_x86 done.\n"); } static volatile int cpu_boot_status; void call_ap_func(void (*next_func)(void)) { cpu_boot_status = 1; next_func(); } struct page_table *get_init_page_table(void); void setup_x86_ap(void (*next_func)(void)) { unsigned long rsp; cpu_disable_interrupt(); ihk_mc_load_page_table(get_init_page_table()); assign_processor_id(); init_smp_processor(); reload_idt(); init_cpu(); rsp = (unsigned long)get_x86_this_cpu_kstack(); asm volatile("movq %0, %%rdi\n" "movq %1, %%rsp\n" "call *%2" : : "r"(next_func), "r"(rsp), "r"(call_ap_func) : "rdi"); while(1); } void arch_show_interrupt_context(const void *reg); extern void tlb_flush_handler(int vector); void __show_stack(uintptr_t *sp) { while (((uintptr_t)sp >= 0xffff800000000000) && ((uintptr_t)sp < 0xffffffff80000000)) { uintptr_t fp; uintptr_t ip; fp = sp[0]; ip = sp[1]; kprintf("IP: %016lx, SP: %016lx, FP: %016lx\n", ip, (uintptr_t)sp, fp); sp = (void *)fp; } return; } void show_context_stack(uintptr_t *rbp) { __show_stack(rbp); return; } void interrupt_exit(struct x86_user_context *regs) { if (interrupt_from_user(regs)) { cpu_enable_interrupt(); check_sig_pending(); check_need_resched(); check_signal(0, regs, -1); } else { check_sig_pending(); } } void handle_interrupt(int vector, struct x86_user_context *regs) { struct ihk_mc_interrupt_handler *h; struct cpu_local_var *v = get_this_cpu_local_var(); lapic_ack(); ++v->in_interrupt; set_cputime(interrupt_from_user(regs) ? CPUTIME_MODE_U2K : CPUTIME_MODE_K2K_IN); dkprintf("CPU[%d] got interrupt, vector: %d, RIP: 0x%lX\n", ihk_mc_get_processor_id(), vector, regs->gpr.rip); if (vector < 0 || vector > 255) { panic("Invalid interrupt vector."); } else if (vector < 32) { struct siginfo info; switch(vector){ case 0: memset(&info, '\0', sizeof info); info.si_signo = SIGFPE; info.si_code = FPE_INTDIV; info._sifields._sigfault.si_addr = (void *)regs->gpr.rip; set_signal(SIGFPE, regs, &info); break; case 9: case 16: case 19: set_signal(SIGFPE, regs, NULL); break; case 4: case 5: set_signal(SIGSEGV, regs, NULL); break; case 6: memset(&info, '\0', sizeof info); info.si_signo = SIGILL; info.si_code = ILL_ILLOPN; info._sifields._sigfault.si_addr = (void *)regs->gpr.rip; set_signal(SIGILL, regs, &info); break; case 10: set_signal(SIGSEGV, regs, NULL); break; case 11: case 12: set_signal(SIGBUS, regs, NULL); break; case 17: memset(&info, '\0', sizeof info); info.si_signo = SIGBUS; info.si_code = BUS_ADRALN; set_signal(SIGBUS, regs, &info); break; default: kprintf("Exception %d, rflags: 0x%lX CS: 0x%lX, RIP: 0x%lX\n", vector, regs->gpr.rflags, regs->gpr.cs, regs->gpr.rip); arch_show_interrupt_context(regs); panic("Unhandled exception"); } } else if (vector == LOCAL_TIMER_VECTOR) { unsigned long irqstate; /* Timer interrupt, enabled only on oversubscribed CPU cores, * request reschedule */ irqstate = ihk_mc_spinlock_lock(&v->runq_lock); v->flags |= CPU_FLAG_NEED_RESCHED; ihk_mc_spinlock_unlock(&v->runq_lock, irqstate); dkprintf("timer[%lu]: CPU_FLAG_NEED_RESCHED \n", rdtsc()); do_backlog(); } else if (vector == LOCAL_PERF_VECTOR) { struct siginfo info; unsigned long value; struct thread *thread = cpu_local_var(current); struct process *proc = thread->proc; long irqstate; struct mckfd *fdp; lapic_write(LAPIC_LVTPC, LOCAL_PERF_VECTOR); value = rdmsr(MSR_PERF_GLOBAL_STATUS); wrmsr(MSR_PERF_GLOBAL_OVF_CTRL, value); wrmsr(MSR_PERF_GLOBAL_OVF_CTRL, 0); irqstate = ihk_mc_spinlock_lock(&proc->mckfd_lock); for(fdp = proc->mckfd; fdp; fdp = fdp->next) { if(fdp->sig_no > 0) break; } ihk_mc_spinlock_unlock(&proc->mckfd_lock, irqstate); if(fdp) { memset(&info, '\0', sizeof info); info.si_signo = fdp->sig_no; info._sifields._sigfault.si_addr = (void *)regs->gpr.rip; info._sifields._sigpoll.si_fd = fdp->fd; set_signal(fdp->sig_no, regs, &info); } else { set_signal(SIGIO, regs, NULL); } } else if (vector >= IHK_TLB_FLUSH_IRQ_VECTOR_START && vector < IHK_TLB_FLUSH_IRQ_VECTOR_END) { tlb_flush_handler(vector); } else if (vector == LOCAL_SMP_FUNC_CALL_VECTOR) { smp_func_call_handler(); } else if (vector == 133) { show_context_stack((uintptr_t *)regs->gpr.rbp); } else { list_for_each_entry(h, &handlers[vector - 32], list) { if (h->func) { h->func(h->priv); } } } interrupt_exit(regs); set_cputime(interrupt_from_user(regs) ? CPUTIME_MODE_K2U : CPUTIME_MODE_K2K_OUT); --v->in_interrupt; /* for migration by IPI */ if (v->flags & CPU_FLAG_NEED_MIGRATE) { schedule(); check_signal(0, regs, 0); } } void gpe_handler(struct x86_user_context *regs) { set_cputime(interrupt_from_user(regs) ? CPUTIME_MODE_U2K : CPUTIME_MODE_K2K_IN); kprintf("General protection fault (err: %lx, %lx:%lx)\n", regs->gpr.error, regs->gpr.cs, regs->gpr.rip); arch_show_interrupt_context(regs); if ((regs->gpr.cs & 3) == 0) { panic("gpe_handler"); } set_signal(SIGSEGV, regs, NULL); interrupt_exit(regs); set_cputime(interrupt_from_user(regs) ? CPUTIME_MODE_K2U : CPUTIME_MODE_K2K_OUT); panic("GPF"); } void debug_handler(struct x86_user_context *regs) { unsigned long db6; int si_code = 0; struct siginfo info; set_cputime(interrupt_from_user(regs) ? CPUTIME_MODE_U2K : CPUTIME_MODE_K2K_IN); #ifdef DEBUG_PRINT_CPU kprintf("debug exception (err: %lx, %lx:%lx)\n", regs->gpr.error, regs->gpr.cs, regs->gpr.rip); arch_show_interrupt_context(regs); #endif asm("mov %%db6, %0" :"=r" (db6)); if (db6 & DB6_BS) { regs->gpr.rflags &= ~RFLAGS_TF; si_code = TRAP_TRACE; } else if (db6 & (DB6_B3|DB6_B2|DB6_B1|DB6_B0)) { si_code = TRAP_HWBKPT; } memset(&info, '\0', sizeof info); info.si_code = si_code; set_signal(SIGTRAP, regs, &info); interrupt_exit(regs); set_cputime(interrupt_from_user(regs) ? CPUTIME_MODE_K2U : CPUTIME_MODE_K2K_OUT); } void int3_handler(struct x86_user_context *regs) { struct siginfo info; set_cputime(interrupt_from_user(regs) ? CPUTIME_MODE_U2K : CPUTIME_MODE_K2K_IN); #ifdef DEBUG_PRINT_CPU kprintf("int3 exception (err: %lx, %lx:%lx)\n", regs->gpr.error, regs->gpr.cs, regs->gpr.rip); arch_show_interrupt_context(regs); #endif memset(&info, '\0', sizeof info); info.si_code = TRAP_BRKPT; set_signal(SIGTRAP, regs, &info); interrupt_exit(regs); set_cputime(interrupt_from_user(regs) ? CPUTIME_MODE_K2U : CPUTIME_MODE_K2K_OUT); } static void outb(uint8_t v, uint16_t port) { asm volatile("outb %0, %1" : : "a" (v), "d" (port)); } static void set_warm_reset_vector(unsigned long ip) { x86_set_warm_reset(ip, first_page_va); } static void __x86_wakeup(int apicid, unsigned long ip) { int retry = 3; set_warm_reset_vector(ip); /* Clear the error */ lapic_write(LAPIC_ESR, 0); lapic_read(LAPIC_ESR); /* INIT */ x86_issue_ipi(apicid, APIC_INT_LEVELTRIG | APIC_INT_ASSERT | APIC_DM_INIT); x86_issue_ipi(apicid, APIC_INT_LEVELTRIG | APIC_DM_INIT); lapic_wait_icr_idle(); while (retry--) { lapic_read(LAPIC_ESR); x86_issue_ipi(apicid, APIC_DM_STARTUP | (ip >> 12)); lapic_wait_icr_idle(); arch_delay(200); if (cpu_boot_status) break; } } /** IHK Functions **/ /*@ @ assigns \nothing; @ ensures \interrupt_disabled == 0; @*/ void cpu_halt(void) { asm volatile("hlt"); } /*@ @ assigns \nothing; @ ensures \interrupt_disabled == 0; @*/ void cpu_safe_halt(void) { asm volatile("sti; hlt"); } /*@ @ assigns \nothing; @ ensures \interrupt_disabled == 0; @*/ void cpu_enable_interrupt(void) { asm volatile("sti"); } /*@ @ assigns \nothing; @ ensures \interrupt_disabled > 0; @*/ void cpu_disable_interrupt(void) { asm volatile("cli"); } /*@ @ assigns \nothing; @ behavior to_enabled: @ assumes flags & RFLAGS_IF; @ ensures \interrupt_disabled == 0; @ behavior to_disabled: @ assumes !(flags & RFLAGS_IF); @ ensures \interrupt_disabled > 0; @*/ void cpu_restore_interrupt(unsigned long flags) { asm volatile("push %0; popf" : : "g"(flags) : "memory", "cc"); } /*@ @ assigns \nothing; @*/ void cpu_pause(void) { asm volatile("pause" ::: "memory"); } /*@ @ assigns \nothing; @ ensures \interrupt_disabled > 0; @ behavior from_enabled: @ assumes \interrupt_disabled == 0; @ ensures \result & RFLAGS_IF; @ behavior from_disabled: @ assumes \interrupt_disabled > 0; @ ensures !(\result & RFLAGS_IF); @*/ unsigned long cpu_disable_interrupt_save(void) { unsigned long flags; asm volatile("pushf; pop %0; cli" : "=r"(flags) : : "memory", "cc"); return flags; } unsigned long cpu_enable_interrupt_save(void) { unsigned long flags; asm volatile("pushf; pop %0; sti" : "=r"(flags) : : "memory", "cc"); return flags; } /*@ @ behavior valid_vector: @ assumes 32 <= vector <= 255; @ requires \valid(h); @ assigns handlers[vector-32]; @ ensures \result == 0; @ behavior invalid_vector: @ assumes (vector < 32) || (255 < vector); @ assigns \nothing; @ ensures \result == -EINVAL; @*/ int ihk_mc_register_interrupt_handler(int vector, struct ihk_mc_interrupt_handler *h) { if (vector < 32 || vector > 255) { return -EINVAL; } list_add_tail(&h->list, &handlers[vector - 32]); return 0; } int ihk_mc_unregister_interrupt_handler(int vector, struct ihk_mc_interrupt_handler *h) { list_del(&h->list); return 0; } extern unsigned long __page_fault_handler_address; /*@ @ requires \valid(h); @ assigns __page_fault_handler_address; @ ensures __page_fault_handler_address == h; @*/ void ihk_mc_set_page_fault_handler(void (*h)(void *, uint64_t, void *)) { __page_fault_handler_address = (unsigned long)h; } extern char trampoline_code_data[], trampoline_code_data_end[]; struct page_table *get_boot_page_table(void); unsigned long get_transit_page_table(void); /* reusable, but not reentrant */ /*@ @ requires \valid_apicid(cpuid); // valid APIC ID or not @ requires \valid(pc); @ requires \valid(trampoline_va); @ requires \valid(trampoline_code_data @ +(0..(trampoline_code_data_end - trampoline_code_data))); @ requires \valid_physical(ap_trampoline); // valid physical address or not @ assigns (char *)trampoline_va+(0..trampoline_code_data_end - trampoline_code_data); @ assigns cpu_boot_status; @ ensures cpu_boot_status != 0; @*/ void ihk_mc_boot_cpu(int cpuid, unsigned long pc) { unsigned long *p; p = (unsigned long *)trampoline_va; memcpy(p, trampoline_code_data, trampoline_code_data_end - trampoline_code_data); p[1] = (unsigned long)virt_to_phys(get_boot_page_table()); p[2] = (unsigned long)setup_x86_ap; p[3] = pc; p[4] = (unsigned long)get_x86_cpu_local_kstack(cpuid); p[6] = (unsigned long)get_transit_page_table(); if (!p[6]) { p[6] = p[1]; } cpu_boot_status = 0; __x86_wakeup(cpuid, ap_trampoline); /* XXX: Time out */ while (!cpu_boot_status) { cpu_pause(); } } /*@ @ requires \valid(new_ctx); @ requires (stack_pointer == NULL) || \valid((unsigned long *)stack_pointer-1); @ requires \valid(next_function); @*/ void ihk_mc_init_context(ihk_mc_kernel_context_t *new_ctx, void *stack_pointer, void (*next_function)(void)) { unsigned long *sp; if (!stack_pointer) { stack_pointer = get_x86_this_cpu_kstack(); } sp = stack_pointer; memset(new_ctx, 0, sizeof(ihk_mc_kernel_context_t)); /* Set the return address */ new_ctx->rsp = (unsigned long)(sp - 1); sp[-1] = (unsigned long)next_function; } extern char enter_user_mode[]; /* * Release runq_lock before entering user space. * This is needed because schedule() holds the runq lock throughout * the context switch and when a new process is created it starts * execution in enter_user_mode, which in turn calls this function. */ void release_runq_lock(void) { ihk_mc_spinlock_unlock(&(cpu_local_var(runq_lock)), cpu_local_var(runq_irqstate)); } /*@ @ requires \valid(ctx); @ requires \valid(puctx); @ requires \valid((ihk_mc_user_context_t *)stack_pointer-1); @ requires \valid_user(new_pc); // valid user space address or not @ requires \valid_user(user_sp-1); @ assigns *((ihk_mc_user_context_t *)stack_pointer-1); @ assigns ctx->rsp0; @*/ void ihk_mc_init_user_process(ihk_mc_kernel_context_t *ctx, ihk_mc_user_context_t **puctx, void *stack_pointer, unsigned long new_pc, unsigned long user_sp) { char *sp; ihk_mc_user_context_t *uctx; sp = stack_pointer; sp -= sizeof(ihk_mc_user_context_t); uctx = (ihk_mc_user_context_t *)sp; *puctx = uctx; memset(uctx, 0, sizeof(ihk_mc_user_context_t)); uctx->gpr.cs = USER_CS; uctx->gpr.rip = new_pc; uctx->gpr.ss = USER_DS; uctx->gpr.rsp = user_sp; uctx->gpr.rflags = RFLAGS_IF; uctx->is_gpr_valid = 1; ihk_mc_init_context(ctx, sp, (void (*)(void))enter_user_mode); ctx->rsp0 = (unsigned long)stack_pointer; } /*@ @ behavior rsp: @ assumes reg == IHK_UCR_STACK_POINTER; @ requires \valid(uctx); @ assigns uctx->gpr.rsp; @ ensures uctx->gpr.rsp == value; @ behavior rip: @ assumes reg == IHK_UCR_PROGRAM_COUNTER; @ requires \valid(uctx); @ assigns uctx->gpr.rip; @ ensures uctx->gpr.rip == value; @*/ void ihk_mc_modify_user_context(ihk_mc_user_context_t *uctx, enum ihk_mc_user_context_regtype reg, unsigned long value) { if (reg == IHK_UCR_STACK_POINTER) { uctx->gpr.rsp = value; } else if (reg == IHK_UCR_PROGRAM_COUNTER) { uctx->gpr.rip = value; } } #ifdef POSTK_DEBUG_ARCH_DEP_42 /* /proc/cpuinfo support added. */ long ihk_mc_show_cpuinfo(char *buf, size_t buf_size, unsigned long read_off, int *eofp) { *eofp = 1; return -ENOMEM; } #endif /* POSTK_DEBUG_ARCH_DEP_42 */ void arch_clone_thread(struct thread *othread, unsigned long pc, unsigned long sp, struct thread *nthread) { return; } void ihk_mc_print_user_context(ihk_mc_user_context_t *uctx) { kprintf("CS:RIP = %04lx:%16lx\n", uctx->gpr.cs, uctx->gpr.rip); kprintf("%16lx %16lx %16lx %16lx\n%16lx %16lx %16lx\n", uctx->gpr.rax, uctx->gpr.rbx, uctx->gpr.rcx, uctx->gpr.rdx, uctx->gpr.rsi, uctx->gpr.rdi, uctx->gpr.rsp); } /*@ @ requires \valid(handler); @ assigns __x86_syscall_handler; @ ensures __x86_syscall_handler == handler; @*/ void ihk_mc_set_syscall_handler(long (*handler)(int, ihk_mc_user_context_t *)) { __x86_syscall_handler = handler; } /*@ @ assigns \nothing; @*/ void ihk_mc_delay_us(int us) { arch_delay(us); } void arch_show_extended_context(void) { unsigned long cr0, cr4, msr, xcr0 = 0; /* Read and print CRs, MSR_EFER, XCR0 */ asm volatile("movq %%cr0, %0" : "=r"(cr0)); asm volatile("movq %%cr4, %0" : "=r"(cr4)); msr = rdmsr(MSR_EFER); if (xsave_available) { xcr0 = xgetbv(0); } __kprintf("\n CR0 CR4\n"); __kprintf("%016lX %016lX\n", cr0, cr4); __kprintf(" MSR_EFER\n"); __kprintf("%016lX\n", msr); if (xsave_available) { __kprintf(" XCR0\n"); __kprintf("%016lX\n", xcr0); } } struct stack { struct stack *rbp; unsigned long eip; }; /* KPRINTF_LOCAL_BUF_LEN is 1024, useless to go further */ #define STACK_BUF_LEN (1024-sizeof("[ 0]: ")) static void __print_stack(struct stack *rbp, unsigned long first) { char buf[STACK_BUF_LEN]; size_t len; /* Build string in buffer to output a single line */ len = snprintf(buf, STACK_BUF_LEN, "addr2line -e smp-x86/kernel/mckernel.img -fpia"); if (first) len += snprintf(buf + len, STACK_BUF_LEN - len, " %#16lx", first); while ((unsigned long)rbp > 0xffff880000000000 && STACK_BUF_LEN - len > sizeof(" 0x0123456789abcdef")) { len += snprintf(buf + len, STACK_BUF_LEN - len, " %#16lx", rbp->eip); rbp = rbp->rbp; } __kprintf("%s\n", buf); } void arch_print_pre_interrupt_stack(const struct x86_basic_regs *regs) { struct stack *rbp; /* only for kernel stack */ if (regs->error & PF_USER) return; __kprintf("Pre-interrupt stack trace:\n"); /* interrupt stack heuristics: * - the first entry looks like it is always garbage, so skip. * (that is done by taking regs->rsp instead of ®s->rsp) * - that still looks sometimes wrong. For now, if it is not * within 64k of itself, look for the next entry that matches. */ rbp = (struct stack*)regs->rsp; while ((uintptr_t)rbp > (uintptr_t)rbp->rbp || (uintptr_t)rbp + 0x10000 < (uintptr_t)rbp->rbp) rbp = (struct stack *)(((uintptr_t *)rbp) + 1); __print_stack(rbp, regs->rip); } void arch_print_stack(void) { struct stack *rbp; __kprintf("Approximative stack trace:\n"); asm("mov %%rbp, %0" : "=r"(rbp) ); __print_stack(rbp, 0); } /*@ @ requires \valid(reg); @ assigns \nothing; @*/ void arch_show_interrupt_context(const void *reg) { const struct x86_user_context *uctx = reg; const struct x86_basic_regs *regs = &uctx->gpr; unsigned long irqflags; irqflags = kprintf_lock(); __kprintf("CS:RIP = %4lx:%16lx\n", regs->cs, regs->rip); __kprintf(" RAX RBX RCX RDX\n"); __kprintf("%16lx %16lx %16lx %16lx\n", regs->rax, regs->rbx, regs->rcx, regs->rdx); __kprintf(" RSI RDI RSP RBP\n"); __kprintf("%16lx %16lx %16lx %16lx\n", regs->rsi, regs->rdi, regs->rsp, regs->rbp); __kprintf(" R8 R9 R10 R11\n"); __kprintf("%16lx %16lx %16lx %16lx\n", regs->r8, regs->r9, regs->r10, regs->r11); __kprintf(" R12 R13 R14 R15\n"); __kprintf("%16lx %16lx %16lx %16lx\n", regs->r12, regs->r13, regs->r14, regs->r15); __kprintf(" CS SS RFLAGS ERROR\n"); __kprintf("%16lx %16lx %16lx %16lx\n", regs->cs, regs->ss, regs->rflags, regs->error); kprintf_unlock(irqflags); return; arch_show_extended_context(); arch_print_pre_interrupt_stack(regs); kprintf_unlock(irqflags); } void arch_cpu_stop(void) { while (1) { cpu_halt(); } } /*@ @ behavior fs_base: @ assumes type == IHK_ASR_X86_FS; @ ensures \result == 0; @ behavior invaiid_type: @ assumes type != IHK_ASR_X86_FS; @ ensures \result == -EINVAL; @*/ int ihk_mc_arch_set_special_register(enum ihk_asr_type type, unsigned long value) { /* GS modification is not permitted */ switch (type) { case IHK_ASR_X86_FS: wrmsr(MSR_FS_BASE, value); return 0; default: return -EINVAL; } } /*@ @ behavior fs_base: @ assumes type == IHK_ASR_X86_FS; @ requires \valid(value); @ ensures \result == 0; @ behavior invalid_type: @ assumes type != IHK_ASR_X86_FS; @ ensures \result == -EINVAL; @*/ int ihk_mc_arch_get_special_register(enum ihk_asr_type type, unsigned long *value) { /* GS modification is not permitted */ switch (type) { case IHK_ASR_X86_FS: *value = rdmsr(MSR_FS_BASE); return 0; default: return -EINVAL; } } /*@ @ requires \valid_cpuid(cpu); // valid CPU logical ID @ ensures \result == 0 @*/ int ihk_mc_interrupt_cpu(int cpu, int vector) { if (cpu < 0 || cpu >= num_processors) { kprintf("%s: invalid CPU id: %d\n", __func__, cpu); return -1; } dkprintf("[%d] ihk_mc_interrupt_cpu: %d\n", ihk_mc_get_processor_id(), cpu); x86_issue_ipi(get_x86_cpu_local_variable(cpu)->apic_id, vector); return 0; } struct thread *arch_switch_context(struct thread *prev, struct thread *next) { struct thread *last; struct mcs_rwlock_node_irqsave lock; dkprintf("[%d] schedule: tlsblock_base: 0x%lX\n", ihk_mc_get_processor_id(), next->tlsblock_base); /* Set up new TLS.. */ ihk_mc_init_user_tlsbase(next->uctx, next->tlsblock_base); #ifdef ENABLE_PERF /* Performance monitoring inherit */ if(next->proc->monitoring_event) { if(next->proc->perf_status == PP_RESET) perf_reset(next->proc->monitoring_event); if(next->proc->perf_status != PP_COUNT) { perf_reset(next->proc->monitoring_event); perf_start(next->proc->monitoring_event); } } #endif #ifdef PROFILE_ENABLE if (prev && prev->profile && prev->profile_start_ts != 0) { prev->profile_elapsed_ts += (rdtsc() - prev->profile_start_ts); prev->profile_start_ts = 0; } if (next->profile && next->profile_start_ts == 0) { next->profile_start_ts = rdtsc(); } #endif if (prev) { mcs_rwlock_writer_lock(&prev->proc->update_lock, &lock); if (prev->proc->status & (PS_DELAY_STOPPED | PS_DELAY_TRACED)) { switch (prev->proc->status) { case PS_DELAY_STOPPED: prev->proc->status = PS_STOPPED; break; case PS_DELAY_TRACED: prev->proc->status = PS_TRACED; break; default: break; } mcs_rwlock_writer_unlock(&prev->proc->update_lock, &lock); /* Wake up the parent who tried wait4 and sleeping */ waitq_wakeup(&prev->proc->parent->waitpid_q); } else { mcs_rwlock_writer_unlock(&prev->proc->update_lock, &lock); } last = ihk_mc_switch_context(&prev->ctx, &next->ctx, prev); } else { last = ihk_mc_switch_context(NULL, &next->ctx, prev); } return last; } /*@ @ requires \valid(thread); @ ensures thread->fp_regs == NULL; @*/ void release_fp_regs(struct thread *thread) { int pages; if (thread && !thread->fp_regs) return; pages = (xsave_size + (PAGE_SIZE - 1)) >> PAGE_SHIFT; dkprintf("release_fp_regs: pages=%d\n", pages); ihk_mc_free_pages(thread->fp_regs, pages); thread->fp_regs = NULL; } static int check_and_allocate_fp_regs(struct thread *thread) { int pages; int result = 0; if (!thread->fp_regs) { pages = (xsave_size + (PAGE_SIZE - 1)) >> PAGE_SHIFT; dkprintf("save_fp_regs: pages=%d\n", pages); thread->fp_regs = ihk_mc_alloc_pages(pages, IHK_MC_AP_NOWAIT); if (!thread->fp_regs) { kprintf("error: allocating fp_regs pages\n"); result = -ENOMEM; goto out; } memset(thread->fp_regs, 0, pages * PAGE_SIZE); } out: return result; } /*@ @ requires \valid(thread); @*/ int save_fp_regs(struct thread *thread) { int ret = 0; ret = check_and_allocate_fp_regs(thread); if (ret) { goto out; } if (xsave_available) { unsigned int low, high; /* Request full save of x87, SSE, AVX and AVX-512 states */ low = (unsigned int)xsave_mask; high = (unsigned int)(xsave_mask >> 32); asm volatile("xsave %0" : : "m" (*thread->fp_regs), "a" (low), "d" (high) : "memory"); dkprintf("fp_regs for TID %d saved\n", thread->tid); } out: return ret; } int copy_fp_regs(struct thread *from, struct thread *to) { int ret = 0; if (from->fp_regs != NULL) { ret = check_and_allocate_fp_regs(to); if (!ret) { memcpy(to->fp_regs, from->fp_regs, sizeof(fp_regs_struct)); } } return ret; } /*@ @ requires \valid(thread); @ assigns thread->fp_regs; @*/ void restore_fp_regs(struct thread *thread) { if (!thread->fp_regs) { // only clear fpregs. clear_fp_regs(); return; } if (xsave_available) { unsigned int low, high; /* Request full restore of x87, SSE, AVX and AVX-512 states */ low = (unsigned int)xsave_mask; high = (unsigned int)(xsave_mask >> 32); asm volatile("xrstor %0" : : "m" (*thread->fp_regs), "a" (low), "d" (high)); dkprintf("fp_regs for TID %d restored\n", thread->tid); } // XXX: why release?? //release_fp_regs(thread); } void clear_fp_regs(void) { struct cpu_local_var *v = get_this_cpu_local_var(); restore_fp_regs(&v->idle); } ihk_mc_user_context_t *lookup_user_context(struct thread *thread) { ihk_mc_user_context_t *uctx = thread->uctx; if ((!(thread->status & (PS_INTERRUPTIBLE | PS_UNINTERRUPTIBLE | PS_STOPPED | PS_TRACED)) && (thread != cpu_local_var(current))) || !uctx->is_gpr_valid) { return NULL; } if (!uctx->is_sr_valid) { uctx->sr.fs_base = thread->tlsblock_base; uctx->sr.gs_base = 0; uctx->sr.ds = 0; uctx->sr.es = 0; uctx->sr.fs = 0; uctx->sr.gs = 0; uctx->is_sr_valid = 1; } return uctx; } /* lookup_user_context() */ extern long do_arch_prctl(unsigned long code, unsigned long address); void ihk_mc_init_user_tlsbase(ihk_mc_user_context_t *ctx, unsigned long tls_base_addr) { do_arch_prctl(ARCH_SET_FS, tls_base_addr); } void arch_flush_icache_all(void) { return; } /*@ @ assigns \nothing; @*/ void init_tick(void) { dkprintf("init_tick():\n"); return; } /*@ @ assigns \nothing; @*/ void init_delay(void) { dkprintf("init_delay():\n"); return; } /*@ @ assigns \nothing; @*/ void sync_tick(void) { dkprintf("sync_tick():\n"); return; } static int is_pvclock_available(void) { uint32_t eax; uint32_t ebx; uint32_t ecx; uint32_t edx; dkprintf("is_pvclock_available()\n"); #define KVM_CPUID_SIGNATURE 0x40000000 asm ("cpuid" : "=a"(eax), "=b"(ebx), "=c"(ecx), "=d"(edx) : "a" (KVM_CPUID_SIGNATURE)); if ((eax && (eax < 0x40000001)) || (ebx != 0x4b4d564b) || (ecx != 0x564b4d56) || (edx != 0x0000004d)) { dkprintf("is_pvclock_available(): false (not kvm)\n"); return 0; } #define KVM_CPUID_FEATURES 0x40000001 asm ("cpuid" : "=a"(eax) : "a"(KVM_CPUID_FEATURES) : "%ebx", "%ecx", "%edx"); #define KVM_FEATURE_CLOCKSOURCE2 3 if (eax & (1 << KVM_FEATURE_CLOCKSOURCE2)) { #define MSR_KVM_SYSTEM_TIME_NEW 0x4b564d01 pvti_msr = MSR_KVM_SYSTEM_TIME_NEW; dkprintf("is_pvclock_available(): true (new)\n"); return 1; } #define KVM_FEATURE_CLOCKSOURCE 0 else if (eax & (1 << KVM_FEATURE_CLOCKSOURCE)) { #define MSR_KVM_SYSTEM_TIME 0x12 pvti_msr = MSR_KVM_SYSTEM_TIME; dkprintf("is_pvclock_available(): true (old)\n"); return 1; } dkprintf("is_pvclock_available(): false (not supported)\n"); return 0; } /* is_pvclock_available() */ int arch_setup_pvclock(void) { size_t size; int npages; dkprintf("arch_setup_pvclock()\n"); if (!is_pvclock_available()) { dkprintf("arch_setup_pvclock(): not supported\n"); return 0; } size = num_processors * sizeof(*pvti); npages = (size + PAGE_SIZE - 1) / PAGE_SIZE; pvti_npages = npages; pvti = ihk_mc_alloc_pages(npages, IHK_MC_AP_NOWAIT); if (!pvti) { ekprintf("arch_setup_pvclock: allocate_pages failed.\n"); return -ENOMEM; } memset(pvti, 0, PAGE_SIZE*npages); dkprintf("arch_setup_pvclock(): ok\n"); return 0; } /* arch_setup_pvclock() */ void arch_start_pvclock(void) { int cpu; intptr_t phys; dkprintf("arch_start_pvclock()\n"); if (!pvti) { dkprintf("arch_start_pvclock(): not supported\n"); return; } cpu = ihk_mc_get_processor_id(); phys = virt_to_phys(&pvti[cpu]); #define KVM_SYSTEM_TIME_ENABLE 0x1 wrmsr(pvti_msr, phys|KVM_SYSTEM_TIME_ENABLE); dkprintf("arch_start_pvclock(): ok\n"); return; } /* arch_start_pvclock() */ #define KVM_CPUID_SIGNATURE 0x40000000 int running_on_kvm(void) { static const char signature[12] = "KVMKVMKVM\0\0"; const uint32_t *sigptr = (const uint32_t *)signature; uint64_t op; uint64_t eax; uint64_t ebx; uint64_t ecx; uint64_t edx; op = KVM_CPUID_SIGNATURE; asm volatile("cpuid" : "=a"(eax),"=b"(ebx),"=c"(ecx),"=d"(edx) : "a" (op)); if (ebx == sigptr[0] && ecx == sigptr[1] && edx == sigptr[2]) { return 1; } return 0; } void mod_nmi_ctx(void *nmi_ctx, void (*func)()) { unsigned long *l = nmi_ctx; int i; unsigned long flags; asm volatile("pushf; pop %0" : "=r"(flags) : : "memory", "cc"); for (i = 0; i < 22; i++) l[i] = l[i + 5]; l[i++] = (unsigned long)func; // return address l[i++] = 0x20; // KERNEL CS l[i++] = flags & ~RFLAGS_IF; // rflags (disable interrupt) l[i++] = (unsigned long)(l + 27); // ols rsp l[i++] = 0x28; // KERNEL DS } void arch_save_panic_regs(void *irq_regs) { struct thread *current = cpu_local_var(current); struct x86_user_context *regs = (struct x86_user_context *)irq_regs; struct x86_cpu_local_variables *x86v = get_x86_cpu_local_variable(ihk_mc_get_processor_id()); struct segment_regs { uint32_t rflags; uint32_t cs; uint32_t ss; uint32_t ds; uint32_t es; uint32_t fs; uint32_t gs; } *sregs; /* Kernel space? */ if (regs->gpr.rip > USER_END) { x86v->panic_regs[0] = regs->gpr.rax; x86v->panic_regs[1] = regs->gpr.rbx; x86v->panic_regs[2] = regs->gpr.rcx; x86v->panic_regs[3] = regs->gpr.rdx; x86v->panic_regs[4] = regs->gpr.rsi; x86v->panic_regs[5] = regs->gpr.rdi; x86v->panic_regs[6] = regs->gpr.rbp; x86v->panic_regs[7] = regs->gpr.rsp; x86v->panic_regs[8] = regs->gpr.r8; x86v->panic_regs[9] = regs->gpr.r9; x86v->panic_regs[10] = regs->gpr.r10; x86v->panic_regs[11] = regs->gpr.r11; x86v->panic_regs[12] = regs->gpr.r12; x86v->panic_regs[13] = regs->gpr.r13; x86v->panic_regs[14] = regs->gpr.r14; x86v->panic_regs[15] = regs->gpr.r15; x86v->panic_regs[16] = regs->gpr.rip; sregs = (struct segment_regs *)&x86v->panic_regs[17]; sregs->rflags = regs->gpr.rflags; sregs->cs = regs->gpr.cs; sregs->ss = regs->gpr.ss; sregs->ds = regs->sr.ds; sregs->es = regs->sr.es; sregs->fs = regs->sr.fs; sregs->gs = regs->sr.gs; } /* User-space, show kernel context */ else { kprintf("%s: in user-space: %p\n", __func__, regs->gpr.rip); x86v->panic_regs[0] = 0; x86v->panic_regs[1] = current->ctx.rbx; x86v->panic_regs[2] = 0; x86v->panic_regs[3] = 0; x86v->panic_regs[4] = current->ctx.rsi; x86v->panic_regs[5] = current->ctx.rdi; x86v->panic_regs[6] = current->ctx.rbp; x86v->panic_regs[7] = current->ctx.rsp; x86v->panic_regs[8] = 0; x86v->panic_regs[9] = 0; x86v->panic_regs[10] = 0; x86v->panic_regs[11] = 0; x86v->panic_regs[12] = regs->gpr.r12; x86v->panic_regs[13] = regs->gpr.r13; x86v->panic_regs[14] = regs->gpr.r14; x86v->panic_regs[15] = regs->gpr.r15; x86v->panic_regs[16] = (unsigned long)enter_user_mode; sregs = (struct segment_regs *)&x86v->panic_regs[17]; sregs->rflags = regs->gpr.rflags; sregs->cs = regs->gpr.cs; sregs->ss = regs->gpr.ss; sregs->ds = regs->sr.ds; sregs->es = regs->sr.es; sregs->fs = regs->sr.fs; sregs->gs = regs->sr.gs; } x86v->paniced = 1; } void arch_clear_panic(void) { struct x86_cpu_local_variables *x86v = get_x86_cpu_local_variable(ihk_mc_get_processor_id()); x86v->paniced = 0; } int arch_cpu_read_write_register( struct ihk_os_cpu_register *desc, enum mcctrl_os_cpu_operation op) { if (op == MCCTRL_OS_CPU_READ_REGISTER) { desc->val = rdmsr(desc->addr); } else if (op == MCCTRL_OS_CPU_WRITE_REGISTER) { wrmsr(desc->addr, desc->val); } else { return -1; } return 0; } /* * Generic remote CPU function invocation facility. */ static void smp_func_call_handler(void) { int irq_flags; struct smp_func_call_request *req; int reqs_left; reiterate: req = NULL; reqs_left = 0; irq_flags = ihk_mc_spinlock_lock( &cpu_local_var(smp_func_req_lock)); /* Take requests one-by-one */ if (!list_empty(&cpu_local_var(smp_func_req_list))) { req = list_first_entry(&cpu_local_var(smp_func_req_list), struct smp_func_call_request, list); list_del(&req->list); reqs_left = !list_empty(&cpu_local_var(smp_func_req_list)); } ihk_mc_spinlock_unlock(&cpu_local_var(smp_func_req_lock), irq_flags); if (req) { req->ret = req->sfcd->func(req->cpu_index, req->sfcd->nr_cpus, req->sfcd->arg); ihk_atomic_dec(&req->sfcd->cpus_left); } if (reqs_left) goto reiterate; } int smp_call_func(cpu_set_t *__cpu_set, smp_func_t __func, void *__arg) { int cpu, nr_cpus = 0; int cpu_index = 0; int this_cpu_index = 0; struct smp_func_call_data sfcd; struct smp_func_call_request *reqs; int ret = 0; int call_on_this_cpu = 0; cpu_set_t cpu_set; /* Sanity checks */ if (!__cpu_set || !__func) { return -EINVAL; } /* Make sure it won't change in between */ cpu_set = *__cpu_set; for_each_set_bit(cpu, (unsigned long *)&cpu_set, sizeof(cpu_set) * BITS_PER_BYTE) { if (cpu == ihk_mc_get_processor_id()) { call_on_this_cpu = 1; } ++nr_cpus; } if (!nr_cpus) { return -EINVAL; } reqs = kmalloc(sizeof(*reqs) * nr_cpus, IHK_MC_AP_NOWAIT); if (!reqs) { ret = -ENOMEM; goto free_out; } sfcd.nr_cpus = nr_cpus; sfcd.func = __func; sfcd.arg = __arg; ihk_atomic_set(&sfcd.cpus_left, call_on_this_cpu ? nr_cpus - 1 : nr_cpus); /* Add requests and send IPIs */ cpu_index = 0; for_each_set_bit(cpu, (unsigned long *)&cpu_set, sizeof(cpu_set) * BITS_PER_BYTE) { unsigned long irq_flags; reqs[cpu_index].cpu_index = cpu_index; reqs[cpu_index].ret = 0; if (cpu == ihk_mc_get_processor_id()) { this_cpu_index = cpu_index; ++cpu_index; continue; } reqs[cpu_index].sfcd = &sfcd; irq_flags = ihk_mc_spinlock_lock(&get_cpu_local_var(cpu)->smp_func_req_lock); list_add_tail(&reqs[cpu_index].list, &get_cpu_local_var(cpu)->smp_func_req_list); ihk_mc_spinlock_unlock(&get_cpu_local_var(cpu)->smp_func_req_lock, irq_flags); ihk_mc_interrupt_cpu(cpu, LOCAL_SMP_FUNC_CALL_VECTOR); ++cpu_index; } /* Is this CPU involved? */ if (call_on_this_cpu) { reqs[this_cpu_index].ret = __func(this_cpu_index, nr_cpus, __arg); } /* Wait for the rest of the CPUs */ while (ihk_atomic_read(&sfcd.cpus_left) > 0) { cpu_pause(); } /* Check return values, if error, report the first non-zero */ for (cpu_index = 0; cpu_index < nr_cpus; ++cpu_index) { if (reqs[cpu_index].ret != 0) { ret = reqs[cpu_index].ret; goto free_out; } } ret = 0; free_out: kfree(reqs); return ret; } extern int nmi_mode; extern long freeze_thaw(void *nmi_ctx); void multi_nm_interrupt_handler(void *irq_regs) { dkprintf("%s: ...\n", __func__); switch (nmi_mode) { case 1: case 2: /* mode == 1 or 2, for FREEZER NMI */ dkprintf("%s: freeze mode NMI catch. (nmi_mode=%d)\n", __func__, nmi_mode); freeze_thaw(NULL); break; case 0: /* mode == 0, for MEMDUMP NMI */ arch_save_panic_regs(irq_regs); ihk_mc_query_mem_areas(); /* memdump-nmi is halted McKernel, break is unnecessary. */ /* fall through */ case 3: /* mode == 3, for SHUTDOWN-WAIT NMI */ kprintf("%s: STOP\n", __func__); while (nmi_mode != 4) cpu_halt(); break; case 4: /* mode == 4, continue NMI */ arch_clear_panic(); if (!ihk_mc_get_processor_id()) { ihk_mc_clear_dump_page_completion(); } kprintf("%s: RESUME, nmi_mode: %d\n", __func__, nmi_mode); break; default: ekprintf("%s: Unknown nmi-mode(%d) detected.\n", __func__, nmi_mode); break; } } /*** end of file ***/