/* archdeps.c COPYRIGHT FUJITSU LIMITED 2016-2018 */ #include #include #include #include #include #include "config.h" #include "../../mcctrl.h" #include "../../kallsyms_compat.h" #if LINUX_VERSION_CODE < KERNEL_VERSION(5,8,0) && defined(CONFIG_X86_VSYSCALL_EMULATION) #define gtod (&VVAR(vsyscall_gtod_data)) #else #define gtod NULL #endif //#define SC_DEBUG #ifdef SC_DEBUG #define dprintk(...) printk(__VA_ARGS__) #else #define dprintk(...) #endif #if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 16, 0) static struct vdso_image *_vdso_image_64; #elif LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 23) static void *vdso_start; static void *vdso_end; static struct page **vdso_pages; #endif static void *__vvar_page_ptr; static long *hpet_address; static void **hv_clock; int arch_symbols_init(void) { #if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 16, 0) _vdso_image_64 = (void *) mcctrl_lookup_name("vdso_image_64"); if (WARN_ON(!_vdso_image_64)) return -EFAULT; #elif LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 23) vdso_start = (void *) mcctrl_lookup_name("vdso_start"); if (WARN_ON(!vdso_start)) return -EFAULT; vdso_end = (void *) mcctrl_lookup_name("vdso_end"); if (WARN_ON(!vdso_end)) return -EFAULT; vdso_pages = (void *) mcctrl_lookup_name("vdso_pages"); if (WARN_ON(!vdso_pages)) return -EFAULT; #endif __vvar_page_ptr = (void *) &__vvar_page; if (WARN_ON(!__vvar_page_ptr)) return -EFAULT; hpet_address = (void *) mcctrl_lookup_name("hpet_address"); hv_clock = (void *) mcctrl_lookup_name("hv_clock"); return 0; } #define VDSO_MAXPAGES 2 struct vdso { long busy; int vdso_npages; char vvar_is_global; char hpet_is_global; char pvti_is_global; char padding; long vdso_physlist[VDSO_MAXPAGES]; void *vvar_virt; long vvar_phys; void *hpet_virt; long hpet_phys; void *pvti_virt; long pvti_phys; void *vgtod_virt; }; unsigned long reserve_user_space_common(struct mcctrl_usrdata *usrdata, unsigned long start, unsigned long end); int reserve_user_space(struct mcctrl_usrdata *usrdata, unsigned long *startp, unsigned long *endp) { struct vm_area_struct *vma; unsigned long start = 0L; unsigned long end; if (mutex_lock_killable(&usrdata->reserve_lock) < 0) { return -1; } #define DESIRED_USER_END 0x800000000000 #define GAP_FOR_MCEXEC 0x008000000000UL end = DESIRED_USER_END; mmap_write_lock(current->mm); vma = find_vma(current->mm, 0); if (vma) { end = (vma->vm_start - GAP_FOR_MCEXEC) & ~(GAP_FOR_MCEXEC - 1); } #if LINUX_VERSION_CODE >= KERNEL_VERSION(3,5,0) mmap_write_unlock(current->mm); #endif start = reserve_user_space_common(usrdata, start, end); #if LINUX_VERSION_CODE < KERNEL_VERSION(3,5,0) mmap_write_unlock(current->mm); #endif mutex_unlock(&usrdata->reserve_lock); if (IS_ERR_VALUE(start)) { return start; } *startp = start; *endp = end; return 0; } void get_vdso_info(ihk_os_t os, long vdso_rpa) { ihk_device_t dev = ihk_os_to_dev(os); long vdso_pa; struct vdso *vdso; size_t size; int i; vdso_pa = ihk_device_map_memory(dev, vdso_rpa, sizeof(*vdso)); vdso = ihk_device_map_virtual(dev, vdso_pa, sizeof(*vdso), NULL, 0); /* VDSO pages */ #if LINUX_VERSION_CODE >= KERNEL_VERSION(3,16,0) size = _vdso_image_64->size; vdso->vdso_npages = size >> PAGE_SHIFT; if (vdso->vdso_npages > VDSO_MAXPAGES) { vdso->vdso_npages = 0; goto out; } for (i = 0; i < vdso->vdso_npages; ++i) { vdso->vdso_physlist[i] = virt_to_phys( _vdso_image_64->data + (i * PAGE_SIZE)); } #elif LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,23) size = vdso_end - vdso_start; size = (size + PAGE_SIZE - 1) & PAGE_MASK; vdso->vdso_npages = size >> PAGE_SHIFT; if (vdso->vdso_npages > VDSO_MAXPAGES) { vdso->vdso_npages = 0; goto out; } for (i = 0; i < vdso->vdso_npages; ++i) { vdso->vdso_physlist[i] = page_to_phys(vdso_pages[i]); } #endif /* VVAR page */ #if LINUX_VERSION_CODE >= KERNEL_VERSION(4,5,0) vdso->vvar_is_global = 0; vdso->vvar_virt = (void *)(-3 * PAGE_SIZE); vdso->vvar_phys = virt_to_phys(__vvar_page_ptr); #elif LINUX_VERSION_CODE >= KERNEL_VERSION(3,17,0) vdso->vvar_is_global = 0; vdso->vvar_virt = (void *)(-2 * PAGE_SIZE); vdso->vvar_phys = virt_to_phys(__vvar_page_ptr); #elif LINUX_VERSION_CODE >= KERNEL_VERSION(3,16,0) vdso->vvar_is_global = 0; vdso->vvar_virt = (void *)(vdso->vdso_npages * PAGE_SIZE); vdso->vvar_phys = virt_to_phys(__vvar_page_ptr); #elif LINUX_VERSION_CODE >= KERNEL_VERSION(3,1,0) vdso->vvar_is_global = 1; vdso->vvar_virt = (void *)fix_to_virt(VVAR_PAGE); vdso->vvar_phys = virt_to_phys(__vvar_page_ptr); #endif /* HPET page */ if (hpet_address && *hpet_address) { #if LINUX_VERSION_CODE >= KERNEL_VERSION(4,5,0) vdso->hpet_is_global = 0; vdso->hpet_virt = (void *)(-2 * PAGE_SIZE); vdso->hpet_phys = *hpet_address; #elif LINUX_VERSION_CODE >= KERNEL_VERSION(3,17,0) vdso->hpet_is_global = 0; vdso->hpet_virt = (void *)(-1 * PAGE_SIZE); vdso->hpet_phys = *hpet_address; #elif LINUX_VERSION_CODE >= KERNEL_VERSION(3,16,0) vdso->hpet_is_global = 0; vdso->hpet_virt = (void *)((vdso->vdso_npages + 1) * PAGE_SIZE); vdso->hpet_phys = *hpet_address; #elif LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,23) vdso->hpet_is_global = 1; vdso->hpet_virt = (void *)fix_to_virt(VSYSCALL_HPET); vdso->hpet_phys = *hpet_address; #endif } /* struct pvlock_vcpu_time_info table */ if (hv_clock && *hv_clock) { #if LINUX_VERSION_CODE >= KERNEL_VERSION(4,5,0) vdso->pvti_is_global = 0; vdso->pvti_virt = (void *)(-1 * PAGE_SIZE); vdso->pvti_phys = virt_to_phys(*hv_clock); #elif LINUX_VERSION_CODE >= KERNEL_VERSION(3,8,0) vdso->pvti_is_global = 1; vdso->pvti_virt = (void *)fix_to_virt(PVCLOCK_FIXMAP_BEGIN); vdso->pvti_phys = virt_to_phys(*hv_clock); #endif } vdso->vgtod_virt = (void *)gtod; out: wmb(); vdso->busy = 0; ihk_device_unmap_virtual(dev, vdso, sizeof(*vdso)); ihk_device_unmap_memory(dev, vdso_pa, sizeof(*vdso)); return; } /* get_vdso_info() */ void * get_user_sp(void) { unsigned long usp; asm volatile("movq %%gs:0xaf80, %0" : "=r" (usp)); return (void *)usp; } void set_user_sp(void *usp) { asm volatile("movq %0, %%gs:0xaf80" :: "r" (usp)); } struct trans_uctx { volatile int cond; int fregsize; unsigned long rax; unsigned long rbx; unsigned long rcx; unsigned long rdx; unsigned long rsi; unsigned long rdi; unsigned long rbp; unsigned long r8; unsigned long r9; unsigned long r10; unsigned long r11; unsigned long r12; unsigned long r13; unsigned long r14; unsigned long r15; unsigned long rflags; unsigned long rip; unsigned long rsp; unsigned long fs; }; void restore_tls(unsigned long addr) { wrmsrl(MSR_FS_BASE, addr); } void save_tls_ctx(void __user *ctx) { struct trans_uctx __user *tctx = ctx; struct trans_uctx kctx; if (copy_from_user(&kctx, tctx, sizeof(struct trans_uctx))) { pr_err("%s: copy_from_user failed.\n", __func__); return; } rdmsrl(MSR_FS_BASE, kctx.fs); } unsigned long get_tls_ctx(void __user *ctx) { struct trans_uctx __user *tctx = ctx; struct trans_uctx kctx; if (copy_from_user(&kctx, tctx, sizeof(struct trans_uctx))) { pr_err("%s: copy_from_user failed.\n", __func__); return 0; } return kctx.fs; } unsigned long get_rsp_ctx(void *ctx) { struct trans_uctx *tctx = ctx; return tctx->rsp; } int translate_rva_to_rpa(ihk_os_t os, unsigned long rpt, unsigned long rva, unsigned long *rpap, unsigned long *pgsizep) { unsigned long rpa; int offsh; int i; int ix; unsigned long phys; unsigned long *pt; int error; unsigned long pgsize; rpa = rpt; offsh = 39; pgsize = 0; /* i = 0: PML4, 1: PDPT, 2: PDT, 3: PT */ for (i = 0; i < 4; ++i) { ix = (rva >> offsh) & 0x1FF; phys = ihk_device_map_memory(ihk_os_to_dev(os), rpa, PAGE_SIZE); pt = ihk_device_map_virtual(ihk_os_to_dev(os), phys, PAGE_SIZE, NULL, 0); dprintk("rpa %#lx offsh %d ix %#x phys %#lx pt %p pt[ix] %#lx\n", rpa, offsh, ix, phys, pt, pt[ix]); #define PTE_P 0x001 if (!(pt[ix] & PTE_P)) { ihk_device_unmap_virtual(ihk_os_to_dev(os), pt, PAGE_SIZE); ihk_device_unmap_memory(ihk_os_to_dev(os), phys, PAGE_SIZE); error = -EFAULT; dprintk("Remote PTE is not present for 0x%lx (rpt: %lx) ?\n", rva, rpt); goto out; } #define PTE_PS 0x080 if (pt[ix] & PTE_PS) { pgsize = 1UL << offsh; rpa = pt[ix] & ((1UL << 52) - 1) & ~(pgsize - 1); rpa |= rva & (pgsize - 1); /* For GB pages, just report regular 2MB page */ if (offsh == 30) { pgsize = 1UL << 21; dprintk("%s: GB page translated 0x%lx -> 0x%lx, pgsize: %lu\n", __FUNCTION__, rva, rpa, pgsize); } ihk_device_unmap_virtual(ihk_os_to_dev(os), pt, PAGE_SIZE); ihk_device_unmap_memory(ihk_os_to_dev(os), phys, PAGE_SIZE); error = 0; goto found; } rpa = pt[ix] & ((1UL << 52) - 1) & ~((1UL << 12) - 1); offsh -= 9; ihk_device_unmap_virtual(ihk_os_to_dev(os), pt, PAGE_SIZE); ihk_device_unmap_memory(ihk_os_to_dev(os), phys, PAGE_SIZE); } pgsize = 1UL << 12; rpa |= rva & (pgsize - 1); found: error = 0; *rpap = rpa; *pgsizep = pgsize; out: dprintk("translate_rva_to_rpa: %d rva %#lx --> rpa %#lx (%lx)\n", error, rva, rpa, pgsize); return error; } #define PFN_WRITE_COMBINED _PAGE_PWT static inline bool pte_is_write_combined(pte_t pte) { return ((pte_flags(pte) & _PAGE_PWT) && !(pte_flags(pte) & _PAGE_PCD)); } /* * The assembler switch_ctx is save/load registers in the context. * Do TLS save/load and register host_thread with ioctl. */ long arch_switch_ctx(struct uti_switch_ctx_desc *desc) { return 0; }