/** * \file memory.c * License details are found in the file LICENSE. * \brief * Acquire physical pages and manipulate page table entries. * \author Taku Shimosawa \par * Copyright (C) 2011 - 2012 Taku Shimosawa * \author Gou Nakamura \par * Copyright (C) 2015 RIKEN AICS */ /* * HISTORY */ #include #include #include #include #include #include #include #include #include #include #include #include #define dkprintf(...) do { if (0) kprintf(__VA_ARGS__); } while (0) #define ekprintf(...) kprintf(__VA_ARGS__) static char *last_page; extern char _head[], _end[]; extern unsigned long x86_kernel_phys_base; /* Arch specific early allocation routine */ void *early_alloc_pages(int nr_pages) { void *p; if (!last_page) { last_page = (char *)(((unsigned long)_end + PAGE_SIZE - 1) & PAGE_MASK); /* Convert the virtual address from text's to straight maps */ last_page = phys_to_virt(virt_to_phys(last_page)); } else if (last_page == (void *)-1) { panic("Early allocator is already finalized. Do not use it.\n"); } p = last_page; last_page += (nr_pages * PAGE_SIZE); return p; } void early_alloc_invalidate(void) { last_page = (void *)-1; } void *ihk_mc_allocate(int size, int flag) { if (!cpu_local_var(kmalloc_initialized)) { kprintf("%s: error, kmalloc not yet initialized\n", __FUNCTION__); return NULL; } return kmalloc(size, IHK_MC_AP_NOWAIT); } void ihk_mc_free(void *p) { if (!cpu_local_var(kmalloc_initialized)) { kprintf("%s: error, kmalloc not yet initialized\n", __FUNCTION__); return; } kfree(p); } void *get_last_early_heap(void) { return last_page; } void flush_tlb(void) { unsigned long cr3; asm volatile("movq %%cr3, %0; movq %0, %%cr3" : "=r"(cr3) : : "memory"); } void flush_tlb_single(unsigned long addr) { asm volatile("invlpg (%0)" :: "r" (addr) : "memory"); } struct page_table { pte_t entry[PT_ENTRIES]; }; static struct page_table *init_pt; static ihk_spinlock_t init_pt_lock; static int use_1gb_page = 0; static void check_available_page_size(void) { uint32_t edx; asm ("cpuid" : "=d" (edx) : "a" (0x80000001) : "%rbx", "%rcx"); use_1gb_page = (edx & (1 << 26))? 1: 0; kprintf("use_1gb_page: %d\n", use_1gb_page); return; } static unsigned long setup_l2(struct page_table *pt, unsigned long page_head, unsigned long start, unsigned long end) { int i; unsigned long phys; for (i = 0; i < PT_ENTRIES; i++) { phys = page_head + ((unsigned long)i << PTL2_SHIFT); if (phys + PTL2_SIZE <= start || phys >= end) { pt->entry[i] = 0; continue; } pt->entry[i] = phys | PFL2_KERN_ATTR | PFL2_SIZE; } return virt_to_phys(pt); } static unsigned long setup_l3(struct page_table *pt, unsigned long page_head, unsigned long start, unsigned long end) { int i; unsigned long phys, pt_phys; for (i = 0; i < PT_ENTRIES; i++) { phys = page_head + ((unsigned long)i << PTL3_SHIFT); if (phys + PTL3_SIZE <= start || phys >= end) { pt->entry[i] = 0; continue; } pt_phys = setup_l2(ihk_mc_alloc_pages(1, IHK_MC_AP_CRITICAL), phys, start, end); pt->entry[i] = pt_phys | PFL3_PDIR_ATTR; } return virt_to_phys(pt); } static void init_normal_area(struct page_table *pt) { unsigned long map_start, map_end, phys, pt_phys; int ident_index, virt_index; map_start = ihk_mc_get_memory_address(IHK_MC_GMA_MAP_START, 0); map_end = ihk_mc_get_memory_address(IHK_MC_GMA_MAP_END, 0); kprintf("map_start = %lx, map_end = %lx\n", map_start, map_end); ident_index = map_start >> PTL4_SHIFT; virt_index = (MAP_ST_START >> PTL4_SHIFT) & (PT_ENTRIES - 1); memset(pt, 0, sizeof(struct page_table)); for (phys = (map_start & ~(PTL4_SIZE - 1)); phys < map_end; phys += PTL4_SIZE) { pt_phys = setup_l3(ihk_mc_alloc_pages(1, IHK_MC_AP_CRITICAL), phys, map_start, map_end); pt->entry[ident_index++] = pt_phys | PFL4_PDIR_ATTR; pt->entry[virt_index++] = pt_phys | PFL4_PDIR_ATTR; } } static struct page_table *__alloc_new_pt(enum ihk_mc_ap_flag ap_flag) { struct page_table *newpt = ihk_mc_alloc_pages(1, ap_flag); if(newpt) memset(newpt, 0, sizeof(struct page_table)); return newpt; } /* * XXX: Confusingly, L4 and L3 automatically add PRESENT, * but L2 and L1 do not! */ enum ihk_mc_pt_attribute attr_mask = 0 | PTATTR_FILEOFF | PTATTR_WRITABLE | PTATTR_USER | PTATTR_ACTIVE | 0; #define ATTR_MASK attr_mask void enable_ptattr_no_execute(void) { attr_mask |= PTATTR_NO_EXECUTE; return; } #if 0 static unsigned long attr_to_l4attr(enum ihk_mc_pt_attribute attr) { return (attr & ATTR_MASK) | PFL4_PRESENT; } #endif static unsigned long attr_to_l3attr(enum ihk_mc_pt_attribute attr) { unsigned long r = (attr & (ATTR_MASK | PTATTR_LARGEPAGE)); if ((attr & PTATTR_UNCACHABLE) && (attr & PTATTR_LARGEPAGE)) { return r | PFL3_PCD | PFL3_PWT; } return r; } static unsigned long attr_to_l2attr(enum ihk_mc_pt_attribute attr) { unsigned long r = (attr & (ATTR_MASK | PTATTR_LARGEPAGE)); if ((attr & PTATTR_UNCACHABLE) && (attr & PTATTR_LARGEPAGE)) { return r | PFL2_PCD | PFL2_PWT; } return r; } static unsigned long attr_to_l1attr(enum ihk_mc_pt_attribute attr) { if (attr & PTATTR_UNCACHABLE) { return (attr & ATTR_MASK) | PFL1_PCD | PFL1_PWT; } else if (attr & PTATTR_WRITE_COMBINED) { return (attr & ATTR_MASK) | PFL1_PWT; } else { return (attr & ATTR_MASK); } } #define GET_VIRT_INDICES(virt, l4i, l3i, l2i, l1i) \ l4i = ((virt) >> PTL4_SHIFT) & (PT_ENTRIES - 1); \ l3i = ((virt) >> PTL3_SHIFT) & (PT_ENTRIES - 1); \ l2i = ((virt) >> PTL2_SHIFT) & (PT_ENTRIES - 1); \ l1i = ((virt) >> PTL1_SHIFT) & (PT_ENTRIES - 1) #define GET_INDICES_VIRT(l4i, l3i, l2i, l1i) \ ( ((uint64_t)(l4i) << PTL4_SHIFT) \ | ((uint64_t)(l3i) << PTL3_SHIFT) \ | ((uint64_t)(l2i) << PTL2_SHIFT) \ | ((uint64_t)(l1i) << PTL1_SHIFT) \ ) void set_pte(pte_t *ppte, unsigned long phys, enum ihk_mc_pt_attribute attr) { if (attr & PTATTR_LARGEPAGE) { *ppte = phys | attr_to_l2attr(attr) | PFL2_SIZE; } else { *ppte = phys | attr_to_l1attr(attr); } } #if 0 /* * get_pte() * * Descripton: walks the page tables (creates tables if not existing) * and returns a pointer to the PTE corresponding to the * virtual address. */ pte_t *get_pte(struct page_table *pt, void *virt, enum ihk_mc_pt_attribute attr, enum ihk_mc_ap_flag ap_flag) { int l4idx, l3idx, l2idx, l1idx; unsigned long v = (unsigned long)virt; struct page_table *newpt; if (!pt) { pt = init_pt; } GET_VIRT_INDICES(v, l4idx, l3idx, l2idx, l1idx); /* TODO: more detailed attribute check */ if (pt->entry[l4idx] & PFL4_PRESENT) { pt = phys_to_virt(pt->entry[l4idx] & PAGE_MASK); } else { if((newpt = __alloc_new_pt(ap_flag)) == NULL) return NULL; pt->entry[l4idx] = virt_to_phys(newpt) | attr_to_l4attr(attr); pt = newpt; } if (pt->entry[l3idx] & PFL3_PRESENT) { pt = phys_to_virt(pt->entry[l3idx] & PAGE_MASK); } else { if((newpt = __alloc_new_pt(ap_flag)) == NULL) return NULL; pt->entry[l3idx] = virt_to_phys(newpt) | attr_to_l3attr(attr); pt = newpt; } /* PTATTR_LARGEPAGE */ if (attr & PTATTR_LARGEPAGE) { return &(pt->entry[l2idx]); } /* Requested regular page, but large is allocated? */ if (pt->entry[l2idx] & PFL2_SIZE) { return NULL; } if (pt->entry[l2idx] & PFL2_PRESENT) { pt = phys_to_virt(pt->entry[l2idx] & PAGE_MASK); } else { if((newpt = __alloc_new_pt(ap_flag)) == NULL) return NULL; pt->entry[l2idx] = virt_to_phys(newpt) | attr_to_l2attr(attr) | PFL2_PRESENT; pt = newpt; } return &(pt->entry[l1idx]); } #endif static int __set_pt_page(struct page_table *pt, void *virt, unsigned long phys, enum ihk_mc_pt_attribute attr) { int l4idx, l3idx, l2idx, l1idx; unsigned long v = (unsigned long)virt; struct page_table *newpt; enum ihk_mc_ap_flag ap_flag; int in_kernel = (((unsigned long long)virt) >= 0xffff000000000000ULL); unsigned long init_pt_lock_flags; int ret = -ENOMEM; init_pt_lock_flags = 0; /* for avoidance of warning */ if (in_kernel) { init_pt_lock_flags = ihk_mc_spinlock_lock(&init_pt_lock); } ap_flag = (attr & PTATTR_FOR_USER) ? IHK_MC_AP_NOWAIT: IHK_MC_AP_CRITICAL; if (!pt) { pt = init_pt; } if (attr & PTATTR_LARGEPAGE) { phys &= LARGE_PAGE_MASK; } else { phys &= PAGE_MASK; } GET_VIRT_INDICES(v, l4idx, l3idx, l2idx, l1idx); /* TODO: more detailed attribute check */ if (pt->entry[l4idx] & PFL4_PRESENT) { pt = phys_to_virt(pt->entry[l4idx] & PAGE_MASK); } else { if((newpt = __alloc_new_pt(ap_flag)) == NULL) goto out; pt->entry[l4idx] = virt_to_phys(newpt) | PFL4_PDIR_ATTR; pt = newpt; } if (pt->entry[l3idx] & PFL3_PRESENT) { pt = phys_to_virt(pt->entry[l3idx] & PAGE_MASK); } else { if((newpt = __alloc_new_pt(ap_flag)) == NULL) goto out; pt->entry[l3idx] = virt_to_phys(newpt) | PFL3_PDIR_ATTR; pt = newpt; } if (attr & PTATTR_LARGEPAGE) { if (pt->entry[l2idx] & PFL2_PRESENT) { if ((pt->entry[l2idx] & PAGE_MASK) != phys) { goto out; } else { ret = 0; goto out; } } else { pt->entry[l2idx] = phys | attr_to_l2attr(attr) | PFL2_SIZE; ret = 0; goto out; } } if (pt->entry[l2idx] & PFL2_PRESENT) { pt = phys_to_virt(pt->entry[l2idx] & PAGE_MASK); } else { if((newpt = __alloc_new_pt(ap_flag)) == NULL) goto out; pt->entry[l2idx] = virt_to_phys(newpt) | PFL2_PDIR_ATTR; pt = newpt; } if (pt->entry[l1idx] & PFL1_PRESENT) { if ((pt->entry[l1idx] & PT_PHYSMASK) != phys) { kprintf("EBUSY: page table for 0x%lX is already set\n", virt); ret = -EBUSY; goto out; } else { ret = 0; goto out; } } pt->entry[l1idx] = phys | attr_to_l1attr(attr); ret = 0; out: if (in_kernel) { ihk_mc_spinlock_unlock(&init_pt_lock, init_pt_lock_flags); } return ret; } static int __clear_pt_page(struct page_table *pt, void *virt, int largepage) { int l4idx, l3idx, l2idx, l1idx; unsigned long v = (unsigned long)virt; if (!pt) { pt = init_pt; } if (largepage) { v &= LARGE_PAGE_MASK; } else { v &= PAGE_MASK; } GET_VIRT_INDICES(v, l4idx, l3idx, l2idx, l1idx); if (!(pt->entry[l4idx] & PFL4_PRESENT)) { return -EINVAL; } pt = phys_to_virt(pt->entry[l4idx] & PAGE_MASK); if (!(pt->entry[l3idx] & PFL3_PRESENT)) { return -EINVAL; } pt = phys_to_virt(pt->entry[l3idx] & PAGE_MASK); if (largepage) { if (!(pt->entry[l2idx] & PFL2_PRESENT)) { return -EINVAL; } else { pt->entry[l2idx] = 0; return 0; } } if (!(pt->entry[l2idx] & PFL2_PRESENT)) { return -EINVAL; } pt = phys_to_virt(pt->entry[l2idx] & PAGE_MASK); pt->entry[l1idx] = 0; return 0; } uint64_t ihk_mc_pt_virt_to_pagemap(struct page_table *pt, unsigned long virt) { int error; unsigned long phys; uint64_t pagemap; error = ihk_mc_pt_virt_to_phys(pt, (void *)virt, &phys); if (error) { return 0; } pagemap = PM_PFRAME(phys >> PAGE_SHIFT); pagemap |= PM_PSHIFT(PAGE_SHIFT) | PM_PRESENT; return pagemap; } int ihk_mc_pt_virt_to_phys_size(struct page_table *pt, const void *virt, unsigned long *phys, unsigned long *size) { int l4idx, l3idx, l2idx, l1idx; unsigned long v = (unsigned long)virt; if (!pt) { pt = init_pt; } GET_VIRT_INDICES(v, l4idx, l3idx, l2idx, l1idx); if (!(pt->entry[l4idx] & PFL4_PRESENT)) { return -EFAULT; } pt = phys_to_virt(pte_get_phys(&pt->entry[l4idx])); if (!(pt->entry[l3idx] & PFL3_PRESENT)) { return -EFAULT; } if ((pt->entry[l3idx] & PFL3_SIZE)) { *phys = pte_get_phys(&pt->entry[l3idx]) | (v & (PTL3_SIZE - 1)); if (size) *size = PTL3_SIZE; return 0; } pt = phys_to_virt(pte_get_phys(&pt->entry[l3idx])); if (!(pt->entry[l2idx] & PFL2_PRESENT)) { return -EFAULT; } if ((pt->entry[l2idx] & PFL2_SIZE)) { *phys = pte_get_phys(&pt->entry[l2idx]) | (v & (PTL2_SIZE - 1)); if (size) *size = PTL2_SIZE; return 0; } pt = phys_to_virt(pte_get_phys(&pt->entry[l2idx])); if (!(pt->entry[l1idx] & PFL1_PRESENT)) { return -EFAULT; } *phys = pte_get_phys(&pt->entry[l1idx]) | (v & (PTL1_SIZE - 1)); if (size) *size = PTL1_SIZE; return 0; } int ihk_mc_pt_virt_to_phys(struct page_table *pt, const void *virt, unsigned long *phys) { return ihk_mc_pt_virt_to_phys_size(pt, virt, phys, NULL); } int ihk_mc_pt_print_pte(struct page_table *pt, void *virt) { int l4idx, l3idx, l2idx, l1idx; unsigned long v = (unsigned long)virt; if (!pt) { pt = init_pt; } GET_VIRT_INDICES(v, l4idx, l3idx, l2idx, l1idx); if (!(pt->entry[l4idx] & PFL4_PRESENT)) { __kprintf("0x%lX l4idx not present! \n", (unsigned long)virt); __kprintf("l4 entry: 0x%lX\n", pt->entry[l4idx]); return -EFAULT; } pt = phys_to_virt(pt->entry[l4idx] & PAGE_MASK); __kprintf("l3 table: 0x%lX l3idx: %d \n", virt_to_phys(pt), l3idx); if (!(pt->entry[l3idx] & PFL3_PRESENT)) { __kprintf("0x%lX l3idx not present! \n", (unsigned long)virt); __kprintf("l3 entry: 0x%lX\n", pt->entry[l3idx]); return -EFAULT; } pt = phys_to_virt(pt->entry[l3idx] & PAGE_MASK); __kprintf("l2 table: 0x%lX l2idx: %d \n", virt_to_phys(pt), l2idx); if (!(pt->entry[l2idx] & PFL2_PRESENT)) { __kprintf("0x%lX l2idx not present! \n", (unsigned long)virt); __kprintf("l2 entry: 0x%lX\n", pt->entry[l2idx]); return -EFAULT; } if ((pt->entry[l2idx] & PFL2_SIZE)) { return 0; } pt = phys_to_virt(pt->entry[l2idx] & PAGE_MASK); __kprintf("l1 table: 0x%lX l1idx: %d \n", virt_to_phys(pt), l1idx); if (!(pt->entry[l1idx] & PFL1_PRESENT)) { __kprintf("0x%lX l1idx not present! \n", (unsigned long)virt); __kprintf("l1 entry: 0x%lX\n", pt->entry[l1idx]); return -EFAULT; } __kprintf("l1 entry: 0x%lX\n", pt->entry[l1idx]); return 0; } int set_pt_large_page(struct page_table *pt, void *virt, unsigned long phys, enum ihk_mc_pt_attribute attr) { return __set_pt_page(pt, virt, phys, attr | PTATTR_LARGEPAGE | PTATTR_ACTIVE); } int ihk_mc_pt_set_large_page(page_table_t pt, void *virt, unsigned long phys, enum ihk_mc_pt_attribute attr) { return __set_pt_page(pt, virt, phys, attr | PTATTR_LARGEPAGE | PTATTR_ACTIVE); } int ihk_mc_pt_set_page(page_table_t pt, void *virt, unsigned long phys, enum ihk_mc_pt_attribute attr) { return __set_pt_page(pt, virt, phys, attr | PTATTR_ACTIVE); } int ihk_mc_pt_prepare_map(page_table_t p, void *virt, unsigned long size, enum ihk_mc_pt_prepare_flag flag) { int l4idx, l4e, ret = 0; unsigned long v = (unsigned long)virt; struct page_table *pt = p, *newpt; unsigned long l; enum ihk_mc_pt_attribute attr = PTATTR_WRITABLE; if (!pt) { pt = init_pt; } l4idx = ((v) >> PTL4_SHIFT) & (PT_ENTRIES - 1); if (flag == IHK_MC_PT_FIRST_LEVEL) { l4e = ((v + size) >> PTL4_SHIFT) & (PT_ENTRIES - 1); for (; l4idx <= l4e; l4idx++) { if (pt->entry[l4idx] & PFL4_PRESENT) { return 0; } else { newpt = __alloc_new_pt(IHK_MC_AP_CRITICAL); if (!newpt) { ret = -ENOMEM; } else { pt->entry[l4idx] = virt_to_phys(newpt) | PFL4_PDIR_ATTR; } } } } else { /* Call without ACTIVE flag */ l = v + size; for (; v < l; v += PAGE_SIZE) { if ((ret = __set_pt_page(pt, (void *)v, 0, attr))) { break; } } } return ret; } struct page_table *ihk_mc_pt_create(enum ihk_mc_ap_flag ap_flag) { struct page_table *pt = ihk_mc_alloc_pages(1, ap_flag); if(pt == NULL) return NULL; memset(pt->entry, 0, PAGE_SIZE); /* Copy the kernel space */ memcpy(pt->entry + PT_ENTRIES / 2, init_pt->entry + PT_ENTRIES / 2, sizeof(pt->entry[0]) * PT_ENTRIES / 2); return pt; } static void destroy_page_table(int level, struct page_table *pt) { int ix; unsigned long entry; struct page_table *lower; if ((level < 1) || (4 < level)) { panic("destroy_page_table: level is out of range"); } if (pt == NULL) { panic("destroy_page_table: pt is NULL"); } if (level > 1) { for (ix = 0; ix < PT_ENTRIES; ++ix) { entry = pt->entry[ix]; if (!(entry & PF_PRESENT)) { /* entry is not valid */ continue; } if (entry & PF_SIZE) { /* not a page table */ continue; } lower = (struct page_table *)phys_to_virt(entry & PT_PHYSMASK); destroy_page_table(level-1, lower); } } ihk_mc_free_pages(pt, 1); return; } void ihk_mc_pt_destroy(struct page_table *pt) { const int level = 4; /* PML4 */ /* clear shared entry */ memset(pt->entry + PT_ENTRIES / 2, 0, sizeof(pt->entry[0]) * PT_ENTRIES / 2); destroy_page_table(level, pt); return; } int ihk_mc_pt_clear_page(page_table_t pt, void *virt) { return __clear_pt_page(pt, virt, 0); } int ihk_mc_pt_clear_large_page(page_table_t pt, void *virt) { return __clear_pt_page(pt, virt, 1); } typedef int walk_pte_fn_t(void *args, pte_t *ptep, uint64_t base, uint64_t start, uint64_t end); static int walk_pte_l1(struct page_table *pt, uint64_t base, uint64_t start, uint64_t end, walk_pte_fn_t *funcp, void *args) { int six; int eix; int ret; int i; int error; uint64_t off; six = (start <= base)? 0: ((start - base) >> PTL1_SHIFT); eix = ((end == 0) || ((base + PTL2_SIZE) <= end))? PT_ENTRIES : (((end - base) + (PTL1_SIZE - 1)) >> PTL1_SHIFT); ret = -ENOENT; for (i = six; i < eix; ++i) { off = i * PTL1_SIZE; error = (*funcp)(args, &pt->entry[i], base+off, start, end); if (!error) { ret = 0; } else if (error != -ENOENT) { ret = error; break; } } return ret; } static int walk_pte_l2(struct page_table *pt, uint64_t base, uint64_t start, uint64_t end, walk_pte_fn_t *funcp, void *args) { int six; int eix; int ret; int i; int error; uint64_t off; six = (start <= base)? 0: ((start - base) >> PTL2_SHIFT); eix = ((end == 0) || ((base + PTL3_SIZE) <= end))? PT_ENTRIES : (((end - base) + (PTL2_SIZE - 1)) >> PTL2_SHIFT); ret = -ENOENT; for (i = six; i < eix; ++i) { off = i * PTL2_SIZE; error = (*funcp)(args, &pt->entry[i], base+off, start, end); if (!error) { ret = 0; } else if (error != -ENOENT) { ret = error; break; } } return ret; } static int walk_pte_l3(struct page_table *pt, uint64_t base, uint64_t start, uint64_t end, walk_pte_fn_t *funcp, void *args) { int six; int eix; int ret; int i; int error; uint64_t off; six = (start <= base)? 0: ((start - base) >> PTL3_SHIFT); eix = ((end == 0) || ((base + PTL4_SIZE) <= end))? PT_ENTRIES : (((end - base) + (PTL3_SIZE - 1)) >> PTL3_SHIFT); ret = -ENOENT; for (i = six; i < eix; ++i) { off = i * PTL3_SIZE; error = (*funcp)(args, &pt->entry[i], base+off, start, end); if (!error) { ret = 0; } else if (error != -ENOENT) { ret = error; break; } } return ret; } static int walk_pte_l4(struct page_table *pt, uint64_t base, uint64_t start, uint64_t end, walk_pte_fn_t *funcp, void *args) { int six; int eix; int ret; int i; int error; uint64_t off; six = (start <= base)? 0: ((start - base) >> PTL4_SHIFT); eix = (end == 0)? PT_ENTRIES :(((end - base) + (PTL4_SIZE - 1)) >> PTL4_SHIFT); ret = -ENOENT; for (i = six; i < eix; ++i) { off = i * PTL4_SIZE; error = (*funcp)(args, &pt->entry[i], base+off, start, end); if (!error) { ret = 0; } else if (error != -ENOENT) { ret = error; break; } } return ret; } static int split_large_page(pte_t *ptep, size_t pgsize) { struct page_table *pt; uintptr_t phys_base; int i; uintptr_t phys; struct page *page; pte_t pte; if ((pgsize != PTL3_SIZE) && (pgsize != PTL2_SIZE)) { ekprintf("split_large_page:invalid pgsize %#lx\n", pgsize); return -EINVAL; } pt = __alloc_new_pt(IHK_MC_AP_NOWAIT); if (pt == NULL) { ekprintf("split_large_page:__alloc_new_pt failed\n"); return -ENOMEM; } pte = *ptep; if (pgsize == PTL2_SIZE) { /* break down to basic page size */ pte &= ~PFL2_SIZE; } if (pte_is_fileoff(ptep, pgsize)) { phys_base = NOPHYS; } else { phys_base = pte_get_phys(ptep); } for (i = 0; i < PT_ENTRIES; ++i) { if (phys_base != NOPHYS) { phys = phys_base + (i * pgsize / PT_ENTRIES); page = phys_to_page(phys); if (page) { page_map(page); } } pt->entry[i] = pte; pte += pgsize / PT_ENTRIES; } *ptep = (virt_to_phys(pt) & PT_PHYSMASK) | PFL2_PDIR_ATTR; /* Do not do this check for large pages as they don't come from the zeroobj * and are not actually mapped. * TODO: clean up zeroobj as we don't really need it, anonymous mappings * should be allocated for real */ if (pgsize != PTL2_SIZE) { if (phys_base != NOPHYS) { page = phys_to_page(phys_base); if (pgsize != PTL2_SIZE && page && page_unmap(page)) { kprintf("split_large_page:page_unmap:%p\n", page); panic("split_large_page:page_unmap\n"); } } } return 0; } struct visit_pte_args { page_table_t pt; enum visit_pte_flag flags; int pgshift; pte_visitor_t *funcp; void *arg; }; static int visit_pte_l1(void *arg0, pte_t *ptep, uintptr_t base, uintptr_t start, uintptr_t end) { struct visit_pte_args *args = arg0; if ((*ptep == PTE_NULL) && (args->flags & VPTEF_SKIP_NULL)) { return 0; } return (*args->funcp)(args->arg, args->pt, ptep, (void *)base, PTL1_SHIFT); } static int visit_pte_l2(void *arg0, pte_t *ptep, uintptr_t base, uintptr_t start, uintptr_t end) { int error; struct visit_pte_args *args = arg0; struct page_table *pt; if ((*ptep == PTE_NULL) && (args->flags & VPTEF_SKIP_NULL)) { return 0; } if (((*ptep == PTE_NULL) || (*ptep & PFL2_SIZE)) && (start <= base) && (((base + PTL2_SIZE) <= end) || (end == 0)) && (!args->pgshift || (args->pgshift == PTL2_SHIFT))) { error = (*args->funcp)(args->arg, args->pt, ptep, (void *)base, PTL2_SHIFT); if (error != -E2BIG) { return error; } } if (*ptep & PFL2_SIZE) { ekprintf("visit_pte_l2:split large page\n"); return -ENOMEM; } if (*ptep == PTE_NULL) { pt = __alloc_new_pt(IHK_MC_AP_NOWAIT); if (!pt) { return -ENOMEM; } *ptep = virt_to_phys(pt) | PFL2_PDIR_ATTR; } else { pt = phys_to_virt(*ptep & PT_PHYSMASK); } error = walk_pte_l1(pt, base, start, end, &visit_pte_l1, arg0); return error; } static int visit_pte_l3(void *arg0, pte_t *ptep, uintptr_t base, uintptr_t start, uintptr_t end) { int error; struct visit_pte_args *args = arg0; struct page_table *pt; if ((*ptep == PTE_NULL) && (args->flags & VPTEF_SKIP_NULL)) { return 0; } if (((*ptep == PTE_NULL) || (*ptep & PFL3_SIZE)) && (start <= base) && (((base + PTL3_SIZE) <= end) || (end == 0)) && (!args->pgshift || (args->pgshift == PTL3_SHIFT)) && use_1gb_page) { error = (*args->funcp)(args->arg, args->pt, ptep, (void *)base, PTL3_SHIFT); if (error != -E2BIG) { return error; } } if (*ptep & PFL3_SIZE) { ekprintf("visit_pte_l3:split large page\n"); return -ENOMEM; } if (*ptep == PTE_NULL) { pt = __alloc_new_pt(IHK_MC_AP_NOWAIT); if (!pt) { return -ENOMEM; } *ptep = virt_to_phys(pt) | PFL3_PDIR_ATTR; } else { pt = phys_to_virt(*ptep & PT_PHYSMASK); } error = walk_pte_l2(pt, base, start, end, &visit_pte_l2, arg0); return error; } static int visit_pte_l4(void *arg0, pte_t *ptep, uintptr_t base, uintptr_t start, uintptr_t end) { int error; struct visit_pte_args *args = arg0; struct page_table *pt; if ((*ptep == PTE_NULL) && (args->flags & VPTEF_SKIP_NULL)) { return 0; } if (*ptep == PTE_NULL) { pt = __alloc_new_pt(IHK_MC_AP_NOWAIT); if (!pt) { return -ENOMEM; } *ptep = virt_to_phys(pt) | PFL4_PDIR_ATTR; } else { pt = phys_to_virt(*ptep & PT_PHYSMASK); } error = walk_pte_l3(pt, base, start, end, &visit_pte_l3, arg0); return error; } int visit_pte_range(page_table_t pt, void *start0, void *end0, int pgshift, enum visit_pte_flag flags, pte_visitor_t *funcp, void *arg) { const uintptr_t start = (uintptr_t)start0; const uintptr_t end = (uintptr_t)end0; struct visit_pte_args args; args.pt = pt; args.flags = flags; args.funcp = funcp; args.arg = arg; args.pgshift = pgshift; return walk_pte_l4(pt, 0, start, end, &visit_pte_l4, &args); } struct clear_range_args { int free_physical; uint8_t padding[4]; struct memobj *memobj; struct process_vm *vm; }; static int clear_range_l1(void *args0, pte_t *ptep, uint64_t base, uint64_t start, uint64_t end) { struct clear_range_args *args = args0; uint64_t phys; struct page *page; pte_t old; if (*ptep == PTE_NULL) { return -ENOENT; } old = xchg(ptep, PTE_NULL); remote_flush_tlb_cpumask(args->vm, base, ihk_mc_get_processor_id()); page = NULL; if (!pte_is_fileoff(&old, PTL1_SIZE)) { phys = pte_get_phys(&old); page = phys_to_page(phys); } if (page && page_is_in_memobj(page) && (old & PFL1_DIRTY)) { memobj_flush_page(args->memobj, phys, PTL1_SIZE); } if (!(old & PFL1_FILEOFF) && args->free_physical) { if (!page || (page && page_unmap(page))) { ihk_mc_free_pages(phys_to_virt(phys), 1); dkprintf("%s: freeing regular page at 0x%lx\n", __FUNCTION__, base); } args->vm->currss -= PTL1_SIZE; } return 0; } static int clear_range_l2(void *args0, pte_t *ptep, uint64_t base, uint64_t start, uint64_t end) { struct clear_range_args *args = args0; uint64_t phys; struct page_table *pt; int error; struct page *page; pte_t old; if (*ptep == PTE_NULL) { return -ENOENT; } if ((*ptep & PFL2_SIZE) && ((base < start) || (end < (base + PTL2_SIZE)))) { error = -EINVAL; ekprintf("clear_range_l2(%p,%p,%lx,%lx,%lx):" "split page. %d\n", args0, ptep, base, start, end, error); return error; } if (*ptep & PFL2_SIZE) { old = xchg(ptep, PTE_NULL); remote_flush_tlb_cpumask(args->vm, base, ihk_mc_get_processor_id()); page = NULL; if (!pte_is_fileoff(&old, PTL2_SIZE)) { phys = pte_get_phys(&old); page = phys_to_page(phys); } if (page && page_is_in_memobj(page) && (old & PFL2_DIRTY)) { memobj_flush_page(args->memobj, phys, PTL2_SIZE); } if (!(old & PFL2_FILEOFF) && args->free_physical) { if (!page || (page && page_unmap(page))) { ihk_mc_free_pages(phys_to_virt(phys), PTL2_SIZE/PTL1_SIZE); dkprintf("%s: freeing large page at 0x%lx\n", __FUNCTION__, base); } args->vm->currss -= PTL2_SIZE; } return 0; } pt = phys_to_virt(*ptep & PT_PHYSMASK); error = walk_pte_l1(pt, base, start, end, &clear_range_l1, args0); if (error && (error != -ENOENT)) { return error; } if ((start <= base) && ((base + PTL2_SIZE) <= end)) { *ptep = PTE_NULL; remote_flush_tlb_cpumask(args->vm, base, ihk_mc_get_processor_id()); ihk_mc_free_pages(pt, 1); } return 0; } static int clear_range_l3(void *args0, pte_t *ptep, uint64_t base, uint64_t start, uint64_t end) { struct clear_range_args *args = args0; int error; uint64_t phys; pte_t old; struct page *page; struct page_table *pt; if (*ptep == PTE_NULL) { return -ENOENT; } if ((*ptep & PFL3_SIZE) && ((base < start) || (end < (base + PTL3_SIZE)))) { error = -EINVAL; ekprintf("clear_range_l3(%p,%p,%lx,%lx,%lx):" "split page. %d\n", args0, ptep, base, start, end, error); return error; } if (*ptep & PFL3_SIZE) { old = xchg(ptep, PTE_NULL); remote_flush_tlb_cpumask(args->vm, base, ihk_mc_get_processor_id()); page = NULL; if (!pte_is_fileoff(&old, PTL3_SIZE)) { phys = pte_get_phys(&old); page = phys_to_page(phys); } if (page && page_is_in_memobj(page) && (old & PFL3_DIRTY)) { memobj_flush_page(args->memobj, phys, PTL3_SIZE); } if (!(old & PFL3_FILEOFF) && args->free_physical) { if (!page || (page && page_unmap(page))) { ihk_mc_free_pages(phys_to_virt(phys), PTL3_SIZE/PTL1_SIZE); } args->vm->currss -= PTL3_SIZE; } return 0; } pt = phys_to_virt(*ptep & PT_PHYSMASK); error = walk_pte_l2(pt, base, start, end, &clear_range_l2, args0); if (error && (error != -ENOENT)) { return error; } if (use_1gb_page && (start <= base) && ((base + PTL3_SIZE) <= end)) { *ptep = PTE_NULL; remote_flush_tlb_cpumask(args->vm, base, ihk_mc_get_processor_id()); ihk_mc_free_pages(pt, 1); } return 0; } static int clear_range_l4(void *args0, pte_t *ptep, uint64_t base, uint64_t start, uint64_t end) { struct page_table *pt; if (*ptep == PTE_NULL) { return -ENOENT; } pt = phys_to_virt(*ptep & PT_PHYSMASK); return walk_pte_l3(pt, base, start, end, &clear_range_l3, args0); } static int clear_range(struct page_table *pt, struct process_vm *vm, uintptr_t start, uintptr_t end, int free_physical, struct memobj *memobj) { int error; struct clear_range_args args; if ((start < vm->region.user_start) || (vm->region.user_end < end) || (end <= start)) { ekprintf("clear_range(%p,%p,%p,%x):" "invalid start and/or end.\n", pt, start, end, free_physical); return -EINVAL; } args.free_physical = free_physical; args.memobj = memobj; args.vm = vm; error = walk_pte_l4(pt, 0, start, end, &clear_range_l4, &args); return error; } int ihk_mc_pt_clear_range(page_table_t pt, struct process_vm *vm, void *start, void *end) { #define KEEP_PHYSICAL 0 return clear_range(pt, vm, (uintptr_t)start, (uintptr_t)end, KEEP_PHYSICAL, NULL); } int ihk_mc_pt_free_range(page_table_t pt, struct process_vm *vm, void *start, void *end, struct memobj *memobj) { #define FREE_PHYSICAL 1 return clear_range(pt, vm, (uintptr_t)start, (uintptr_t)end, FREE_PHYSICAL, memobj); } struct change_attr_args { pte_t clrpte; pte_t setpte; }; static int change_attr_range_l1(void *arg0, pte_t *ptep, uint64_t base, uint64_t start, uint64_t end) { struct change_attr_args *args = arg0; if ((*ptep == PTE_NULL) || (*ptep & PFL1_FILEOFF)) { return -ENOENT; } *ptep = (*ptep & ~args->clrpte) | args->setpte; return 0; } static int change_attr_range_l2(void *arg0, pte_t *ptep, uint64_t base, uint64_t start, uint64_t end) { struct change_attr_args *args = arg0; int error; struct page_table *pt; if ((*ptep == PTE_NULL) || (*ptep & PFL2_FILEOFF)) { return -ENOENT; } if ((*ptep & PFL2_SIZE) && ((base < start) || (end < (base + PTL2_SIZE)))) { error = -EINVAL; ekprintf("change_attr_range_l2(%p,%p,%lx,%lx,%lx):" "split page. %d\n", arg0, ptep, base, start, end, error); return error; } if (*ptep & PFL2_SIZE) { if (!(*ptep & PFL2_FILEOFF)) { *ptep = (*ptep & ~args->clrpte) | args->setpte; } return 0; } pt = phys_to_virt(*ptep & PT_PHYSMASK); return walk_pte_l1(pt, base, start, end, &change_attr_range_l1, arg0); } static int change_attr_range_l3(void *arg0, pte_t *ptep, uint64_t base, uint64_t start, uint64_t end) { struct change_attr_args *args = arg0; int error; struct page_table *pt; if ((*ptep == PTE_NULL) || (*ptep & PFL3_FILEOFF)) { return -ENOENT; } if ((*ptep & PFL3_SIZE) && ((base < start) || (end < (base + PTL3_SIZE)))) { error = -EINVAL; ekprintf("change_attr_range_l3(%p,%p,%lx,%lx,%lx):" "split page. %d\n", arg0, ptep, base, start, end, error); return error; } if (*ptep & PFL3_SIZE) { if (!(*ptep & PFL3_FILEOFF)) { *ptep = (*ptep & ~args->clrpte) | args->setpte; } return 0; } pt = phys_to_virt(*ptep & PT_PHYSMASK); return walk_pte_l2(pt, base, start, end, &change_attr_range_l2, arg0); } static int change_attr_range_l4(void *arg0, pte_t *ptep, uint64_t base, uint64_t start, uint64_t end) { struct page_table *pt; if (*ptep == PTE_NULL) { return -ENOENT; } pt = phys_to_virt(*ptep & PT_PHYSMASK); return walk_pte_l3(pt, base, start, end, &change_attr_range_l3, arg0); } int ihk_mc_pt_change_attr_range(page_table_t pt, void *start0, void *end0, enum ihk_mc_pt_attribute clrattr, enum ihk_mc_pt_attribute setattr) { const intptr_t start = (intptr_t)start0; const intptr_t end = (intptr_t)end0; struct change_attr_args args; args.clrpte = attr_to_l1attr(clrattr); args.setpte = attr_to_l1attr(setattr); return walk_pte_l4(pt, 0, start, end, &change_attr_range_l4, &args); } static pte_t *lookup_pte(struct page_table *pt, uintptr_t virt, int pgshift, uintptr_t *basep, size_t *sizep, int *p2alignp) { int l4idx, l3idx, l2idx, l1idx; pte_t *ptep; uintptr_t base; size_t size; int p2align; GET_VIRT_INDICES(virt, l4idx, l3idx, l2idx, l1idx); ptep = NULL; if (!pgshift) { pgshift = (use_1gb_page)? PTL3_SHIFT: PTL2_SHIFT; } if (pt->entry[l4idx] == PTE_NULL) { if (pgshift > PTL3_SHIFT) { pgshift = PTL3_SHIFT; } goto out; } pt = phys_to_virt(pte_get_phys(&pt->entry[l4idx])); if ((pt->entry[l3idx] == PTE_NULL) || (pt->entry[l3idx] & PFL3_SIZE)) { if (pgshift >= PTL3_SHIFT) { ptep = &pt->entry[l3idx]; pgshift = PTL3_SHIFT; } goto out; } pt = phys_to_virt(pte_get_phys(&pt->entry[l3idx])); if ((pt->entry[l2idx] == PTE_NULL) || (pt->entry[l2idx] & PFL2_SIZE)) { if (pgshift >= PTL2_SHIFT) { ptep = &pt->entry[l2idx]; pgshift = PTL2_SHIFT; } goto out; } pt = phys_to_virt(pte_get_phys(&pt->entry[l2idx])); ptep = &pt->entry[l1idx]; pgshift = PTL1_SHIFT; out: size = (size_t)1 << pgshift; base = virt & ~(size - 1); p2align = pgshift - PAGE_SHIFT; if (basep) *basep = base; if (sizep) *sizep = size; if (p2alignp) *p2alignp = p2align; return ptep; } pte_t *ihk_mc_pt_lookup_pte(page_table_t pt, void *virt, int pgshift, void **basep, size_t *sizep, int *p2alignp) { pte_t *ptep; uintptr_t base; size_t size; int p2align; dkprintf("ihk_mc_pt_lookup_pte(%p,%p,%d)\n", pt, virt, pgshift); ptep = lookup_pte(pt, (uintptr_t)virt, pgshift, &base, &size, &p2align); if (basep) *basep = (void *)base; if (sizep) *sizep = size; if (p2alignp) *p2alignp = p2align; dkprintf("ihk_mc_pt_lookup_pte(%p,%p,%d): %p %lx %lx %d\n", pt, virt, pgshift, ptep, base, size, p2align); return ptep; } struct set_range_args { page_table_t pt; uintptr_t phys; enum ihk_mc_pt_attribute attr; int pgshift; uintptr_t diff; struct process_vm *vm; }; int set_range_l1(void *args0, pte_t *ptep, uintptr_t base, uintptr_t start, uintptr_t end) { struct set_range_args *args = args0; int error; uintptr_t phys; dkprintf("set_range_l1(%lx,%lx,%lx)\n", base, start, end); if (*ptep != PTE_NULL) { error = -EBUSY; ekprintf("set_range_l1(%lx,%lx,%lx):page exists. %d %lx\n", base, start, end, error, *ptep); (void)clear_range(args->pt, args->vm, start, base, KEEP_PHYSICAL, NULL); goto out; } phys = args->phys + (base - start); *ptep = phys | attr_to_l1attr(args->attr); error = 0; out: dkprintf("set_range_l1(%lx,%lx,%lx): %d %lx\n", base, start, end, error, *ptep); return error; } int set_range_l2(void *args0, pte_t *ptep, uintptr_t base, uintptr_t start, uintptr_t end) { struct set_range_args *args = args0; int error; struct page_table *pt; uintptr_t phys; struct page_table *newpt = NULL; pte_t pte; dkprintf("set_range_l2(%lx,%lx,%lx)\n", base, start, end); retry: if (*ptep == PTE_NULL) { if ((start <= base) && ((base + PTL2_SIZE) <= end) && ((args->diff & (PTL2_SIZE - 1)) == 0) && (!args->pgshift || (args->pgshift == PTL2_SHIFT))) { phys = args->phys + (base - start); *ptep = phys | attr_to_l2attr( args->attr|PTATTR_LARGEPAGE); error = 0; dkprintf("set_range_l2(%lx,%lx,%lx):" "2MiB page. %d %lx\n", base, start, end, error, *ptep); goto out; } if (!newpt) { newpt = __alloc_new_pt(IHK_MC_AP_NOWAIT); if (newpt == NULL) { error = -ENOMEM; ekprintf("set_range_l2(%lx,%lx,%lx):" "__alloc_new_pt failed. %d %lx\n", base, start, end, error, *ptep); (void)clear_range(args->pt, args->vm, start, base, KEEP_PHYSICAL, NULL); goto out; } } pte = virt_to_phys(newpt) | PFL2_PDIR_ATTR; pte = atomic_cmpxchg8(ptep, PTE_NULL, pte); if (pte != PTE_NULL) { /* failed to set PDTe */ goto retry; } pt = newpt; newpt = NULL; } else if (*ptep & PFL2_SIZE) { error = -EBUSY; ekprintf("set_range_l2(%lx,%lx,%lx):" "page exists. %d %lx\n", base, start, end, error, *ptep); (void)clear_range(args->pt, args->vm, start, base, KEEP_PHYSICAL, NULL); goto out; } else { pt = phys_to_virt(*ptep & PT_PHYSMASK); } error = walk_pte_l1(pt, base, start, end, &set_range_l1, args0); if (error) { ekprintf("set_range_l2(%lx,%lx,%lx):" "walk_pte_l1 failed. %d %lx\n", base, start, end, error, *ptep); goto out; } error = 0; out: if (newpt) { ihk_mc_free_pages(newpt, 1); } dkprintf("set_range_l2(%lx,%lx,%lx): %d %lx\n", base, start, end, error, *ptep); return error; } int set_range_l3(void *args0, pte_t *ptep, uintptr_t base, uintptr_t start, uintptr_t end) { struct page_table *newpt = NULL; pte_t pte; struct page_table *pt; int error; struct set_range_args *args = args0; uintptr_t phys; dkprintf("set_range_l3(%lx,%lx,%lx)\n", base, start, end); retry: if (*ptep == PTE_NULL) { if ((start <= base) && ((base + PTL3_SIZE) <= end) && ((args->diff & (PTL3_SIZE - 1)) == 0) && (!args->pgshift || (args->pgshift == PTL3_SHIFT)) && use_1gb_page) { phys = args->phys + (base - start); *ptep = phys | attr_to_l3attr( args->attr|PTATTR_LARGEPAGE); error = 0; dkprintf("set_range_l3(%lx,%lx,%lx):" "1GiB page. %d %lx\n", base, start, end, error, *ptep); goto out; } if (!newpt) { newpt = __alloc_new_pt(IHK_MC_AP_NOWAIT); if (newpt == NULL) { error = -ENOMEM; ekprintf("set_range_l3(%lx,%lx,%lx):" "__alloc_new_pt failed. %d %lx\n", base, start, end, error, *ptep); (void)clear_range(args->pt, args->vm, start, base, KEEP_PHYSICAL, NULL); goto out; } } pte = virt_to_phys(newpt) | PFL3_PDIR_ATTR; pte = atomic_cmpxchg8(ptep, PTE_NULL, pte); if (pte != PTE_NULL) { /* failed to set PDPTe */ goto retry; } pt = newpt; newpt = NULL; } else if (*ptep & PFL3_SIZE) { error = -EBUSY; ekprintf("set_range_l3(%lx,%lx,%lx):" "page exists. %d %lx\n", base, start, end, error, *ptep); (void)clear_range(args->pt, args->vm, start, base, KEEP_PHYSICAL, NULL); goto out; } else { pt = phys_to_virt(*ptep & PT_PHYSMASK); } error = walk_pte_l2(pt, base, start, end, &set_range_l2, args0); if (error) { ekprintf("set_range_l3(%lx,%lx,%lx):" "walk_pte_l2 failed. %d %lx\n", base, start, end, error, *ptep); goto out; } error = 0; out: if (newpt) { ihk_mc_free_pages(newpt, 1); } dkprintf("set_range_l3(%lx,%lx,%lx): %d\n", base, start, end, error, *ptep); return error; } int set_range_l4(void *args0, pte_t *ptep, uintptr_t base, uintptr_t start, uintptr_t end) { struct set_range_args *args = args0; struct page_table *newpt = NULL; pte_t pte; struct page_table *pt; int error; dkprintf("set_range_l4(%lx,%lx,%lx)\n", base, start, end); retry: if (*ptep == PTE_NULL) { if (!newpt) { newpt = __alloc_new_pt(IHK_MC_AP_NOWAIT); if (newpt == NULL) { error = -ENOMEM; ekprintf("set_range_l4(%lx,%lx,%lx):" "__alloc_new_pt failed. %d %lx\n", base, start, end, error, *ptep); (void)clear_range(args->pt, args->vm, start, base, KEEP_PHYSICAL, NULL); goto out; } } pte = virt_to_phys(newpt) | PFL4_PDIR_ATTR; pte = atomic_cmpxchg8(ptep, PTE_NULL, pte); if (pte != PTE_NULL) { /* failed to set PML4e */ goto retry; } pt = newpt; newpt = NULL; } else { pt = phys_to_virt(*ptep & PT_PHYSMASK); } error = walk_pte_l3(pt, base, start, end, &set_range_l3, args0); if (error) { ekprintf("set_range_l4(%lx,%lx,%lx):" "walk_pte_l3 failed. %d %lx\n", base, start, end, error, *ptep); goto out; } error = 0; out: if (newpt) { ihk_mc_free_pages(newpt, 1); } dkprintf("set_range_l4(%lx,%lx,%lx): %d %lx\n", base, start, end, error, *ptep); return error; } int ihk_mc_pt_set_range(page_table_t pt, struct process_vm *vm, void *start, void *end, uintptr_t phys, enum ihk_mc_pt_attribute attr, int pgshift) { int error; struct set_range_args args; dkprintf("ihk_mc_pt_set_range(%p,%p,%p,%lx,%x)\n", pt, start, end, phys, attr); args.pt = pt; args.phys = phys; args.attr = attr; args.diff = (uintptr_t)start ^ phys; args.vm = vm; args.pgshift = pgshift; error = walk_pte_l4(pt, 0, (uintptr_t)start, (uintptr_t)end, &set_range_l4, &args); if (error) { ekprintf("ihk_mc_pt_set_range(%p,%p,%p,%lx,%x):" "walk_pte_l4 failed. %d\n", pt, start, end, phys, attr, error); goto out; } error = 0; out: dkprintf("ihk_mc_pt_set_range(%p,%p,%p,%lx,%x): %d\n", pt, start, end, phys, attr, error); return error; } int ihk_mc_pt_set_pte(page_table_t pt, pte_t *ptep, size_t pgsize, uintptr_t phys, enum ihk_mc_pt_attribute attr) { int error; dkprintf("ihk_mc_pt_set_pte(%p,%p,%lx,%lx,%x)\n", pt, ptep, pgsize, phys, attr); if (pgsize == PTL1_SIZE) { *ptep = phys | attr_to_l1attr(attr); } else if (pgsize == PTL2_SIZE) { *ptep = phys | attr_to_l2attr(attr | PTATTR_LARGEPAGE); } else if ((pgsize == PTL3_SIZE) && (use_1gb_page)) { *ptep = phys | attr_to_l3attr(attr | PTATTR_LARGEPAGE); } else { error = -EINVAL; ekprintf("ihk_mc_pt_set_pte(%p,%p,%lx,%lx,%x):" "page size. %d %lx\n", pt, ptep, pgsize, phys, attr, error, *ptep); panic("ihk_mc_pt_set_pte:page size"); goto out; } error = 0; out: dkprintf("ihk_mc_pt_set_pte(%p,%p,%lx,%lx,%x): %d %lx\n", pt, ptep, pgsize, phys, attr, error, *ptep); return error; } int ihk_mc_pt_split(page_table_t pt, struct process_vm *vm, void *addr) { int error; pte_t *ptep; void *pgaddr; size_t pgsize; intptr_t phys; struct page *page; retry: ptep = ihk_mc_pt_lookup_pte(pt, addr, 0, &pgaddr, &pgsize, NULL); if (ptep && !pte_is_null(ptep) && (pgaddr != addr)) { page = NULL; if (!pte_is_fileoff(ptep, pgsize)) { phys = pte_get_phys(ptep); page = phys_to_page(phys); } if (page && (page_is_in_memobj(page) || page_is_multi_mapped(page))) { error = -EINVAL; kprintf("ihk_mc_pt_split:NYI:page break down\n"); goto out; } error = split_large_page(ptep, pgsize); if (error) { kprintf("ihk_mc_pt_split:split_large_page failed. %d\n", error); goto out; } remote_flush_tlb_cpumask(vm, (intptr_t)pgaddr, ihk_mc_get_processor_id()); goto retry; } error = 0; out: return error; } /* ihk_mc_pt_split() */ int arch_get_smaller_page_size(void *args, size_t cursize, size_t *newsizep, int *p2alignp) { size_t newsize; int p2align; int error; if (0) { /* dummy */ panic("not reached"); } else if ((cursize > PTL3_SIZE) && use_1gb_page) { /* 1GiB */ newsize = PTL3_SIZE; p2align = PTL3_SHIFT - PTL1_SHIFT; } else if (cursize > PTL2_SIZE) { /* 2MiB */ newsize = PTL2_SIZE; p2align = PTL2_SHIFT - PTL1_SHIFT; } else if (cursize > PTL1_SIZE) { /* 4KiB : basic page size */ newsize = PTL1_SIZE; p2align = PTL1_SHIFT - PTL1_SHIFT; } else { error = -ENOMEM; newsize = 0; p2align = -1; goto out; } error = 0; if (newsizep) *newsizep = newsize; if (p2alignp) *p2alignp = p2align; out: dkprintf("arch_get_smaller_page_size(%p,%lx): %d %lx %d\n", args, cursize, error, newsize, p2align); return error; } enum ihk_mc_pt_attribute arch_vrflag_to_ptattr(unsigned long flag, uint64_t fault, pte_t *ptep) { enum ihk_mc_pt_attribute attr; attr = common_vrflag_to_ptattr(flag, fault, ptep); if ((fault & PF_PROT) || ((fault & (PF_POPULATE | PF_PATCH)) && (flag & VR_PRIVATE))) { attr |= PTATTR_DIRTY; } return attr; } struct move_args { uintptr_t src; uintptr_t dest; struct process_vm *vm; }; static int move_one_page(void *arg0, page_table_t pt, pte_t *ptep, void *pgaddr, int pgshift) { int error; struct move_args *args = arg0; const size_t pgsize = (size_t)1 << pgshift; uintptr_t dest; pte_t apte; uintptr_t phys; enum ihk_mc_pt_attribute attr; dkprintf("move_one_page(%p,%p,%p %#lx,%p,%d)\n", arg0, pt, ptep, *ptep, pgaddr, pgshift); if (pte_is_fileoff(ptep, pgsize)) { error = -ENOTSUPP; kprintf("move_one_page(%p,%p,%p %#lx,%p,%d):fileoff. %d\n", arg0, pt, ptep, *ptep, pgaddr, pgshift, error); goto out; } dest = args->dest + ((uintptr_t)pgaddr - args->src); apte = PTE_NULL; pte_xchg(ptep, &apte); phys = apte & PT_PHYSMASK; attr = apte & ~PT_PHYSMASK; error = ihk_mc_pt_set_range(pt, args->vm, (void *)dest, (void *)(dest + pgsize), phys, attr, pgshift); if (error) { kprintf("move_one_page(%p,%p,%p %#lx,%p,%d):" "set failed. %d\n", arg0, pt, ptep, *ptep, pgaddr, pgshift, error); goto out; } error = 0; out: dkprintf("move_one_page(%p,%p,%p %#lx,%p,%d):%d\n", arg0, pt, ptep, *ptep, pgaddr, pgshift, error); return error; } int move_pte_range(page_table_t pt, struct process_vm *vm, void *src, void *dest, size_t size) { int error; struct move_args args; dkprintf("move_pte_range(%p,%p,%p,%#lx)\n", pt, src, dest, size); args.src = (uintptr_t)src; args.dest = (uintptr_t)dest; args.vm = vm; error = visit_pte_range(pt, src, src+size, 0, VPTEF_SKIP_NULL, &move_one_page, &args); flush_tlb(); /* XXX: TLB flush */ if (error) { goto out; } error = 0; out: dkprintf("move_pte_range(%p,%p,%p,%#lx):%d\n", pt, src, dest, size, error); return error; } void load_page_table(struct page_table *pt) { unsigned long pt_addr; if (!pt) { pt = init_pt; } pt_addr = virt_to_phys(pt); asm volatile ("movq %0, %%cr3" : : "r"(pt_addr) : "memory"); } void ihk_mc_load_page_table(struct page_table *pt) { load_page_table(pt); } struct page_table *get_init_page_table(void) { return init_pt; } static unsigned long fixed_virt; static void init_fixed_area(struct page_table *pt) { fixed_virt = MAP_FIXED_START; return; } void init_text_area(struct page_table *pt) { unsigned long __end, phys, virt; int i, nlpages; __end = ((unsigned long)_end + LARGE_PAGE_SIZE * 2 - 1) & LARGE_PAGE_MASK; nlpages = (__end - MAP_KERNEL_START) >> LARGE_PAGE_SHIFT; kprintf("TEXT: # of large pages = %d\n", nlpages); kprintf("TEXT: Base address = %lx\n", x86_kernel_phys_base); phys = x86_kernel_phys_base; virt = MAP_KERNEL_START; for (i = 0; i < nlpages; i++) { set_pt_large_page(pt, (void *)virt, phys, PTATTR_WRITABLE); virt += LARGE_PAGE_SIZE; phys += LARGE_PAGE_SIZE; } } void *map_fixed_area(unsigned long phys, unsigned long size, int uncachable) { unsigned long poffset, paligned; int i, npages; void *v = (void *)fixed_virt; enum ihk_mc_pt_attribute attr; poffset = phys & (PAGE_SIZE - 1); paligned = phys & PAGE_MASK; npages = (poffset + size + PAGE_SIZE - 1) >> PAGE_SHIFT; attr = PTATTR_WRITABLE | PTATTR_ACTIVE; #if 0 /* In the case of LAPIC MMIO, something will happen */ attr |= PTATTR_NO_EXECUTE; #endif if (uncachable) { attr |= PTATTR_UNCACHABLE; } kprintf("map_fixed: %lx => %p (%d pages)\n", paligned, v, npages); for (i = 0; i < npages; i++) { if(__set_pt_page(init_pt, (void *)fixed_virt, paligned, attr)){ return NULL; } fixed_virt += PAGE_SIZE; paligned += PAGE_SIZE; } flush_tlb(); return (char *)v + poffset; } void init_low_area(struct page_table *pt) { set_pt_large_page(pt, 0, 0, PTATTR_NO_EXECUTE|PTATTR_WRITABLE); } static void init_vsyscall_area(struct page_table *pt) { extern char vsyscall_page[]; int error; #define VSYSCALL_ADDR ((void *)(0xffffffffff600000)) error = __set_pt_page(pt, VSYSCALL_ADDR, virt_to_phys(vsyscall_page), PTATTR_ACTIVE|PTATTR_USER); if (error) { panic("init_vsyscall_area:__set_pt_page failed"); } return; } void init_page_table(void) { check_available_page_size(); init_pt = ihk_mc_alloc_pages(1, IHK_MC_AP_CRITICAL); ihk_mc_spinlock_init(&init_pt_lock); memset(init_pt, 0, sizeof(PAGE_SIZE)); /* Normal memory area */ init_normal_area(init_pt); init_fixed_area(init_pt); init_low_area(init_pt); init_text_area(init_pt); init_vsyscall_area(init_pt); load_page_table(init_pt); kprintf("Page table is now at %p\n", init_pt); } extern void __reserve_arch_pages(unsigned long, unsigned long, void (*)(struct ihk_page_allocator_desc *, unsigned long, unsigned long, int)); void ihk_mc_reserve_arch_pages(struct ihk_page_allocator_desc *pa_allocator, unsigned long start, unsigned long end, void (*cb)(struct ihk_page_allocator_desc *, unsigned long, unsigned long, int)) { /* Reserve Text + temporal heap */ cb(pa_allocator, virt_to_phys(_head), virt_to_phys(get_last_early_heap()), 0); /* Reserve trampoline area to boot the second ap */ cb(pa_allocator, ap_trampoline, ap_trampoline + AP_TRAMPOLINE_SIZE, 0); /* Reserve the null page */ cb(pa_allocator, 0, PAGE_SIZE, 0); /* * Micro-arch specific * TODO: this does nothing in SMP mode, update it for KNC if necessary */ __reserve_arch_pages(start, end, cb); } unsigned long virt_to_phys(void *v) { unsigned long va = (unsigned long)v; if (va >= MAP_KERNEL_START) { return va - MAP_KERNEL_START + x86_kernel_phys_base; } else { return va - MAP_ST_START; } } void *phys_to_virt(unsigned long p) { return (void *)(p + MAP_ST_START); } int copy_from_user(void *dst, const void *src, size_t siz) { struct process_vm *vm = cpu_local_var(current)->vm; return read_process_vm(vm, dst, src, siz); } int strlen_user(const char *s) { struct process_vm *vm = cpu_local_var(current)->vm; struct vm_range *range; unsigned long pgstart; int maxlen; const char *head = s; maxlen = 4096 - (((unsigned long)s) & 0x0000000000000fffUL); pgstart = ((unsigned long)s) & 0xfffffffffffff000UL; if(!pgstart || pgstart >= MAP_KERNEL_START) return -EFAULT; ihk_mc_spinlock_lock_noirq(&vm->memory_range_lock); for(;;){ range = lookup_process_memory_range(vm, pgstart, pgstart+1); if(range == NULL){ ihk_mc_spinlock_unlock_noirq(&vm->memory_range_lock); return -EFAULT; } if((range->flag & VR_PROT_MASK) == VR_PROT_NONE){ ihk_mc_spinlock_unlock_noirq(&vm->memory_range_lock); return -EFAULT; } while(*s && maxlen > 0){ s++; maxlen--; } if(!*s) break; maxlen = 4096; pgstart += 4096; } ihk_mc_spinlock_unlock_noirq(&vm->memory_range_lock); return s - head; } int strcpy_from_user(char *dst, const char *src) { struct process_vm *vm = cpu_local_var(current)->vm; struct vm_range *range; unsigned long pgstart; int maxlen; int err = 0; maxlen = 4096 - (((unsigned long)src) & 0x0000000000000fffUL); pgstart = ((unsigned long)src) & 0xfffffffffffff000UL; if(!pgstart || pgstart >= MAP_KERNEL_START) return -EFAULT; ihk_mc_spinlock_lock_noirq(&vm->memory_range_lock); for(;;){ range = lookup_process_memory_range(vm, pgstart, pgstart + 1); if(range == NULL){ err = -EFAULT; break; } if((range->flag & VR_PROT_MASK) == VR_PROT_NONE){ err = -EFAULT; break; } while(*src && maxlen > 0){ *(dst++) = *(src++); maxlen--; } if(!*src){ *dst = '\0'; break; } maxlen = 4096; pgstart += 4096; } ihk_mc_spinlock_unlock_noirq(&vm->memory_range_lock); return err; } long getlong_user(const long *p) { int error; long l; error = copy_from_user(&l, p, sizeof(l)); if (error) { return error; } return l; } int getint_user(const int *p) { int error; int i; error = copy_from_user(&i, p, sizeof(i)); if (error) { return error; } return i; } int read_process_vm(struct process_vm *vm, void *kdst, const void *usrc, size_t siz) { const uintptr_t ustart = (uintptr_t)usrc; const uintptr_t uend = ustart + siz; uint64_t reason; uintptr_t addr; int error; const void *from; void *to; size_t remain; size_t cpsize; unsigned long pa; void *va; if ((ustart < vm->region.user_start) || (vm->region.user_end <= ustart) || ((vm->region.user_end - ustart) < siz)) { kprintf("%s: error: out of user range\n", __FUNCTION__); return -EFAULT; } reason = PF_USER; /* page not present */ for (addr = ustart & PAGE_MASK; addr < uend; addr += PAGE_SIZE) { if (!addr) return -EINVAL; error = page_fault_process_vm(vm, (void *)addr, reason); if (error) { kprintf("%s: error: PF for %p failed\n", __FUNCTION__, addr); return error; } } from = usrc; to = kdst; remain = siz; while (remain > 0) { cpsize = PAGE_SIZE - ((uintptr_t)from & (PAGE_SIZE - 1)); if (cpsize > remain) { cpsize = remain; } error = ihk_mc_pt_virt_to_phys(vm->address_space->page_table, from, &pa); if (error) { kprintf("%s: error: resolving physical address or %p\n", __FUNCTION__, from); return error; } if (pa < ihk_mc_get_memory_address(IHK_MC_GMA_MAP_START, 0) || pa >= ihk_mc_get_memory_address(IHK_MC_GMA_MAP_END, 0)) { dkprintf("%s: pa is outside of LWK memory, to: %p, pa: %p," "cpsize: %d\n", __FUNCTION__, to, pa, cpsize); va = ihk_mc_map_virtual(pa, 1, PTATTR_ACTIVE); memcpy(to, va, cpsize); ihk_mc_unmap_virtual(va, 1, 1); } else { va = phys_to_virt(pa); memcpy(to, va, cpsize); } from += cpsize; to += cpsize; remain -= cpsize; } return 0; } /* read_process_vm() */ int copy_to_user(void *dst, const void *src, size_t siz) { struct process_vm *vm = cpu_local_var(current)->vm; return write_process_vm(vm, dst, src, siz); } int setlong_user(long *dst, long data) { return copy_to_user(dst, &data, sizeof(data)); } int setint_user(int *dst, int data) { return copy_to_user(dst, &data, sizeof(data)); } int write_process_vm(struct process_vm *vm, void *udst, const void *ksrc, size_t siz) { const uintptr_t ustart = (uintptr_t)udst; const uintptr_t uend = ustart + siz; uint64_t reason; uintptr_t addr; int error; const void *from; void *to; size_t remain; size_t cpsize; unsigned long pa; void *va; if ((ustart < vm->region.user_start) || (vm->region.user_end <= ustart) || ((vm->region.user_end - ustart) < siz)) { return -EFAULT; } reason = PF_POPULATE | PF_WRITE | PF_USER; for (addr = ustart & PAGE_MASK; addr < uend; addr += PAGE_SIZE) { error = page_fault_process_vm(vm, (void *)addr, reason); if (error) { return error; } } from = ksrc; to = udst; remain = siz; while (remain > 0) { cpsize = PAGE_SIZE - ((uintptr_t)to & (PAGE_SIZE - 1)); if (cpsize > remain) { cpsize = remain; } error = ihk_mc_pt_virt_to_phys(vm->address_space->page_table, to, &pa); if (error) { return error; } va = phys_to_virt(pa); memcpy(va, from, cpsize); from += cpsize; to += cpsize; remain -= cpsize; } return 0; } /* write_process_vm() */ int patch_process_vm(struct process_vm *vm, void *udst, const void *ksrc, size_t siz) { const uintptr_t ustart = (uintptr_t)udst; const uintptr_t uend = ustart + siz; uint64_t reason; uintptr_t addr; int error; const void *from; void *to; size_t remain; size_t cpsize; unsigned long pa; void *va; kprintf("patch_process_vm(%p,%p,%p,%lx)\n", vm, udst, ksrc, siz); if ((ustart < vm->region.user_start) || (vm->region.user_end <= ustart) || ((vm->region.user_end - ustart) < siz)) { kprintf("patch_process_vm(%p,%p,%p,%lx):not in user\n", vm, udst, ksrc, siz); return -EFAULT; } reason = PF_PATCH | PF_WRITE | PF_USER; for (addr = ustart & PAGE_MASK; addr < uend; addr += PAGE_SIZE) { error = page_fault_process_vm(vm, (void *)addr, reason); if (error) { kprintf("patch_process_vm(%p,%p,%p,%lx):pf(%lx):%d\n", vm, udst, ksrc, siz, addr, error); return error; } } from = ksrc; to = udst; remain = siz; while (remain > 0) { cpsize = PAGE_SIZE - ((uintptr_t)to & (PAGE_SIZE - 1)); if (cpsize > remain) { cpsize = remain; } error = ihk_mc_pt_virt_to_phys(vm->address_space->page_table, to, &pa); if (error) { kprintf("patch_process_vm(%p,%p,%p,%lx):v2p(%p):%d\n", vm, udst, ksrc, siz, to, error); return error; } if (pa < ihk_mc_get_memory_address(IHK_MC_GMA_MAP_START, 0) || pa >= ihk_mc_get_memory_address(IHK_MC_GMA_MAP_END, 0)) { dkprintf("%s: pa is outside of LWK memory, from: %p," "pa: %p, cpsize: %d\n", __FUNCTION__, from, pa, cpsize); va = ihk_mc_map_virtual(pa, 1, PTATTR_ACTIVE); memcpy(va, from, cpsize); ihk_mc_unmap_virtual(va, 1, 1); } else { va = phys_to_virt(pa); memcpy(va, from, cpsize); } from += cpsize; to += cpsize; remain -= cpsize; } kprintf("patch_process_vm(%p,%p,%p,%lx):%d\n", vm, udst, ksrc, siz, 0); return 0; } /* patch_process_vm() */