From d4a0b32f06a75a3bb0f02d38e7dc71aeb00d0f4d Mon Sep 17 00:00:00 2001 From: NAKAMURA Gou Date: Thu, 21 Apr 2016 22:38:29 +0900 Subject: [PATCH] support large pages --- arch/x86/kernel/include/arch/shm.h | 3 +- arch/x86/kernel/memory.c | 406 +++++++++++++++++++---------- arch/x86/kernel/syscall.c | 62 +++-- kernel/process.c | 126 ++++++--- kernel/shmobj.c | 33 ++- kernel/syscall.c | 81 +++--- kernel/zeroobj.c | 2 +- lib/include/ihk/mm.h | 8 +- 8 files changed, 469 insertions(+), 252 deletions(-) diff --git a/arch/x86/kernel/include/arch/shm.h b/arch/x86/kernel/include/arch/shm.h index 20bbcafa..51685795 100644 --- a/arch/x86/kernel/include/arch/shm.h +++ b/arch/x86/kernel/include/arch/shm.h @@ -39,7 +39,8 @@ struct shmid_ds { pid_t shm_cpid; pid_t shm_lpid; uint64_t shm_nattch; - uint8_t padding[16]; + uint8_t padding[12]; + int init_pgshift; }; #endif /* HEADER_ARCH_SHM_H */ diff --git a/arch/x86/kernel/memory.c b/arch/x86/kernel/memory.c index bae4b1ce..43285b79 100644 --- a/arch/x86/kernel/memory.c +++ b/arch/x86/kernel/memory.c @@ -124,11 +124,8 @@ struct page_table { static struct page_table *init_pt; static ihk_spinlock_t init_pt_lock; -#ifdef USE_LARGE_PAGES static int use_1gb_page = 0; -#endif -#ifdef USE_LARGE_PAGES static void check_available_page_size(void) { uint32_t edx; @@ -139,7 +136,6 @@ static void check_available_page_size(void) return; } -#endif static unsigned long setup_l2(struct page_table *pt, unsigned long page_head, unsigned long start, @@ -534,28 +530,33 @@ int ihk_mc_pt_virt_to_phys(struct page_table *pt, if (!(pt->entry[l4idx] & PFL4_PRESENT)) { return -EFAULT; } - pt = phys_to_virt(pt->entry[l4idx] & PAGE_MASK); + pt = phys_to_virt(pte_get_phys(&pt->entry[l4idx])); if (!(pt->entry[l3idx] & PFL3_PRESENT)) { return -EFAULT; } - pt = phys_to_virt(pt->entry[l3idx] & PAGE_MASK); + if ((pt->entry[l3idx] & PFL3_SIZE)) { + *phys = pte_get_phys(&pt->entry[l3idx]) + | (v & (PTL3_SIZE - 1)); + return 0; + } + pt = phys_to_virt(pte_get_phys(&pt->entry[l3idx])); if (!(pt->entry[l2idx] & PFL2_PRESENT)) { return -EFAULT; } if ((pt->entry[l2idx] & PFL2_SIZE)) { - *phys = (pt->entry[l2idx] & LARGE_PAGE_MASK) | - (v & (LARGE_PAGE_SIZE - 1)); + *phys = pte_get_phys(&pt->entry[l2idx]) + | (v & (PTL2_SIZE - 1)); return 0; } - pt = phys_to_virt(pt->entry[l2idx] & PAGE_MASK); + pt = phys_to_virt(pte_get_phys(&pt->entry[l2idx])); if (!(pt->entry[l1idx] & PFL1_PRESENT)) { return -EFAULT; } - *phys = (pt->entry[l1idx] & PT_PHYSMASK) | (v & (PAGE_SIZE - 1)); + *phys = pte_get_phys(&pt->entry[l1idx]) | (v & (PTL1_SIZE - 1)); return 0; } @@ -862,12 +863,19 @@ static int walk_pte_l4(struct page_table *pt, uint64_t base, uint64_t start, return ret; } -static int split_large_page(pte_t *ptep) +static int split_large_page(pte_t *ptep, size_t pgsize) { struct page_table *pt; - uint64_t phys; - pte_t attr; + uintptr_t phys_base; int i; + uintptr_t phys; + struct page *page; + pte_t pte; + + if ((pgsize != PTL3_SIZE) && (pgsize != PTL2_SIZE)) { + ekprintf("split_large_page:invalid pgsize %#lx\n", pgsize); + return -EINVAL; + } pt = __alloc_new_pt(IHK_MC_AP_NOWAIT); if (pt == NULL) { @@ -875,29 +883,47 @@ static int split_large_page(pte_t *ptep) return -ENOMEM; } - if (!(*ptep & PFL2_FILEOFF)) { - phys = *ptep & PT_PHYSMASK; - attr = *ptep & ~PT_PHYSMASK; - attr &= ~PFL2_SIZE; + pte = *ptep; + if (pgsize == PTL2_SIZE) { + /* break down to basic page size */ + pte &= ~PFL2_SIZE; + } + + if (pte_is_fileoff(ptep, pgsize)) { + phys_base = NOPHYS; } else { - phys = *ptep & PAGE_MASK; /* file offset */ - attr = *ptep & ~PAGE_MASK; - attr &= ~PFL2_SIZE; + phys_base = pte_get_phys(ptep); } for (i = 0; i < PT_ENTRIES; ++i) { - pt->entry[i] = (phys + (i * PTL1_SIZE)) | attr; + if (phys_base != NOPHYS) { + phys = phys_base + (i * pgsize / PT_ENTRIES); + page = phys_to_page(phys); + if (page) { + page_map(page); + } + } + pt->entry[i] = pte; + pte += pgsize / PT_ENTRIES; } *ptep = (virt_to_phys(pt) & PT_PHYSMASK) | PFL2_PDIR_ATTR; + + if (phys_base != NOPHYS) { + page = phys_to_page(phys_base); + if (page && page_unmap(page)) { + kprintf("split_large_page:page_unmap:%p\n", page); + panic("split_large_page:page_unmap\n"); + } + } return 0; } struct visit_pte_args { page_table_t pt; enum visit_pte_flag flags; - int padding; + int pgshift; pte_visitor_t *funcp; void *arg; }; @@ -926,11 +952,11 @@ static int visit_pte_l2(void *arg0, pte_t *ptep, uintptr_t base, return 0; } -#ifdef USE_LARGE_PAGES if (((*ptep == PTE_NULL) || (*ptep & PFL2_SIZE)) && (start <= base) && (((base + PTL2_SIZE) <= end) - || (end == 0))) { + || (end == 0)) + && (!args->pgshift || (args->pgshift == PTL2_SHIFT))) { error = (*args->funcp)(args->arg, args->pt, ptep, (void *)base, PTL2_SHIFT); if (error != -E2BIG) { @@ -942,7 +968,6 @@ static int visit_pte_l2(void *arg0, pte_t *ptep, uintptr_t base, ekprintf("visit_pte_l2:split large page\n"); return -ENOMEM; } -#endif if (*ptep == PTE_NULL) { pt = __alloc_new_pt(IHK_MC_AP_NOWAIT); @@ -970,11 +995,12 @@ static int visit_pte_l3(void *arg0, pte_t *ptep, uintptr_t base, return 0; } -#ifdef USE_LARGE_PAGES if (((*ptep == PTE_NULL) || (*ptep & PFL3_SIZE)) && (start <= base) && (((base + PTL3_SIZE) <= end) - || (end == 0))) { + || (end == 0)) + && (!args->pgshift || (args->pgshift == PTL3_SHIFT)) + && use_1gb_page) { error = (*args->funcp)(args->arg, args->pt, ptep, (void *)base, PTL3_SHIFT); if (error != -E2BIG) { @@ -986,7 +1012,6 @@ static int visit_pte_l3(void *arg0, pte_t *ptep, uintptr_t base, ekprintf("visit_pte_l3:split large page\n"); return -ENOMEM; } -#endif if (*ptep == PTE_NULL) { pt = __alloc_new_pt(IHK_MC_AP_NOWAIT); @@ -1029,7 +1054,7 @@ static int visit_pte_l4(void *arg0, pte_t *ptep, uintptr_t base, return error; } -int visit_pte_range(page_table_t pt, void *start0, void *end0, +int visit_pte_range(page_table_t pt, void *start0, void *end0, int pgshift, enum visit_pte_flag flags, pte_visitor_t *funcp, void *arg) { const uintptr_t start = (uintptr_t)start0; @@ -1040,6 +1065,7 @@ int visit_pte_range(page_table_t pt, void *start0, void *end0, args.flags = flags; args.funcp = funcp; args.arg = arg; + args.pgshift = pgshift; return walk_pte_l4(pt, 0, start, end, &visit_pte_l4, &args); } @@ -1063,23 +1089,26 @@ static int clear_range_l1(void *args0, pte_t *ptep, uint64_t base, return -ENOENT; } - phys = *ptep & PT_PHYSMASK; old = xchg(ptep, PTE_NULL); + remote_flush_tlb_cpumask(args->vm, base, ihk_mc_get_processor_id()); - if ((old & PFL1_DIRTY) && args->memobj) { + page = NULL; + if (!pte_is_fileoff(&old, PTL1_SIZE)) { + phys = pte_get_phys(&old); + page = phys_to_page(phys); + } + + if (page && page_is_in_memobj(page) && (old & PFL1_DIRTY)) { memobj_flush_page(args->memobj, phys, PTL1_SIZE); } if (!(old & PFL1_FILEOFF) && args->free_physical) { - page = phys_to_page(phys); if (page && page_unmap(page)) { ihk_mc_free_pages(phys_to_virt(phys), 1); } - args->vm->currss -= PAGE_SIZE; + args->vm->currss -= PTL1_SIZE; } - remote_flush_tlb_cpumask(args->vm, base, ihk_mc_get_processor_id()); - return 0; } @@ -1099,36 +1128,35 @@ static int clear_range_l2(void *args0, pte_t *ptep, uint64_t base, if ((*ptep & PFL2_SIZE) && ((base < start) || (end < (base + PTL2_SIZE)))) { - error = split_large_page(ptep); - if (error) { - ekprintf("clear_range_l2(%p,%p,%lx,%lx,%lx):" - "split failed. %d\n", - args0, ptep, base, start, end, error); - return error; - } - if (*ptep & PFL2_SIZE) { - panic("clear_range_l2:split"); - } + error = -EINVAL; + ekprintf("clear_range_l2(%p,%p,%lx,%lx,%lx):" + "split page. %d\n", + args0, ptep, base, start, end, error); + return error; } if (*ptep & PFL2_SIZE) { - phys = *ptep & PT_PHYSMASK; old = xchg(ptep, PTE_NULL); + remote_flush_tlb_cpumask(args->vm, base, + ihk_mc_get_processor_id()); - if ((old & PFL2_DIRTY) && args->memobj) { + page = NULL; + if (!pte_is_fileoff(&old, PTL2_SIZE)) { + phys = pte_get_phys(&old); + page = phys_to_page(phys); + } + + if (page && page_is_in_memobj(page) && (old & PFL2_DIRTY)) { memobj_flush_page(args->memobj, phys, PTL2_SIZE); } if (!(old & PFL2_FILEOFF) && args->free_physical) { - page = phys_to_page(phys); if (page && page_unmap(page)) { ihk_mc_free_pages(phys_to_virt(phys), PTL2_SIZE/PTL1_SIZE); } - args->vm->currss -= LARGE_PAGE_SIZE; + args->vm->currss -= PTL2_SIZE; } - remote_flush_tlb_cpumask(args->vm, base, ihk_mc_get_processor_id()); - return 0; } @@ -1140,6 +1168,8 @@ static int clear_range_l2(void *args0, pte_t *ptep, uint64_t base, if ((start <= base) && ((base + PTL2_SIZE) <= end)) { *ptep = PTE_NULL; + remote_flush_tlb_cpumask(args->vm, base, + ihk_mc_get_processor_id()); arch_free_page(pt); } @@ -1149,14 +1179,65 @@ static int clear_range_l2(void *args0, pte_t *ptep, uint64_t base, static int clear_range_l3(void *args0, pte_t *ptep, uint64_t base, uint64_t start, uint64_t end) { + struct clear_range_args *args = args0; + int error; + uint64_t phys; + pte_t old; + struct page *page; struct page_table *pt; if (*ptep == PTE_NULL) { return -ENOENT; } + if ((*ptep & PFL3_SIZE) + && ((base < start) || (end < (base + PTL3_SIZE)))) { + error = -EINVAL; + ekprintf("clear_range_l3(%p,%p,%lx,%lx,%lx):" + "split page. %d\n", + args0, ptep, base, start, end, error); + return error; + } + + if (*ptep & PFL3_SIZE) { + old = xchg(ptep, PTE_NULL); + remote_flush_tlb_cpumask(args->vm, base, + ihk_mc_get_processor_id()); + + page = NULL; + if (!pte_is_fileoff(&old, PTL3_SIZE)) { + phys = pte_get_phys(&old); + page = phys_to_page(phys); + } + + if (page && page_is_in_memobj(page) && (old & PFL3_DIRTY)) { + memobj_flush_page(args->memobj, phys, PTL3_SIZE); + } + + if (!(old & PFL3_FILEOFF) && args->free_physical) { + if (page && page_unmap(page)) { + ihk_mc_free_pages(phys_to_virt(phys), PTL3_SIZE/PTL1_SIZE); + } + args->vm->currss -= PTL3_SIZE; + } + + return 0; + } + pt = phys_to_virt(*ptep & PT_PHYSMASK); - return walk_pte_l2(pt, base, start, end, &clear_range_l2, args0); + error = walk_pte_l2(pt, base, start, end, &clear_range_l2, args0); + if (error && (error != -ENOENT)) { + return error; + } + + if (use_1gb_page && (start <= base) && ((base + PTL3_SIZE) <= end)) { + *ptep = PTE_NULL; + remote_flush_tlb_cpumask(args->vm, base, + ihk_mc_get_processor_id()); + arch_free_page(pt); + } + + return 0; } static int clear_range_l4(void *args0, pte_t *ptep, uint64_t base, @@ -1179,7 +1260,9 @@ static int clear_range(struct page_table *pt, struct process_vm *vm, int error; struct clear_range_args args; - if ((USER_END <= start) || (USER_END < end) || (end <= start)) { + if ((start < vm->region.user_start) + || (vm->region.user_end < end) + || (end <= start)) { ekprintf("clear_range(%p,%p,%p,%x):" "invalid start and/or end.\n", pt, start, end, free_physical); @@ -1241,16 +1324,11 @@ static int change_attr_range_l2(void *arg0, pte_t *ptep, uint64_t base, if ((*ptep & PFL2_SIZE) && ((base < start) || (end < (base + PTL2_SIZE)))) { - error = split_large_page(ptep); - if (error) { - ekprintf("change_attr_range_l2(%p,%p,%lx,%lx,%lx):" - "split failed. %d\n", - arg0, ptep, base, start, end, error); - return error; - } - if (*ptep & PFL2_SIZE) { - panic("change_attr_range_l2:split"); - } + error = -EINVAL; + ekprintf("change_attr_range_l2(%p,%p,%lx,%lx,%lx):" + "split page. %d\n", + arg0, ptep, base, start, end, error); + return error; } if (*ptep & PFL2_SIZE) { @@ -1267,12 +1345,30 @@ static int change_attr_range_l2(void *arg0, pte_t *ptep, uint64_t base, static int change_attr_range_l3(void *arg0, pte_t *ptep, uint64_t base, uint64_t start, uint64_t end) { + struct change_attr_args *args = arg0; + int error; struct page_table *pt; if ((*ptep == PTE_NULL) || (*ptep & PFL3_FILEOFF)) { return -ENOENT; } + if ((*ptep & PFL3_SIZE) + && ((base < start) || (end < (base + PTL3_SIZE)))) { + error = -EINVAL; + ekprintf("change_attr_range_l3(%p,%p,%lx,%lx,%lx):" + "split page. %d\n", + arg0, ptep, base, start, end, error); + return error; + } + + if (*ptep & PFL3_SIZE) { + if (!(*ptep & PFL3_FILEOFF)) { + *ptep = (*ptep & ~args->clrpte) | args->setpte; + } + return 0; + } + pt = phys_to_virt(*ptep & PT_PHYSMASK); return walk_pte_l2(pt, base, start, end, &change_attr_range_l2, arg0); } @@ -1303,7 +1399,7 @@ int ihk_mc_pt_change_attr_range(page_table_t pt, void *start0, void *end0, return walk_pte_l4(pt, 0, start, end, &change_attr_range_l4, &args); } -static pte_t *lookup_pte(struct page_table *pt, uintptr_t virt, +static pte_t *lookup_pte(struct page_table *pt, uintptr_t virt, int pgshift, uintptr_t *basep, size_t *sizep, int *p2alignp) { int l4idx, l3idx, l2idx, l1idx; @@ -1314,63 +1410,46 @@ static pte_t *lookup_pte(struct page_table *pt, uintptr_t virt, GET_VIRT_INDICES(virt, l4idx, l3idx, l2idx, l1idx); -#ifdef USE_LARGE_PAGES - if (use_1gb_page) { - ptep = NULL; - base = GET_INDICES_VIRT(l4idx, 0, 0, 0); - size = PTL3_SIZE; - p2align = PTL3_SHIFT - PTL1_SHIFT; - } - else { - ptep = NULL; - base = GET_INDICES_VIRT(l4idx, l3idx, 0, 0); - size = PTL2_SIZE; - p2align = PTL2_SHIFT - PTL1_SHIFT; - } -#else ptep = NULL; - base = GET_INDICES_VIRT(l4idx, l3idx, l2idx, l1idx); - size = PTL1_SIZE; - p2align = PTL1_SHIFT - PTL1_SHIFT; -#endif + if (!pgshift) { + pgshift = (use_1gb_page)? PTL3_SHIFT: PTL2_SHIFT; + } if (pt->entry[l4idx] == PTE_NULL) { + if (pgshift > PTL3_SHIFT) { + pgshift = PTL3_SHIFT; + } goto out; } - pt = phys_to_virt(pt->entry[l4idx] & PT_PHYSMASK); + pt = phys_to_virt(pte_get_phys(&pt->entry[l4idx])); if ((pt->entry[l3idx] == PTE_NULL) || (pt->entry[l3idx] & PFL3_SIZE)) { -#ifdef USE_LARGE_PAGES - if (use_1gb_page) { + if (pgshift >= PTL3_SHIFT) { ptep = &pt->entry[l3idx]; - base = GET_INDICES_VIRT(l4idx, l3idx, 0, 0); - size = PTL3_SIZE; - p2align = PTL3_SHIFT - PTL1_SHIFT; + pgshift = PTL3_SHIFT; } -#endif goto out; } - pt = phys_to_virt(pt->entry[l3idx] & PT_PHYSMASK); + pt = phys_to_virt(pte_get_phys(&pt->entry[l3idx])); if ((pt->entry[l2idx] == PTE_NULL) || (pt->entry[l2idx] & PFL2_SIZE)) { -#ifdef USE_LARGE_PAGES - ptep = &pt->entry[l2idx]; - base = GET_INDICES_VIRT(l4idx, l3idx, l2idx, 0); - size = PTL2_SIZE; - p2align = PTL2_SHIFT - PTL1_SHIFT; -#endif + if (pgshift >= PTL2_SHIFT) { + ptep = &pt->entry[l2idx]; + pgshift = PTL2_SHIFT; + } goto out; } - pt = phys_to_virt(pt->entry[l2idx] & PT_PHYSMASK); + pt = phys_to_virt(pte_get_phys(&pt->entry[l2idx])); ptep = &pt->entry[l1idx]; - base = GET_INDICES_VIRT(l4idx, l3idx, l2idx, l1idx); - size = PTL1_SIZE; - p2align = PTL1_SHIFT - PTL1_SHIFT; + pgshift = PTL1_SHIFT; out: + size = (size_t)1 << pgshift; + base = virt & ~(size - 1); + p2align = pgshift - PAGE_SHIFT; if (basep) *basep = base; if (sizep) *sizep = size; if (p2alignp) *p2alignp = p2align; @@ -1378,21 +1457,21 @@ out: return ptep; } -pte_t *ihk_mc_pt_lookup_pte(page_table_t pt, void *virt, void **basep, - size_t *sizep, int *p2alignp) +pte_t *ihk_mc_pt_lookup_pte(page_table_t pt, void *virt, int pgshift, + void **basep, size_t *sizep, int *p2alignp) { pte_t *ptep; uintptr_t base; size_t size; int p2align; - dkprintf("ihk_mc_pt_lookup_pte(%p,%p)\n", pt, virt); - ptep = lookup_pte(pt, (uintptr_t)virt, &base, &size, &p2align); + dkprintf("ihk_mc_pt_lookup_pte(%p,%p,%d)\n", pt, virt, pgshift); + ptep = lookup_pte(pt, (uintptr_t)virt, pgshift, &base, &size, &p2align); if (basep) *basep = (void *)base; if (sizep) *sizep = size; if (p2alignp) *p2alignp = p2align; - dkprintf("ihk_mc_pt_lookup_pte(%p,%p): %p %lx %lx %d\n", - pt, virt, ptep, base, size, p2align); + dkprintf("ihk_mc_pt_lookup_pte(%p,%p,%d): %p %lx %lx %d\n", + pt, virt, pgshift, ptep, base, size, p2align); return ptep; } @@ -1400,7 +1479,7 @@ struct set_range_args { page_table_t pt; uintptr_t phys; enum ihk_mc_pt_attribute attr; - int padding; + int pgshift; uintptr_t diff; struct process_vm *vm; }; @@ -1438,39 +1517,50 @@ int set_range_l2(void *args0, pte_t *ptep, uintptr_t base, uintptr_t start, struct set_range_args *args = args0; int error; struct page_table *pt; -#ifdef USE_LARGE_PAGES uintptr_t phys; -#endif + struct page_table *newpt = NULL; + pte_t pte; dkprintf("set_range_l2(%lx,%lx,%lx)\n", base, start, end); +retry: if (*ptep == PTE_NULL) { -#ifdef USE_LARGE_PAGES if ((start <= base) && ((base + PTL2_SIZE) <= end) - && ((args->diff & (PTL2_SIZE - 1)) == 0)) { + && ((args->diff & (PTL2_SIZE - 1)) == 0) + && (!args->pgshift + || (args->pgshift == PTL2_SHIFT))) { phys = args->phys + (base - start); *ptep = phys | attr_to_l2attr( args->attr|PTATTR_LARGEPAGE); error = 0; dkprintf("set_range_l2(%lx,%lx,%lx):" - "large page. %d %lx\n", + "2MiB page. %d %lx\n", base, start, end, error, *ptep); goto out; } -#endif - pt = __alloc_new_pt(IHK_MC_AP_NOWAIT); - if (pt == NULL) { - error = -ENOMEM; - ekprintf("set_range_l2(%lx,%lx,%lx):" - "__alloc_new_pt failed. %d %lx\n", - base, start, end, error, *ptep); - (void)clear_range(args->pt, args->vm, start, base, - KEEP_PHYSICAL, NULL); - goto out; + if (!newpt) { + newpt = __alloc_new_pt(IHK_MC_AP_NOWAIT); + if (newpt == NULL) { + error = -ENOMEM; + ekprintf("set_range_l2(%lx,%lx,%lx):" + "__alloc_new_pt failed. %d %lx\n", + base, start, end, error, *ptep); + (void)clear_range(args->pt, args->vm, start, base, + KEEP_PHYSICAL, NULL); + goto out; + } } - *ptep = virt_to_phys(pt) | PFL2_PDIR_ATTR; + pte = virt_to_phys(newpt) | PFL2_PDIR_ATTR; + pte = atomic_cmpxchg8(ptep, PTE_NULL, pte); + if (pte != PTE_NULL) { + /* failed to set PDTe */ + goto retry; + } + + pt = newpt; + newpt = NULL; } else if (*ptep & PFL2_SIZE) { error = -EBUSY; @@ -1494,6 +1584,9 @@ int set_range_l2(void *args0, pte_t *ptep, uintptr_t base, uintptr_t start, error = 0; out: + if (newpt) { + arch_free_page(newpt); + } dkprintf("set_range_l2(%lx,%lx,%lx): %d %lx\n", base, start, end, error, *ptep); return error; @@ -1506,18 +1599,17 @@ int set_range_l3(void *args0, pte_t *ptep, uintptr_t base, uintptr_t start, pte_t pte; struct page_table *pt; int error; -#ifdef USE_LARGE_PAGES struct set_range_args *args = args0; uintptr_t phys; -#endif dkprintf("set_range_l3(%lx,%lx,%lx)\n", base, start, end); retry: if (*ptep == PTE_NULL) { -#ifdef USE_LARGE_PAGES if ((start <= base) && ((base + PTL3_SIZE) <= end) && ((args->diff & (PTL3_SIZE - 1)) == 0) + && (!args->pgshift + || (args->pgshift == PTL3_SHIFT)) && use_1gb_page) { phys = args->phys + (base - start); *ptep = phys | attr_to_l3attr( @@ -1528,7 +1620,6 @@ retry: base, start, end, error, *ptep); goto out; } -#endif if (!newpt) { newpt = __alloc_new_pt(IHK_MC_AP_NOWAIT); @@ -1537,6 +1628,8 @@ retry: ekprintf("set_range_l3(%lx,%lx,%lx):" "__alloc_new_pt failed. %d %lx\n", base, start, end, error, *ptep); + (void)clear_range(args->pt, args->vm, start, + base, KEEP_PHYSICAL, NULL); goto out; } } @@ -1556,6 +1649,8 @@ retry: ekprintf("set_range_l3(%lx,%lx,%lx):" "page exists. %d %lx\n", base, start, end, error, *ptep); + (void)clear_range(args->pt, args->vm, start, base, + KEEP_PHYSICAL, NULL); goto out; } else { @@ -1583,6 +1678,7 @@ out: int set_range_l4(void *args0, pte_t *ptep, uintptr_t base, uintptr_t start, uintptr_t end) { + struct set_range_args *args = args0; struct page_table *newpt = NULL; pte_t pte; struct page_table *pt; @@ -1599,6 +1695,8 @@ retry: ekprintf("set_range_l4(%lx,%lx,%lx):" "__alloc_new_pt failed. %d %lx\n", base, start, end, error, *ptep); + (void)clear_range(args->pt, args->vm, start, + base, KEEP_PHYSICAL, NULL); goto out; } } @@ -1636,7 +1734,8 @@ out: } int ihk_mc_pt_set_range(page_table_t pt, struct process_vm *vm, void *start, - void *end, uintptr_t phys, enum ihk_mc_pt_attribute attr) + void *end, uintptr_t phys, enum ihk_mc_pt_attribute attr, + int pgshift) { int error; struct set_range_args args; @@ -1649,6 +1748,7 @@ int ihk_mc_pt_set_range(page_table_t pt, struct process_vm *vm, void *start, args.attr = attr; args.diff = (uintptr_t)start ^ phys; args.vm = vm; + args.pgshift = pgshift; error = walk_pte_l4(pt, 0, (uintptr_t)start, (uintptr_t)end, &set_range_l4, &args); @@ -1677,14 +1777,12 @@ int ihk_mc_pt_set_pte(page_table_t pt, pte_t *ptep, size_t pgsize, if (pgsize == PTL1_SIZE) { *ptep = phys | attr_to_l1attr(attr); } -#ifdef USE_LARGE_PAGES else if (pgsize == PTL2_SIZE) { *ptep = phys | attr_to_l2attr(attr | PTATTR_LARGEPAGE); } else if ((pgsize == PTL3_SIZE) && (use_1gb_page)) { *ptep = phys | attr_to_l3attr(attr | PTATTR_LARGEPAGE); } -#endif else { error = -EINVAL; ekprintf("ihk_mc_pt_set_pte(%p,%p,%lx,%lx,%x):" @@ -1701,6 +1799,46 @@ out: return error; } +int ihk_mc_pt_split(page_table_t pt, struct process_vm *vm, void *addr) +{ + int error; + pte_t *ptep; + void *pgaddr; + size_t pgsize; + intptr_t phys; + struct page *page; + + +retry: + ptep = ihk_mc_pt_lookup_pte(pt, addr, 0, &pgaddr, &pgsize, NULL); + if (ptep && !pte_is_null(ptep) && (pgaddr != addr)) { + page = NULL; + if (!pte_is_fileoff(ptep, pgsize)) { + phys = pte_get_phys(ptep); + page = phys_to_page(phys); + } + if (page && (page_is_in_memobj(page) + || page_is_multi_mapped(page))) { + error = -EINVAL; + kprintf("ihk_mc_pt_split:NYI:page break down\n"); + goto out; + } + + error = split_large_page(ptep, pgsize); + if (error) { + kprintf("ihk_mc_pt_split:split_large_page failed. %d\n", error); + goto out; + } + remote_flush_tlb_cpumask(vm, (intptr_t)pgaddr, + ihk_mc_get_processor_id()); + goto retry; + } + + error = 0; +out: + return error; +} /* ihk_mc_pt_split() */ + int arch_get_smaller_page_size(void *args, size_t cursize, size_t *newsizep, int *p2alignp) { @@ -1712,7 +1850,6 @@ int arch_get_smaller_page_size(void *args, size_t cursize, size_t *newsizep, /* dummy */ panic("not reached"); } -#ifdef USE_LARGE_PAGES else if ((cursize > PTL3_SIZE) && use_1gb_page) { /* 1GiB */ newsize = PTL3_SIZE; @@ -1723,7 +1860,6 @@ int arch_get_smaller_page_size(void *args, size_t cursize, size_t *newsizep, newsize = PTL2_SIZE; p2align = PTL2_SHIFT - PTL1_SHIFT; } -#endif else if (cursize > PTL1_SIZE) { /* 4KiB : basic page size */ newsize = PTL1_SIZE; @@ -1796,7 +1932,7 @@ static int move_one_page(void *arg0, page_table_t pt, pte_t *ptep, attr = apte & ~PT_PHYSMASK; error = ihk_mc_pt_set_range(pt, args->vm, (void *)dest, - (void *)(dest + pgsize), phys, attr); + (void *)(dest + pgsize), phys, attr, pgshift); if (error) { kprintf("move_one_page(%p,%p,%p %#lx,%p,%d):" "set failed. %d\n", @@ -1822,7 +1958,7 @@ int move_pte_range(page_table_t pt, struct process_vm *vm, args.dest = (uintptr_t)dest; args.vm = vm; - error = visit_pte_range(pt, src, src+size, VPTEF_SKIP_NULL, + error = visit_pte_range(pt, src, src+size, 0, VPTEF_SKIP_NULL, &move_one_page, &args); flush_tlb(); /* XXX: TLB flush */ if (error) { @@ -1946,9 +2082,7 @@ static void init_vsyscall_area(struct page_table *pt) void init_page_table(void) { -#ifdef USE_LARGE_PAGES check_available_page_size(); -#endif init_pt = arch_alloc_page(IHK_MC_AP_CRITICAL); ihk_mc_spinlock_init(&init_pt_lock); diff --git a/arch/x86/kernel/syscall.c b/arch/x86/kernel/syscall.c index 1f62ebdf..d22ebe88 100644 --- a/arch/x86/kernel/syscall.c +++ b/arch/x86/kernel/syscall.c @@ -1327,6 +1327,7 @@ SYSCALL_DECLARE(mmap) intptr_t addr; size_t len; int flags = flags0; + size_t pgsize; dkprintf("sys_mmap(%lx,%lx,%x,%x,%d,%lx)\n", addr0, len0, prot, flags0, fd, off0); @@ -1350,14 +1351,36 @@ SYSCALL_DECLARE(mmap) } /* check arguments */ -#define VALID_DUMMY_ADDR (region->user_start) + pgsize = PAGE_SIZE; + if (flags & MAP_HUGETLB) { + switch (flags & (0x3F << MAP_HUGE_SHIFT)) { + case 0: + flags |= MAP_HUGE_2MB; /* default hugepage size */ + break; + + case MAP_HUGE_2MB: + case MAP_HUGE_1GB: + break; + + default: + ekprintf("sys_mmap(%lx,%lx,%x,%x,%x,%lx):" + "not supported page size.\n", + addr0, len0, prot, flags0, fd, off0); + error = -EINVAL; + goto out; + } + + pgsize = (size_t)1 << ((flags >> MAP_HUGE_SHIFT) & 0x3F); + } + +#define VALID_DUMMY_ADDR ((region->user_start + PTL3_SIZE - 1) & ~(PTL3_SIZE - 1)) addr = (flags & MAP_FIXED)? addr0: VALID_DUMMY_ADDR; - len = (len0 + PAGE_SIZE - 1) & PAGE_MASK; - if ((addr & (PAGE_SIZE - 1)) + len = (len0 + pgsize - 1) & ~(pgsize - 1); + if ((addr & (pgsize - 1)) || (len == 0) || !(flags & (MAP_SHARED | MAP_PRIVATE)) || ((flags & MAP_SHARED) && (flags & MAP_PRIVATE)) - || (off0 & (PAGE_SIZE - 1))) { + || (off0 & (pgsize - 1))) { ekprintf("sys_mmap(%lx,%lx,%x,%x,%x,%lx):EINVAL\n", addr0, len0, prot, flags0, fd, off0); error = -EINVAL; @@ -1383,25 +1406,6 @@ SYSCALL_DECLARE(mmap) goto out; } - if (flags & MAP_HUGETLB) { - switch (flags & (0x3F << MAP_HUGE_SHIFT)) { - case 0: - flags |= MAP_HUGE_2MB; /* default hugepage size */ - break; - - case MAP_HUGE_2MB: - case MAP_HUGE_1GB: - break; - - default: - ekprintf("sys_mmap(%lx,%lx,%x,%x,%x,%lx):" - "not supported page size.\n", - addr0, len0, prot, flags0, fd, off0); - error = -EINVAL; - goto out; - } - } - addr = do_mmap(addr, len, prot, flags, fd, off0); error = 0; @@ -1714,7 +1718,8 @@ int arch_map_vdso(struct process_vm *vm) for (i = 0; i < vdso.vdso_npages; ++i) { s = vm->vdso_addr + (i * PAGE_SIZE); e = s + PAGE_SIZE; - error = ihk_mc_pt_set_range(pt, vm, s, e, vdso.vdso_physlist[i], attr); + error = ihk_mc_pt_set_range(pt, vm, s, e, + vdso.vdso_physlist[i], attr, 0); if (error) { ekprintf("ihk_mc_pt_set_range failed. %d\n", error); goto out; @@ -1744,7 +1749,8 @@ int arch_map_vdso(struct process_vm *vm) s = vm->vdso_addr + (intptr_t)vdso.vvar_virt; e = s + PAGE_SIZE; attr = PTATTR_ACTIVE | PTATTR_USER | PTATTR_NO_EXECUTE; - error = ihk_mc_pt_set_range(pt, vm, s, e, vdso.vvar_phys, attr); + error = ihk_mc_pt_set_range(pt, vm, s, e, + vdso.vvar_phys, attr, 0); if (error) { ekprintf("ihk_mc_pt_set_range failed. %d\n", error); goto out; @@ -1754,7 +1760,8 @@ int arch_map_vdso(struct process_vm *vm) s = vm->vdso_addr + (intptr_t)vdso.hpet_virt; e = s + PAGE_SIZE; attr = PTATTR_ACTIVE | PTATTR_USER | PTATTR_NO_EXECUTE | PTATTR_UNCACHABLE; - error = ihk_mc_pt_set_range(pt, vm, s, e, vdso.hpet_phys, attr); + error = ihk_mc_pt_set_range(pt, vm, s, e, + vdso.hpet_phys, attr, 0); if (error) { ekprintf("ihk_mc_pt_set_range failed. %d\n", error); goto out; @@ -1764,7 +1771,8 @@ int arch_map_vdso(struct process_vm *vm) s = vm->vdso_addr + (intptr_t)vdso.pvti_virt; e = s + PAGE_SIZE; attr = PTATTR_ACTIVE | PTATTR_USER | PTATTR_NO_EXECUTE; - error = ihk_mc_pt_set_range(pt, vm, s, e, vdso.pvti_phys, attr); + error = ihk_mc_pt_set_range(pt, vm, s, e, + vdso.pvti_phys, attr, 0); if (error) { ekprintf("ihk_mc_pt_set_range failed. %d\n", error); goto out; diff --git a/kernel/process.c b/kernel/process.c index 0d1f5c06..13b43cdd 100644 --- a/kernel/process.c +++ b/kernel/process.c @@ -522,7 +522,9 @@ static int copy_user_pte(void *arg0, page_table_t src_pt, pte_t *src_ptep, void attr = arch_vrflag_to_ptattr(args->new_vrflag, PF_POPULATE, NULL); } - error = ihk_mc_pt_set_range(args->new_vm->address_space->page_table, args->new_vm, pgaddr, pgaddr+pgsize, phys, attr); + error = ihk_mc_pt_set_range(args->new_vm->address_space->page_table, + args->new_vm, pgaddr, pgaddr+pgsize, phys, attr, + pgshift); if (error) { args->fault_addr = (intptr_t)pgaddr; goto out; @@ -572,6 +574,7 @@ static int copy_user_ranges(struct process_vm *vm, struct process_vm *orgvm) range->flag = src_range->flag; range->memobj = src_range->memobj; range->objoff = src_range->objoff; + range->pgshift = src_range->pgshift; if (range->memobj) { memobj_ref(range->memobj); } @@ -583,7 +586,8 @@ static int copy_user_ranges(struct process_vm *vm, struct process_vm *orgvm) error = visit_pte_range(orgvm->address_space->page_table, (void *)range->start, (void *)range->end, - VPTEF_SKIP_NULL, ©_user_pte, &args); + range->pgshift, VPTEF_SKIP_NULL, + ©_user_pte, &args); if (error) { if (args.fault_addr != -1) { kprintf("ERROR: copy_user_ranges() " @@ -626,7 +630,8 @@ int update_process_page_table(struct process_vm *vm, attr = arch_vrflag_to_ptattr(range->flag, PF_POPULATE, NULL); flags = ihk_mc_spinlock_lock(&vm->page_table_lock); error = ihk_mc_pt_set_range(vm->address_space->page_table, vm, - (void *)range->start, (void *)range->end, phys, attr); + (void *)range->start, (void *)range->end, phys, attr, + range->pgshift); if (error) { kprintf("update_process_page_table:ihk_mc_pt_set_range failed. %d\n", error); goto out; @@ -647,6 +652,13 @@ int split_process_memory_range(struct process_vm *vm, struct vm_range *range, dkprintf("split_process_memory_range(%p,%lx-%lx,%lx,%p)\n", vm, range->start, range->end, addr, splitp); + error = ihk_mc_pt_split(vm->address_space->page_table, vm, (void *)addr); + if (error) { + ekprintf("split_process_memory_range:" + "ihk_mc_pt_split failed. %d\n", error); + goto out; + } + newrange = kmalloc(sizeof(struct vm_range), IHK_MC_AP_NOWAIT); if (!newrange) { ekprintf("split_process_memory_range(%p,%lx-%lx,%lx,%p):" @@ -659,6 +671,7 @@ int split_process_memory_range(struct process_vm *vm, struct vm_range *range, newrange->start = addr; newrange->end = range->end; newrange->flag = range->flag; + newrange->pgshift = range->pgshift; if (range->memobj) { memobj_ref(range->memobj); @@ -735,11 +748,10 @@ int free_process_memory_range(struct process_vm *vm, struct vm_range *range) int error; intptr_t start; intptr_t end; -#ifdef USE_LARGE_PAGES struct vm_range *neighbor; intptr_t lpstart; intptr_t lpend; -#endif /* USE_LARGE_PAGES */ + size_t pgsize; dkprintf("free_process_memory_range(%p, 0x%lx - 0x%lx)\n", vm, range->start, range->end); @@ -747,25 +759,40 @@ int free_process_memory_range(struct process_vm *vm, struct vm_range *range) start = range->start; end = range->end; if (!(range->flag & (VR_REMOTE | VR_IO_NOCACHE | VR_RESERVED))) { -#ifdef USE_LARGE_PAGES - lpstart = start & LARGE_PAGE_MASK; - lpend = (end + LARGE_PAGE_SIZE - 1) & LARGE_PAGE_MASK; - - - if (lpstart < start) { - neighbor = previous_process_memory_range(vm, range); - if ((neighbor == NULL) || (neighbor->end <= lpstart)) { + neighbor = previous_process_memory_range(vm, range); + pgsize = -1; + for (;;) { + error = arch_get_smaller_page_size( + NULL, pgsize, &pgsize, NULL); + if (error) { + kprintf("free_process_memory_range:" + "arch_get_smaller_page_size failed." + " %d\n", error); + break; + } + lpstart = start & ~(pgsize - 1); + if (!neighbor || (neighbor->end <= lpstart)) { start = lpstart; + break; } } - - if (end < lpend) { - neighbor = next_process_memory_range(vm, range); - if ((neighbor == NULL) || (lpend <= neighbor->start)) { + neighbor = next_process_memory_range(vm, range); + pgsize = -1; + for (;;) { + error = arch_get_smaller_page_size( + NULL, pgsize, &pgsize, NULL); + if (error) { + kprintf("free_process_memory_range:" + "arch_get_smaller_page_size failed." + " %d\n", error); + break; + } + lpend = (end + pgsize - 1) & ~(pgsize - 1); + if (!neighbor || (lpend <= neighbor->start)) { end = lpend; + break; } } -#endif /* USE_LARGE_PAGES */ ihk_mc_spinlock_lock_noirq(&vm->page_table_lock); if (range->memobj) { @@ -928,6 +955,7 @@ enum ihk_mc_pt_attribute common_vrflag_to_ptattr(unsigned long flag, uint64_t fa return attr; } +/* XXX: インデントを揃える必要がある */ int add_process_memory_range(struct process_vm *vm, unsigned long start, unsigned long end, unsigned long phys, unsigned long flag, @@ -1236,7 +1264,8 @@ int remap_process_memory_range(struct process_vm *vm, struct vm_range *range, args.memobj = range->memobj; error = visit_pte_range(vm->address_space->page_table, (void *)start, - (void *)end, VPTEF_DEFAULT, &remap_one_page, &args); + (void *)end, range->pgshift, VPTEF_DEFAULT, + &remap_one_page, &args); if (error) { ekprintf("remap_process_memory_range(%p,%p,%#lx,%#lx,%#lx):" "visit pte failed %d\n", @@ -1306,8 +1335,8 @@ int sync_process_memory_range(struct process_vm *vm, struct vm_range *range, ihk_mc_spinlock_lock_noirq(&vm->page_table_lock); memobj_lock(range->memobj); error = visit_pte_range(vm->address_space->page_table, (void *)start, - (void *)end, VPTEF_SKIP_NULL, &sync_one_page, - &args); + (void *)end, range->pgshift, VPTEF_SKIP_NULL, + &sync_one_page, &args); memobj_unlock(range->memobj); ihk_mc_spinlock_unlock_noirq(&vm->page_table_lock); if (error) { @@ -1389,7 +1418,7 @@ int invalidate_process_memory_range(struct process_vm *vm, ihk_mc_spinlock_lock_noirq(&vm->page_table_lock); memobj_lock(range->memobj); error = visit_pte_range(vm->address_space->page_table, (void *)start, - (void *)end, VPTEF_SKIP_NULL, + (void *)end, range->pgshift, VPTEF_SKIP_NULL, &invalidate_one_page, &args); memobj_unlock(range->memobj); ihk_mc_spinlock_unlock_noirq(&vm->page_table_lock); @@ -1421,8 +1450,8 @@ static int page_fault_process_memory_range(struct process_vm *vm, struct vm_rang ihk_mc_spinlock_lock_noirq(&vm->page_table_lock); /*****/ ptep = ihk_mc_pt_lookup_pte(vm->address_space->page_table, - (void *)fault_addr, &pgaddr, &pgsize, - &p2align); + (void *)fault_addr, range->pgshift, &pgaddr, &pgsize, + &p2align); if (!(reason & (PF_PROT | PF_PATCH)) && ptep && !pte_is_null(ptep) && !pte_is_fileoff(ptep, pgsize)) { if (!pte_is_present(ptep)) { @@ -1439,13 +1468,19 @@ static int page_fault_process_memory_range(struct process_vm *vm, struct vm_rang goto out; } /*****/ - if (!ptep || (pgsize != PAGE_SIZE)) { + while (((uintptr_t)pgaddr < range->start) + || (range->end < ((uintptr_t)pgaddr + pgsize))) { ptep = NULL; - pgsize = PAGE_SIZE; - p2align = PAGE_P2ALIGN; + error = arch_get_smaller_page_size(NULL, pgsize, &pgsize, &p2align); + if (error) { + kprintf("page_fault_process_memory_range(%p,%lx-%lx %lx,%lx,%lx):arch_get_smaller_page_size(pte) failed. %d\n", vm, range->start, range->end, range->flag, fault_addr, reason, error); + goto out; + } + pgaddr = (void *)(fault_addr & ~(pgsize - 1)); } - pgaddr = (void *)(fault_addr & ~(pgsize - 1)); + /*****/ if (!ptep || pte_is_null(ptep) || pte_is_fileoff(ptep, pgsize)) { + phys = NOPHYS; if (range->memobj) { off_t off; @@ -1458,17 +1493,34 @@ static int page_fault_process_memory_range(struct process_vm *vm, struct vm_rang error = memobj_get_page(range->memobj, off, p2align, &phys, &memobj_flag); if (error) { - if (error != -ERESTART) { + struct memobj *obj; + + if (zeroobj_create(&obj)) { + panic("PFPMR: zeroobj_crate"); + } + + if (range->memobj != obj) { + goto out; } - goto out; } } - else { + if (phys == NOPHYS) { void *virt; size_t npages; +retry: npages = pgsize / PAGE_SIZE; virt = ihk_mc_alloc_aligned_pages(npages, p2align, IHK_MC_AP_NOWAIT); + if (!virt && !range->pgshift && (pgsize != PAGE_SIZE)) { + error = arch_get_smaller_page_size(NULL, pgsize, &pgsize, &p2align); + if (error) { + kprintf("page_fault_process_memory_range(%p,%lx-%lx %lx,%lx,%lx):arch_get_smaller_page_size(anon) failed. %d\n", vm, range->start, range->end, range->flag, fault_addr, reason, error); + goto out; + } + ptep = NULL; + pgaddr = (void *)(fault_addr & ~(pgsize - 1)); + goto retry; + } if (!virt) { error = -ENOMEM; kprintf("page_fault_process_memory_range(%p,%lx-%lx %lx,%lx,%lx):cannot allocate new page. %d\n", vm, range->start, range->end, range->flag, fault_addr, reason, error); @@ -1527,18 +1579,20 @@ static int page_fault_process_memory_range(struct process_vm *vm, struct vm_rang else { error = ihk_mc_pt_set_range(vm->address_space->page_table, vm, pgaddr, pgaddr + pgsize, phys, - attr); + attr, range->pgshift); if (error) { kprintf("page_fault_process_memory_range(%p,%lx-%lx %lx,%lx,%lx):set_range failed. %d\n", vm, range->start, range->end, range->flag, fault_addr, reason, error); goto out; } } flush_tlb_single(fault_addr); - error = 0; - page = NULL; - vm->currss += PAGE_SIZE; + vm->currss += pgsize; if(vm->currss > vm->proc->maxrss) vm->proc->maxrss = vm->currss; + + error = 0; + page = NULL; + out: ihk_mc_spinlock_unlock_noirq(&vm->page_table_lock); if (page) { @@ -1712,7 +1766,7 @@ int init_process_stack(struct thread *thread, struct program_load_desc *pn, thread->vm, (void *)(end-minsz), (void *)end, virt_to_phys(stack), arch_vrflag_to_ptattr(vrflag, PF_POPULATE, - NULL)); + NULL), 0); if (error) { kprintf("init_process_stack:" "set range %lx-%lx %lx failed. %d\n", diff --git a/kernel/shmobj.c b/kernel/shmobj.c index 590187ea..67e42047 100644 --- a/kernel/shmobj.c +++ b/kernel/shmobj.c @@ -159,8 +159,16 @@ int shmobj_create(struct shmid_ds *ds, struct memobj **objp) { struct shmobj *obj = NULL; int error; + int pgshift; + size_t pgsize; dkprintf("shmobj_create(%p %#lx,%p)\n", ds, ds->shm_segsz, objp); + pgshift = ds->init_pgshift; + if (!pgshift) { + pgshift = PAGE_SHIFT; + } + pgsize = (size_t)1 << pgshift; + obj = kmalloc(sizeof(*obj), IHK_MC_AP_NOWAIT); if (!obj) { error = -ENOMEM; @@ -174,9 +182,10 @@ int shmobj_create(struct shmid_ds *ds, struct memobj **objp) obj->ds = *ds; obj->ds.shm_perm.seq = the_seq++; obj->ds.shm_nattch = 1; + obj->ds.init_pgshift = 0; obj->index = -1; - obj->pgshift = PAGE_SHIFT; - obj->real_segsz = (obj->ds.shm_segsz + PAGE_SIZE - 1) & PAGE_MASK; + obj->pgshift = pgshift; + obj->real_segsz = (obj->ds.shm_segsz + pgsize - 1) & ~(pgsize - 1); page_list_init(obj); ihk_mc_spinlock_init(&obj->memobj.lock); @@ -213,13 +222,14 @@ void shmobj_destroy(struct shmobj *obj) extern int the_maxi; struct shmlock_user *user; size_t size; + int npages; dkprintf("shmobj_destroy(%p [%d %o])\n", obj, obj->index, obj->ds.shm_perm.mode); if (obj->user) { user = obj->user; obj->user = NULL; shmlock_users_lock(); - size = (obj->ds.shm_segsz + PAGE_SIZE - 1) & PAGE_MASK; + size = obj->real_segsz; user->locked -= size; if (!user->locked) { shmlock_user_free(user); @@ -227,6 +237,7 @@ void shmobj_destroy(struct shmobj *obj) shmlock_users_unlock(); } /* zap page_list */ + npages = (size_t)1 << (obj->pgshift - PAGE_SHIFT); for (;;) { struct page *page; int count; @@ -253,9 +264,8 @@ void shmobj_destroy(struct shmobj *obj) panic("shmobj_release"); } - /* XXX:NYI: large pages */ page->mode = PM_NONE; - free_pages(phys_to_virt(page_to_phys(page)), 1); + free_pages(phys_to_virt(page_to_phys(page)), npages); } if (obj->index < 0) { kfree(obj); @@ -362,9 +372,9 @@ static int shmobj_get_page(struct memobj *memobj, off_t off, int p2align, memobj, off, p2align, physp, error); goto out; } - if (p2align != PAGE_P2ALIGN) { /* XXX:NYI:large pages */ + if (p2align != (obj->pgshift - PAGE_SHIFT)) { error = -ENOMEM; - ekprintf("shmobj_get_page(%p,%#lx,%d,%p):large page. %d\n", + ekprintf("shmobj_get_page(%p,%#lx,%d,%p):pgsize mismatch. %d\n", memobj, off, p2align, physp, error); goto out; } @@ -384,7 +394,8 @@ static int shmobj_get_page(struct memobj *memobj, off_t off, int p2align, page = page_list_lookup(obj, off); if (!page) { npages = 1 << p2align; - virt = ihk_mc_alloc_pages(npages, IHK_MC_AP_NOWAIT); + virt = ihk_mc_alloc_aligned_pages(npages, p2align, + IHK_MC_AP_NOWAIT); if (!virt) { error = -ENOMEM; ekprintf("shmobj_get_page(%p,%#lx,%d,%p):" @@ -460,7 +471,7 @@ static int shmobj_lookup_page(struct memobj *memobj, off_t off, int p2align, struct shmobj *obj = to_shmobj(memobj); int error; struct page *page; - uintptr_t phys; + uintptr_t phys = NOPHYS; dkprintf("shmobj_lookup_page(%p,%#lx,%d,%p)\n", memobj, off, p2align, physp); @@ -471,9 +482,9 @@ static int shmobj_lookup_page(struct memobj *memobj, off_t off, int p2align, memobj, off, p2align, physp, error); goto out; } - if (p2align != PAGE_P2ALIGN) { /* XXX:NYI:large pages */ + if (p2align != (obj->pgshift - PAGE_SHIFT)) { error = -ENOMEM; - ekprintf("shmobj_lookup_page(%p,%#lx,%d,%p):large page. %d\n", + ekprintf("shmobj_lookup_page(%p,%#lx,%d,%p):pgsize mismatch. %d\n", memobj, off, p2align, physp, error); goto out; } diff --git a/kernel/syscall.c b/kernel/syscall.c index 7106e03f..23820637 100644 --- a/kernel/syscall.c +++ b/kernel/syscall.c @@ -895,24 +895,20 @@ static int do_munmap(void *addr, size_t len) return error; } -static int search_free_space(size_t len, intptr_t hint, intptr_t *addrp) +static int search_free_space(size_t len, intptr_t hint, int pgshift, intptr_t *addrp) { struct thread *thread = cpu_local_var(current); struct vm_regions *region = &thread->vm->region; intptr_t addr; int error; struct vm_range *range; + size_t pgsize = (size_t)1 << pgshift; - dkprintf("search_free_space(%lx,%lx,%p)\n", len, hint, addrp); + dkprintf("search_free_space(%lx,%lx,%d,%p)\n", len, hint, pgshift, addrp); addr = hint; for (;;) { -#ifdef USE_LARGE_PAGES - if (len >= LARGE_PAGE_SIZE) { - addr = (addr + LARGE_PAGE_SIZE - 1) & LARGE_PAGE_MASK; - } -#endif /* USE_LARGE_PAGES */ - + addr = (addr + pgsize - 1) & ~(pgsize - 1); if ((region->user_end <= addr) || ((region->user_end - len) < addr)) { ekprintf("search_free_space(%lx,%lx,%p):" @@ -934,8 +930,8 @@ static int search_free_space(size_t len, intptr_t hint, intptr_t *addrp) *addrp = addr; out: - dkprintf("search_free_space(%lx,%lx,%p): %d %lx\n", - len, hint, addrp, error, addr); + dkprintf("search_free_space(%lx,%lx,%d,%p): %d %lx\n", + len, hint, pgshift, addrp, error, addr); return error; } @@ -994,6 +990,27 @@ do_mmap(const intptr_t addr0, const size_t len0, const int prot, flush_nfo_tlb(); + if (flags & MAP_HUGETLB) { + pgshift = (flags >> MAP_HUGE_SHIFT) & 0x3F; + p2align = pgshift - PAGE_SHIFT; + } + else if ((flags & MAP_PRIVATE) && (flags & MAP_ANONYMOUS)) { + pgshift = 0; /* transparent huge page */ + p2align = PAGE_P2ALIGN; + + if (len > PAGE_SIZE) { + error = arch_get_smaller_page_size(NULL, len+1, NULL, &p2align); + if (error) { + ekprintf("do_mmap:arch_get_smaller_page_size failed. %d\n", error); + goto out; + } + } + } + else { + pgshift = PAGE_SHIFT; /* basic page size */ + p2align = PAGE_P2ALIGN; + } + ihk_mc_spinlock_lock_noirq(&thread->vm->memory_range_lock); if (flags & MAP_FIXED) { @@ -1007,10 +1024,11 @@ do_mmap(const intptr_t addr0, const size_t len0, const int prot, } else { /* choose mapping address */ - error = search_free_space(len, region->map_end, &addr); + error = search_free_space(len, region->map_end, + PAGE_SHIFT+p2align, &addr); if (error) { - ekprintf("do_mmap:search_free_space(%lx,%lx) failed. %d\n", - len, region->map_end, error); + ekprintf("do_mmap:search_free_space(%lx,%lx,%d) failed. %d\n", + len, region->map_end, p2align, error); goto out; } region->map_end = addr + len; @@ -1096,13 +1114,6 @@ do_mmap(const intptr_t addr0, const size_t len0, const int prot, else if (!(vrflags & VR_DEMAND_PAGING) && ((vrflags & VR_PROT_MASK) != VR_PROT_NONE)) { npages = len >> PAGE_SHIFT; - p2align = PAGE_P2ALIGN; -#ifdef USE_LARGE_PAGES - if ((len >= LARGE_PAGE_SIZE) - && ((addr & (LARGE_PAGE_SIZE - 1)) == 0)) { - p2align = LARGE_PAGE_P2ALIGN; - } -#endif /* USE_LARGE_PAGES */ p = ihk_mc_alloc_aligned_pages(npages, p2align, IHK_MC_AP_NOWAIT); if (p == NULL) { ekprintf("do_mmap:allocate_pages(%d,%d) failed.\n", @@ -1116,6 +1127,7 @@ do_mmap(const intptr_t addr0, const size_t len0, const int prot, memset(&ads, 0, sizeof(ads)); ads.shm_segsz = len; ads.shm_perm.mode = SHM_DEST; + ads.init_pgshift = PAGE_SHIFT; error = shmobj_create(&ads, &memobj); if (error) { ekprintf("do_mmap:shmobj_create failed. %d\n", error); @@ -1141,13 +1153,6 @@ do_mmap(const intptr_t addr0, const size_t len0, const int prot, } vrflags |= VRFLAG_PROT_TO_MAXPROT(PROT_TO_VR_FLAG(maxprot)); - if (flags & MAP_HUGETLB) { - pgshift = (flags >> MAP_HUGE_SHIFT) & 0x3F; - } - else { - pgshift = PAGE_SHIFT; /* basic page size */ - } - error = add_process_memory_range(thread->vm, addr, addr+len, phys, vrflags, memobj, off, pgshift); if (error) { @@ -3238,7 +3243,7 @@ SYSCALL_DECLARE(mincore) ihk_mc_spinlock_lock_noirq(&vm->page_table_lock); ptep = ihk_mc_pt_lookup_pte(vm->address_space->page_table, - (void *)addr, NULL, NULL, NULL); + (void *)addr, 0, NULL, NULL, NULL); if (ptep && pte_is_present(ptep)) { value = 1; } @@ -3630,6 +3635,7 @@ int do_shmget(const key_t key, const size_t size, const int shmflg) ads.shm_segsz = size; ads.shm_ctime = now; ads.shm_cpid = proc->pid; + ads.init_pgshift = pgshift; error = shmobj_create_indexed(&ads, &obj); if (error) { @@ -3639,7 +3645,6 @@ int do_shmget(const key_t key, const size_t size, const int shmflg) } obj->index = ++the_maxi; - obj->pgshift = pgshift; list_add(&obj->chain, &kds_list); ++the_shm_info.used_ids; @@ -3668,6 +3673,7 @@ SYSCALL_DECLARE(shmat) int vrflags; int req; struct shmobj *obj; + size_t pgsize; dkprintf("shmat(%#x,%p,%#x)\n", shmid, shmaddr, shmflg); @@ -3679,13 +3685,14 @@ SYSCALL_DECLARE(shmat) return error; } - if (shmaddr && ((uintptr_t)shmaddr & (PAGE_SIZE - 1)) && !(shmflg & SHM_RND)) { + pgsize = (size_t)1 << obj->pgshift; + if (shmaddr && ((uintptr_t)shmaddr & (pgsize - 1)) && !(shmflg & SHM_RND)) { shmobj_list_unlock(); dkprintf("shmat(%#x,%p,%#x): -EINVAL\n", shmid, shmaddr, shmflg); return -EINVAL; } - addr = (uintptr_t)shmaddr & PAGE_MASK; - len = (obj->ds.shm_segsz + PAGE_SIZE - 1) & PAGE_MASK; + addr = (uintptr_t)shmaddr & ~(pgsize - 1); + len = obj->real_segsz; prot = PROT_READ; req = 4; @@ -3725,7 +3732,7 @@ SYSCALL_DECLARE(shmat) } } else { - error = search_free_space(len, region->map_end, &addr); + error = search_free_space(len, region->map_end, obj->pgshift, &addr); if (error) { ihk_mc_spinlock_unlock_noirq(&vm->memory_range_lock); shmobj_list_unlock(); @@ -3753,7 +3760,7 @@ SYSCALL_DECLARE(shmat) memobj_ref(&obj->memobj); error = add_process_memory_range(vm, addr, addr+len, -1, - vrflags, &obj->memobj, 0, PAGE_SHIFT); + vrflags, &obj->memobj, 0, obj->pgshift); if (error) { if (!(prot & PROT_WRITE)) { (void)set_host_vma(addr, len, PROT_READ|PROT_WRITE); @@ -3940,7 +3947,7 @@ SYSCALL_DECLARE(shmctl) ekprintf("shmctl(%#x,%d,%p): user lookup: %d\n", shmid, cmd, buf, error); return -ENOMEM; } - size = (obj->ds.shm_segsz + PAGE_SIZE - 1) & PAGE_MASK; + size = obj->real_segsz; if (!has_cap_ipc_lock(thread) && (rlim->rlim_cur != (rlim_t)-1) && ((rlim->rlim_cur < user->locked) @@ -3978,7 +3985,7 @@ SYSCALL_DECLARE(shmctl) if ((obj->ds.shm_perm.mode & SHM_LOCKED) && ((obj->pgshift == 0) || (obj->pgshift == PAGE_SHIFT))) { - size = (obj->ds.shm_segsz + PAGE_SIZE - 1) & PAGE_MASK; + size = obj->real_segsz; shmlock_users_lock(); user = obj->user; obj->user = NULL; @@ -6433,7 +6440,7 @@ SYSCALL_DECLARE(mremap) } need_relocate = 1; error = search_free_space(newsize, vm->region.map_end, - (intptr_t *)&newstart); + range->pgshift, (intptr_t *)&newstart); if (error) { ekprintf("sys_mremap(%#lx,%#lx,%#lx,%#x,%#lx):" "search failed. %d\n", diff --git a/kernel/zeroobj.c b/kernel/zeroobj.c index 238bdc2a..c8fc5c2a 100644 --- a/kernel/zeroobj.c +++ b/kernel/zeroobj.c @@ -182,7 +182,7 @@ static int zeroobj_get_page(struct memobj *memobj, off_t off, int p2align, } if (p2align != PAGE_P2ALIGN) { /* XXX:NYI:large pages */ error = -ENOMEM; - ekprintf("zeroobj_get_page(%p,%#lx,%d,%p):large page. %d\n", + dkprintf("zeroobj_get_page(%p,%#lx,%d,%p):large page. %d\n", memobj, off, p2align, physp, error); goto out; } diff --git a/lib/include/ihk/mm.h b/lib/include/ihk/mm.h index 3bc59183..cf2957a0 100644 --- a/lib/include/ihk/mm.h +++ b/lib/include/ihk/mm.h @@ -127,16 +127,18 @@ int ihk_mc_pt_free_range(page_table_t pt, struct process_vm *vm, int ihk_mc_pt_change_attr_range(page_table_t pt, void *start, void *end, enum ihk_mc_pt_attribute clrattr, enum ihk_mc_pt_attribute setattr); -pte_t *ihk_mc_pt_lookup_pte(page_table_t pt, void *virt, void **pgbasep, size_t *pgsizep, int *p2alignp); +pte_t *ihk_mc_pt_lookup_pte(page_table_t pt, void *virt, int pgshift, void **pgbasep, size_t *pgsizep, int *p2alignp); int ihk_mc_pt_set_range(page_table_t pt, struct process_vm *vm, void *start, - void *end, uintptr_t phys, enum ihk_mc_pt_attribute attr); + void *end, uintptr_t phys, enum ihk_mc_pt_attribute attr, + int pgshift); int ihk_mc_pt_set_pte(page_table_t pt, pte_t *ptep, size_t pgsize, uintptr_t phys, enum ihk_mc_pt_attribute attr); int ihk_mc_pt_prepare_map(page_table_t pt, void *virt, unsigned long size, enum ihk_mc_pt_prepare_flag); +int ihk_mc_pt_split(page_table_t pt, struct process_vm *vm, void *addr); typedef int pte_visitor_t(void *arg, page_table_t pt, pte_t *ptep, void *pgaddr, int pgshift); -int visit_pte_range(page_table_t pt, void *start, void *end, +int visit_pte_range(page_table_t pt, void *start, void *end, int pgshift, enum visit_pte_flag flags, pte_visitor_t *funcp, void *arg); int move_pte_range(page_table_t pt, struct process_vm *vm, void *src, void *dest, size_t size);