From 9b5ccb5a33cb4559bde76c3267bf73a151b56989 Mon Sep 17 00:00:00 2001 From: Balazs Gerofi Date: Sun, 12 Mar 2017 14:34:34 +0900 Subject: [PATCH] Pre-map file mappings from /dev/shm (--mpol-shm-premap mcexec argument) --- arch/x86/kernel/memory.c | 14 +++-- executer/include/uprotocol.h | 1 + executer/kernel/mcctrl/syscall.c | 6 +- executer/user/mcexec.c | 11 ++++ kernel/fileobj.c | 96 ++++++++++++++++++++++++++++++++ kernel/include/memobj.h | 5 ++ kernel/include/syscall.h | 1 + kernel/syscall.c | 30 ++++++++++ 8 files changed, 157 insertions(+), 7 deletions(-) diff --git a/arch/x86/kernel/memory.c b/arch/x86/kernel/memory.c index d03addea..efb48f11 100644 --- a/arch/x86/kernel/memory.c +++ b/arch/x86/kernel/memory.c @@ -1075,7 +1075,7 @@ int visit_pte_range(page_table_t pt, void *start0, void *end0, int pgshift, struct clear_range_args { int free_physical; - uint8_t padding[4]; + int dont_walk_l1; struct memobj *memobj; struct process_vm *vm; }; @@ -1167,9 +1167,11 @@ static int clear_range_l2(void *args0, pte_t *ptep, uint64_t base, } pt = phys_to_virt(*ptep & PT_PHYSMASK); - error = walk_pte_l1(pt, base, start, end, &clear_range_l1, args0); - if (error && (error != -ENOENT)) { - return error; + if (!args->dont_walk_l1) { + error = walk_pte_l1(pt, base, start, end, &clear_range_l1, args0); + if (error && (error != -ENOENT)) { + return error; + } } if ((start <= base) && ((base + PTL2_SIZE) <= end)) { @@ -1279,6 +1281,10 @@ static int clear_range(struct page_table *pt, struct process_vm *vm, if (memobj && (memobj->flags & MF_DEV_FILE)) { args.free_physical = 0; } + args.dont_walk_l1 = 0; + if (memobj && ((memobj->flags & MF_PREMAP))) { + args.dont_walk_l1 = 1; + } args.memobj = memobj; args.vm = vm; diff --git a/executer/include/uprotocol.h b/executer/include/uprotocol.h index 86936b23..8a7efb42 100644 --- a/executer/include/uprotocol.h +++ b/executer/include/uprotocol.h @@ -98,6 +98,7 @@ typedef unsigned long __cpu_set_unit; #define MPOL_NO_HEAP 0x01 #define MPOL_NO_STACK 0x02 #define MPOL_NO_BSS 0x04 +#define MPOL_SHM_PREMAP 0x08 struct program_load_desc { int num_sections; diff --git a/executer/kernel/mcctrl/syscall.c b/executer/kernel/mcctrl/syscall.c index a65e471e..9bfb61d7 100644 --- a/executer/kernel/mcctrl/syscall.c +++ b/executer/kernel/mcctrl/syscall.c @@ -770,6 +770,7 @@ enum { MF_ZEROFILL = 0x0010, MF_REG_FILE = 0x1000, MF_DEV_FILE = 0x2000, + MF_PREMAP = 0x8000, MF_END }; @@ -863,9 +864,8 @@ static int pager_req_create(ihk_os_t os, int fd, uintptr_t result_pa) fullpath = d_path(&file->f_path, pathbuf, PATH_MAX); if (!IS_ERR(fullpath)) { if (!strncmp("/dev/shm/Intel_MPI", fullpath, 18)) { - //mf_flags = (MF_PREFETCH | MF_ZEROFILL); - mf_flags = (MF_ZEROFILL); - dprintk("%s: filename: %s, zerofill\n", + mf_flags = (MF_PREMAP | MF_ZEROFILL); + dprintk("%s: filename: %s, premap & zerofill\n", __FUNCTION__, fullpath); } else if (strstr(fullpath, "libmpi") != NULL) { diff --git a/executer/user/mcexec.c b/executer/user/mcexec.c index f744a50c..c0385d39 100644 --- a/executer/user/mcexec.c +++ b/executer/user/mcexec.c @@ -158,6 +158,7 @@ static int enable_vdso = 1; static int mpol_no_heap = 0; static int mpol_no_stack = 0; static int mpol_no_bss = 0; +static int mpol_shm_premap = 0; static int no_bind_ikc_map = 0; static unsigned long mpol_threshold = 0; static unsigned long heap_extension = (2*1024*1024); @@ -1312,6 +1313,12 @@ static struct option mcexec_options[] = { .flag = &mpol_no_bss, .val = 1, }, + { + .name = "mpol-shm-premap", + .has_arg = no_argument, + .flag = &mpol_shm_premap, + .val = 1, + }, { .name = "no-bind-ikc-map", .has_arg = no_argument, @@ -1798,6 +1805,10 @@ int main(int argc, char **argv) desc->mpol_flags |= MPOL_NO_BSS; } + if (mpol_shm_premap) { + desc->mpol_flags |= MPOL_SHM_PREMAP; + } + desc->mpol_threshold = mpol_threshold; desc->heap_extension = heap_extension; diff --git a/kernel/fileobj.c b/kernel/fileobj.c index 79c28ea1..d30138af 100644 --- a/kernel/fileobj.c +++ b/kernel/fileobj.c @@ -231,6 +231,52 @@ int fileobj_create(int fd, struct memobj **objp, int *maxprotp) if (to_memobj(obj)->flags & MF_PREFETCH) { to_memobj(obj)->status = MEMOBJ_TO_BE_PREFETCHED; } + + /* XXX: KNL specific optimization for OFP runs */ + if ((to_memobj(obj)->flags & MF_PREMAP) && + (to_memobj(obj)->flags & MF_ZEROFILL)) { + struct memobj *mo = to_memobj(obj); + int nr_pages = (result.size + (PAGE_SIZE - 1)) + >> PAGE_SHIFT; + int j = 0; + int node = 4; + + mo->pages = kmalloc(nr_pages * sizeof(void *), IHK_MC_AP_NOWAIT); + if (!mo->pages) { + kprintf("%s: WARNING: failed to allocate pages\n", + __FUNCTION__); + goto error_cleanup; + } + + mo->nr_pages = nr_pages; + memset(mo->pages, 0, nr_pages * sizeof(*mo->pages)); + + if (cpu_local_var(current)->proc->mpol_flags & MPOL_SHM_PREMAP) { + /* Get the actual pages NUMA interleaved */ + for (j = 0; j < nr_pages; ++j) { + mo->pages[j] = ihk_mc_alloc_aligned_pages_node(1, + PAGE_P2ALIGN, IHK_MC_AP_NOWAIT, node); + if (!mo->pages[j]) { + kprintf("%s: ERROR: allocating pages[%d]\n", + __FUNCTION__, j); + goto error_cleanup; + } + + memset(mo->pages[j], 0, PAGE_SIZE); + + ++node; + if (node == ihk_mc_get_nr_numa_nodes()) { + node = 4; + } + } + dkprintf("%s: allocated %d pages interleaved\n", + __FUNCTION__, nr_pages); + } +error_cleanup: + /* TODO: cleanup allocated portion */ + ; + } + newobj = NULL; dkprintf("%s: new obj 0x%lx cref: %d, %s\n", __FUNCTION__, @@ -345,6 +391,19 @@ static void fileobj_release(struct memobj *memobj) page->mode = PM_NONE; #endif } + + /* Pre-mapped? */ + if (to_memobj(free_obj)->flags & MF_PREMAP) { + int i; + + for (i = 0; i < to_memobj(free_obj)->nr_pages; ++i) { + if (to_memobj(free_obj)->pages[i]) + ihk_mc_free_pages(to_memobj(free_obj)->pages[i], 1); + } + + kfree(to_memobj(free_obj)->pages); + } + obj_list_remove(free_obj); mcs_rwlock_writer_unlock_noirq(&fileobj_list_lock, &node); kfree(free_obj); @@ -491,6 +550,42 @@ static int fileobj_get_page(struct memobj *memobj, off_t off, profile_event_add(PROFILE_page_fault_file, PAGE_SIZE); #endif // PROFILE_ENABLE + if (memobj->flags & MF_PREMAP) { + int page_ind = off >> PAGE_SHIFT; + + if (!memobj->pages[page_ind]) { + virt = ihk_mc_alloc_pages(1, IHK_MC_AP_NOWAIT | IHK_MC_AP_USER); + + if (!virt) { + error = -ENOMEM; + kprintf("fileobj_get_page(%p,%lx,%x,%p):" + "alloc failed. %d\n", + obj, off, p2align, physp, + error); + goto out_nolock; + } + + /* Update the array but see if someone did it already and use + * that if so */ + if (!__sync_bool_compare_and_swap(&memobj->pages[page_ind], + NULL, virt)) { + ihk_mc_free_pages(virt, 1); + } + else { + dkprintf("%s: MF_ZEROFILL: off: %lu -> 0x%lx allocated\n", + __FUNCTION__, off, virt_to_phys(virt)); + } + } + + virt = memobj->pages[page_ind]; + error = 0; + *physp = virt_to_phys(virt); + dkprintf("%s: MF_ZEROFILL: off: %lu -> 0x%lx resolved\n", + __FUNCTION__, off, virt_to_phys(virt)); + virt = NULL; + goto out_nolock; + } + mcs_rwlock_writer_lock_noirq(&obj->page_hash_locks[hash], &mcs_node); page = __fileobj_page_hash_lookup(obj, hash, off); @@ -566,6 +661,7 @@ static int fileobj_get_page(struct memobj *memobj, off_t off, out: mcs_rwlock_writer_unlock_noirq(&obj->page_hash_locks[hash], &mcs_node); +out_nolock: if (virt) { ihk_mc_free_pages(virt, npages); } diff --git a/kernel/include/memobj.h b/kernel/include/memobj.h index 24650de4..fd932856 100644 --- a/kernel/include/memobj.h +++ b/kernel/include/memobj.h @@ -36,6 +36,7 @@ enum { MF_ZEROFILL = 0x0010, MF_REG_FILE = 0x1000, MF_DEV_FILE = 0x2000, + MF_PREMAP = 0x8000, MF_HOST_RELEASED = 0x80000000, MF_END }; @@ -49,6 +50,10 @@ struct memobj { uint32_t status; size_t size; ihk_spinlock_t lock; + + /* For pre-mapped memobjects */ + void **pages; + int nr_pages; }; typedef void memobj_release_func_t(struct memobj *obj); diff --git a/kernel/include/syscall.h b/kernel/include/syscall.h index dc8929ce..5e46bd49 100644 --- a/kernel/include/syscall.h +++ b/kernel/include/syscall.h @@ -163,6 +163,7 @@ typedef unsigned long __cpu_set_unit; #define MPOL_NO_HEAP 0x01 #define MPOL_NO_STACK 0x02 #define MPOL_NO_BSS 0x04 +#define MPOL_SHM_PREMAP 0x08 struct program_load_desc { int num_sections; diff --git a/kernel/syscall.c b/kernel/syscall.c index fe6fe644..8a899fd9 100644 --- a/kernel/syscall.c +++ b/kernel/syscall.c @@ -1321,6 +1321,36 @@ do_mmap(const intptr_t addr0, const size_t len0, const int prot, populate_len = memobj->size; } memobj_unlock(memobj); + + /* Update PTEs for pre-mapped memory object */ + if ((memobj->flags & MF_PREMAP) && + (proc->mpol_flags & MPOL_SHM_PREMAP)) { + int i; + enum ihk_mc_pt_attribute ptattr; + ptattr = arch_vrflag_to_ptattr(range->flag, PF_POPULATE, NULL); + + for (i = 0; i < memobj->nr_pages; ++i) { + error = ihk_mc_pt_set_range(proc->vm->address_space->page_table, + proc->vm, + (void *)range->start + (i * PAGE_SIZE), + (void *)range->start + (i * PAGE_SIZE) + + PAGE_SIZE, + virt_to_phys(memobj->pages[i]), + ptattr, + PAGE_SHIFT); + if (error) { + kprintf("%s: ERROR: mapping %d page of pre-mapped file\n", + __FUNCTION__, i); + } + } + dkprintf("%s: memobj 0x%lx pre-mapped\n", __FUNCTION__, memobj); + } +/* + else if (memobj->flags & MF_REG_FILE) { + populated_mapping = 1; + populate_len = memobj->size; + } +*/ } error = 0;