From 09f63483ccb6dcf080493b472126ac220738af26 Mon Sep 17 00:00:00 2001 From: Balazs Gerofi Date: Thu, 7 Jun 2018 07:16:49 +0900 Subject: [PATCH] OFP: temporary ANON mmap() rewrite --- arch/x86_64/kernel/include/arch-memory.h | 6 + executer/kernel/mcctrl/control.c | 1 - executer/user/mcexec.c | 1 + kernel/include/process.h | 3 + kernel/mem.c | 27 +++- kernel/process.c | 175 ++++++++++++++++++++++- kernel/syscall.c | 33 ++++- kernel/user_exp_rcv.c | 5 +- lib/abort.c | 2 +- 9 files changed, 239 insertions(+), 14 deletions(-) diff --git a/arch/x86_64/kernel/include/arch-memory.h b/arch/x86_64/kernel/include/arch-memory.h index b3b931cb..644276af 100644 --- a/arch/x86_64/kernel/include/arch-memory.h +++ b/arch/x86_64/kernel/include/arch-memory.h @@ -40,6 +40,12 @@ #define LARGE_PAGE_MASK (~((unsigned long)LARGE_PAGE_SIZE - 1)) #define LARGE_PAGE_P2ALIGN (LARGE_PAGE_SHIFT - PAGE_SHIFT) +#define GB_PAGE_SHIFT 30 +#define GB_PAGE_SIZE (1UL << GB_PAGE_SHIFT) +#define GB_PAGE_MASK (~((unsigned long)GB_PAGE_SIZE - 1)) +#define GB_PAGE_P2ALIGN (GB_PAGE_SHIFT - PAGE_SHIFT) + + #define USER_END 0x0000800000000000UL #define TASK_UNMAPPED_BASE 0x00002AAAAAA00000UL diff --git a/executer/kernel/mcctrl/control.c b/executer/kernel/mcctrl/control.c index 5f6d0ee9..a1064c95 100644 --- a/executer/kernel/mcctrl/control.c +++ b/executer/kernel/mcctrl/control.c @@ -1073,7 +1073,6 @@ out: return ret; } - /* NOTE: per-process data is refcounted. * For every get call the user should call put. */ struct mcctrl_per_proc_data *mcctrl_get_per_proc_data( diff --git a/executer/user/mcexec.c b/executer/user/mcexec.c index 775aaa3a..a0d9add7 100644 --- a/executer/user/mcexec.c +++ b/executer/user/mcexec.c @@ -3522,6 +3522,7 @@ int main_loop(struct thread_data_s *my_thread) if (ioctl(fd, MCEXEC_UP_TRANSFER, &trans) != 0) { fprintf(stderr, "__NR_gettid(): error transfering TIDs\n"); + exit(1); } free(tids); diff --git a/kernel/include/process.h b/kernel/include/process.h index 593cacf6..2cc627b8 100644 --- a/kernel/include/process.h +++ b/kernel/include/process.h @@ -32,6 +32,7 @@ #define VR_STACK 0x1 #define VR_RESERVED 0x2 #define VR_AP_USER 0x4 +#define VR_PREALLOC 0x8 #define VR_IO_NOCACHE 0x100 #define VR_REMOTE 0x200 #define VR_WRITE_COMBINED 0x400 @@ -387,6 +388,8 @@ struct vm_range { int pgshift; /* page size. 0 means THP */ int padding; void *private_data; + unsigned long lowest_accesed; + unsigned long faulted_size; }; struct vm_range_numa_policy { diff --git a/kernel/mem.c b/kernel/mem.c index 395cb032..740c8e3d 100644 --- a/kernel/mem.c +++ b/kernel/mem.c @@ -698,6 +698,22 @@ static void *mckernel_allocate_aligned_pages_node(int npages, int p2align, break; } + else { + dkprintf("%s: couldn't fulfill user policy for" + " %d contiguous pages from node %d " +#ifdef IHK_RBTREE_ALLOCATOR + "(free pages left: %d)" +#endif + "\n", + __FUNCTION__, + npages, + numa_id +#ifdef IHK_RBTREE_ALLOCATOR + , memory_nodes[numa_id].nr_free_pages +#endif + ); + //return NULL; + } } if (pa) break; @@ -719,8 +735,8 @@ static void *mckernel_allocate_aligned_pages_node(int npages, int p2align, #ifdef PROFILE_ENABLE profile_event_add(PROFILE_mpol_alloc_missed, npages * 4096); #endif - dkprintf("%s: couldn't fulfill user policy for %d pages\n", - __FUNCTION__, npages); + dkprintf("%s: couldn't fulfill user policy for %d pages from node %d\n", + __FUNCTION__, npages, i); } distance_based: @@ -926,6 +942,8 @@ static void query_free_mem_interrupt_handler(void *priv) /* Iterate memory allocators */ for (i = 0; i < ihk_mc_get_nr_numa_nodes(); ++i) { #ifdef IHK_RBTREE_ALLOCATOR + kprintf("McKernel free pages in NUMA node %d: %d\n", + i, memory_nodes[i].nr_free_pages); pages += memory_nodes[i].nr_free_pages; #else struct ihk_page_allocator_desc *pa_allocator; @@ -981,6 +999,8 @@ void coredump(struct thread *thread, void *regs) struct coretable *coretable; int chunks; + return; + #ifdef POSTK_DEBUG_ARCH_DEP_67 /* use limit corefile size. (temporarily fix.) */ if (thread->proc->rlimit[MCK_RLIMIT_CORE].rlim_cur == 0) { return; @@ -1189,6 +1209,7 @@ static void page_fault_handler(void *fault_addr, uint64_t reason, void *regs) if (!lptep || !pte_is_present(lptep)) { kprintf("%s: ERROR: no mapping in Linux for: 0x%lx?\n", __FUNCTION__, virt); + terminate(0, SIGKILL); goto regular_handler; } @@ -1210,7 +1231,7 @@ static void page_fault_handler(void *fault_addr, uint64_t reason, void *regs) } *ptep = *lptep; - kprintf("%s: Linux ioremap address 0x%lx -> 0x%lx " + dkprintf("%s: Linux ioremap address 0x%lx -> 0x%lx " "mapped on demand\n", __FUNCTION__, virt, phys); diff --git a/kernel/process.c b/kernel/process.c index 997ace65..4bdb9577 100644 --- a/kernel/process.c +++ b/kernel/process.c @@ -1015,6 +1015,10 @@ int free_process_memory_range(struct process_vm *vm, struct vm_range *range) if (vm->range_cache[i] == range) vm->range_cache[i] = NULL; } + +if (range->flag & VR_STACK) { + kprintf("%s: VR_STACK faulted_size: %lu\n", __FUNCTION__, range->faulted_size); +} kfree(range); dkprintf("free_process_memory_range(%p,%lx-%lx): 0\n", @@ -1230,6 +1234,9 @@ int add_process_memory_range(struct process_vm *vm, range->pgshift = pgshift; range->private_data = NULL; + range->lowest_accesed = end; + range->faulted_size = 0; + rc = 0; if (phys == NOPHYS) { /* Nothing to map */ @@ -1266,6 +1273,138 @@ int add_process_memory_range(struct process_vm *vm, return rc; } + /* + * Allocate and map physical memory, + * interpret NUMA policy. + * TODO: move out to a function.. + */ +if (flag & VR_PREALLOC && phys == NOPHYS) { + +#if 0 + unsigned long addr = start; + enum ihk_mc_pt_attribute ptattr; + ptattr = arch_vrflag_to_ptattr(range->flag, PF_POPULATE, NULL); + unsigned long irqflags; + unsigned long len = 0; + void *frame = NULL; + int npages; + int p2align; + + len = end - addr; + + /* Figure out size */ + if (len >= LARGE_PAGE_SIZE) { + p2align = LARGE_PAGE_P2ALIGN; + } + else { + p2align = PAGE_P2ALIGN; + } + npages = len >> PAGE_SHIFT; + + frame = ihk_mc_alloc_aligned_pages_user(npages, + p2align, + IHK_MC_AP_NOWAIT | (range->flag & VR_AP_USER ? IHK_MC_AP_USER : 0), + -1); + if (!frame) { + kprintf("%s: error: out of memory\n", __FUNCTION__); + panic("panic"); + return -ENOMEM; + } + + irqflags = ihk_mc_spinlock_lock(&vm->page_table_lock); + + rc = ihk_mc_pt_set_range(vm->address_space->page_table, + vm, + (void *)addr, + (void *)addr + len, + virt_to_phys(frame), + ptattr, + PAGE_SHIFT + p2align, + range); + + if (rc) { + kprintf("%s: ERROR: mapping\n", __FUNCTION__); + ihk_mc_spinlock_unlock(&vm->page_table_lock, irqflags); + return -ENOMEM; + } + + ihk_mc_spinlock_unlock(&vm->page_table_lock, irqflags); + + memset(frame, 0, len); + addr += len; + + +#else + unsigned long addr = start; + enum ihk_mc_pt_attribute ptattr; + ptattr = arch_vrflag_to_ptattr(range->flag, PF_POPULATE, NULL); + + while (addr < end) { + unsigned long irqflags; + unsigned long len = 0; + void *frame = NULL; + int npages; + int p2align; + + len = end - addr; + + /* Figure out size */ + if (len >= LARGE_PAGE_SIZE) { + len = LARGE_PAGE_SIZE; + p2align = LARGE_PAGE_P2ALIGN; + } + else { + len = PAGE_SIZE; + p2align = PAGE_P2ALIGN; + } + + npages = len >> PAGE_SHIFT; +#if 0 + frame = ihk_mc_alloc_aligned_pages_node_user(npages, + p2align, + IHK_MC_AP_NOWAIT | (range->flag & VR_AP_USER ? IHK_MC_AP_USER : 0), + node, -1); + node = 1 - node; +#else + frame = ihk_mc_alloc_aligned_pages_user(npages, + p2align, + IHK_MC_AP_NOWAIT | (range->flag & VR_AP_USER ? IHK_MC_AP_USER : 0), + -1); +#endif + if (!frame) { + kprintf("%s: error: out of memory\n", __FUNCTION__); + return -ENOMEM; + } + + irqflags = ihk_mc_spinlock_lock(&vm->page_table_lock); + + rc = ihk_mc_pt_set_range(vm->address_space->page_table, + vm, + (void *)addr, + (void *)addr + len, + virt_to_phys(frame), + ptattr, + PAGE_SHIFT + p2align, + range); + + if (rc) { + kprintf("%s: ERROR: mapping\n", __FUNCTION__); + ihk_mc_spinlock_unlock(&vm->page_table_lock, irqflags); + return -ENOMEM; + } + + ihk_mc_spinlock_unlock(&vm->page_table_lock, irqflags); + + memset(frame, 0, len); + addr += len; + } +#endif + dkprintf("%s: 0x%lx:%lu mapped\n", + __FUNCTION__, + start, + end - start); +} + /* Clear content! */ if (phys != NOPHYS && !(flag & (VR_REMOTE | VR_DEMAND_PAGING)) && ((flag & VR_PROT_MASK) != VR_PROT_NONE)) { @@ -1784,6 +1923,22 @@ static int page_fault_process_memory_range(struct process_vm *vm, struct vm_rang } pgaddr = (void *)(fault_addr & ~(pgsize - 1)); } + + if (pgsize > LARGE_PAGE_SIZE) { + dkprintf("%s: 0x%lx, pgsize: %lu\n", + __FUNCTION__, pgaddr, pgsize); + } + + if (range->flag & VR_STACK) { + range->faulted_size += pgsize; + + if (range->lowest_accesed > (unsigned long)pgaddr) { + dkprintf("%s: VR_STACK @ 0x%lx, pgsize: %lu, distance: %lu\n", + __FUNCTION__, pgaddr, pgsize, range->end - (unsigned long)pgaddr); + range->lowest_accesed = (unsigned long)pgaddr; + } + } + /*****/ dkprintf("%s: ptep=%lx,pte_is_null=%d,pte_is_fileoff=%d\n", __FUNCTION__, ptep, ptep ? pte_is_null(ptep) : -1, ptep ? pte_is_fileoff(ptep, pgsize) : -1); if (!ptep || pte_is_null(ptep) || pte_is_fileoff(ptep, pgsize)) { @@ -2155,6 +2310,8 @@ int init_process_stack(struct thread *thread, struct program_load_desc *pn, struct vm_range *range; int stack_populated_size = 0; int stack_align_padding = 0; + int p2align = LARGE_PAGE_P2ALIGN; + int pgshift = LARGE_PAGE_SHIFT; /* Create stack range */ end = STACK_TOP(&thread->vm->region) & LARGE_PAGE_MASK; @@ -2177,18 +2334,27 @@ int init_process_stack(struct thread *thread, struct program_load_desc *pn, else if (size < minsz) { size = minsz; } + +#if 0 + if (minsz >= GB_PAGE_SIZE) { + end = end & GB_PAGE_MASK; + p2align = GB_PAGE_P2ALIGN; + pgshift = GB_PAGE_SHIFT; + } +#endif + start = (end - size) & LARGE_PAGE_MASK; /* Apply user allocation policy to stacks */ /* TODO: make threshold kernel or mcexec argument */ ap_flag = (size >= proc->mpol_threshold && !(proc->mpol_flags & MPOL_NO_STACK)) ? IHK_MC_AP_USER : 0; - dkprintf("%s: max size: %lu, mapped size: %lu %s\n", - __FUNCTION__, size, minsz, + kprintf("%s: stack: 0x%lx-0x%lx:%lu, mapped: %lu %s\n", + __FUNCTION__, start, end, size, minsz, ap_flag ? "(IHK_MC_AP_USER)" : ""); stack = ihk_mc_alloc_aligned_pages_user(minsz >> PAGE_SHIFT, - LARGE_PAGE_P2ALIGN, IHK_MC_AP_NOWAIT | ap_flag, start); + p2align, IHK_MC_AP_NOWAIT | ap_flag, start); if (!stack) { kprintf("%s: error: couldn't allocate initial stack\n", @@ -2215,8 +2381,7 @@ int init_process_stack(struct thread *thread, struct program_load_desc *pn, thread->vm, (void *)(end - minsz), (void *)end, virt_to_phys(stack), arch_vrflag_to_ptattr(vrflag, PF_POPULATE, NULL), - LARGE_PAGE_SHIFT, range - ); + pgshift, range); if (error) { kprintf("init_process_stack:" diff --git a/kernel/syscall.c b/kernel/syscall.c index 89e6fd1e..74288851 100644 --- a/kernel/syscall.c +++ b/kernel/syscall.c @@ -1242,6 +1242,8 @@ interrupt_syscall(struct thread *thread, int sig) SYSCALL_DECLARE(exit_group) { dkprintf("sys_exit_group,pid=%d\n", cpu_local_var(current)->proc->pid); +dkprintf("%s: PID: %d, TID: %d\n", __FUNCTION__, + cpu_local_var(current)->proc->pid, cpu_local_var(current)->tid); terminate((int)ihk_mc_syscall_arg0(ctx), 0); return 0; @@ -1572,6 +1574,24 @@ do_mmap(const intptr_t addr0, const size_t len0, const int prot, vrflags |= VR_AP_USER; } +#if 1 + if (len < (unsigned long)4*1024*1024*1024) { + phys = NOPHYS; + vrflags |= VR_PREALLOC; + } + else { + kprintf("%s: big ANON mapping!!: %lu\n", __FUNCTION__, len); + /* Give demand paging a chance */ + vrflags |= VR_DEMAND_PAGING; + populated_mapping = 0; + error = zeroobj_create(&memobj); + if (error) { + ekprintf("%s: zeroobj_create failed, error: %d\n", + __FUNCTION__, error); + goto out; + } + } +#else p = ihk_mc_alloc_aligned_pages_user(npages, p2align, IHK_MC_AP_NOWAIT | ap_flag, addr0); if (p == NULL) { @@ -1603,6 +1623,7 @@ do_mmap(const intptr_t addr0, const size_t len0, const int prot, __FUNCTION__, addr, len, npages, p2align); phys = virt_to_phys(p); } +#endif } else if (flags & MAP_SHARED) { dkprintf("%s: MAP_SHARED,flags=%x,len=%ld\n", __FUNCTION__, flags, len); @@ -5530,6 +5551,7 @@ do_exit(int code) int sig = code & 255; dkprintf("sys_exit,pid=%d\n", proc->pid); +dkprintf("%s: PID: %d, TID: %d\n", __FUNCTION__, proc->pid, thread->tid); mcs_rwlock_reader_lock(&proc->threads_lock, &lock); nproc = 0; @@ -7813,6 +7835,10 @@ SYSCALL_DECLARE(mremap) uintptr_t lckstart = -1; uintptr_t lckend = -1; +/* Not for lammps for now.. */ +if (!strcmp("./lammps", thread->proc->saved_cmdline)) + return -ENOSYS; + dkprintf("sys_mremap(%#lx,%#lx,%#lx,%#x,%#lx)\n", oldaddr, oldsize0, newsize0, flags, newaddr); ihk_mc_spinlock_lock_noirq(&vm->memory_range_lock); @@ -9535,6 +9561,10 @@ long syscall(int num, ihk_mc_user_context_t *ctx) } #endif // PROFILE_ENABLE + if (thread->proc->nohost) { // mcexec termination was detected + terminate(0, SIGKILL); + } + #if defined(POSTK_DEBUG_TEMP_FIX_60) && defined(POSTK_DEBUG_TEMP_FIX_56) check_need_resched(); #elif defined(POSTK_DEBUG_TEMP_FIX_60) /* sched_yield called check_signal fix. */ @@ -9562,9 +9592,6 @@ long syscall(int num, ihk_mc_user_context_t *ctx) #endif // DISABLE_SCHED_YIELD set_cputime(0); - if (thread->proc->nohost) { // mcexec termination was detected - terminate(0, SIGKILL); - } //kprintf("syscall=%d returns %lx(%ld)\n", num, l, l); return l; diff --git a/kernel/user_exp_rcv.c b/kernel/user_exp_rcv.c index 55b456ec..6aa69c39 100644 --- a/kernel/user_exp_rcv.c +++ b/kernel/user_exp_rcv.c @@ -110,8 +110,11 @@ int hfi1_user_exp_rcv_setup(struct hfi1_filedata *fd, struct hfi1_tid_info *tinf #if 0 /* Verify that access is OK for the user buffer */ - if (access_ok(vm, VERIFY_WRITE, tinfo->vaddr, tinfo->length)) + if (access_ok(vm, VERIFY_WRITE, tinfo->vaddr, tinfo->length)) { + kprintf("%s: access_ok() failed for 0x%lx:%lu\n", + __FUNCTION__, tinfo->vaddr, tinfo->length); return -EFAULT; + } #endif vaddr_end = tinfo->vaddr + tinfo->length; diff --git a/lib/abort.c b/lib/abort.c index 1103a1af..5cca7df6 100644 --- a/lib/abort.c +++ b/lib/abort.c @@ -19,7 +19,7 @@ void panic(const char *msg) kprintf("%s\n", msg); - arch_print_stack(); + //arch_print_stack(); while (1) { cpu_halt();