diff --git a/arch/x86/kernel/memory.c b/arch/x86/kernel/memory.c index 6fbe8ab4..7eb6b70d 100644 --- a/arch/x86/kernel/memory.c +++ b/arch/x86/kernel/memory.c @@ -2263,6 +2263,37 @@ int getint_user(int *dest, const int *p) return 0; } +int verify_process_vm(struct process_vm *vm, + const void *usrc, size_t size) +{ + const uintptr_t ustart = (uintptr_t)usrc; + const uintptr_t uend = ustart + size; + uint64_t reason; + uintptr_t addr; + int error = 0; + + if ((ustart < vm->region.user_start) + || (vm->region.user_end <= ustart) + || ((vm->region.user_end - ustart) < size)) { + kprintf("%s: error: out of user range\n", __FUNCTION__); + return -EFAULT; + } + + reason = PF_USER; /* page not present */ + for (addr = ustart & PAGE_MASK; addr < uend; addr += PAGE_SIZE) { + if (!addr) + return -EINVAL; + + error = page_fault_process_vm(vm, (void *)addr, reason); + if (error) { + kprintf("%s: error: PF for %p failed\n", __FUNCTION__, addr); + return error; + } + } + + return error; +} + int read_process_vm(struct process_vm *vm, void *kdst, const void *usrc, size_t siz) { const uintptr_t ustart = (uintptr_t)usrc; diff --git a/kernel/syscall.c b/kernel/syscall.c index e949b6cf..4fc13452 100644 --- a/kernel/syscall.c +++ b/kernel/syscall.c @@ -8085,6 +8085,322 @@ SYSCALL_DECLARE(migrate_pages) return -ENOSYS; } /* sys_migrate_pages() */ +struct move_pages_smp_req { + unsigned long count; + const void **user_virt_addr; + int *user_status; + const int *user_nodes; + void **virt_addr; + int *status; + pte_t **ptep; + int *nodes; + int nodes_ready; + int *nr_pages; + unsigned long *dst_phys; + struct process *proc; + ihk_atomic_t phase_done; + int phase_ret; +}; + +int move_pages_smp_handler(int cpu_index, int nr_cpus, void *arg) +{ + int i, i_s, i_e, phase = 1; + struct move_pages_smp_req *mpsr = + (struct move_pages_smp_req *)arg; + struct process_vm *vm = mpsr->proc->vm; + int count = mpsr->count; + struct page_table *save_pt; + extern struct page_table *get_init_page_table(void); + + i_s = (count / nr_cpus) * cpu_index; + i_e = i_s + (count / nr_cpus); + if (cpu_index == (nr_cpus - 1)) { + i_e = count; + } + + /* Load target process' PT so that we can access user-space */ + save_pt = cpu_local_var(current) == &cpu_local_var(idle) ? + get_init_page_table() : + cpu_local_var(current)->vm->address_space->page_table; + + if (save_pt != vm->address_space->page_table) { + ihk_mc_load_page_table(vm->address_space->page_table); + } + else { + save_pt = NULL; + } + + if (nr_cpus == 1) { + switch (cpu_index) { + case 0: + memcpy(mpsr->virt_addr, mpsr->user_virt_addr, + sizeof(void *) * count); + memcpy(mpsr->status, mpsr->user_status, + sizeof(int) * count); + memcpy(mpsr->nodes, mpsr->user_nodes, + sizeof(int) * count); + memset(mpsr->ptep, 0, sizeof(pte_t) * count); + memset(mpsr->status, 0, sizeof(int) * count); + memset(mpsr->nr_pages, 0, sizeof(int) * count); + memset(mpsr->dst_phys, 0, + sizeof(unsigned long) * count); + mpsr->nodes_ready = 1; + break; + + default: + break; + } + } + else if (nr_cpus > 1 && nr_cpus < 4) { + switch (cpu_index) { + case 0: + memcpy(mpsr->virt_addr, mpsr->user_virt_addr, + sizeof(void *) * count); + memcpy(mpsr->status, mpsr->user_status, + sizeof(int) * count); + case 1: + memcpy(mpsr->nodes, mpsr->user_nodes, + sizeof(int) * count); + memset(mpsr->ptep, 0, sizeof(pte_t) * count); + memset(mpsr->status, 0, sizeof(int) * count); + memset(mpsr->nr_pages, 0, sizeof(int) * count); + memset(mpsr->dst_phys, 0, + sizeof(unsigned long) * count); + mpsr->nodes_ready = 1; + break; + + default: + break; + } + } + else if (nr_cpus >= 4) { + switch (cpu_index) { + case 0: + memcpy(mpsr->virt_addr, mpsr->user_virt_addr, + sizeof(void *) * count); + break; + case 1: + memcpy(mpsr->status, mpsr->user_status, + sizeof(int) * count); + break; + case 2: + memcpy(mpsr->nodes, mpsr->user_nodes, + sizeof(int) * count); + mpsr->nodes_ready = 1; + break; + case 3: + memset(mpsr->ptep, 0, sizeof(pte_t) * count); + memset(mpsr->status, 0, sizeof(int) * count); + memset(mpsr->nr_pages, 0, sizeof(int) * count); + memset(mpsr->dst_phys, 0, + sizeof(unsigned long) * count); + break; + + default: + break; + } + } + + while (!(volatile int)mpsr->nodes_ready) { + cpu_pause(); + } + + /* NUMA verification in parallel */ + for (i = i_s; i < i_e; i++) { + if (mpsr->nodes[i] < 0 || + mpsr->nodes[i] >= ihk_mc_get_nr_numa_nodes() || + !test_bit(mpsr->nodes[i], + mpsr->proc->vm->numa_mask)) { + mpsr->phase_ret = -EINVAL; + break; + } + } + + /* Barrier */ + ihk_atomic_inc(&mpsr->phase_done); + while (ihk_atomic_read(&mpsr->phase_done) != + (phase * nr_cpus)) { + cpu_pause(); + } + + if (mpsr->phase_ret != 0) { + goto out; + } + + kprintf("%s: phase %d done\n", __FUNCTION__, phase); + ++phase; + + /* PTE lookup in parallel */ + for (i = i_s; i < i_e; i++) { + void *phys; + size_t pgsize; + int p2align; + /* + * XXX: No page structures for anonymous mappings. + * Look up physical addresses by scanning page tables. + */ + mpsr->ptep[i] = ihk_mc_pt_lookup_pte(vm->address_space->page_table, + (void *)mpsr->virt_addr[i], 0, &phys, &pgsize, &p2align); + + /* PTE valid? */ + if (!mpsr->ptep[i] || !pte_is_present(mpsr->ptep[i])) { + mpsr->status[i] = -ENOENT; + mpsr->ptep[i] = NULL; + continue; + } + + /* PTE is file? */ + if (pte_is_fileoff(mpsr->ptep[i], PAGE_SIZE)) { + mpsr->status[i] = -EINVAL; + mpsr->ptep[i] = NULL; + continue; + } + + dkprintf("%s: virt 0x%lx:%lu requested to be moved to node %d\n", + __FUNCTION__, mpsr->virt_addr[i], pgsize, mpsr->nodes[i]); + + /* Large page? */ + if (pgsize > PAGE_SIZE) { + int nr_sub_pages = (pgsize / PAGE_SIZE); + int j; + + if (i + nr_sub_pages > count) { + kprintf("%s: ERROR: page at index %d exceeds the region\n", + __FUNCTION__, i); + mpsr->status[i] = -EINVAL; + break; + } + + /* Is it contiguous across nr_sub_pages and all + * requested to be moved to the same target node? */ + for (j = 0; j < nr_sub_pages; ++j) { + if (mpsr->virt_addr[i + j] != + (mpsr->virt_addr[i] + (j * PAGE_SIZE)) || + mpsr->nodes[i] != mpsr->nodes[i + j]) { + kprintf("%s: ERROR: virt address or node at index %d" + " is inconsistent\n", + __FUNCTION__, i + j); + mpsr->phase_ret = -EINVAL; + goto pte_out; + } + } + + mpsr->nr_pages[i] = nr_sub_pages; + i += (nr_sub_pages - 1); + } + else { + mpsr->nr_pages[i] = 1; + } + } + +pte_out: + /* Barrier */ + ihk_atomic_inc(&mpsr->phase_done); + while (ihk_atomic_read(&mpsr->phase_done) != + (phase * nr_cpus)) { + cpu_pause(); + } + + if (mpsr->phase_ret != 0) { + goto out; + } + + kprintf("%s: phase %d done\n", __FUNCTION__, phase); + ++phase; + + if (cpu_index == 0) { + /* Allocate new pages on target NUMA nodes */ + for (i = 0; i < count; i++) { + int pgalign = 0; + int j; + void *dst; + + if (!mpsr->ptep[i] || mpsr->status[i] < 0 || !mpsr->nr_pages[i]) + continue; + + /* TODO: store pgalign info in an array as well? */ + if (mpsr->nr_pages[i] > 1) { + if (mpsr->nr_pages[i] * PAGE_SIZE == PTL2_SIZE) + pgalign = PTL2_SHIFT - PTL1_SHIFT; + } + + dst = ihk_mc_alloc_aligned_pages_node(mpsr->nr_pages[i], + pgalign, IHK_MC_AP_USER, mpsr->nodes[i]); + + if (!dst) { + mpsr->status[i] = -ENOMEM; + continue; + } + + for (j = i; j < (i + mpsr->nr_pages[i]); ++j) { + mpsr->status[j] = mpsr->nodes[i]; + } + + mpsr->dst_phys[i] = virt_to_phys(dst); + + dkprintf("%s: virt 0x%lx:%lu to node %d, pgalign: %d," + " allocated phys: 0x%lx\n", + __FUNCTION__, mpsr->virt_addr[i], + mpsr->nr_pages[i] * PAGE_SIZE, + mpsr->nodes[i], pgalign, mpsr->dst_phys[i]); + } + } + + /* Barrier */ + ihk_atomic_inc(&mpsr->phase_done); + while (ihk_atomic_read(&mpsr->phase_done) != + (phase * nr_cpus)) { + cpu_pause(); + } + + if (mpsr->phase_ret != 0) { + goto out; + } + + kprintf("%s: phase %d done\n", __FUNCTION__, phase); + ++phase; + + /* Copy, PTE update, memfree in parallel */ + for (i = i_s; i < i_e; ++i) { + if (!mpsr->dst_phys[i]) + continue; + + fast_memcpy(phys_to_virt(mpsr->dst_phys[i]), + phys_to_virt(pte_get_phys(mpsr->ptep[i])), + mpsr->nr_pages[i] * PAGE_SIZE); + + ihk_mc_free_pages( + phys_to_virt(pte_get_phys(mpsr->ptep[i])), + mpsr->nr_pages[i]); + + pte_update_phys(mpsr->ptep[i], mpsr->dst_phys[i]); + + dkprintf("%s: virt 0x%lx:%lu copied and remapped to phys: 0x%lu\n", + __FUNCTION__, mpsr->virt_addr[i], + mpsr->nr_pages[i] * PAGE_SIZE, + mpsr->dst_phys[i]); + } + + /* XXX: do a separate SMP call with only CPUs running threads + * of this process? */ + if (cpu_local_var(current)->proc == mpsr->proc) { + /* Invalidate all TLBs */ + for (i = 0; i < mpsr->count; i++) { + if (!mpsr->dst_phys[i]) + continue; + + flush_tlb_single((unsigned long)mpsr->virt_addr[i]); + } + } + +out: + if (save_pt) { + ihk_mc_load_page_table(save_pt); + } + + return mpsr->phase_ret; +} + SYSCALL_DECLARE(move_pages) { int pid = ihk_mc_syscall_arg0(ctx); @@ -8099,9 +8415,14 @@ SYSCALL_DECLARE(move_pages) int *nr_pages = NULL; unsigned long *dst_phys = NULL; pte_t **ptep = NULL; + struct move_pages_smp_req mpsr; struct process_vm *vm = cpu_local_var(current)->vm; - int i, ret = 0; + int ret = 0; + + unsigned long t_s, t_e; + + t_s = rdtsc(); /* Only self is supported for now */ if (pid) { @@ -8152,8 +8473,33 @@ SYSCALL_DECLARE(move_pages) ret = -ENOMEM; goto dealloc_out; } +t_e = rdtsc(); kprintf("%s: init malloc: %lu \n", __FUNCTION__, t_e - t_s); t_s = t_e; /* Get virt addresses and NUMA node numbers from user */ + if (verify_process_vm(cpu_local_var(current)->vm, + user_virt_addr, sizeof(void *) * count)) { + ret = -EFAULT; + goto dealloc_out; + } + + if (verify_process_vm(cpu_local_var(current)->vm, + user_nodes, sizeof(int) * count)) { + ret = -EFAULT; + goto dealloc_out; + } + + if (verify_process_vm(cpu_local_var(current)->vm, + user_status, sizeof(int) * count)) { + ret = -EFAULT; + goto dealloc_out; + } +t_e = rdtsc(); kprintf("%s: init verify: %lu \n", __FUNCTION__, t_e - t_s); t_s = t_e; + +#if 0 + memcpy(virt_addr, user_virt_addr, sizeof(void *) * count); + memcpy(status, user_status, sizeof(int) * count); + memcpy(nodes, user_nodes, sizeof(int) * count); + if (copy_from_user(virt_addr, user_virt_addr, sizeof(void *) * count)) { ret = -EFAULT; @@ -8172,6 +8518,8 @@ SYSCALL_DECLARE(move_pages) goto dealloc_out; } +t_e = rdtsc(); kprintf("%s: init copy: %lu \n", __FUNCTION__, t_e - t_s); t_s = t_e; + /* Verify target NUMA nodes are valid */ for (i = 0; i < count; i++) { if (nodes[i] < 0 || @@ -8182,145 +8530,45 @@ SYSCALL_DECLARE(move_pages) } } +t_e = rdtsc(); kprintf("%s: init NUMAver: %lu \n", __FUNCTION__, t_e - t_s); t_s = t_e; + memset(ptep, 0, sizeof(pte_t) * count); memset(status, 0, sizeof(int) * count); memset(nr_pages, 0, sizeof(int) * count); + memset(dst_phys, 0, sizeof(unsigned long) * count); + +t_e = rdtsc(); kprintf("%s: init memset: %lu \n", __FUNCTION__, t_e - t_s); t_s = t_e; +#endif ihk_mc_spinlock_lock_noirq(&vm->page_table_lock); - for (i = 0; i < count; i++) { - void *phys; - size_t pgsize; - int p2align; - /* - * XXX: No page structures for anonymous mappings. - * Look up physical addresses by scanning page tables. - */ - ptep[i] = ihk_mc_pt_lookup_pte(vm->address_space->page_table, - (void *)virt_addr[i], 0, &phys, &pgsize, &p2align); - - /* PTE valid? */ - if (!ptep[i] || !pte_is_present(ptep[i])) { - status[i] = -ENOENT; - ptep[i] = NULL; - continue; - } - - /* PTE is file? */ - if (pte_is_fileoff(ptep[i], PAGE_SIZE)) { - status[i] = -EINVAL; - ptep[i] = NULL; - continue; - } - - dkprintf("%s: virt 0x%lx:%lu requested to be moved to node %d\n", - __FUNCTION__, virt_addr[i], pgsize, nodes[i]); - - /* Large page? */ - if (pgsize > PAGE_SIZE) { - int nr_sub_pages = (pgsize / PAGE_SIZE); - int j; - - if (i + nr_sub_pages > count) { - kprintf("%s: ERROR: page at index %d exceeds the region\n", - __FUNCTION__, i); - status[i] = -EINVAL; - break; - } - - /* Is it contiguous across nr_sub_pages and all - * requested to be moved to the same target node? */ - for (j = 0; j < nr_sub_pages; ++j) { - if (virt_addr[i + j] != (virt_addr[i] + (j * PAGE_SIZE)) || - nodes[i] != nodes[i + j]) { - kprintf("%s: ERROR: virt address or node at index %d" - " is inconsistent\n", - __FUNCTION__, i + j); - ret = -EINVAL; - - ihk_mc_spinlock_unlock_noirq(&vm->page_table_lock); - goto dealloc_out; - } - } - - nr_pages[i] = nr_sub_pages; - i += (nr_sub_pages - 1); - } - else { - nr_pages[i] = 1; - } - } - - memset(dst_phys, 0, sizeof(unsigned long) * count); - - /* Allocate new pages on target NUMA nodes */ - for (i = 0; i < count; i++) { - int pgalign = 0; - int j; - void *dst; - - if (!ptep[i] || status[i] < 0 || !nr_pages[i]) - continue; - - /* TODO: store pgalign info in an array as well? */ - if (nr_pages[i] > 1) { - if (nr_pages[i] * PAGE_SIZE == PTL2_SIZE) - pgalign = PTL2_SHIFT - PTL1_SHIFT; - } - - dst = ihk_mc_alloc_aligned_pages_node(nr_pages[i], - pgalign, IHK_MC_AP_USER, nodes[i]); - - if (!dst) { - status[i] = -ENOMEM; - continue; - } - - for (j = i; j < (i + nr_pages[i]); ++j) { - status[j] = nodes[i]; - } - - dst_phys[i] = virt_to_phys(dst); - - dkprintf("%s: virt 0x%lx:%lu to node %d, pgalign: %d," - " allocated phys: 0x%lx\n", - __FUNCTION__, virt_addr[i], - nr_pages[i] * PAGE_SIZE, - nodes[i], pgalign, dst_phys[i]); - } - - /* Do the copy, update PTEs, free original memory */ - /* TODO: do this in parallel */ - for (i = 0; i < count; i++) { - if (!dst_phys[i]) - continue; - - fast_memcpy(phys_to_virt(dst_phys[i]), - phys_to_virt(pte_get_phys(ptep[i])), - nr_pages[i] * PAGE_SIZE); - - ihk_mc_free_pages(phys_to_virt(pte_get_phys(ptep[i])), - nr_pages[i]); - - pte_update_phys(ptep[i], dst_phys[i]); - - dkprintf("%s: virt 0x%lx:%lu copied and remapped to phys: 0x%lu\n", - __FUNCTION__, virt_addr[i], - nr_pages[i] * PAGE_SIZE, - dst_phys[i]); - } - - /* Invalidate TLBs */ - /* TODO: do this on all cores in parallel */ - for (i = 0; i < count; i++) { - if (!dst_phys[i]) - continue; - - flush_tlb_single((unsigned long)virt_addr[i]); - } + /* Do the arg init, NUMA verification, copy, + * update PTEs, free original memory */ + mpsr.count = count; + mpsr.user_virt_addr = user_virt_addr; + mpsr.user_status = user_status; + mpsr.user_nodes = user_nodes; + mpsr.virt_addr = virt_addr; + mpsr.status = status; + mpsr.nodes = nodes; + mpsr.nodes_ready = 0; + mpsr.ptep = ptep; + mpsr.dst_phys = dst_phys; + mpsr.nr_pages = nr_pages; + mpsr.proc = cpu_local_var(current)->proc; + ihk_atomic_set(&mpsr.phase_done, 0); + mpsr.phase_ret = 0; + ret = smp_call_func(&cpu_local_var(current)->cpu_set, + move_pages_smp_handler, &mpsr); ihk_mc_spinlock_unlock_noirq(&vm->page_table_lock); + if (ret != 0) { + goto dealloc_out; + } + +t_e = rdtsc(); kprintf("%s: parallel: %lu \n", __FUNCTION__, t_e - t_s); t_s = t_e; + /* This shouldn't fail (verified above) */ if (copy_to_user(user_status, status, sizeof(int) * count)) { ret = -EFAULT; diff --git a/lib/include/memory.h b/lib/include/memory.h index edeaa080..7bb380bc 100644 --- a/lib/include/memory.h +++ b/lib/include/memory.h @@ -37,6 +37,8 @@ int strlen_user(const char *s); int strcpy_from_user(char *dst, const char *src); long getlong_user(long *dest, const long *p); int getint_user(int *dest, const int *p); +int verify_process_vm(struct process_vm *vm, + const void *usrc, size_t size); int read_process_vm(struct process_vm *vm, void *kdst, const void *usrc, size_t siz); int copy_to_user(void *dst, const void *src, size_t siz); int setlong_user(long *dst, long data);