move_pages(): parallel implementation v1

This commit is contained in:
Balazs Gerofi
2017-05-12 09:28:52 +09:00
parent 06f824c829
commit 4c0f401424
3 changed files with 412 additions and 131 deletions

View File

@ -2263,6 +2263,37 @@ int getint_user(int *dest, const int *p)
return 0;
}
int verify_process_vm(struct process_vm *vm,
const void *usrc, size_t size)
{
const uintptr_t ustart = (uintptr_t)usrc;
const uintptr_t uend = ustart + size;
uint64_t reason;
uintptr_t addr;
int error = 0;
if ((ustart < vm->region.user_start)
|| (vm->region.user_end <= ustart)
|| ((vm->region.user_end - ustart) < size)) {
kprintf("%s: error: out of user range\n", __FUNCTION__);
return -EFAULT;
}
reason = PF_USER; /* page not present */
for (addr = ustart & PAGE_MASK; addr < uend; addr += PAGE_SIZE) {
if (!addr)
return -EINVAL;
error = page_fault_process_vm(vm, (void *)addr, reason);
if (error) {
kprintf("%s: error: PF for %p failed\n", __FUNCTION__, addr);
return error;
}
}
return error;
}
int read_process_vm(struct process_vm *vm, void *kdst, const void *usrc, size_t siz)
{
const uintptr_t ustart = (uintptr_t)usrc;

View File

@ -8085,6 +8085,322 @@ SYSCALL_DECLARE(migrate_pages)
return -ENOSYS;
} /* sys_migrate_pages() */
struct move_pages_smp_req {
unsigned long count;
const void **user_virt_addr;
int *user_status;
const int *user_nodes;
void **virt_addr;
int *status;
pte_t **ptep;
int *nodes;
int nodes_ready;
int *nr_pages;
unsigned long *dst_phys;
struct process *proc;
ihk_atomic_t phase_done;
int phase_ret;
};
int move_pages_smp_handler(int cpu_index, int nr_cpus, void *arg)
{
int i, i_s, i_e, phase = 1;
struct move_pages_smp_req *mpsr =
(struct move_pages_smp_req *)arg;
struct process_vm *vm = mpsr->proc->vm;
int count = mpsr->count;
struct page_table *save_pt;
extern struct page_table *get_init_page_table(void);
i_s = (count / nr_cpus) * cpu_index;
i_e = i_s + (count / nr_cpus);
if (cpu_index == (nr_cpus - 1)) {
i_e = count;
}
/* Load target process' PT so that we can access user-space */
save_pt = cpu_local_var(current) == &cpu_local_var(idle) ?
get_init_page_table() :
cpu_local_var(current)->vm->address_space->page_table;
if (save_pt != vm->address_space->page_table) {
ihk_mc_load_page_table(vm->address_space->page_table);
}
else {
save_pt = NULL;
}
if (nr_cpus == 1) {
switch (cpu_index) {
case 0:
memcpy(mpsr->virt_addr, mpsr->user_virt_addr,
sizeof(void *) * count);
memcpy(mpsr->status, mpsr->user_status,
sizeof(int) * count);
memcpy(mpsr->nodes, mpsr->user_nodes,
sizeof(int) * count);
memset(mpsr->ptep, 0, sizeof(pte_t) * count);
memset(mpsr->status, 0, sizeof(int) * count);
memset(mpsr->nr_pages, 0, sizeof(int) * count);
memset(mpsr->dst_phys, 0,
sizeof(unsigned long) * count);
mpsr->nodes_ready = 1;
break;
default:
break;
}
}
else if (nr_cpus > 1 && nr_cpus < 4) {
switch (cpu_index) {
case 0:
memcpy(mpsr->virt_addr, mpsr->user_virt_addr,
sizeof(void *) * count);
memcpy(mpsr->status, mpsr->user_status,
sizeof(int) * count);
case 1:
memcpy(mpsr->nodes, mpsr->user_nodes,
sizeof(int) * count);
memset(mpsr->ptep, 0, sizeof(pte_t) * count);
memset(mpsr->status, 0, sizeof(int) * count);
memset(mpsr->nr_pages, 0, sizeof(int) * count);
memset(mpsr->dst_phys, 0,
sizeof(unsigned long) * count);
mpsr->nodes_ready = 1;
break;
default:
break;
}
}
else if (nr_cpus >= 4) {
switch (cpu_index) {
case 0:
memcpy(mpsr->virt_addr, mpsr->user_virt_addr,
sizeof(void *) * count);
break;
case 1:
memcpy(mpsr->status, mpsr->user_status,
sizeof(int) * count);
break;
case 2:
memcpy(mpsr->nodes, mpsr->user_nodes,
sizeof(int) * count);
mpsr->nodes_ready = 1;
break;
case 3:
memset(mpsr->ptep, 0, sizeof(pte_t) * count);
memset(mpsr->status, 0, sizeof(int) * count);
memset(mpsr->nr_pages, 0, sizeof(int) * count);
memset(mpsr->dst_phys, 0,
sizeof(unsigned long) * count);
break;
default:
break;
}
}
while (!(volatile int)mpsr->nodes_ready) {
cpu_pause();
}
/* NUMA verification in parallel */
for (i = i_s; i < i_e; i++) {
if (mpsr->nodes[i] < 0 ||
mpsr->nodes[i] >= ihk_mc_get_nr_numa_nodes() ||
!test_bit(mpsr->nodes[i],
mpsr->proc->vm->numa_mask)) {
mpsr->phase_ret = -EINVAL;
break;
}
}
/* Barrier */
ihk_atomic_inc(&mpsr->phase_done);
while (ihk_atomic_read(&mpsr->phase_done) !=
(phase * nr_cpus)) {
cpu_pause();
}
if (mpsr->phase_ret != 0) {
goto out;
}
kprintf("%s: phase %d done\n", __FUNCTION__, phase);
++phase;
/* PTE lookup in parallel */
for (i = i_s; i < i_e; i++) {
void *phys;
size_t pgsize;
int p2align;
/*
* XXX: No page structures for anonymous mappings.
* Look up physical addresses by scanning page tables.
*/
mpsr->ptep[i] = ihk_mc_pt_lookup_pte(vm->address_space->page_table,
(void *)mpsr->virt_addr[i], 0, &phys, &pgsize, &p2align);
/* PTE valid? */
if (!mpsr->ptep[i] || !pte_is_present(mpsr->ptep[i])) {
mpsr->status[i] = -ENOENT;
mpsr->ptep[i] = NULL;
continue;
}
/* PTE is file? */
if (pte_is_fileoff(mpsr->ptep[i], PAGE_SIZE)) {
mpsr->status[i] = -EINVAL;
mpsr->ptep[i] = NULL;
continue;
}
dkprintf("%s: virt 0x%lx:%lu requested to be moved to node %d\n",
__FUNCTION__, mpsr->virt_addr[i], pgsize, mpsr->nodes[i]);
/* Large page? */
if (pgsize > PAGE_SIZE) {
int nr_sub_pages = (pgsize / PAGE_SIZE);
int j;
if (i + nr_sub_pages > count) {
kprintf("%s: ERROR: page at index %d exceeds the region\n",
__FUNCTION__, i);
mpsr->status[i] = -EINVAL;
break;
}
/* Is it contiguous across nr_sub_pages and all
* requested to be moved to the same target node? */
for (j = 0; j < nr_sub_pages; ++j) {
if (mpsr->virt_addr[i + j] !=
(mpsr->virt_addr[i] + (j * PAGE_SIZE)) ||
mpsr->nodes[i] != mpsr->nodes[i + j]) {
kprintf("%s: ERROR: virt address or node at index %d"
" is inconsistent\n",
__FUNCTION__, i + j);
mpsr->phase_ret = -EINVAL;
goto pte_out;
}
}
mpsr->nr_pages[i] = nr_sub_pages;
i += (nr_sub_pages - 1);
}
else {
mpsr->nr_pages[i] = 1;
}
}
pte_out:
/* Barrier */
ihk_atomic_inc(&mpsr->phase_done);
while (ihk_atomic_read(&mpsr->phase_done) !=
(phase * nr_cpus)) {
cpu_pause();
}
if (mpsr->phase_ret != 0) {
goto out;
}
kprintf("%s: phase %d done\n", __FUNCTION__, phase);
++phase;
if (cpu_index == 0) {
/* Allocate new pages on target NUMA nodes */
for (i = 0; i < count; i++) {
int pgalign = 0;
int j;
void *dst;
if (!mpsr->ptep[i] || mpsr->status[i] < 0 || !mpsr->nr_pages[i])
continue;
/* TODO: store pgalign info in an array as well? */
if (mpsr->nr_pages[i] > 1) {
if (mpsr->nr_pages[i] * PAGE_SIZE == PTL2_SIZE)
pgalign = PTL2_SHIFT - PTL1_SHIFT;
}
dst = ihk_mc_alloc_aligned_pages_node(mpsr->nr_pages[i],
pgalign, IHK_MC_AP_USER, mpsr->nodes[i]);
if (!dst) {
mpsr->status[i] = -ENOMEM;
continue;
}
for (j = i; j < (i + mpsr->nr_pages[i]); ++j) {
mpsr->status[j] = mpsr->nodes[i];
}
mpsr->dst_phys[i] = virt_to_phys(dst);
dkprintf("%s: virt 0x%lx:%lu to node %d, pgalign: %d,"
" allocated phys: 0x%lx\n",
__FUNCTION__, mpsr->virt_addr[i],
mpsr->nr_pages[i] * PAGE_SIZE,
mpsr->nodes[i], pgalign, mpsr->dst_phys[i]);
}
}
/* Barrier */
ihk_atomic_inc(&mpsr->phase_done);
while (ihk_atomic_read(&mpsr->phase_done) !=
(phase * nr_cpus)) {
cpu_pause();
}
if (mpsr->phase_ret != 0) {
goto out;
}
kprintf("%s: phase %d done\n", __FUNCTION__, phase);
++phase;
/* Copy, PTE update, memfree in parallel */
for (i = i_s; i < i_e; ++i) {
if (!mpsr->dst_phys[i])
continue;
fast_memcpy(phys_to_virt(mpsr->dst_phys[i]),
phys_to_virt(pte_get_phys(mpsr->ptep[i])),
mpsr->nr_pages[i] * PAGE_SIZE);
ihk_mc_free_pages(
phys_to_virt(pte_get_phys(mpsr->ptep[i])),
mpsr->nr_pages[i]);
pte_update_phys(mpsr->ptep[i], mpsr->dst_phys[i]);
dkprintf("%s: virt 0x%lx:%lu copied and remapped to phys: 0x%lu\n",
__FUNCTION__, mpsr->virt_addr[i],
mpsr->nr_pages[i] * PAGE_SIZE,
mpsr->dst_phys[i]);
}
/* XXX: do a separate SMP call with only CPUs running threads
* of this process? */
if (cpu_local_var(current)->proc == mpsr->proc) {
/* Invalidate all TLBs */
for (i = 0; i < mpsr->count; i++) {
if (!mpsr->dst_phys[i])
continue;
flush_tlb_single((unsigned long)mpsr->virt_addr[i]);
}
}
out:
if (save_pt) {
ihk_mc_load_page_table(save_pt);
}
return mpsr->phase_ret;
}
SYSCALL_DECLARE(move_pages)
{
int pid = ihk_mc_syscall_arg0(ctx);
@ -8099,9 +8415,14 @@ SYSCALL_DECLARE(move_pages)
int *nr_pages = NULL;
unsigned long *dst_phys = NULL;
pte_t **ptep = NULL;
struct move_pages_smp_req mpsr;
struct process_vm *vm = cpu_local_var(current)->vm;
int i, ret = 0;
int ret = 0;
unsigned long t_s, t_e;
t_s = rdtsc();
/* Only self is supported for now */
if (pid) {
@ -8152,8 +8473,33 @@ SYSCALL_DECLARE(move_pages)
ret = -ENOMEM;
goto dealloc_out;
}
t_e = rdtsc(); kprintf("%s: init malloc: %lu \n", __FUNCTION__, t_e - t_s); t_s = t_e;
/* Get virt addresses and NUMA node numbers from user */
if (verify_process_vm(cpu_local_var(current)->vm,
user_virt_addr, sizeof(void *) * count)) {
ret = -EFAULT;
goto dealloc_out;
}
if (verify_process_vm(cpu_local_var(current)->vm,
user_nodes, sizeof(int) * count)) {
ret = -EFAULT;
goto dealloc_out;
}
if (verify_process_vm(cpu_local_var(current)->vm,
user_status, sizeof(int) * count)) {
ret = -EFAULT;
goto dealloc_out;
}
t_e = rdtsc(); kprintf("%s: init verify: %lu \n", __FUNCTION__, t_e - t_s); t_s = t_e;
#if 0
memcpy(virt_addr, user_virt_addr, sizeof(void *) * count);
memcpy(status, user_status, sizeof(int) * count);
memcpy(nodes, user_nodes, sizeof(int) * count);
if (copy_from_user(virt_addr, user_virt_addr,
sizeof(void *) * count)) {
ret = -EFAULT;
@ -8172,6 +8518,8 @@ SYSCALL_DECLARE(move_pages)
goto dealloc_out;
}
t_e = rdtsc(); kprintf("%s: init copy: %lu \n", __FUNCTION__, t_e - t_s); t_s = t_e;
/* Verify target NUMA nodes are valid */
for (i = 0; i < count; i++) {
if (nodes[i] < 0 ||
@ -8182,145 +8530,45 @@ SYSCALL_DECLARE(move_pages)
}
}
t_e = rdtsc(); kprintf("%s: init NUMAver: %lu \n", __FUNCTION__, t_e - t_s); t_s = t_e;
memset(ptep, 0, sizeof(pte_t) * count);
memset(status, 0, sizeof(int) * count);
memset(nr_pages, 0, sizeof(int) * count);
memset(dst_phys, 0, sizeof(unsigned long) * count);
t_e = rdtsc(); kprintf("%s: init memset: %lu \n", __FUNCTION__, t_e - t_s); t_s = t_e;
#endif
ihk_mc_spinlock_lock_noirq(&vm->page_table_lock);
for (i = 0; i < count; i++) {
void *phys;
size_t pgsize;
int p2align;
/*
* XXX: No page structures for anonymous mappings.
* Look up physical addresses by scanning page tables.
*/
ptep[i] = ihk_mc_pt_lookup_pte(vm->address_space->page_table,
(void *)virt_addr[i], 0, &phys, &pgsize, &p2align);
/* PTE valid? */
if (!ptep[i] || !pte_is_present(ptep[i])) {
status[i] = -ENOENT;
ptep[i] = NULL;
continue;
}
/* PTE is file? */
if (pte_is_fileoff(ptep[i], PAGE_SIZE)) {
status[i] = -EINVAL;
ptep[i] = NULL;
continue;
}
dkprintf("%s: virt 0x%lx:%lu requested to be moved to node %d\n",
__FUNCTION__, virt_addr[i], pgsize, nodes[i]);
/* Large page? */
if (pgsize > PAGE_SIZE) {
int nr_sub_pages = (pgsize / PAGE_SIZE);
int j;
if (i + nr_sub_pages > count) {
kprintf("%s: ERROR: page at index %d exceeds the region\n",
__FUNCTION__, i);
status[i] = -EINVAL;
break;
}
/* Is it contiguous across nr_sub_pages and all
* requested to be moved to the same target node? */
for (j = 0; j < nr_sub_pages; ++j) {
if (virt_addr[i + j] != (virt_addr[i] + (j * PAGE_SIZE)) ||
nodes[i] != nodes[i + j]) {
kprintf("%s: ERROR: virt address or node at index %d"
" is inconsistent\n",
__FUNCTION__, i + j);
ret = -EINVAL;
ihk_mc_spinlock_unlock_noirq(&vm->page_table_lock);
goto dealloc_out;
}
}
nr_pages[i] = nr_sub_pages;
i += (nr_sub_pages - 1);
}
else {
nr_pages[i] = 1;
}
}
memset(dst_phys, 0, sizeof(unsigned long) * count);
/* Allocate new pages on target NUMA nodes */
for (i = 0; i < count; i++) {
int pgalign = 0;
int j;
void *dst;
if (!ptep[i] || status[i] < 0 || !nr_pages[i])
continue;
/* TODO: store pgalign info in an array as well? */
if (nr_pages[i] > 1) {
if (nr_pages[i] * PAGE_SIZE == PTL2_SIZE)
pgalign = PTL2_SHIFT - PTL1_SHIFT;
}
dst = ihk_mc_alloc_aligned_pages_node(nr_pages[i],
pgalign, IHK_MC_AP_USER, nodes[i]);
if (!dst) {
status[i] = -ENOMEM;
continue;
}
for (j = i; j < (i + nr_pages[i]); ++j) {
status[j] = nodes[i];
}
dst_phys[i] = virt_to_phys(dst);
dkprintf("%s: virt 0x%lx:%lu to node %d, pgalign: %d,"
" allocated phys: 0x%lx\n",
__FUNCTION__, virt_addr[i],
nr_pages[i] * PAGE_SIZE,
nodes[i], pgalign, dst_phys[i]);
}
/* Do the copy, update PTEs, free original memory */
/* TODO: do this in parallel */
for (i = 0; i < count; i++) {
if (!dst_phys[i])
continue;
fast_memcpy(phys_to_virt(dst_phys[i]),
phys_to_virt(pte_get_phys(ptep[i])),
nr_pages[i] * PAGE_SIZE);
ihk_mc_free_pages(phys_to_virt(pte_get_phys(ptep[i])),
nr_pages[i]);
pte_update_phys(ptep[i], dst_phys[i]);
dkprintf("%s: virt 0x%lx:%lu copied and remapped to phys: 0x%lu\n",
__FUNCTION__, virt_addr[i],
nr_pages[i] * PAGE_SIZE,
dst_phys[i]);
}
/* Invalidate TLBs */
/* TODO: do this on all cores in parallel */
for (i = 0; i < count; i++) {
if (!dst_phys[i])
continue;
flush_tlb_single((unsigned long)virt_addr[i]);
}
/* Do the arg init, NUMA verification, copy,
* update PTEs, free original memory */
mpsr.count = count;
mpsr.user_virt_addr = user_virt_addr;
mpsr.user_status = user_status;
mpsr.user_nodes = user_nodes;
mpsr.virt_addr = virt_addr;
mpsr.status = status;
mpsr.nodes = nodes;
mpsr.nodes_ready = 0;
mpsr.ptep = ptep;
mpsr.dst_phys = dst_phys;
mpsr.nr_pages = nr_pages;
mpsr.proc = cpu_local_var(current)->proc;
ihk_atomic_set(&mpsr.phase_done, 0);
mpsr.phase_ret = 0;
ret = smp_call_func(&cpu_local_var(current)->cpu_set,
move_pages_smp_handler, &mpsr);
ihk_mc_spinlock_unlock_noirq(&vm->page_table_lock);
if (ret != 0) {
goto dealloc_out;
}
t_e = rdtsc(); kprintf("%s: parallel: %lu \n", __FUNCTION__, t_e - t_s); t_s = t_e;
/* This shouldn't fail (verified above) */
if (copy_to_user(user_status, status, sizeof(int) * count)) {
ret = -EFAULT;

View File

@ -37,6 +37,8 @@ int strlen_user(const char *s);
int strcpy_from_user(char *dst, const char *src);
long getlong_user(long *dest, const long *p);
int getint_user(int *dest, const int *p);
int verify_process_vm(struct process_vm *vm,
const void *usrc, size_t size);
int read_process_vm(struct process_vm *vm, void *kdst, const void *usrc, size_t siz);
int copy_to_user(void *dst, const void *src, size_t siz);
int setlong_user(long *dst, long data);