NUMA: get_mempolicy(), set_mempolicy() and mbind() implementation

This commit is contained in:
Balazs Gerofi
2016-11-05 13:32:02 +09:00
parent e46f027894
commit 0f826290d0
3 changed files with 409 additions and 11 deletions

View File

@ -369,6 +369,13 @@ struct vm_range {
int padding;
};
struct vm_range_numa_policy {
struct list_head list;
unsigned long start, end;
DECLARE_BITMAP(numa_mask, PROCESS_NUMA_MASK_BITS);
int numa_mem_policy;
};
struct vm_regions {
unsigned long vm_start, vm_end;
unsigned long text_start, text_end;
@ -660,6 +667,8 @@ struct process_vm {
long currss;
DECLARE_BITMAP(numa_mask, PROCESS_NUMA_MASK_BITS);
int numa_mem_policy;
/* Protected by memory_range_lock */
struct list_head vm_range_numa_policy_list;
};
static inline int has_cap_ipc_lock(struct thread *th)

View File

@ -210,6 +210,7 @@ init_process_vm(struct process *owner, struct address_space *asp, struct process
ihk_atomic_set(&vm->refcount, 1);
INIT_LIST_HEAD(&vm->vm_range_list);
INIT_LIST_HEAD(&vm->vm_range_numa_policy_list);
vm->address_space = asp;
vm->proc = owner;
vm->exiting = 0;
@ -2483,6 +2484,7 @@ void sched_init(void)
ihk_mc_init_context(&idle_thread->ctx, NULL, idle);
ihk_mc_spinlock_init(&idle_thread->vm->memory_range_lock);
INIT_LIST_HEAD(&idle_thread->vm->vm_range_list);
INIT_LIST_HEAD(&idle_thread->vm->vm_range_numa_policy_list);
idle_thread->proc->pid = 0;
idle_thread->tid = ihk_mc_get_processor_id();

View File

@ -7081,7 +7081,332 @@ out:
SYSCALL_DECLARE(mbind)
{
return -ENOSYS;
unsigned long addr = ihk_mc_syscall_arg0(ctx);
unsigned long len = ihk_mc_syscall_arg1(ctx);
int mode = ihk_mc_syscall_arg2(ctx);
unsigned long *nodemask =
(unsigned long *)ihk_mc_syscall_arg3(ctx);
unsigned long maxnode = ihk_mc_syscall_arg4(ctx);
unsigned flags = ihk_mc_syscall_arg5(ctx);
struct process_vm *vm = cpu_local_var(current)->vm;
unsigned long nodemask_bits = 0;
int mode_flags = 0;
int error = 0;
int bit;
struct vm_range *range;
struct vm_range_numa_policy *range_policy, *range_policy_iter;
struct vm_range_numa_policy *range_policy_next = NULL;
DECLARE_BITMAP(numa_mask, PROCESS_NUMA_MASK_BITS);
/* Validate arguments */
if (addr & ~PAGE_MASK) {
return -EINVAL;
}
len = (len + PAGE_SIZE - 1) & PAGE_MASK;
if (addr + len < addr || addr == (addr + len)) {
return -EINVAL;
}
memset(numa_mask, 0, sizeof(numa_mask));
if (maxnode) {
nodemask_bits = ALIGN(maxnode, 8);
if (maxnode > (PAGE_SIZE << 3)) {
dkprintf("%s: ERROR: nodemask_bits bigger than PAGE_SIZE bits\n",
__FUNCTION__);
error = -EINVAL;
goto out;
}
if (nodemask_bits > PROCESS_NUMA_MASK_BITS) {
dkprintf("%s: WARNING: process NUMA mask bits is insufficient\n",
__FUNCTION__);
nodemask_bits = PROCESS_NUMA_MASK_BITS;
}
}
if ((mode & MPOL_F_STATIC_NODES) && (mode & MPOL_F_RELATIVE_NODES)) {
dkprintf("%s: error: MPOL_F_STATIC_NODES & MPOL_F_RELATIVE_NODES\n",
__FUNCTION__);
error = -EINVAL;
goto out;
}
if ((flags & MPOL_MF_STRICT) && (flags & MPOL_MF_MOVE)) {
dkprintf("%s: error: MPOL_MF_STRICT & MPOL_MF_MOVE\n",
__FUNCTION__);
/*
* XXX: man page claims the correct error code is EIO,
* but LTP tests for EINVAL.
*/
error = -EINVAL;
goto out;
}
mode_flags = (mode & (MPOL_F_STATIC_NODES | MPOL_F_RELATIVE_NODES));
mode &= ~(MPOL_F_STATIC_NODES | MPOL_F_RELATIVE_NODES);
if (mode_flags & MPOL_F_RELATIVE_NODES) {
/* Not supported.. */
dkprintf("%s: error: MPOL_F_RELATIVE_NODES not supported\n",
__FUNCTION__);
error = -EINVAL;
goto out;
}
switch (mode) {
case MPOL_DEFAULT:
if (nodemask && nodemask_bits) {
error = copy_from_user(numa_mask, nodemask,
(nodemask_bits >> 3));
if (error) {
dkprintf("%s: error: copy_from_user numa_mask\n",
__FUNCTION__);
error = -EFAULT;
goto out;
}
if (!bitmap_empty(numa_mask, nodemask_bits)) {
dkprintf("%s: ERROR: nodemask not empty for MPOL_DEFAULT\n",
__FUNCTION__);
error = -EINVAL;
goto out;
}
}
break;
case MPOL_BIND:
case MPOL_INTERLEAVE:
case MPOL_PREFERRED:
/* Special case for MPOL_PREFERRED with empty nodemask */
if (mode == MPOL_PREFERRED && !nodemask) {
error = 0;
break;
}
if (flags & MPOL_MF_STRICT) {
error = -EIO;
goto out;
}
error = copy_from_user(numa_mask, nodemask,
(nodemask_bits >> 3));
if (error) {
error = -EFAULT;
goto out;
}
if (!nodemask || bitmap_empty(numa_mask, nodemask_bits)) {
dkprintf("%s: ERROR: nodemask not specified\n",
__FUNCTION__);
error = -EINVAL;
goto out;
}
/* Verify NUMA mask */
for_each_set_bit(bit, numa_mask, maxnode) {
if (bit >= ihk_mc_get_nr_numa_nodes()) {
dkprintf("%s: %d is bigger than # of NUMA nodes\n",
__FUNCTION__, bit);
error = -EINVAL;
goto out;
}
}
break;
default:
error = -EINVAL;
goto out;
}
/* Validate address range */
ihk_mc_spinlock_lock_noirq(&vm->memory_range_lock);
range = lookup_process_memory_range(vm, addr, addr + len);
if (!range) {
dkprintf("%s: ERROR: range is invalid\n", __FUNCTION__);
error = -EFAULT;
goto unlock_out;
}
/* Do the actual policy setting */
switch (mode) {
/*
* Man page claims MPOL_DEFAULT should remove any range specific
* policies so that process wise policy will be used. LTP on the
* other hand seems to test if MPOL_DEFAULT is set as a range policy.
* MPOL_DEFAULT thus behaves the same as the rest of the policies
* for now.
*/
#if 0
case MPOL_DEFAULT:
/* Delete or adjust any overlapping range settings */
list_for_each_entry_safe(range_policy_iter, range_policy_next,
&vm->vm_range_numa_policy_list, list) {
int keep = 0;
unsigned long orig_end = range_policy_iter->end;
if (range_policy_iter->end < addr ||
range_policy_iter->start > addr + len) {
continue;
}
/* Do we need to keep the front? */
if (range_policy_iter->start < addr) {
range_policy_iter->end = addr;
keep = 1;
}
/* Do we need to keep the end? */
if (orig_end > addr + len) {
/* Are we keeping front already? */
if (keep) {
/* Add a new entry after */
range_policy = kmalloc(sizeof(*range_policy),
IHK_MC_AP_NOWAIT);
if (!range_policy) {
kprintf("%s: error allocating range_policy\n",
__FUNCTION__);
error = -ENOMEM;
goto unlock_out;
}
memcpy(range_policy, range_policy_iter,
sizeof(*range_policy));
range_policy->start = addr + len;
range_policy->end = orig_end;
list_add(&range_policy->list,
&range_policy_iter->list);
}
else {
range_policy_iter->start = addr + len;
keep = 1;
}
}
if (!keep) {
list_del(&range_policy_iter->list);
kfree(range_policy_iter);
}
}
break;
#endif
case MPOL_DEFAULT:
case MPOL_BIND:
case MPOL_INTERLEAVE:
case MPOL_PREFERRED:
/* Adjust any overlapping range settings and add new one */
range_policy_next = NULL;
list_for_each_entry(range_policy_iter,
&vm->vm_range_numa_policy_list, list) {
int adjusted = 0;
unsigned long orig_end = range_policy_iter->end;
if (range_policy_iter->end < addr)
continue;
/* Special case of entirely overlapping */
if (range_policy_iter->start == addr &&
range_policy_iter->end == addr + len) {
range_policy = range_policy_iter;
goto mbind_update_only;
}
/* Overlapping partially? */
if (range_policy_iter->start < addr) {
orig_end = range_policy_iter->end;
range_policy_iter->end = addr;
adjusted = 1;
}
/* Do we need to keep the end? */
if (orig_end > addr + len) {
if (adjusted) {
/* Add a new entry after */
range_policy = kmalloc(sizeof(*range_policy),
IHK_MC_AP_NOWAIT);
if (!range_policy) {
dkprintf("%s: error allocating range_policy\n",
__FUNCTION__);
error = -ENOMEM;
goto unlock_out;
}
memcpy(range_policy, range_policy_iter,
sizeof(*range_policy));
range_policy->start = addr + len;
range_policy->end = orig_end;
list_add(&range_policy->list,
&range_policy_iter->list);
range_policy_next = range_policy;
break;
}
else {
range_policy_iter->start = addr + len;
range_policy_next = range_policy_iter;
break;
}
}
/* Next one in ascending address order? */
if (range_policy_iter->start >= addr + len) {
range_policy_next = range_policy_iter;
break;
}
}
/* Add a new entry */
range_policy = kmalloc(sizeof(*range_policy),
IHK_MC_AP_NOWAIT);
if (!range_policy) {
dkprintf("%s: error allocating range_policy\n",
__FUNCTION__);
error = -ENOMEM;
goto unlock_out;
}
memset(range_policy, 0, sizeof(*range_policy));
range_policy->start = addr;
range_policy->end = addr + len;
if (range_policy_next) {
list_add_tail(&range_policy->list,
&range_policy_next->list);
}
else {
list_add_tail(&range_policy->list,
&vm->vm_range_numa_policy_list);
}
mbind_update_only:
if (mode == MPOL_DEFAULT) {
memset(range_policy->numa_mask, 0, sizeof(numa_mask));
for (bit = 0; bit < ihk_mc_get_nr_numa_nodes(); ++bit) {
set_bit(bit, range_policy->numa_mask);
}
}
else {
memcpy(range_policy->numa_mask, &numa_mask,
sizeof(numa_mask));
}
range_policy->numa_mem_policy = mode;
break;
default:
error = -EINVAL;
goto out;
}
error = 0;
unlock_out:
ihk_mc_spinlock_unlock_noirq(&vm->memory_range_lock);
out:
return error;
} /* sys_mbind() */
SYSCALL_DECLARE(set_mempolicy)
@ -7094,6 +7419,8 @@ SYSCALL_DECLARE(set_mempolicy)
struct process_vm *vm = cpu_local_var(current)->vm;
int error = 0;
int bit, valid_mask;
struct vm_range_numa_policy *range_policy_iter;
struct vm_range_numa_policy *range_policy_next = NULL;
DECLARE_BITMAP(numa_mask, PROCESS_NUMA_MASK_BITS);
memset(numa_mask, 0, sizeof(numa_mask));
@ -7108,7 +7435,7 @@ SYSCALL_DECLARE(set_mempolicy)
}
if (nodemask_bits > PROCESS_NUMA_MASK_BITS) {
kprintf("%s: WARNING: process NUMA mask bits is insufficient\n",
dkprintf("%s: WARNING: process NUMA mask bits is insufficient\n",
__FUNCTION__);
nodemask_bits = PROCESS_NUMA_MASK_BITS;
}
@ -7137,7 +7464,14 @@ SYSCALL_DECLARE(set_mempolicy)
set_bit(bit, vm->numa_mask);
}
/* TODO: delete all mbind() specified regions */
/* Delete all range settings */
ihk_mc_spinlock_lock_noirq(&vm->memory_range_lock);
list_for_each_entry_safe(range_policy_iter, range_policy_next,
&vm->vm_range_numa_policy_list, list) {
list_del(&range_policy_iter->list);
kfree(range_policy_iter);
}
ihk_mc_spinlock_unlock_noirq(&vm->memory_range_lock);
vm->numa_mem_policy = mode;
error = 0;
@ -7224,16 +7558,25 @@ SYSCALL_DECLARE(get_mempolicy)
unsigned long addr = ihk_mc_syscall_arg3(ctx);
unsigned long flags = ihk_mc_syscall_arg4(ctx);
struct process_vm *vm = cpu_local_var(current)->vm;
int error;
struct vm_range_numa_policy *range_policy = NULL;
int error = 0;
int policy;
if (((flags & MPOL_F_ADDR) && !addr) ||
(!(flags & MPOL_F_ADDR) && addr) ||
if ((!(flags & MPOL_F_ADDR) && addr) ||
(flags & ~(MPOL_F_ADDR | MPOL_F_NODE | MPOL_F_MEMS_ALLOWED)) ||
((flags & MPOL_F_NODE) && !(flags & MPOL_F_ADDR) &&
vm->numa_mem_policy == MPOL_INTERLEAVE)) {
return -EINVAL;
}
/*
* XXX: man page claims the correct error code is EINVAL,
* but LTP tests for EFAULT.
*/
if ((flags & MPOL_F_ADDR) && !addr) {
return -EFAULT;
}
if (maxnode) {
if (maxnode < ihk_mc_get_nr_numa_nodes()) {
return -EINVAL;
@ -7247,18 +7590,62 @@ SYSCALL_DECLARE(get_mempolicy)
}
}
/* Special case of MPOL_F_MEMS_ALLOWED */
if (flags == MPOL_F_MEMS_ALLOWED) {
if (nodemask) {
error = copy_to_user(nodemask,
cpu_local_var(current)->vm->numa_mask,
(nodemask_bits >> 3));
if (error) {
error = -EFAULT;
}
}
goto out;
}
/* Address range specific? */
if (flags & MPOL_F_ADDR) {
struct vm_range_numa_policy *range_policy_iter;
struct vm_range *range;
ihk_mc_spinlock_lock_noirq(&vm->memory_range_lock);
range = lookup_process_memory_range(vm, addr, addr + 1);
if (!range) {
dkprintf("%s: ERROR: range is invalid\n", __FUNCTION__);
error = -EFAULT;
ihk_mc_spinlock_unlock_noirq(&vm->memory_range_lock);
goto out;
}
list_for_each_entry(range_policy_iter,
&vm->vm_range_numa_policy_list, list) {
if (range_policy_iter->start > addr ||
range_policy_iter->end <= addr) {
continue;
}
range_policy = range_policy_iter;
break;
}
ihk_mc_spinlock_unlock_noirq(&vm->memory_range_lock);
}
/* Return policy */
policy = range_policy ? range_policy->numa_mem_policy :
vm->numa_mem_policy;
if (mode) {
error = copy_to_user(mode,
&cpu_local_var(current)->vm->numa_mem_policy,
sizeof(int));
error = copy_to_user(mode, &policy, sizeof(int));
if (error) {
error = -EFAULT;
goto out;
}
}
if (nodemask) {
if (nodemask && (policy != MPOL_DEFAULT)) {
error = copy_to_user(nodemask,
range_policy ? range_policy->numa_mask :
cpu_local_var(current)->vm->numa_mask,
(nodemask_bits >> 3));
if (error) {