From effde241b9ca8163c3a1790ffd46a53c0c92ed6a Mon Sep 17 00:00:00 2001 From: Tomoki Shirasawa Date: Tue, 25 Jul 2017 13:03:48 +0900 Subject: [PATCH] support uti_attr for utility thread offloading --- executer/include/uprotocol.h | 40 +++++ executer/kernel/mcctrl/control.c | 287 +++++++++++++++++++++++++++++++ executer/kernel/mcctrl/driver.c | 3 + executer/user/mcexec.c | 40 +---- kernel/syscall.c | 10 +- 5 files changed, 342 insertions(+), 38 deletions(-) diff --git a/executer/include/uprotocol.h b/executer/include/uprotocol.h index 99a80bf2..7b985f5b 100644 --- a/executer/include/uprotocol.h +++ b/executer/include/uprotocol.h @@ -61,6 +61,7 @@ #define MCEXEC_UP_SYSCALL_THREAD 0x30a02924 #define MCEXEC_UP_TERMINATE_THREAD 0x30a02925 #define MCEXEC_UP_GET_NUM_POOL_THREADS 0x30a02926 +#define MCEXEC_UP_UTI_ATTR 0x30a02927 #define MCEXEC_UP_COPY_FROM_MCK 0x30a03000 #define MCEXEC_UP_COPY_TO_MCK 0x30a03001 @@ -274,4 +275,43 @@ struct perf_ctrl_desc { }; }; }; + +#define UTI_FLAG_NUMA_SET (1ULL<<1) /* Indicates NUMA_SET is specified */ + +#define UTI_FLAG_SAME_NUMA_DOMAIN (1ULL<<2) +#define UTI_FLAG_DIFFERENT_NUMA_DOMAIN (1ULL<<3) + +#define UTI_FLAG_SAME_L1 (1ULL<<4) +#define UTI_FLAG_SAME_L2 (1ULL<<5) +#define UTI_FLAG_SAME_L3 (1ULL<<6) + +#define UTI_FLAG_DIFFERENT_L1 (1ULL<<7) +#define UTI_FLAG_DIFFERENT_L2 (1ULL<<8) +#define UTI_FLAG_DIFFERENT_L3 (1ULL<<9) + +#define UTI_FLAG_EXCLUSIVE_CPU (1ULL<<10) +#define UTI_FLAG_CPU_INTENSIVE (1ULL<<11) +#define UTI_FLAG_HIGH_PRIORITY (1ULL<<12) +#define UTI_FLAG_NON_COOPERATIVE (1ULL<<13) + +/* Linux default value is used */ +#define UTI_MAX_NUMA_DOMAINS (1024) + +typedef struct uti_attr { + /* UTI_CPU_SET environmental variable is used to denote the preferred + location of utility thread */ + uint64_t numa_set[(UTI_MAX_NUMA_DOMAINS + sizeof(uint64_t) * 8 - 1) / + (sizeof(uint64_t) * 8)]; + uint64_t flags; /* Representing location and behavior hints by bitmap */ +} uti_attr_t; + +struct kuti_attr { + long parent_cpuid; + struct uti_attr attr; +}; + +struct uti_attr_desc { + struct kuti_attr *attr; +}; + #endif diff --git a/executer/kernel/mcctrl/control.c b/executer/kernel/mcctrl/control.c index 5e05cd23..70e2337c 100644 --- a/executer/kernel/mcctrl/control.c +++ b/executer/kernel/mcctrl/control.c @@ -2307,6 +2307,290 @@ mcexec_syscall_thread(ihk_os_t os, unsigned long arg, struct file *file) return rc; } +static struct ihk_cache_topology * +cache_topo_search(struct ihk_cpu_topology *cpu_topo, int level) +{ + struct ihk_cache_topology *lcache_topo; + + list_for_each_entry(lcache_topo, &cpu_topo->cache_topology_list, + chain) { + if (lcache_topo->level == level) + return lcache_topo; + } + return NULL; +} + +static long (*setaffinity)(pid_t pid, const struct cpumask *in_mask); +static int (*setscheduler_nocheck)(struct task_struct *p, int policy, + const struct sched_param *param); +static unsigned int *uti_rr; +static int max_cpu; + +static int +uti_attr_init(void) +{ + int i; + unsigned int *rr; + unsigned int *retval; + + if (uti_rr) + return 0; + + if (!setaffinity) { + setaffinity = (long (*)(pid_t, const struct cpumask *)) + kallsyms_lookup_name("sched_setaffinity"); + if (!setaffinity) + return -ENOSYS; + } + if (!setscheduler_nocheck) { + setscheduler_nocheck = (int (*)(struct task_struct *, int, + const struct sched_param *)) + kallsyms_lookup_name("sched_setscheduler_nocheck"); + if (!setscheduler_nocheck) + return -ENOSYS; + } + + for_each_possible_cpu(i) { + max_cpu = i; + } + max_cpu++; + rr = (unsigned int *)kmalloc(sizeof(unsigned int) * max_cpu, + GFP_KERNEL); + if (!rr) + return -ENOMEM; + memset(rr, '\0', sizeof(unsigned int) * max_cpu); + + retval = __sync_val_compare_and_swap(&uti_rr, NULL, rr); + if (retval != NULL) { + kfree(rr); + } + + return 0; +} + +void +uti_attr_finalize(void) +{ + if (uti_rr) + kfree(uti_rr); +} + +static cpumask_t * +uti_cpu_select(cpumask_t *cpumask) +{ + int i; + int mincpu; + unsigned int minrr; + unsigned int newval; + unsigned int retval; + +retry: + minrr = (unsigned int)-1; + mincpu = -1; + for_each_cpu(i, cpumask) { + int rr = uti_rr[i]; + if (rr < minrr) { + mincpu = i; + minrr = rr; + } + } + newval = minrr + 1; + retval = __sync_val_compare_and_swap(uti_rr + mincpu, minrr, newval); + if (retval != minrr) + goto retry; + +printk("sel cpu=%d rr=%d\n", mincpu, uti_rr[mincpu]); + for_each_cpu(i, cpumask) { + if (i != mincpu) { + cpumask_clear_cpu(i, cpumask); + } + } + + return cpumask; +} + +static long +mcexec_uti_attr(ihk_os_t os, struct uti_attr_desc __user *arg) +{ + struct uti_attr_desc desc; + struct kuti_attr *kattr; + cpumask_t *cpuset; + struct mcctrl_usrdata *ud = ihk_host_os_get_usrdata(os); + ihk_device_t dev = ihk_os_to_dev(os); + struct cpu_topology *cpu_topo; + struct cpu_topology *target_cpu = NULL; + struct node_topology *node_topo; + struct ihk_cache_topology *lcache_topo; + struct ihk_node_topology *lnode_topo; + cpumask_t *wkmask; + int i; + int rc = 0; + int mask_size = cpumask_size(); + + if ((rc = uti_attr_init())) { + return rc; + } + if (copy_from_user(&desc, arg, sizeof desc)) + return -EFAULT; + if (!(kattr = kmalloc(sizeof(struct kuti_attr), GFP_KERNEL))) + return -ENOMEM; + if (copy_from_user(kattr, (struct kuti_attr __user *)desc.attr, + sizeof(struct kuti_attr))) { + kfree(kattr); + return -EFAULT; + } + + if (((kattr->attr.flags & UTI_FLAG_SAME_L1) && + (kattr->attr.flags & UTI_FLAG_DIFFERENT_L1)) || + ((kattr->attr.flags & UTI_FLAG_SAME_L2) && + (kattr->attr.flags & UTI_FLAG_DIFFERENT_L2)) || + ((kattr->attr.flags & UTI_FLAG_SAME_L3) && + (kattr->attr.flags & UTI_FLAG_DIFFERENT_L3)) || + ((kattr->attr.flags & UTI_FLAG_SAME_NUMA_DOMAIN) && + (kattr->attr.flags & UTI_FLAG_DIFFERENT_NUMA_DOMAIN))) { + kfree(kattr); + return -EINVAL; + } + + if (!(cpuset = kmalloc(mask_size * 2, GFP_KERNEL))) { + kfree(kattr); + return -ENOMEM; + } + wkmask = (cpumask_t *)(((char *)cpuset) + mask_size); + + list_for_each_entry(cpu_topo, &ud->cpu_topology_list, chain) { + if (cpu_topo->mckernel_cpu_id == kattr->parent_cpuid) { + target_cpu = cpu_topo; + } + } + + if (!target_cpu) { + return -EINVAL; + } + + memcpy(cpuset, cpu_active_mask, mask_size); + + if (kattr->attr.flags & UTI_FLAG_NUMA_SET) { + nodemask_t *numaset = (nodemask_t *)&kattr->attr.numa_set[0]; + memset(wkmask, '\0', mask_size); + for_each_node_mask(i, *numaset) { + list_for_each_entry(node_topo, &ud->node_topology_list, + chain) { + if (node_topo->mckernel_numa_id == i) { + cpumask_or(wkmask, wkmask, + &node_topo->saved->cpumap); + break; + } + } + } + cpumask_and(cpuset, cpuset, wkmask); + } + + if ((kattr->attr.flags & UTI_FLAG_SAME_NUMA_DOMAIN) || + (kattr->attr.flags & UTI_FLAG_DIFFERENT_NUMA_DOMAIN)) { + memset(wkmask, '\0', mask_size); + for (i = 0; i < UTI_MAX_NUMA_DOMAINS; i++) { + lnode_topo = ihk_device_get_node_topology(dev, i); + if(!lnode_topo) + continue; + if(IS_ERR(lnode_topo)) + continue; + if (cpu_isset(target_cpu->saved->cpu_number, + lnode_topo->cpumap)) { + if (kattr->attr.flags & + UTI_FLAG_SAME_NUMA_DOMAIN) { + cpumask_or(wkmask, wkmask, + &lnode_topo->cpumap); + } + } + else { + if (kattr->attr.flags & + UTI_FLAG_DIFFERENT_NUMA_DOMAIN) { + cpumask_or(wkmask, wkmask, + &lnode_topo->cpumap); + } + } + } + cpumask_and(cpuset, cpuset, wkmask); + } + + if (((kattr->attr.flags & UTI_FLAG_SAME_L1) || + (kattr->attr.flags & UTI_FLAG_DIFFERENT_L1)) && + (lcache_topo = cache_topo_search(target_cpu->saved, 1))) { + if (kattr->attr.flags & UTI_FLAG_SAME_L1) { + cpumask_and(cpuset, cpuset, + &lcache_topo->shared_cpu_map); + } + else { + cpumask_complement(wkmask, + &lcache_topo->shared_cpu_map); + cpumask_and(cpuset, cpuset, wkmask); + } + } + + if (((kattr->attr.flags & UTI_FLAG_SAME_L2) || + (kattr->attr.flags & UTI_FLAG_DIFFERENT_L2)) && + (lcache_topo = cache_topo_search(target_cpu->saved, 2))) { + if (kattr->attr.flags & UTI_FLAG_SAME_L2) { + cpumask_and(cpuset, cpuset, + &lcache_topo->shared_cpu_map); + } + else { + cpumask_complement(wkmask, + &lcache_topo->shared_cpu_map); + cpumask_and(cpuset, cpuset, wkmask); + } + } + + if (((kattr->attr.flags & UTI_FLAG_SAME_L3) || + (kattr->attr.flags & UTI_FLAG_DIFFERENT_L3)) && + (lcache_topo = cache_topo_search(target_cpu->saved, 3))) { + if (kattr->attr.flags & UTI_FLAG_SAME_L3) { + cpumask_and(cpuset, cpuset, + &lcache_topo->shared_cpu_map); + } + else { + cpumask_complement(wkmask, + &lcache_topo->shared_cpu_map); + cpumask_and(cpuset, cpuset, wkmask); + } + } + + rc = cpumask_weight(cpuset); + if (!rc); /* do nothing */ + else if (kattr->attr.flags & UTI_FLAG_EXCLUSIVE_CPU) { + struct sched_param sp; + + setaffinity(0, uti_cpu_select(cpuset)); + sp.sched_priority = 1; + setscheduler_nocheck(current, SCHED_FIFO, &sp); + rc = 1; + } + else if (kattr->attr.flags & UTI_FLAG_CPU_INTENSIVE) { + setaffinity(0, uti_cpu_select(cpuset)); + rc = 1; + } + else if (kattr->attr.flags & UTI_FLAG_HIGH_PRIORITY) { + struct sched_param sp; + + setaffinity(0, uti_cpu_select(cpuset)); + sp.sched_priority = 1; + setscheduler_nocheck(current, SCHED_FIFO, &sp); + rc = 1; + } + else if (kattr->attr.flags & UTI_FLAG_NON_COOPERATIVE) { + setaffinity(0, uti_cpu_select(cpuset)); + rc = 1; + } + else { + setaffinity(0, cpuset); + } + + kfree(kattr); + kfree(cpuset); + return rc; +} + long mcexec_copy_from_mck(ihk_os_t os, unsigned long *arg) { @@ -2423,6 +2707,9 @@ long __mcctrl_control(ihk_os_t os, unsigned int req, unsigned long arg, case MCEXEC_UP_GET_NUM_POOL_THREADS: return mcctrl_get_num_pool_threads(os); + case MCEXEC_UP_UTI_ATTR: + return mcexec_uti_attr(os, (struct uti_attr_desc __user *)arg); + case MCEXEC_UP_COPY_FROM_MCK: return mcexec_copy_from_mck(os, (unsigned long *)arg); diff --git a/executer/kernel/mcctrl/driver.c b/executer/kernel/mcctrl/driver.c index 274ce00f..21ada913 100644 --- a/executer/kernel/mcctrl/driver.c +++ b/executer/kernel/mcctrl/driver.c @@ -43,6 +43,7 @@ extern void procfs_exit(int); extern void rus_page_hash_init(void); extern void rus_page_hash_put_pages(void); +extern void uti_attr_finalize(void); extern void binfmt_mcexec_init(void); extern void binfmt_mcexec_exit(void); @@ -87,6 +88,7 @@ static struct ihk_os_user_call_handler mcctrl_uchs[] = { { .request = MCEXEC_UP_SYSCALL_THREAD, .func = mcctrl_ioctl }, { .request = MCEXEC_UP_TERMINATE_THREAD, .func = mcctrl_ioctl }, { .request = MCEXEC_UP_GET_NUM_POOL_THREADS, .func = mcctrl_ioctl }, + { .request = MCEXEC_UP_UTI_ATTR, .func = mcctrl_ioctl }, { .request = MCEXEC_UP_DEBUG_LOG, .func = mcctrl_ioctl }, { .request = MCEXEC_UP_COPY_FROM_MCK, .func = mcctrl_ioctl }, { .request = MCEXEC_UP_COPY_TO_MCK, .func = mcctrl_ioctl }, @@ -230,6 +232,7 @@ static void __exit mcctrl_exit(void) binfmt_mcexec_exit(); rus_page_hash_put_pages(); + uti_attr_finalize(); printk("mcctrl: unregistered.\n"); } diff --git a/executer/user/mcexec.c b/executer/user/mcexec.c index 8c1d9db5..1de09df8 100644 --- a/executer/user/mcexec.c +++ b/executer/user/mcexec.c @@ -179,40 +179,6 @@ struct kernel_termios { cc_t c_cc[NCCS]; /* control characters */ }; -#define UTI_FLAG_NUMA_SET (1ULL<<1) /* Indicates NUMA_SET is specified */ - -#define UTI_FLAG_SAME_NUMA_DOMAIN (1ULL<<2) -#define UTI_FLAG_DIFFERENT_NUMA_DOMAIN (1ULL<<3) - -#define UTI_FLAG_SAME_L1 (1ULL<<4) -#define UTI_FLAG_SAME_L2 (1ULL<<5) -#define UTI_FLAG_SAME_L3 (1ULL<<6) - -#define UTI_FLAG_DIFFERENT_L1 (1ULL<<7) -#define UTI_FLAG_DIFFERENT_L2 (1ULL<<8) -#define UTI_FLAG_DIFFERENT_L3 (1ULL<<9) - -#define UTI_FLAG_EXCLUSIVE_CPU (1ULL<<10) -#define UTI_FLAG_CPU_INTENSIVE (1ULL<<11) -#define UTI_FLAG_HIGH_PRIORITY (1ULL<<12) -#define UTI_FLAG_NON_COOPERATIVE (1ULL<<13) - -/* Linux default value is used */ -#define UTI_MAX_NUMA_DOMAINS (1024) - -typedef struct uti_attr { - /* UTI_CPU_SET environmental variable is used to denote the preferred - location of utility thread */ - uint64_t numa_set[(UTI_MAX_NUMA_DOMAINS + sizeof(uint64_t) * 8 - 1) / - (sizeof(uint64_t) * 8)]; - uint64_t flags; /* Representing location and behavior hints by bitmap */ -} uti_attr_t; - -struct kuti_attr { - long parent_cpuid; - struct uti_attr attr; -}; - struct thread_data_s; int main_loop(struct thread_data_s *); @@ -2591,6 +2557,7 @@ util_thread_setaffinity(unsigned long pattr) { struct kuti_attr kattr; unsigned long args[3]; + struct uti_attr_desc desc; args[0] = (unsigned long)&kattr; args[1] = pattr; @@ -2599,9 +2566,8 @@ util_thread_setaffinity(unsigned long pattr) return; } - - - + desc.attr = &kattr; + ioctl(fd, MCEXEC_UP_UTI_ATTR, &desc); } static long diff --git a/kernel/syscall.c b/kernel/syscall.c index c4a738ca..6f98562a 100644 --- a/kernel/syscall.c +++ b/kernel/syscall.c @@ -246,7 +246,8 @@ long do_syscall(struct syscall_request *req, int cpu, int pid) ihk_mc_spinlock_lock(&(get_this_cpu_local_var()->runq_lock)); v = get_this_cpu_local_var(); - if (v->flags & CPU_FLAG_NEED_RESCHED) { + if (v->flags & CPU_FLAG_NEED_RESCHED || + req->number == __NR_sched_setaffinity) { do_schedule = 1; } @@ -2357,6 +2358,13 @@ retry_tid: new->status = PS_RUNNING; if (old->mod_clone == SPAWN_TO_REMOTE) { new->mod_clone = SPAWNING_TO_REMOTE; + if (old->mod_clone_arg) { + new->mod_clone_arg = kmalloc(sizeof(struct uti_attr), + IHK_MC_AP_NOWAIT); + if (new->mod_clone_arg) + memcpy(new->mod_clone_arg, old->mod_clone_arg, + sizeof(struct uti_attr)); + } } chain_thread(new); if (!(clone_flags & CLONE_VM)) {