mcexec: -n: topology aware partitioned execution

2016-12-10 16:27:57 +09:00
parent fdcf766337
commit 052b3f44ca
8 changed files with 239 additions and 11 deletions
--- a/executer/include/uprotocol.h
+++ b/executer/include/uprotocol.h
@ -42,6 +42,7 @@
 #define MCEXEC_UP_GET_CRED	 0x30a0290a
 #define MCEXEC_UP_GET_CREDV	 0x30a0290b
 #define MCEXEC_UP_GET_NODES  0x30a0290c
 #define MCEXEC_UP_GET_CPUSET  0x30a0290d
 #define MCEXEC_UP_PREPARE_DMA    0x30a02910
 #define MCEXEC_UP_FREE_DMA       0x30a02911
@ -79,6 +80,13 @@ struct program_image_section {
 #define SHELL_PATH_MAX_LEN	1024
 #define MCK_RLIM_MAX	20
 struct get_cpu_set_arg {
 	int nr_processes;
 	void *cpu_set;
 	size_t cpu_set_size;	// Size in bytes
 	int *target_core;
 };
 #define PLD_CPU_SET_MAX_CPUS 1024
 typedef unsigned long __cpu_set_unit;
 #define PLD_CPU_SET_SIZE (PLD_CPU_SET_MAX_CPUS / (8 * sizeof(__cpu_set_unit)))
--- a/executer/kernel/mcctrl/control.c
+++ b/executer/kernel/mcctrl/control.c
@ -34,6 +34,7 @@
 #include <linux/version.h>
 #include <linux/semaphore.h>
 #include <linux/interrupt.h>
 #include <linux/cpumask.h>
 #include <asm/uaccess.h>
 #include <asm/delay.h>
 #include <asm/io.h>
@ -460,6 +461,183 @@ static long mcexec_get_nodes(ihk_os_t os)
 	return usrdata->mem_info->n_numa_nodes;
 }
 extern int linux_numa_2_mckernel_numa(struct mcctrl_usrdata *udp, int numa_id);
 extern int mckernel_cpu_2_linux_cpu(struct mcctrl_usrdata *udp, int cpu_id);
 static long mcexec_get_cpuset(ihk_os_t os, unsigned long arg)
 {
 	struct mcctrl_usrdata *udp = ihk_host_os_get_usrdata(os);
 	struct mcctrl_part_exec *pe;
 	struct get_cpu_set_arg req;
 	struct cpu_topology *cpu_top, *cpu_top_i;
 	struct cache_topology *cache_top;
 	int cpu, cpus_assigned, cpus_to_assign, cpu_prev;
 	int ret = 0;
 	cpumask_t cpus_used;
 	cpumask_t cpus_to_use;
 	if (!udp) {
 		return -EINVAL;
 	}
 	pe = &udp->part_exec;
 	if (copy_from_user(&req, (void *)arg, sizeof(req))) {
 		printk("%s: error copying user request\n", __FUNCTION__);
 		return -EINVAL;
 	}
 	mutex_lock(&pe->lock);
 	memcpy(&cpus_used, &pe->cpus_used, sizeof(cpumask_t));
 	memset(&cpus_to_use, 0, sizeof(cpus_to_use));
 	/* First process to enter CPU partitioning */
 	if (pe->nr_processes == -1) {
 		pe->nr_processes = req.nr_processes;
 		pe->nr_processes_left = req.nr_processes;
 		dprintk("%s: nr_processes: %d (partitioned exec starts)\n",
 				__FUNCTION__,
 				pe->nr_processes);
 	}
 	if (pe->nr_processes != req.nr_processes) {
 		printk("%s: error: requested number of processes"
 				" doesn't match current partitioned execution\n",
 				__FUNCTION__);
 		ret = -EINVAL;
 		goto unlock_out;
 	}
 	--pe->nr_processes_left;
 	dprintk("%s: nr_processes: %d, nr_processes_left: %d\n",
 			__FUNCTION__,
 			pe->nr_processes,
 			pe->nr_processes_left);
 	cpus_to_assign = udp->cpu_info->n_cpus / req.nr_processes;
 	/* Find the first unused CPU */
 	cpu = cpumask_next_zero(-1, &cpus_used);
 	if (cpu >= udp->cpu_info->n_cpus) {
 		printk("%s: error: no more CPUs available\n",
 				__FUNCTION__);
 		ret = -EINVAL;
 		goto unlock_out;
 	}
 	cpu_set(cpu, cpus_used);
 	cpu_set(cpu, cpus_to_use);
 	cpu_prev = cpu;
 	dprintk("%s: CPU %d assigned (first)\n", __FUNCTION__, cpu);
 	for (cpus_assigned = 1; cpus_assigned < cpus_to_assign;
 			++cpus_assigned) {
 		int node;
 		cpu_top = NULL;
 		/* Find the topology object of the last core assigned */
 		list_for_each_entry(cpu_top_i, &udp->cpu_topology_list, chain) {
 			if (cpu_top_i->mckernel_cpu_id == cpu_prev) {
 				cpu_top = cpu_top_i;
 				break;
 			}
 		}
 		if (!cpu_top) {
 			printk("%s: error: couldn't find CPU topology info\n",
 					__FUNCTION__);
 			ret = -EINVAL;
 			goto unlock_out;
 		}
 		/* Find a core sharing the same cache iterating caches from
 		 * the most inner one outwards */
 		list_for_each_entry(cache_top, &cpu_top->cache_list, chain) {
 			for_each_cpu(cpu, &cache_top->shared_cpu_map) {
 				if (!cpu_isset(cpu, cpus_used)) {
 					cpu_set(cpu, cpus_used);
 					cpu_set(cpu, cpus_to_use);
 					cpu_prev = cpu;
 					dprintk("%s: CPU %d assigned (same cache L%lu)\n",
 						__FUNCTION__, cpu, cache_top->saved->level);
 					goto next_cpu;
 				}
 			}
 		}
 		/* No CPU? Find a core from the same NUMA node */
 		node = linux_numa_2_mckernel_numa(udp,
 				cpu_to_node(mckernel_cpu_2_linux_cpu(udp, cpu_prev)));
 		for_each_cpu_not(cpu, &cpus_used) {
 			/* Found one */
 			if (node == linux_numa_2_mckernel_numa(udp,
 						cpu_to_node(mckernel_cpu_2_linux_cpu(udp, cpu)))) {
 				cpu_set(cpu, cpus_used);
 				cpu_set(cpu, cpus_to_use);
 				cpu_prev = cpu;
 				dprintk("%s: CPU %d assigned (same NUMA)\n",
 						__FUNCTION__, cpu);
 				goto next_cpu;
 			}
 		}
 		/* No CPU? Simply find the next unused one */
 		cpu = cpumask_next_zero(-1, &cpus_used);
 		if (cpu >= udp->cpu_info->n_cpus) {
 			printk("%s: error: no more CPUs available\n",
 					__FUNCTION__);
 			ret = -EINVAL;
 			goto unlock_out;
 		}
 		cpu_set(cpu, cpus_used);
 		cpu_set(cpu, cpus_to_use);
 		cpu_prev = cpu;
 		dprintk("%s: CPU %d assigned (unused)\n",
 				__FUNCTION__, cpu);
 next_cpu:
 		continue;
 	}
 	/* Found all cores, let user know */
 	if (copy_to_user(req.cpu_set, &cpus_to_use,
 				(req.cpu_set_size < sizeof(cpus_to_use) ?
 				 req.cpu_set_size : sizeof(cpus_to_use)))) {
 		printk("%s: error copying mask to user\n", __FUNCTION__);
 		ret = -EINVAL;
 		goto unlock_out;
 	}
 	cpu = cpumask_next(-1, &cpus_to_use);
 	if (copy_to_user(req.target_core, &cpu, sizeof(cpu))) {
 		printk("%s: error copying target core to user\n",
 				__FUNCTION__);
 		ret = -EINVAL;
 		goto unlock_out;
 	}
 	/* Commit used cores to OS structure */
 	memcpy(&pe->cpus_used, &cpus_used, sizeof(cpus_used));
 	/* Reset if last process */
 	if (pe->nr_processes_left == 0) {
 		dprintk("%s: nr_processes: %d (partitioned exec ends)\n",
 				__FUNCTION__,
 				pe->nr_processes);
 		pe->nr_processes = -1;
 		memset(&pe->cpus_used, 0, sizeof(pe->cpus_used));
 	}
 	ret = 0;
 unlock_out:
 	mutex_unlock(&pe->lock);
 	return ret;
 }
 int mcctrl_add_per_proc_data(struct mcctrl_usrdata *ud, int pid, 
 	struct mcctrl_per_proc_data *ppd)
 {
@ -1279,6 +1457,9 @@ long __mcctrl_control(ihk_os_t os, unsigned int req, unsigned long arg,
 	case MCEXEC_UP_GET_NODES:
 		return mcexec_get_nodes(os);
 	case MCEXEC_UP_GET_CPUSET:
 		return mcexec_get_cpuset(os, arg);
 	case MCEXEC_UP_STRNCPY_FROM_USER:
 		return mcexec_strncpy_from_user(os, 
 				(struct strncpy_from_user_desc *)arg);
--- a/executer/kernel/mcctrl/driver.c
+++ b/executer/kernel/mcctrl/driver.c
@ -61,6 +61,7 @@ static struct ihk_os_user_call_handler mcctrl_uchs[] = {
 	{ .request = MCEXEC_UP_SEND_SIGNAL, .func = mcctrl_ioctl },
 	{ .request = MCEXEC_UP_GET_CPU, .func = mcctrl_ioctl },
 	{ .request = MCEXEC_UP_GET_NODES, .func = mcctrl_ioctl },
 	{ .request = MCEXEC_UP_GET_CPUSET, .func = mcctrl_ioctl },
 	{ .request = MCEXEC_UP_STRNCPY_FROM_USER, .func = mcctrl_ioctl },
 	{ .request = MCEXEC_UP_NEW_PROCESS, .func = mcctrl_ioctl },
 	{ .request = MCEXEC_UP_PREPARE_DMA, .func = mcctrl_ioctl },
--- a/executer/kernel/mcctrl/ikc.c
+++ b/executer/kernel/mcctrl/ikc.c
@ -298,6 +298,9 @@ int prepare_ikc_channels(ihk_os_t os)
 	INIT_LIST_HEAD(&usrdata->cpu_topology_list);
 	INIT_LIST_HEAD(&usrdata->node_topology_list);
 	mutex_init(&usrdata->part_exec.lock);
 	usrdata->part_exec.nr_processes = -1;
 	return 0;
 }
--- a/executer/kernel/mcctrl/mcctrl.h
+++ b/executer/kernel/mcctrl/mcctrl.h
@ -254,6 +254,13 @@ struct node_topology {
 	struct list_head chain;
 };
 struct mcctrl_part_exec {
 	struct mutex lock;	
 	int nr_processes;
 	int nr_processes_left;
 	cpumask_t cpus_used;
 };
 #define CPU_LONGS (((NR_CPUS) + (BITS_PER_LONG) - 1) / (BITS_PER_LONG))
 #define MCCTRL_PER_PROC_DATA_HASH_SHIFT 7
@ -284,6 +291,7 @@ struct mcctrl_usrdata {
 	nodemask_t numa_online;
 	struct list_head cpu_topology_list;
 	struct list_head node_topology_list;
 	struct mcctrl_part_exec part_exec;
 };
 struct mcctrl_signal {
--- a/executer/kernel/mcctrl/sysfs_files.c
+++ b/executer/kernel/mcctrl/sysfs_files.c
@ -197,19 +197,19 @@ void free_topology_info(ihk_os_t os)
 /*
 * CPU and NUMA node mapping conversion functions.
 */
-static int mckernel_cpu_2_linux_cpu(struct mcctrl_usrdata *udp, int cpu_id)
+int mckernel_cpu_2_linux_cpu(struct mcctrl_usrdata *udp, int cpu_id)
 {
 	return (cpu_id < udp->cpu_info->n_cpus) ?
 		udp->cpu_info->mapping[cpu_id] : -1;
 }
-static int mckernel_cpu_2_hw_id(struct mcctrl_usrdata *udp, int cpu_id)
+int mckernel_cpu_2_hw_id(struct mcctrl_usrdata *udp, int cpu_id)
 {
 	return (cpu_id < udp->cpu_info->n_cpus) ?
 		udp->cpu_info->hw_ids[cpu_id] : -1;
 }
-static int linux_cpu_2_mckernel_cpu(struct mcctrl_usrdata *udp, int cpu_id)
+int linux_cpu_2_mckernel_cpu(struct mcctrl_usrdata *udp, int cpu_id)
 {
 	int i;
@ -222,7 +222,7 @@ static int linux_cpu_2_mckernel_cpu(struct mcctrl_usrdata *udp, int cpu_id)
 }
 #if 0
-static int hw_id_2_mckernel_cpu(struct mcctrl_usrdata *udp, int hw_id)
+int hw_id_2_mckernel_cpu(struct mcctrl_usrdata *udp, int hw_id)
 {
 	int i;
@ -235,7 +235,7 @@ static int hw_id_2_mckernel_cpu(struct mcctrl_usrdata *udp, int hw_id)
 	return -1;
 }
-static int hw_id_2_linux_cpu(struct mcctrl_usrdata *udp, int hw_id)
+int hw_id_2_linux_cpu(struct mcctrl_usrdata *udp, int hw_id)
 {
 	int i;
@ -248,7 +248,7 @@ static int hw_id_2_linux_cpu(struct mcctrl_usrdata *udp, int hw_id)
 	return -1;
 }
-static int linux_cpu_2_hw_id(struct mcctrl_usrdata *udp, int cpu)
+int linux_cpu_2_hw_id(struct mcctrl_usrdata *udp, int cpu)
 {
 	int mckernel_cpu = linux_cpu_2_mckernel_cpu(udp, cpu);
@ -257,13 +257,13 @@ static int linux_cpu_2_hw_id(struct mcctrl_usrdata *udp, int cpu)
 }
 #endif
-static int mckernel_numa_2_linux_numa(struct mcctrl_usrdata *udp, int numa_id)
+int mckernel_numa_2_linux_numa(struct mcctrl_usrdata *udp, int numa_id)
 {
 	return (numa_id < udp->mem_info->n_numa_nodes) ?
 		udp->mem_info->numa_mapping[numa_id] : -1;
 }
-static int linux_numa_2_mckernel_numa(struct mcctrl_usrdata *udp, int numa_id)
+int linux_numa_2_mckernel_numa(struct mcctrl_usrdata *udp, int numa_id)
 {
 	int i;
--- a/executer/user/mcexec.c
+++ b/executer/user/mcexec.c
@ -153,6 +153,9 @@ static const char rlimit_stack_envname[] = "MCKERNEL_RLIMIT_STACK";
 static int ischild;
 static int enable_vdso = 1;
 /* Partitioned execution (e.g., for MPI) */
 static int nr_processes = 0;
 struct fork_sync {
 	pid_t pid;
 	int status;
@ -1102,7 +1105,7 @@ static int reduce_stack(struct rlimit *orig_rlim, char *argv[])
 void print_usage(char **argv)
 {
-	fprintf(stderr, "Usage: %s [-c target_core] [<mcos-id>] (program) [args...]\n", argv[0]);
+	fprintf(stderr, "Usage: %s [-c target_core] [-n nr_partitions] [<mcos-id>] (program) [args...]\n", argv[0]);
 }
 void init_sigaction(void)
@ -1329,12 +1332,16 @@ int main(int argc, char **argv)
 	}
 	/* Parse options ("+" denotes stop at the first non-option) */
-	while ((opt = getopt_long(argc, argv, "+c:", mcexec_options, NULL)) != -1) {
+	while ((opt = getopt_long(argc, argv, "+c:n:", mcexec_options, NULL)) != -1) {
 		switch (opt) {
 			case 'c':
 				target_core = atoi(optarg);
 				break;
-			
+
 			case 'n':
 				nr_processes = atoi(optarg);
 				break;
 			case 0:	/* long opt */
 				break;
@ -1599,6 +1606,24 @@ int main(int argc, char **argv)
 		exit(1);
 	}
 	/* Partitioned execution, obtain CPU set */
 	if (nr_processes > 0) {
 		struct get_cpu_set_arg cpu_set_arg;
 		cpu_set_arg.cpu_set = (void *)&desc->cpu_set;
 		cpu_set_arg.cpu_set_size = sizeof(desc->cpu_set);
 		cpu_set_arg.nr_processes = nr_processes;
 		cpu_set_arg.target_core = &target_core;
 		if (ioctl(fd, MCEXEC_UP_GET_CPUSET, (void *)&cpu_set_arg) != 0) {
 			perror("getting CPU set for partitioned execution");
 			close(fd);
 			return 1;
 		}
 		desc->cpu = target_core;
 	}
 	if (ioctl(fd, MCEXEC_UP_PREPARE_IMAGE, (unsigned long)desc) != 0) {
 		perror("prepare");
 		close(fd);
--- a/kernel/process.c
+++ b/kernel/process.c
@ -265,6 +265,8 @@ struct thread *create_thread(unsigned long user_pc,
 			goto err;
 		}
 		dkprintf("%s: pid: %d, CPU: %d\n",
 			__FUNCTION__, proc->pid, cpu); 
 		CPU_SET(cpu, &thread->cpu_set);
 		cpu_set_empty = 0;
 	}