mcexec: -n: topology aware partitioned execution

This commit is contained in:
Balazs Gerofi
2016-12-10 16:27:57 +09:00
parent fdcf766337
commit 052b3f44ca
8 changed files with 239 additions and 11 deletions

View File

@ -42,6 +42,7 @@
#define MCEXEC_UP_GET_CRED 0x30a0290a #define MCEXEC_UP_GET_CRED 0x30a0290a
#define MCEXEC_UP_GET_CREDV 0x30a0290b #define MCEXEC_UP_GET_CREDV 0x30a0290b
#define MCEXEC_UP_GET_NODES 0x30a0290c #define MCEXEC_UP_GET_NODES 0x30a0290c
#define MCEXEC_UP_GET_CPUSET 0x30a0290d
#define MCEXEC_UP_PREPARE_DMA 0x30a02910 #define MCEXEC_UP_PREPARE_DMA 0x30a02910
#define MCEXEC_UP_FREE_DMA 0x30a02911 #define MCEXEC_UP_FREE_DMA 0x30a02911
@ -79,6 +80,13 @@ struct program_image_section {
#define SHELL_PATH_MAX_LEN 1024 #define SHELL_PATH_MAX_LEN 1024
#define MCK_RLIM_MAX 20 #define MCK_RLIM_MAX 20
struct get_cpu_set_arg {
int nr_processes;
void *cpu_set;
size_t cpu_set_size; // Size in bytes
int *target_core;
};
#define PLD_CPU_SET_MAX_CPUS 1024 #define PLD_CPU_SET_MAX_CPUS 1024
typedef unsigned long __cpu_set_unit; typedef unsigned long __cpu_set_unit;
#define PLD_CPU_SET_SIZE (PLD_CPU_SET_MAX_CPUS / (8 * sizeof(__cpu_set_unit))) #define PLD_CPU_SET_SIZE (PLD_CPU_SET_MAX_CPUS / (8 * sizeof(__cpu_set_unit)))

View File

@ -34,6 +34,7 @@
#include <linux/version.h> #include <linux/version.h>
#include <linux/semaphore.h> #include <linux/semaphore.h>
#include <linux/interrupt.h> #include <linux/interrupt.h>
#include <linux/cpumask.h>
#include <asm/uaccess.h> #include <asm/uaccess.h>
#include <asm/delay.h> #include <asm/delay.h>
#include <asm/io.h> #include <asm/io.h>
@ -460,6 +461,183 @@ static long mcexec_get_nodes(ihk_os_t os)
return usrdata->mem_info->n_numa_nodes; return usrdata->mem_info->n_numa_nodes;
} }
extern int linux_numa_2_mckernel_numa(struct mcctrl_usrdata *udp, int numa_id);
extern int mckernel_cpu_2_linux_cpu(struct mcctrl_usrdata *udp, int cpu_id);
static long mcexec_get_cpuset(ihk_os_t os, unsigned long arg)
{
struct mcctrl_usrdata *udp = ihk_host_os_get_usrdata(os);
struct mcctrl_part_exec *pe;
struct get_cpu_set_arg req;
struct cpu_topology *cpu_top, *cpu_top_i;
struct cache_topology *cache_top;
int cpu, cpus_assigned, cpus_to_assign, cpu_prev;
int ret = 0;
cpumask_t cpus_used;
cpumask_t cpus_to_use;
if (!udp) {
return -EINVAL;
}
pe = &udp->part_exec;
if (copy_from_user(&req, (void *)arg, sizeof(req))) {
printk("%s: error copying user request\n", __FUNCTION__);
return -EINVAL;
}
mutex_lock(&pe->lock);
memcpy(&cpus_used, &pe->cpus_used, sizeof(cpumask_t));
memset(&cpus_to_use, 0, sizeof(cpus_to_use));
/* First process to enter CPU partitioning */
if (pe->nr_processes == -1) {
pe->nr_processes = req.nr_processes;
pe->nr_processes_left = req.nr_processes;
dprintk("%s: nr_processes: %d (partitioned exec starts)\n",
__FUNCTION__,
pe->nr_processes);
}
if (pe->nr_processes != req.nr_processes) {
printk("%s: error: requested number of processes"
" doesn't match current partitioned execution\n",
__FUNCTION__);
ret = -EINVAL;
goto unlock_out;
}
--pe->nr_processes_left;
dprintk("%s: nr_processes: %d, nr_processes_left: %d\n",
__FUNCTION__,
pe->nr_processes,
pe->nr_processes_left);
cpus_to_assign = udp->cpu_info->n_cpus / req.nr_processes;
/* Find the first unused CPU */
cpu = cpumask_next_zero(-1, &cpus_used);
if (cpu >= udp->cpu_info->n_cpus) {
printk("%s: error: no more CPUs available\n",
__FUNCTION__);
ret = -EINVAL;
goto unlock_out;
}
cpu_set(cpu, cpus_used);
cpu_set(cpu, cpus_to_use);
cpu_prev = cpu;
dprintk("%s: CPU %d assigned (first)\n", __FUNCTION__, cpu);
for (cpus_assigned = 1; cpus_assigned < cpus_to_assign;
++cpus_assigned) {
int node;
cpu_top = NULL;
/* Find the topology object of the last core assigned */
list_for_each_entry(cpu_top_i, &udp->cpu_topology_list, chain) {
if (cpu_top_i->mckernel_cpu_id == cpu_prev) {
cpu_top = cpu_top_i;
break;
}
}
if (!cpu_top) {
printk("%s: error: couldn't find CPU topology info\n",
__FUNCTION__);
ret = -EINVAL;
goto unlock_out;
}
/* Find a core sharing the same cache iterating caches from
* the most inner one outwards */
list_for_each_entry(cache_top, &cpu_top->cache_list, chain) {
for_each_cpu(cpu, &cache_top->shared_cpu_map) {
if (!cpu_isset(cpu, cpus_used)) {
cpu_set(cpu, cpus_used);
cpu_set(cpu, cpus_to_use);
cpu_prev = cpu;
dprintk("%s: CPU %d assigned (same cache L%lu)\n",
__FUNCTION__, cpu, cache_top->saved->level);
goto next_cpu;
}
}
}
/* No CPU? Find a core from the same NUMA node */
node = linux_numa_2_mckernel_numa(udp,
cpu_to_node(mckernel_cpu_2_linux_cpu(udp, cpu_prev)));
for_each_cpu_not(cpu, &cpus_used) {
/* Found one */
if (node == linux_numa_2_mckernel_numa(udp,
cpu_to_node(mckernel_cpu_2_linux_cpu(udp, cpu)))) {
cpu_set(cpu, cpus_used);
cpu_set(cpu, cpus_to_use);
cpu_prev = cpu;
dprintk("%s: CPU %d assigned (same NUMA)\n",
__FUNCTION__, cpu);
goto next_cpu;
}
}
/* No CPU? Simply find the next unused one */
cpu = cpumask_next_zero(-1, &cpus_used);
if (cpu >= udp->cpu_info->n_cpus) {
printk("%s: error: no more CPUs available\n",
__FUNCTION__);
ret = -EINVAL;
goto unlock_out;
}
cpu_set(cpu, cpus_used);
cpu_set(cpu, cpus_to_use);
cpu_prev = cpu;
dprintk("%s: CPU %d assigned (unused)\n",
__FUNCTION__, cpu);
next_cpu:
continue;
}
/* Found all cores, let user know */
if (copy_to_user(req.cpu_set, &cpus_to_use,
(req.cpu_set_size < sizeof(cpus_to_use) ?
req.cpu_set_size : sizeof(cpus_to_use)))) {
printk("%s: error copying mask to user\n", __FUNCTION__);
ret = -EINVAL;
goto unlock_out;
}
cpu = cpumask_next(-1, &cpus_to_use);
if (copy_to_user(req.target_core, &cpu, sizeof(cpu))) {
printk("%s: error copying target core to user\n",
__FUNCTION__);
ret = -EINVAL;
goto unlock_out;
}
/* Commit used cores to OS structure */
memcpy(&pe->cpus_used, &cpus_used, sizeof(cpus_used));
/* Reset if last process */
if (pe->nr_processes_left == 0) {
dprintk("%s: nr_processes: %d (partitioned exec ends)\n",
__FUNCTION__,
pe->nr_processes);
pe->nr_processes = -1;
memset(&pe->cpus_used, 0, sizeof(pe->cpus_used));
}
ret = 0;
unlock_out:
mutex_unlock(&pe->lock);
return ret;
}
int mcctrl_add_per_proc_data(struct mcctrl_usrdata *ud, int pid, int mcctrl_add_per_proc_data(struct mcctrl_usrdata *ud, int pid,
struct mcctrl_per_proc_data *ppd) struct mcctrl_per_proc_data *ppd)
{ {
@ -1279,6 +1457,9 @@ long __mcctrl_control(ihk_os_t os, unsigned int req, unsigned long arg,
case MCEXEC_UP_GET_NODES: case MCEXEC_UP_GET_NODES:
return mcexec_get_nodes(os); return mcexec_get_nodes(os);
case MCEXEC_UP_GET_CPUSET:
return mcexec_get_cpuset(os, arg);
case MCEXEC_UP_STRNCPY_FROM_USER: case MCEXEC_UP_STRNCPY_FROM_USER:
return mcexec_strncpy_from_user(os, return mcexec_strncpy_from_user(os,
(struct strncpy_from_user_desc *)arg); (struct strncpy_from_user_desc *)arg);

View File

@ -61,6 +61,7 @@ static struct ihk_os_user_call_handler mcctrl_uchs[] = {
{ .request = MCEXEC_UP_SEND_SIGNAL, .func = mcctrl_ioctl }, { .request = MCEXEC_UP_SEND_SIGNAL, .func = mcctrl_ioctl },
{ .request = MCEXEC_UP_GET_CPU, .func = mcctrl_ioctl }, { .request = MCEXEC_UP_GET_CPU, .func = mcctrl_ioctl },
{ .request = MCEXEC_UP_GET_NODES, .func = mcctrl_ioctl }, { .request = MCEXEC_UP_GET_NODES, .func = mcctrl_ioctl },
{ .request = MCEXEC_UP_GET_CPUSET, .func = mcctrl_ioctl },
{ .request = MCEXEC_UP_STRNCPY_FROM_USER, .func = mcctrl_ioctl }, { .request = MCEXEC_UP_STRNCPY_FROM_USER, .func = mcctrl_ioctl },
{ .request = MCEXEC_UP_NEW_PROCESS, .func = mcctrl_ioctl }, { .request = MCEXEC_UP_NEW_PROCESS, .func = mcctrl_ioctl },
{ .request = MCEXEC_UP_PREPARE_DMA, .func = mcctrl_ioctl }, { .request = MCEXEC_UP_PREPARE_DMA, .func = mcctrl_ioctl },

View File

@ -298,6 +298,9 @@ int prepare_ikc_channels(ihk_os_t os)
INIT_LIST_HEAD(&usrdata->cpu_topology_list); INIT_LIST_HEAD(&usrdata->cpu_topology_list);
INIT_LIST_HEAD(&usrdata->node_topology_list); INIT_LIST_HEAD(&usrdata->node_topology_list);
mutex_init(&usrdata->part_exec.lock);
usrdata->part_exec.nr_processes = -1;
return 0; return 0;
} }

View File

@ -254,6 +254,13 @@ struct node_topology {
struct list_head chain; struct list_head chain;
}; };
struct mcctrl_part_exec {
struct mutex lock;
int nr_processes;
int nr_processes_left;
cpumask_t cpus_used;
};
#define CPU_LONGS (((NR_CPUS) + (BITS_PER_LONG) - 1) / (BITS_PER_LONG)) #define CPU_LONGS (((NR_CPUS) + (BITS_PER_LONG) - 1) / (BITS_PER_LONG))
#define MCCTRL_PER_PROC_DATA_HASH_SHIFT 7 #define MCCTRL_PER_PROC_DATA_HASH_SHIFT 7
@ -284,6 +291,7 @@ struct mcctrl_usrdata {
nodemask_t numa_online; nodemask_t numa_online;
struct list_head cpu_topology_list; struct list_head cpu_topology_list;
struct list_head node_topology_list; struct list_head node_topology_list;
struct mcctrl_part_exec part_exec;
}; };
struct mcctrl_signal { struct mcctrl_signal {

View File

@ -197,19 +197,19 @@ void free_topology_info(ihk_os_t os)
/* /*
* CPU and NUMA node mapping conversion functions. * CPU and NUMA node mapping conversion functions.
*/ */
static int mckernel_cpu_2_linux_cpu(struct mcctrl_usrdata *udp, int cpu_id) int mckernel_cpu_2_linux_cpu(struct mcctrl_usrdata *udp, int cpu_id)
{ {
return (cpu_id < udp->cpu_info->n_cpus) ? return (cpu_id < udp->cpu_info->n_cpus) ?
udp->cpu_info->mapping[cpu_id] : -1; udp->cpu_info->mapping[cpu_id] : -1;
} }
static int mckernel_cpu_2_hw_id(struct mcctrl_usrdata *udp, int cpu_id) int mckernel_cpu_2_hw_id(struct mcctrl_usrdata *udp, int cpu_id)
{ {
return (cpu_id < udp->cpu_info->n_cpus) ? return (cpu_id < udp->cpu_info->n_cpus) ?
udp->cpu_info->hw_ids[cpu_id] : -1; udp->cpu_info->hw_ids[cpu_id] : -1;
} }
static int linux_cpu_2_mckernel_cpu(struct mcctrl_usrdata *udp, int cpu_id) int linux_cpu_2_mckernel_cpu(struct mcctrl_usrdata *udp, int cpu_id)
{ {
int i; int i;
@ -222,7 +222,7 @@ static int linux_cpu_2_mckernel_cpu(struct mcctrl_usrdata *udp, int cpu_id)
} }
#if 0 #if 0
static int hw_id_2_mckernel_cpu(struct mcctrl_usrdata *udp, int hw_id) int hw_id_2_mckernel_cpu(struct mcctrl_usrdata *udp, int hw_id)
{ {
int i; int i;
@ -235,7 +235,7 @@ static int hw_id_2_mckernel_cpu(struct mcctrl_usrdata *udp, int hw_id)
return -1; return -1;
} }
static int hw_id_2_linux_cpu(struct mcctrl_usrdata *udp, int hw_id) int hw_id_2_linux_cpu(struct mcctrl_usrdata *udp, int hw_id)
{ {
int i; int i;
@ -248,7 +248,7 @@ static int hw_id_2_linux_cpu(struct mcctrl_usrdata *udp, int hw_id)
return -1; return -1;
} }
static int linux_cpu_2_hw_id(struct mcctrl_usrdata *udp, int cpu) int linux_cpu_2_hw_id(struct mcctrl_usrdata *udp, int cpu)
{ {
int mckernel_cpu = linux_cpu_2_mckernel_cpu(udp, cpu); int mckernel_cpu = linux_cpu_2_mckernel_cpu(udp, cpu);
@ -257,13 +257,13 @@ static int linux_cpu_2_hw_id(struct mcctrl_usrdata *udp, int cpu)
} }
#endif #endif
static int mckernel_numa_2_linux_numa(struct mcctrl_usrdata *udp, int numa_id) int mckernel_numa_2_linux_numa(struct mcctrl_usrdata *udp, int numa_id)
{ {
return (numa_id < udp->mem_info->n_numa_nodes) ? return (numa_id < udp->mem_info->n_numa_nodes) ?
udp->mem_info->numa_mapping[numa_id] : -1; udp->mem_info->numa_mapping[numa_id] : -1;
} }
static int linux_numa_2_mckernel_numa(struct mcctrl_usrdata *udp, int numa_id) int linux_numa_2_mckernel_numa(struct mcctrl_usrdata *udp, int numa_id)
{ {
int i; int i;

View File

@ -153,6 +153,9 @@ static const char rlimit_stack_envname[] = "MCKERNEL_RLIMIT_STACK";
static int ischild; static int ischild;
static int enable_vdso = 1; static int enable_vdso = 1;
/* Partitioned execution (e.g., for MPI) */
static int nr_processes = 0;
struct fork_sync { struct fork_sync {
pid_t pid; pid_t pid;
int status; int status;
@ -1102,7 +1105,7 @@ static int reduce_stack(struct rlimit *orig_rlim, char *argv[])
void print_usage(char **argv) void print_usage(char **argv)
{ {
fprintf(stderr, "Usage: %s [-c target_core] [<mcos-id>] (program) [args...]\n", argv[0]); fprintf(stderr, "Usage: %s [-c target_core] [-n nr_partitions] [<mcos-id>] (program) [args...]\n", argv[0]);
} }
void init_sigaction(void) void init_sigaction(void)
@ -1329,12 +1332,16 @@ int main(int argc, char **argv)
} }
/* Parse options ("+" denotes stop at the first non-option) */ /* Parse options ("+" denotes stop at the first non-option) */
while ((opt = getopt_long(argc, argv, "+c:", mcexec_options, NULL)) != -1) { while ((opt = getopt_long(argc, argv, "+c:n:", mcexec_options, NULL)) != -1) {
switch (opt) { switch (opt) {
case 'c': case 'c':
target_core = atoi(optarg); target_core = atoi(optarg);
break; break;
case 'n':
nr_processes = atoi(optarg);
break;
case 0: /* long opt */ case 0: /* long opt */
break; break;
@ -1599,6 +1606,24 @@ int main(int argc, char **argv)
exit(1); exit(1);
} }
/* Partitioned execution, obtain CPU set */
if (nr_processes > 0) {
struct get_cpu_set_arg cpu_set_arg;
cpu_set_arg.cpu_set = (void *)&desc->cpu_set;
cpu_set_arg.cpu_set_size = sizeof(desc->cpu_set);
cpu_set_arg.nr_processes = nr_processes;
cpu_set_arg.target_core = &target_core;
if (ioctl(fd, MCEXEC_UP_GET_CPUSET, (void *)&cpu_set_arg) != 0) {
perror("getting CPU set for partitioned execution");
close(fd);
return 1;
}
desc->cpu = target_core;
}
if (ioctl(fd, MCEXEC_UP_PREPARE_IMAGE, (unsigned long)desc) != 0) { if (ioctl(fd, MCEXEC_UP_PREPARE_IMAGE, (unsigned long)desc) != 0) {
perror("prepare"); perror("prepare");
close(fd); close(fd);

View File

@ -265,6 +265,8 @@ struct thread *create_thread(unsigned long user_pc,
goto err; goto err;
} }
dkprintf("%s: pid: %d, CPU: %d\n",
__FUNCTION__, proc->pid, cpu);
CPU_SET(cpu, &thread->cpu_set); CPU_SET(cpu, &thread->cpu_set);
cpu_set_empty = 0; cpu_set_empty = 0;
} }