mcexec: detect mismatch of mcexec -n and mpirun -ppn
Change-Id: I0ce1b2d48cda10713920cb88692e107b8c4d3bab Refs: #929
This commit is contained in:
committed by
Masamichi Takagi
parent
45bc6a617a
commit
d1d93d90cc
@ -587,13 +587,14 @@ extern int mckernel_cpu_2_linux_cpu(struct mcctrl_usrdata *udp, int cpu_id);
|
||||
static long mcexec_get_cpuset(ihk_os_t os, unsigned long arg)
|
||||
{
|
||||
struct mcctrl_usrdata *udp = ihk_host_os_get_usrdata(os);
|
||||
struct mcctrl_part_exec *pe;
|
||||
struct mcctrl_part_exec *pe = NULL, *pe_itr;
|
||||
struct get_cpu_set_arg req;
|
||||
struct mcctrl_cpu_topology *cpu_top, *cpu_top_i;
|
||||
struct cache_topology *cache_top;
|
||||
int cpu, cpus_assigned, cpus_to_assign, cpu_prev;
|
||||
int ret = 0;
|
||||
int mcexec_linux_numa;
|
||||
int pe_list_len = 0;
|
||||
cpumask_t *mcexec_cpu_set = NULL;
|
||||
cpumask_t *cpus_used = NULL;
|
||||
cpumask_t *cpus_to_use = NULL;
|
||||
@ -614,7 +615,7 @@ static long mcexec_get_cpuset(ihk_os_t os, unsigned long arg)
|
||||
}
|
||||
|
||||
if (copy_from_user(&req, (void *)arg, sizeof(req))) {
|
||||
printk("%s: error copying user request\n", __FUNCTION__);
|
||||
pr_err("%s: error copying user request\n", __func__);
|
||||
ret = -EINVAL;
|
||||
goto put_out;
|
||||
}
|
||||
@ -691,18 +692,48 @@ static long mcexec_get_cpuset(ihk_os_t os, unsigned long arg)
|
||||
goto put_out;
|
||||
}
|
||||
|
||||
pe = &udp->part_exec;
|
||||
mutex_lock(&udp->part_exec_lock);
|
||||
/* Find part_exec having same node_proxy */
|
||||
list_for_each_entry_reverse(pe_itr, &udp->part_exec_list, chain) {
|
||||
pe_list_len++;
|
||||
if (pe_itr->node_proxy_pid == req.ppid) {
|
||||
pe = pe_itr;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
mutex_lock(&pe->lock);
|
||||
if (!pe) {
|
||||
/* First process to enter CPU partitioning */
|
||||
pr_debug("%s: pe_list_len:%d\n", __func__, pe_list_len);
|
||||
if (pe_list_len >= PE_LIST_MAXLEN) {
|
||||
/* delete head entry of pe_list */
|
||||
pe_itr = list_first_entry(&udp->part_exec_list,
|
||||
struct mcctrl_part_exec, chain);
|
||||
list_del(&pe_itr->chain);
|
||||
kfree(pe_itr);
|
||||
}
|
||||
|
||||
/* First process to enter CPU partitioning */
|
||||
if (pe->nr_processes == -1) {
|
||||
pe = kzalloc(sizeof(struct mcctrl_part_exec), GFP_KERNEL);
|
||||
if (!pe) {
|
||||
mutex_unlock(&udp->part_exec_lock);
|
||||
ret = -ENOMEM;
|
||||
goto put_out;
|
||||
}
|
||||
/* Init part_exec */
|
||||
mutex_init(&pe->lock);
|
||||
INIT_LIST_HEAD(&pe->pli_list);
|
||||
pe->nr_processes = req.nr_processes;
|
||||
pe->nr_processes_left = req.nr_processes;
|
||||
pe->nr_processes_joined = 0;
|
||||
pe->node_proxy_pid = req.ppid;
|
||||
|
||||
list_add_tail(&pe->chain, &udp->part_exec_list);
|
||||
dprintk("%s: nr_processes: %d (partitioned exec starts)\n",
|
||||
__FUNCTION__,
|
||||
pe->nr_processes);
|
||||
__func__, pe->nr_processes);
|
||||
}
|
||||
mutex_unlock(&udp->part_exec_lock);
|
||||
|
||||
mutex_lock(&pe->lock);
|
||||
|
||||
if (pe->nr_processes != req.nr_processes) {
|
||||
printk("%s: error: requested number of processes"
|
||||
@ -712,7 +743,15 @@ static long mcexec_get_cpuset(ihk_os_t os, unsigned long arg)
|
||||
goto put_and_unlock_out;
|
||||
}
|
||||
|
||||
if (pe->nr_processes_joined >= pe->nr_processes) {
|
||||
printk("%s: too many processes have joined to the group of %d\n",
|
||||
__func__, req.ppid);
|
||||
ret = -EINVAL;
|
||||
goto put_and_unlock_out;
|
||||
}
|
||||
|
||||
--pe->nr_processes_left;
|
||||
++pe->nr_processes_joined;
|
||||
dprintk("%s: nr_processes: %d, nr_processes_left: %d\n",
|
||||
__FUNCTION__,
|
||||
pe->nr_processes,
|
||||
@ -798,8 +837,6 @@ static long mcexec_get_cpuset(ihk_os_t os, unsigned long arg)
|
||||
wake_up_interruptible(&pli_next->pli_wq);
|
||||
}
|
||||
|
||||
/* Reset process counter to start state */
|
||||
pe->nr_processes = -1;
|
||||
ret = -ETIMEDOUT;
|
||||
goto put_and_unlock_out;
|
||||
}
|
||||
@ -1047,16 +1084,8 @@ next_cpu:
|
||||
/* Commit used cores to OS structure */
|
||||
memcpy(&pe->cpus_used, cpus_used, sizeof(*cpus_used));
|
||||
|
||||
/* Reset if last process */
|
||||
if (pe->nr_processes_left == 0) {
|
||||
dprintk("%s: nr_processes: %d (partitioned exec ends)\n",
|
||||
__FUNCTION__,
|
||||
pe->nr_processes);
|
||||
pe->nr_processes = -1;
|
||||
memset(&pe->cpus_used, 0, sizeof(pe->cpus_used));
|
||||
}
|
||||
/* Otherwise wake up next process in list */
|
||||
else {
|
||||
/* If not last process, wake up next process in list */
|
||||
if (pe->nr_processes_left != 0) {
|
||||
++pe->process_rank;
|
||||
pli_next = list_first_entry(&pe->pli_list,
|
||||
struct process_list_item, list);
|
||||
|
||||
@ -518,6 +518,7 @@ int prepare_ikc_channels(ihk_os_t os)
|
||||
|
||||
init_waitqueue_head(&usrdata->wq_procfs);
|
||||
mutex_init(&usrdata->reserve_lock);
|
||||
mutex_init(&usrdata->part_exec_lock);
|
||||
|
||||
for (i = 0; i < MCCTRL_PER_PROC_DATA_HASH_SIZE; ++i) {
|
||||
INIT_LIST_HEAD(&usrdata->per_proc_data_hash[i]);
|
||||
@ -526,10 +527,8 @@ int prepare_ikc_channels(ihk_os_t os)
|
||||
|
||||
INIT_LIST_HEAD(&usrdata->cpu_topology_list);
|
||||
INIT_LIST_HEAD(&usrdata->node_topology_list);
|
||||
INIT_LIST_HEAD(&usrdata->part_exec_list);
|
||||
|
||||
mutex_init(&usrdata->part_exec.lock);
|
||||
INIT_LIST_HEAD(&usrdata->part_exec.pli_list);
|
||||
usrdata->part_exec.nr_processes = -1;
|
||||
INIT_LIST_HEAD(&usrdata->wakeup_descs_list);
|
||||
spin_lock_init(&usrdata->wakeup_descs_lock);
|
||||
|
||||
@ -585,6 +584,18 @@ void destroy_ikc_channels(ihk_os_t os)
|
||||
|
||||
kfree(usrdata->channels);
|
||||
kfree(usrdata->ikc2linux);
|
||||
|
||||
mutex_lock(&usrdata->part_exec_lock);
|
||||
while (!list_empty(&usrdata->part_exec_list)) {
|
||||
struct mcctrl_part_exec *pe;
|
||||
|
||||
pe = list_first_entry(&usrdata->part_exec_list,
|
||||
struct mcctrl_part_exec, chain);
|
||||
list_del(&pe->chain);
|
||||
kfree(pe);
|
||||
}
|
||||
mutex_unlock(&usrdata->part_exec_lock);
|
||||
|
||||
kfree(usrdata);
|
||||
}
|
||||
|
||||
|
||||
@ -324,13 +324,20 @@ struct process_list_item {
|
||||
wait_queue_head_t pli_wq;
|
||||
};
|
||||
|
||||
#define PE_LIST_MAXLEN 5
|
||||
|
||||
struct mcctrl_part_exec {
|
||||
struct mutex lock;
|
||||
int nr_processes;
|
||||
/* number of processes to let in / out the synchronization point */
|
||||
int nr_processes_left;
|
||||
/* number of processes which have joined the partition */
|
||||
int nr_processes_joined;
|
||||
int process_rank;
|
||||
pid_t node_proxy_pid;
|
||||
cpumask_t cpus_used;
|
||||
struct list_head pli_list;
|
||||
struct list_head chain;
|
||||
};
|
||||
|
||||
#define CPU_LONGS (((NR_CPUS) + (BITS_PER_LONG) - 1) / (BITS_PER_LONG))
|
||||
@ -353,6 +360,7 @@ struct mcctrl_usrdata {
|
||||
int job_pos;
|
||||
int mcctrl_dma_abort;
|
||||
struct mutex reserve_lock;
|
||||
struct mutex part_exec_lock;
|
||||
unsigned long last_thread_exec;
|
||||
wait_queue_head_t wq_procfs;
|
||||
struct list_head per_proc_data_hash[MCCTRL_PER_PROC_DATA_HASH_SIZE];
|
||||
@ -368,7 +376,7 @@ struct mcctrl_usrdata {
|
||||
nodemask_t numa_online;
|
||||
struct list_head cpu_topology_list;
|
||||
struct list_head node_topology_list;
|
||||
struct mcctrl_part_exec part_exec;
|
||||
struct list_head part_exec_list;
|
||||
int perf_event_num;
|
||||
};
|
||||
|
||||
|
||||
Reference in New Issue
Block a user