Compare commits

..

2 Commits

Author SHA1 Message Date
7c0e624b13 spec: prerelase 0.8 for testing mcexec -n issue
Change-Id: Ie54f7bc74097c8390f75ddbd0d6e58a8ea87ea7c
2020-07-21 13:31:45 +09:00
0b66bab992 Revert "mcexec: detect mismatch of mcexec -n and mpirun -ppn"
This reverts commit 1d135492c3.

Conflicts:
	executer/kernel/mcctrl/control.c

Change-Id: I224cced408aa4b77691a153c5e1d2fdf8043fa04
2020-07-21 13:08:21 +09:00
13 changed files with 29 additions and 448 deletions

View File

@ -10,7 +10,7 @@ project(mckernel C ASM)
set(MCKERNEL_VERSION "1.7.0")
# See "Fedora Packaging Guidlines -- Versioning"
set(MCKERNEL_RELEASE "0.7")
set(MCKERNEL_RELEASE "0.8")
set(CMAKE_MODULE_PATH ${CMAKE_SOURCE_DIR}/cmake/modules)
# for rpmbuild

View File

@ -94,7 +94,6 @@ struct get_cpu_set_arg {
char *req_cpu_list; // Requested by user-space
int req_cpu_list_len; // Lenght of request string
int *process_rank;
pid_t ppid;
void *cpu_set;
size_t cpu_set_size; // Size in bytes
int *target_core;

View File

@ -587,14 +587,13 @@ extern int mckernel_cpu_2_linux_cpu(struct mcctrl_usrdata *udp, int cpu_id);
static long mcexec_get_cpuset(ihk_os_t os, unsigned long arg)
{
struct mcctrl_usrdata *udp = ihk_host_os_get_usrdata(os);
struct mcctrl_part_exec *pe = NULL, *pe_itr;
struct mcctrl_part_exec *pe;
struct get_cpu_set_arg req;
struct mcctrl_cpu_topology *cpu_top, *cpu_top_i;
struct cache_topology *cache_top;
int cpu, cpus_assigned, cpus_to_assign, cpu_prev;
int ret = 0;
int mcexec_linux_numa;
int pe_list_len = 0;
cpumask_t *mcexec_cpu_set = NULL;
cpumask_t *cpus_used = NULL;
cpumask_t *cpus_to_use = NULL;
@ -615,7 +614,7 @@ static long mcexec_get_cpuset(ihk_os_t os, unsigned long arg)
}
if (copy_from_user(&req, (void *)arg, sizeof(req))) {
pr_err("%s: error copying user request\n", __func__);
printk("%s: error copying user request\n", __FUNCTION__);
ret = -EINVAL;
goto put_out;
}
@ -692,49 +691,19 @@ static long mcexec_get_cpuset(ihk_os_t os, unsigned long arg)
goto put_out;
}
mutex_lock(&udp->part_exec_lock);
/* Find part_exec having same node_proxy */
list_for_each_entry_reverse(pe_itr, &udp->part_exec_list, chain) {
pe_list_len++;
if (pe_itr->node_proxy_pid == req.ppid) {
pe = pe_itr;
break;
}
}
if (!pe) {
/* First process to enter CPU partitioning */
pr_debug("%s: pe_list_len:%d\n", __func__, pe_list_len);
if (pe_list_len >= PE_LIST_MAXLEN) {
/* delete head entry of pe_list */
pe_itr = list_first_entry(&udp->part_exec_list,
struct mcctrl_part_exec, chain);
list_del(&pe_itr->chain);
kfree(pe_itr);
}
pe = kzalloc(sizeof(struct mcctrl_part_exec), GFP_KERNEL);
if (!pe) {
mutex_unlock(&udp->part_exec_lock);
ret = -ENOMEM;
goto put_out;
}
/* Init part_exec */
mutex_init(&pe->lock);
INIT_LIST_HEAD(&pe->pli_list);
pe->nr_processes = req.nr_processes;
pe->nr_processes_left = req.nr_processes;
pe->nr_processes_joined = 0;
pe->node_proxy_pid = req.ppid;
list_add_tail(&pe->chain, &udp->part_exec_list);
dprintk("%s: nr_processes: %d (partitioned exec starts)\n",
__func__, pe->nr_processes);
}
mutex_unlock(&udp->part_exec_lock);
pe = &udp->part_exec;
mutex_lock(&pe->lock);
/* First process to enter CPU partitioning */
if (pe->nr_processes == -1) {
pe->nr_processes = req.nr_processes;
pe->nr_processes_left = req.nr_processes;
dprintk("%s: nr_processes: %d (partitioned exec starts)\n",
__FUNCTION__,
pe->nr_processes);
}
if (pe->nr_processes != req.nr_processes) {
printk("%s: error: requested number of processes"
" doesn't match current partitioned execution\n",
@ -743,15 +712,7 @@ static long mcexec_get_cpuset(ihk_os_t os, unsigned long arg)
goto put_and_unlock_out;
}
if (pe->nr_processes_joined >= pe->nr_processes) {
printk("%s: too many processes have joined to the group of %d\n",
__func__, req.ppid);
ret = -EINVAL;
goto put_and_unlock_out;
}
--pe->nr_processes_left;
++pe->nr_processes_joined;
dprintk("%s: nr_processes: %d, nr_processes_left: %d\n",
__FUNCTION__,
pe->nr_processes,
@ -837,6 +798,8 @@ static long mcexec_get_cpuset(ihk_os_t os, unsigned long arg)
wake_up_interruptible(&pli_next->pli_wq);
}
/* Reset process counter to start state */
pe->nr_processes = -1;
ret = -ETIMEDOUT;
goto put_and_unlock_out;
}
@ -1084,8 +1047,16 @@ next_cpu:
/* Commit used cores to OS structure */
memcpy(&pe->cpus_used, cpus_used, sizeof(*cpus_used));
/* If not last process, wake up next process in list */
if (pe->nr_processes_left != 0) {
/* Reset if last process */
if (pe->nr_processes_left == 0) {
dprintk("%s: nr_processes: %d (partitioned exec ends)\n",
__FUNCTION__,
pe->nr_processes);
pe->nr_processes = -1;
memset(&pe->cpus_used, 0, sizeof(pe->cpus_used));
}
/* Otherwise wake up next process in list */
else {
++pe->process_rank;
pli_next = list_first_entry(&pe->pli_list,
struct process_list_item, list);

View File

@ -513,7 +513,6 @@ int prepare_ikc_channels(ihk_os_t os)
init_waitqueue_head(&usrdata->wq_procfs);
mutex_init(&usrdata->reserve_lock);
mutex_init(&usrdata->part_exec_lock);
for (i = 0; i < MCCTRL_PER_PROC_DATA_HASH_SIZE; ++i) {
INIT_LIST_HEAD(&usrdata->per_proc_data_hash[i]);
@ -522,8 +521,10 @@ int prepare_ikc_channels(ihk_os_t os)
INIT_LIST_HEAD(&usrdata->cpu_topology_list);
INIT_LIST_HEAD(&usrdata->node_topology_list);
INIT_LIST_HEAD(&usrdata->part_exec_list);
mutex_init(&usrdata->part_exec.lock);
INIT_LIST_HEAD(&usrdata->part_exec.pli_list);
usrdata->part_exec.nr_processes = -1;
INIT_LIST_HEAD(&usrdata->wakeup_descs_list);
spin_lock_init(&usrdata->wakeup_descs_lock);
@ -579,18 +580,6 @@ void destroy_ikc_channels(ihk_os_t os)
kfree(usrdata->channels);
kfree(usrdata->ikc2linux);
mutex_lock(&usrdata->part_exec_lock);
while (!list_empty(&usrdata->part_exec_list)) {
struct mcctrl_part_exec *pe;
pe = list_first_entry(&usrdata->part_exec_list,
struct mcctrl_part_exec, chain);
list_del(&pe->chain);
kfree(pe);
}
mutex_unlock(&usrdata->part_exec_lock);
kfree(usrdata);
}

View File

@ -324,20 +324,13 @@ struct process_list_item {
wait_queue_head_t pli_wq;
};
#define PE_LIST_MAXLEN 5
struct mcctrl_part_exec {
struct mutex lock;
int nr_processes;
/* number of processes to let in / out the synchronization point */
int nr_processes_left;
/* number of processes which have joined the partition */
int nr_processes_joined;
int process_rank;
pid_t node_proxy_pid;
cpumask_t cpus_used;
struct list_head pli_list;
struct list_head chain;
};
#define CPU_LONGS (((NR_CPUS) + (BITS_PER_LONG) - 1) / (BITS_PER_LONG))
@ -360,7 +353,6 @@ struct mcctrl_usrdata {
int job_pos;
int mcctrl_dma_abort;
struct mutex reserve_lock;
struct mutex part_exec_lock;
unsigned long last_thread_exec;
wait_queue_head_t wq_procfs;
struct list_head per_proc_data_hash[MCCTRL_PER_PROC_DATA_HASH_SIZE];
@ -376,7 +368,7 @@ struct mcctrl_usrdata {
nodemask_t numa_online;
struct list_head cpu_topology_list;
struct list_head node_topology_list;
struct list_head part_exec_list;
struct mcctrl_part_exec part_exec;
int perf_event_num;
};

View File

@ -2547,7 +2547,6 @@ int main(int argc, char **argv)
cpu_set_arg.cpu_set = (void *)&desc->cpu_set;
cpu_set_arg.cpu_set_size = sizeof(desc->cpu_set);
cpu_set_arg.nr_processes = nr_processes;
cpu_set_arg.ppid = getppid();
cpu_set_arg.target_core = &target_core;
cpu_set_arg.process_rank = &process_rank;
cpu_set_arg.mcexec_linux_numa = &mcexec_linux_numa;

View File

@ -1,145 +0,0 @@
#/bin/sh
USELTP=0
USEOSTEST=0
. ../../common.sh
issue="929"
tid=01
tname=`printf "C${issue}T%02d" ${tid}`
echo "*** ${tname} start *******************************"
TEST_CMD="mpirun -f ./hostfile -ppn 5 ${MCEXEC} -n 5 ./test_prog.sh"
echo ${TEST_CMD}
${TEST_CMD} &> ${tname}.txt
mpi_ret=$?
cat ./${tname}.txt
started_num=`grep 'test_prog is started' ./${tname}.txt | wc -l`
if [ ${mpi_ret} -eq 0 -a ${started_num} -eq 5 ]; then
echo "*** ${tname} PASSED ******************************"
else
echo "*** ${tname} FAILED ******************************"
fi
let tid++
echo ""
tname=`printf "C${issue}T%02d" ${tid}`
echo "*** ${tname} start *******************************"
TEST_CMD="mpirun -f ./hostfile -ppn 5 ${MCEXEC} -n 3 ./test_prog.sh"
echo ${TEST_CMD}
${TEST_CMD} &> ${tname}.txt
mpi_ret=$?
cat ./${tname}.txt
started_num=`grep 'test_prog is started' ./${tname}.txt | wc -l`
if [ ${mpi_ret} -ne 0 -a ${started_num} -eq 3 ]; then
echo "*** ${tname} PASSED ******************************"
else
echo "*** ${tname} FAILED ******************************"
fi
let tid++
echo ""
tname=`printf "C${issue}T%02d" ${tid}`
echo "*** ${tname} start *******************************"
TEST_CMD="mpirun -f ./hostfile -ppn 3 ${MCEXEC} -n 5 ./test_prog.sh"
echo ${TEST_CMD}
${TEST_CMD} &> ${tname}.txt
mpi_ret=$?
cat ./${tname}.txt
started_num=`grep 'test_prog is started' ./${tname}.txt | wc -l`
if [ ${mpi_ret} -ne 0 -a ${started_num} -eq 0 ]; then
echo "*** ${tname} PASSED ******************************"
else
echo "*** ${tname} FAILED ******************************"
fi
let tid++
echo ""
tname=`printf "C${issue}T%02d" ${tid}`
echo "*** ${tname} start *******************************"
TEST_CMD="mpirun -f ./hostfile -ppn 6 ${MCEXEC} -n 3 ./test_prog.sh"
echo ${TEST_CMD}
${TEST_CMD} &> ${tname}.txt
mpi_ret=$?
cat ./${tname}.txt
started_num=`grep 'test_prog is started' ./${tname}.txt | wc -l`
if [ ${mpi_ret} -ne 0 -a ${started_num} -eq 3 ]; then
echo "*** ${tname} PASSED ******************************"
else
echo "*** ${tname} FAILED ******************************"
fi
let tid++
echo ""
tname=`printf "C${issue}T%02d" ${tid}`
echo "*** ${tname} start *******************************"
TEST_CMD="mpirun -f ./hostfile -ppn 250 ${MCEXEC} -n 250 ./test_prog.sh"
echo ${TEST_CMD}
${TEST_CMD} &> ${tname}.txt
mpi_ret=$?
head -n 10 ./${tname}.txt
echo "..."
started_num=`grep 'test_prog is started' ./${tname}.txt | wc -l`
if [ ${mpi_ret} -ne 0 -a ${started_num} -eq 0 ]; then
echo "*** ${tname} PASSED ******************************"
else
echo "*** ${tname} FAILED ******************************"
fi
let tid++
echo ""
tname=`printf "C${issue}T%02d" ${tid}`
echo "*** ${tname} start *******************************"
ng=0
TEST_CMD="mpirun -f ./hostfile -ppn 5 ${MCEXEC} -n 5 ./test_prog.sh"
echo "** reboot mcrernel for check pe_list_len"
mcreboot
echo "** enable debug message in mcexec_get_cpuset"
sudo sh -c "echo -n 'func mcexec_get_cpuset +p' > /sys/kernel/debug/dynamic_debug/control"
echo ${TEST_CMD}
for i in `seq 1 20`
do
${TEST_CMD} &> ${tname}.txt
mpi_ret=$?
started_num=`grep 'test_prog is started' ./${tname}.txt | wc -l`
if [ ${mpi_ret} -eq 0 -a ${started_num} -eq 5 ]; then
echo "[OK] exec: $i"
else
echo "[NG] exec: $i"
let ng++
fi
done
echo "** check pe_list_len"
dmesg --notime | grep "mcexec_get_cpuset: pe_list" | tail -n 20 | cut -f 2-3 -d ':' > ./pe_list_len.txt
cat ./pe_list_len.txt | while read line
do
len=`echo ${line} | cut -f 2 -d ':'`
if [ ${len} -ge 0 -a ${len} -le 5 ]; then
echo "[OK] ${line}"
else
echo "[NG] ${line}"
let ng++
fi
done
echo "** disable debug message in mcexec_get_cpuset"
sudo sh -c "echo -n 'func mcexec_get_cpuset -p' > /sys/kernel/debug/dynamic_debug/control"
if [ ${ng} -eq 0 ]; then
echo "*** ${tname} PASSED ******************************"
else
echo "*** ${tname} FAILED ******************************"
fi
let tid++
echo ""

View File

@ -1,11 +0,0 @@
CFLAGS=-g
LDFLAGS=
TARGET=
all: $(TARGET)
test: all
./C929.sh
clean:
rm -f $(TARGET) *.o *.txt

View File

@ -1,36 +0,0 @@
【Issue#929 動作確認】
□ テスト内容
1. mpirunで指定する-ppnと、mcexecで指定する-n の指定状況ごとに
想定どおりの動作となることを確認
C929T01:
-ppn == -n の場合に、プログラムが実行され、mpirunが成功する
C929T02:
-ppn > -n の場合に、プログラムの一部が実行され、mpirunが失敗する
C929T03:
-ppn < -n の場合に、プログラムが実行されず、mpirunが失敗する
C929T04:
-ppn が -n の整数倍である場合に、プログラムの一部が実行され、mpirunが失敗する
C929T05:
-ppn と -n がMcKernelに割り当てたCPU数よりも大きい場合に、
プログラムが実行されず、mpirunが失敗する
C929T06:
-ppn == -n での正常実行を20回連続で行った場合に、
プログラムが実行され、mpirunが成功する
また、mcctrlで管理しているpart_exec_list の要素数が5を超えない
□ 実行手順
$ make test
McKernelのインストール先や、OSTEST, LTPの配置場所は、
$HOME/.mck_test_config を参照している
.mck_test_config は、McKernelをビルドした際に生成されるmck_test_config.sample ファイルを
$HOMEにコピーし、適宜編集する
□ 実行結果
x86_64_result.log aarch64_result.log 参照。
すべての項目をPASSしていることを確認。

View File

@ -1,99 +0,0 @@
*** C929T01 start *******************************
mpirun -f ./hostfile -ppn 5 /home/satoken/ihk+mckernel/bin/mcexec -n 5 ./test_prog.sh
test_prog is started.
test_prog is started.
test_prog is started.
test_prog is started.
test_prog is started.
*** C929T01 PASSED ******************************
*** C929T02 start *******************************
mpirun -f ./hostfile -ppn 5 /home/satoken/ihk+mckernel/bin/mcexec -n 3 ./test_prog.sh
getting CPU set for partitioned execution: Invalid argument
getting CPU set for partitioned execution: Invalid argument
test_prog is started.
test_prog is started.
test_prog is started.
*** C929T02 PASSED ******************************
*** C929T03 start *******************************
mpirun -f ./hostfile -ppn 3 /home/satoken/ihk+mckernel/bin/mcexec -n 5 ./test_prog.sh
getting CPU set for partitioned execution: Connection timed out
getting CPU set for partitioned execution: Connection timed out
getting CPU set for partitioned execution: Connection timed out
*** C929T03 PASSED ******************************
*** C929T04 start *******************************
mpirun -f ./hostfile -ppn 6 /home/satoken/ihk+mckernel/bin/mcexec -n 3 ./test_prog.sh
getting CPU set for partitioned execution: Invalid argument
getting CPU set for partitioned execution: Invalid argument
getting CPU set for partitioned execution: Invalid argument
test_prog is started.
test_prog is started.
test_prog is started.
*** C929T04 PASSED ******************************
*** C929T05 start *******************************
mpirun -f ./hostfile -ppn 250 /home/satoken/ihk+mckernel/bin/mcexec -n 250 ./test_prog.sh
error: nr_processes can't exceed nr. of CPUs
error: nr_processes can't exceed nr. of CPUs
error: nr_processes can't exceed nr. of CPUs
error: nr_processes can't exceed nr. of CPUs
error: nr_processes can't exceed nr. of CPUs
error: nr_processes can't exceed nr. of CPUs
error: nr_processes can't exceed nr. of CPUs
error: nr_processes can't exceed nr. of CPUs
error: nr_processes can't exceed nr. of CPUs
error: nr_processes can't exceed nr. of CPUs
...
*** C929T05 PASSED ******************************
*** C929T06 start *******************************
** reboot mcrernel for check pe_list_len
mcreboot.sh -c 1-6,29-34 -m 50G@0,50G@1 -r 1-6:0+29-34:28 -O ... done
** enable debug message in mcexec_get_cpuset
mpirun -f ./hostfile -ppn 5 /home/satoken/ihk+mckernel/bin/mcexec -n 5 ./test_prog.sh
[OK] exec: 1
[OK] exec: 2
[OK] exec: 3
[OK] exec: 4
[OK] exec: 5
[OK] exec: 6
[OK] exec: 7
[OK] exec: 8
[OK] exec: 9
[OK] exec: 10
[OK] exec: 11
[OK] exec: 12
[OK] exec: 13
[OK] exec: 14
[OK] exec: 15
[OK] exec: 16
[OK] exec: 17
[OK] exec: 18
[OK] exec: 19
[OK] exec: 20
** check pe_list_len
[OK] pe_list_len:0
[OK] pe_list_len:1
[OK] pe_list_len:2
[OK] pe_list_len:3
[OK] pe_list_len:4
[OK] pe_list_len:5
[OK] pe_list_len:5
[OK] pe_list_len:5
[OK] pe_list_len:5
[OK] pe_list_len:5
[OK] pe_list_len:5
[OK] pe_list_len:5
[OK] pe_list_len:5
[OK] pe_list_len:5
[OK] pe_list_len:5
[OK] pe_list_len:5
[OK] pe_list_len:5
[OK] pe_list_len:5
[OK] pe_list_len:5
[OK] pe_list_len:5
** disable debug message in mcexec_get_cpuset
*** C929T06 PASSED ******************************

View File

@ -1 +0,0 @@
localhost

View File

@ -1,3 +0,0 @@
#!/bin/sh
echo "test_prog is started."

View File

@ -1,74 +0,0 @@
*** C929T01 start *******************************
mpirun -f ./hostfile -ppn 5 /home/satoken/ihk+mckernel/bin/mcexec -n 5 ./test_prog.sh
test_prog is started.
test_prog is started.
test_prog is started.
test_prog is started.
test_prog is started.
*** C929T01 PASSED ******************************
*** C929T02 start *******************************
mpirun -f ./hostfile -ppn 5 /home/satoken/ihk+mckernel/bin/mcexec -n 3 ./test_prog.sh
getting CPU set for partitioned execution: Invalid argument
getting CPU set for partitioned execution: Invalid argument
test_prog is started.
test_prog is started.
test_prog is started.
*** C929T02 PASSED ******************************
*** C929T03 start *******************************
mpirun -f ./hostfile -ppn 3 /home/satoken/ihk+mckernel/bin/mcexec -n 5 ./test_prog.sh
getting CPU set for partitioned execution: Connection timed out
getting CPU set for partitioned execution: Connection timed out
getting CPU set for partitioned execution: Connection timed out
*** C929T03 PASSED ******************************
*** C929T04 start *******************************
mpirun -f ./hostfile -ppn 6 /home/satoken/ihk+mckernel/bin/mcexec -n 3 ./test_prog.sh
getting CPU set for partitioned execution: Invalid argument
getting CPU set for partitioned execution: Invalid argument
getting CPU set for partitioned execution: Invalid argument
test_prog is started.
test_prog is started.
test_prog is started.
*** C929T04 PASSED ******************************
*** C929T05 start *******************************
mpirun -f ./hostfile -ppn 250 /home/satoken/ihk+mckernel/bin/mcexec -n 250 ./test_prog.sh
error: nr_processes can't exceed nr. of CPUs
error: nr_processes can't exceed nr. of CPUs
error: nr_processes can't exceed nr. of CPUs
error: nr_processes can't exceed nr. of CPUs
error: nr_processes can't exceed nr. of CPUs
error: nr_processes can't exceed nr. of CPUs
error: nr_processes can't exceed nr. of CPUs
error: nr_processes can't exceed nr. of CPUs
error: nr_processes can't exceed nr. of CPUs
error: nr_processes can't exceed nr. of CPUs
...
*** C929T05 PASSED ******************************
*** C929T06 start *******************************
mpirun -f ./hostfile -ppn 5 /home/satoken/ihk+mckernel/bin/mcexec -n 5 ./test_prog.sh
[OK] exec: 1
[OK] exec: 2
[OK] exec: 3
[OK] exec: 4
[OK] exec: 5
[OK] exec: 6
[OK] exec: 7
[OK] exec: 8
[OK] exec: 9
[OK] exec: 10
[OK] exec: 11
[OK] exec: 12
[OK] exec: 13
[OK] exec: 14
[OK] exec: 15
[OK] exec: 16
[OK] exec: 17
[OK] exec: 18
[OK] exec: 19
[OK] exec: 20
*** C929T06 PASSED ******************************