diff --git a/.gitignore b/.gitignore index c304f0c6..ab70c188 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ +*~ *.o *.elf *.bin diff --git a/test/uti/CT01.c b/test/uti/CT01.c new file mode 100644 index 00000000..5a209b47 --- /dev/null +++ b/test/uti/CT01.c @@ -0,0 +1,137 @@ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include + +pthread_mutex_t mutex1; +pthread_cond_t cond1; +pthread_mutex_t mutex2; +pthread_cond_t cond2; +char *m; +int flag1, flag2; + +int sigst; +pthread_t thr; + +void +sigsegv(int s) +{ + if (sigst == 1) { + fprintf(stderr, "CT01007 munmap OK (SIGSEGV)\n"); + pthread_join(thr, NULL); + fprintf(stderr, "CT01008 exit(pthread_join) OK\n"); + fprintf(stderr, "CT01009 futex (pthread_mutex/pthread_cond) OK\n"); + fprintf(stderr, "CT01010 END\n"); + exit(0); + } + printf("BAD SIGSEGV\n"); + exit(1); +} + +void * +util_thread(void *arg) +{ + int rc; + + rc = syscall(732); + if (rc == -1) + fprintf(stderr, "CT01003 running on Linux OK\n"); + else { + fprintf(stderr, "CT01003 running on McKernel NG\n", rc); + exit(1); + } + errno = 0; + m = mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0); + if (m != (void *)-1) { + fprintf(stderr, "CT01004 mmap OK\n"); + } + else { + fprintf(stderr, "CT01004 mmap NG errno=%d\n", errno); + exit(1); + } + strcpy(m, "mmap OK"); + pthread_mutex_lock(&mutex1); + flag1 = 1; + pthread_cond_signal(&cond1); + pthread_mutex_unlock(&mutex1); + + pthread_mutex_lock(&mutex2); + while(!flag2) { + pthread_cond_wait(&cond2, &mutex2); + } + flag2 = 0; + pthread_mutex_unlock(&mutex2); + rc = munmap(m, 4096); + if (rc == 0) { + fprintf(stderr, "CT01006 munmap OK\n"); + } + else { + fprintf(stderr, "CT01006 munmap NG errno=%d\n", errno); + exit(1); + } + + pthread_mutex_lock(&mutex1); + flag1 = 1; + pthread_cond_signal(&cond1); + pthread_mutex_unlock(&mutex1); + return NULL; +} + +int +main(int argc, char **argv) +{ + int rc; + + signal(SIGSEGV, sigsegv); + pthread_mutex_init(&mutex1, NULL); + pthread_cond_init(&cond1, NULL); + pthread_mutex_init(&mutex2, NULL); + pthread_cond_init(&cond2, NULL); + + fprintf(stderr, "CT01001 mmap/munmap/futex/exit START\n"); + rc = syscall(731, 1, NULL); + if (rc) { + fprintf(stderr, "util_indicate_clone rc=%d, errno=%d\n", rc, errno); + fflush(stderr); + } + rc = pthread_create(&thr, NULL, util_thread, NULL); + if(rc){ + fprintf(stderr, "pthread_create: %d\n", rc); + exit(1); + } + fprintf(stderr, "CT01002 pthread_create OK\n"); + pthread_mutex_lock(&mutex1); + while(!flag1) { + pthread_cond_wait(&cond1, &mutex1); + } + flag1 = 0; + pthread_mutex_unlock(&mutex1); + + fprintf(stderr, "CT01005 %s\n", m); + pthread_mutex_lock(&mutex2); + flag2 = 1; + pthread_cond_signal(&cond2); + pthread_mutex_unlock(&mutex2); + + pthread_mutex_lock(&mutex1); + while(!flag1) { + pthread_cond_wait(&cond1, &mutex1); + } + flag1 = 0; + pthread_mutex_unlock(&mutex1); + + sigst = 1; + fprintf(stderr, "%s\n", m); + fprintf(stderr, "CT01007 munmap NG\n"); + pthread_join(thr, NULL); + fprintf(stderr, "CT01008 exit(pthread_join) OK\n"); + fprintf(stderr, "CT01009 futex (pthread_mutex/pthread_cond) OK\n"); + fprintf(stderr, "CT01010 END\n"); + exit(0); +} diff --git a/test/uti/CT01.sh b/test/uti/CT01.sh new file mode 100755 index 00000000..7756e8cb --- /dev/null +++ b/test/uti/CT01.sh @@ -0,0 +1,86 @@ +#!/usr/bin/bash + +#!/usr/bin/bash -x + +MYHOME=$HOME +UTI_TOP=${MYHOME}/project/os/mckernel/test/uti + +MCK=${MYHOME}/project/os/install +unset DISABLE_UTI + +cmdline="./CT01" + +stop=0 +reboot=0 +go=0 + +mck=0 +nloops=1 + +while getopts srgac:n:mdl: OPT +do + case ${OPT} in + s) stop=1 + ;; + r) reboot=1 + ;; + g) go=1 + ;; + a) async=1 + ;; + c) cmdline=$OPTARG + ;; + n) ndoubles=$OPTARG + ;; + m) + mck=1 + ;; + d) export DISABLE_UTI=1 + ;; + l) nloops=$OPTARG + ;; + *) echo "invalid option -${OPT}" >&2 + exit 1 + esac +done + +if [ ${mck} -eq 1 ]; then + MCEXEC="${MCK}/bin/mcexec" +else + MCEXEC= +fi + +if [ ${stop} -eq 1 ]; then + if [ ${mck} -eq 1 ]; then + sudo ${MCK}/sbin/mcstop+release.sh + else + : + fi +fi + +if [ ${reboot} -eq 1 ]; then + if [ ${mck} -eq 1 ]; then + if hostname | grep ofp &>/dev/null; then + sudo ${MCK}/sbin/mcreboot.sh -c 2-17,70-85,138-153,206-221,20-35,88-103,156-171,224-239,36-51,104-119,172-187,240-255,52-67,120-135,188-203,256-271 -r 2-5,70-73,138-141,206-209:0+6-9,74-77,142-145,210-213:1+10-13,78-81,146-149,214-217:68+14-17,82-85,150-153,218-221:69+20-23,88-91,156-159,224-227:136+24-27,92-95,160-163,228-231:137+28-31,96-99,164-167,232-235:204+32-35,100-103,168-171,236-239:205+36-39,104-107,172-175,240-243:18+40-43,108-111,176-179,244-247:19+44-47,112-115,180-183,248-251:86+48-51,116-119,184-187,252-255:87+52-55,120-123,188-191,256-259:154+56-59,124-127,192-195,260-263:155+60-63,128-131,196-199,264-267:222+64-67,132-135,200-203,268-271:223 -m 32G@0,12G@1 + else + sudo ${MCK}/sbin/mcreboot.sh -c 1-15,65-79,129-143,193-207,17-31,81-95,145-159,209-223,33-47,97-111,161-175,225-239,49-63,113-127,177-191,241-255 -r 1-15:0+65-79:64+129-143:128+193-207:192+17-31:16+81-95:80+145-159:144+209-223:208+33-47:32+97-111:96+161-175:160+225-239:224+49-63:48+113-127:112+177-191:176+241-255:240 -m 12G@0,12G@1,12G@2,12G@3,3920M@4,3920M@5,3920M@6,3920M@7 + fi + else + : + fi +fi + +if [ ${go} -eq 1 ]; then + cd ${UTI_TOP} + make $cmdline + for i in `seq 1 ${nloops}`; do + ${MCK}/bin/mcexec --enable-uti $cmdline + wait + echo =====; + echo $i; + echo =====; i=$((i+1)); + done +fi + + + diff --git a/test/uti/CT02.c b/test/uti/CT02.c new file mode 100644 index 00000000..fdfe51d2 --- /dev/null +++ b/test/uti/CT02.c @@ -0,0 +1,162 @@ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include + +int flag1; +pthread_mutex_t mutex1; +pthread_cond_t cond1; + +int flag2; +pthread_mutex_t mutex2; +pthread_cond_t cond2; +char *m; + +int sigst; +pthread_t thr; + +void +sigsegv(int s) +{ + if (sigst == 1) { + fprintf(stderr, "CT02007 mremap OK (SIGSEGV)\n"); + pthread_mutex_lock(&mutex2); + flag2 = 1; + pthread_cond_signal(&cond2); + pthread_mutex_unlock(&mutex2); + pthread_join(thr, NULL); + fprintf(stderr, "CT02009 pthread_join OK\n"); + fprintf(stderr, "CT02010 END\n"); + exit(0); + } + printf("BAD SIGSEGV\n"); + exit(1); +} + +void * +util_thread(void *arg) +{ + int rc; + char *n; + + rc = syscall(732); + if (rc == -1) + fprintf(stderr, "CT02003 get_system OK\n"); + else { + fprintf(stderr, "CT02003 get_system NG get_system=%d\n", rc); + exit(1); + } + errno = 0; + m = mmap(NULL, 8192, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0); + if (m != (void *)-1) { + fprintf(stderr, "CT02004 mmap OK\n"); + } + else { + fprintf(stderr, "CT02004 mmap NG errno=%d\n", errno); + exit(1); + } + strcpy(m + 4096, "mmap OK"); + pthread_mutex_lock(&mutex1); + flag1 = 1; + pthread_cond_signal(&cond1); + pthread_mutex_unlock(&mutex1); + pthread_mutex_lock(&mutex2); + while (!flag2) { + pthread_cond_wait(&cond2, &mutex2); + } + flag2 = 0; + pthread_mutex_unlock(&mutex2); + n = mremap(m, 8192, 4096, 0); + if (n == m) { + fprintf(stderr, "CT02006 mremap OK\n"); + } + else if (n != (void *)-1){ + fprintf(stderr, "CT02006 mremap remapped, test stop\n"); + exit(1); + } + else { + fprintf(stderr, "CT02006 mremap NG errno=%d\n", errno); + exit(1); + } + pthread_mutex_lock(&mutex1); + flag1 = 1; + pthread_cond_signal(&cond1); + pthread_mutex_unlock(&mutex1); + + pthread_mutex_lock(&mutex2); + while (!flag2) { + pthread_cond_wait(&cond2, &mutex2); + } + flag2 = 0; + pthread_mutex_unlock(&mutex2); + rc = munmap(m, 4096); + if (rc == 0) { + fprintf(stderr, "CT02008 munmap OK\n"); + } + else { + fprintf(stderr, "CT02008 munmap NG errno=%d\n", errno); + exit(1); + } + return NULL; +} + +int +main(int argc, char **argv) +{ + int rc; + + signal(SIGSEGV, sigsegv); + pthread_mutex_init(&mutex1, NULL); + pthread_cond_init(&cond1, NULL); + pthread_mutex_init(&mutex2, NULL); + pthread_cond_init(&cond2, NULL); + + fprintf(stderr, "CT02001 mremap START\n"); + rc = syscall(731, 1, NULL); + if (rc) { + fprintf(stderr, "util_indicate_clone rc=%d, errno=%d\n", rc, errno); + fflush(stderr); + } + rc = pthread_create(&thr, NULL, util_thread, NULL); + if(rc){ + fprintf(stderr, "pthread_create: %d\n", rc); + exit(1); + } + fprintf(stderr, "CT02002 pthread_create OK\n"); + pthread_mutex_lock(&mutex1); + while (!flag1) { + pthread_cond_wait(&cond1, &mutex1); + } + flag1 = 0; + pthread_mutex_unlock(&mutex1); + fprintf(stderr, "CT02005 %s\n", m + 4096); + + pthread_mutex_lock(&mutex2); + flag2 = 1; + pthread_cond_signal(&cond2); + pthread_mutex_unlock(&mutex2); + + pthread_mutex_lock(&mutex1); + while (!flag1) { + pthread_cond_wait(&cond1, &mutex1); + } + flag1 = 0; + pthread_mutex_unlock(&mutex1); + sigst = 1; + fprintf(stderr, "%s\n", m + 4096); + fprintf(stderr, "CT02007 mremap NG\n"); + pthread_mutex_lock(&mutex2); + flag2 = 1; + pthread_cond_signal(&cond2); + pthread_mutex_unlock(&mutex2); + pthread_join(thr, NULL); + fprintf(stderr, "CT02009 pthread_join OK\n"); + fprintf(stderr, "CT02010 END\n"); + exit(0); +} diff --git a/test/uti/CT03.c b/test/uti/CT03.c new file mode 100644 index 00000000..6f79b2d5 --- /dev/null +++ b/test/uti/CT03.c @@ -0,0 +1,171 @@ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include + +int flag1; +pthread_mutex_t mutex1; +pthread_cond_t cond1; + +int flag2; +pthread_mutex_t mutex2; +pthread_cond_t cond2; + +char *m; + +int sigst; +pthread_t thr; + +void +sigsegv(int s) +{ + if (sigst == 1) { + fprintf(stderr, "CT03007 mprotect OK (SIGSEGV)\n"); + + pthread_mutex_lock(&mutex2); + flag2 = 1; + pthread_cond_signal(&cond2); + pthread_mutex_unlock(&mutex2); + + pthread_join(thr, NULL); + fprintf(stderr, "CT03009 pthread_join OK\n"); + fprintf(stderr, "CT03010 END\n"); + exit(0); + } + printf("BAD SIGSEGV\n"); + exit(1); +} + +void * +util_thread(void *arg) +{ + int rc; + + rc = syscall(732); + if (rc == -1) + fprintf(stderr, "CT03003 get_system OK\n"); + else { + fprintf(stderr, "CT03003 get_system NG get_system=%d\n", rc); + exit(1); + } + errno = 0; + m = mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0); + if (m != (void *)-1) { + fprintf(stderr, "CT03004 mmap OK\n"); + } + else { + fprintf(stderr, "CT03004 mmap NG errno=%d\n", errno); + exit(1); + } + strcpy(m, "mmap OK"); + + pthread_mutex_lock(&mutex1); + flag1 = 1; + pthread_cond_signal(&cond1); + pthread_mutex_unlock(&mutex1); + + pthread_mutex_lock(&mutex2); + while (!flag2) { + pthread_cond_wait(&cond2, &mutex2); + } + flag2 = 0; + pthread_mutex_unlock(&mutex2); + + rc = mprotect(m, 4096, PROT_READ); + if (rc == 0) { + fprintf(stderr, "CT03006 mprotect OK\n"); + } + else { + fprintf(stderr, "CT03006 mprotect NG errno=%d\n", errno); + exit(1); + } + pthread_mutex_lock(&mutex1); + flag1 = 1; + pthread_cond_signal(&cond1); + pthread_mutex_unlock(&mutex1); + + pthread_mutex_lock(&mutex2); + while (!flag2) { + pthread_cond_wait(&cond2, &mutex2); + } + flag2 = 0; + pthread_mutex_unlock(&mutex2); + + rc = munmap(m, 4096); + if (rc == 0) { + fprintf(stderr, "CT03008 munmap OK\n"); + } + else { + fprintf(stderr, "CT03008 munmap NG errno=%d\n", errno); + exit(1); + } + return NULL; +} + +int +main(int argc, char **argv) +{ + int rc; + + signal(SIGSEGV, sigsegv); + pthread_mutex_init(&mutex1, NULL); + pthread_cond_init(&cond1, NULL); + pthread_mutex_init(&mutex2, NULL); + pthread_cond_init(&cond2, NULL); + + fprintf(stderr, "CT03001 mprotect START\n"); + rc = syscall(731, 1, NULL); + if (rc) { + fprintf(stderr, "util_indicate_clone rc=%d, errno=%d\n", rc, errno); + fflush(stderr); + } + rc = pthread_create(&thr, NULL, util_thread, NULL); + if(rc){ + fprintf(stderr, "pthread_create: %d\n", rc); + exit(1); + } + fprintf(stderr, "CT03002 pthread_create OK\n"); + + pthread_mutex_lock(&mutex1); + while (!flag1) { + pthread_cond_wait(&cond1, &mutex1); + } + flag1 = 0; + pthread_mutex_unlock(&mutex1); + + fprintf(stderr, "CT03005 %s\n", m); + + pthread_mutex_lock(&mutex2); + flag2 = 1; + pthread_cond_signal(&cond2); + pthread_mutex_unlock(&mutex2); + + + pthread_mutex_lock(&mutex1); + while (!flag1) { + pthread_cond_wait(&cond1, &mutex1); + } + flag1 = 0; + pthread_mutex_unlock(&mutex1); + + sigst = 1; + strcpy(m, "mprotect NG"); + fprintf(stderr, "%s\n", m); + fprintf(stderr, "CT03007 mprotect NG\n"); + + pthread_mutex_lock(&mutex2); + flag2 = 1; + pthread_cond_signal(&cond2); + pthread_mutex_unlock(&mutex2); + + pthread_join(thr, NULL); + fprintf(stderr, "CT03009 pthread_join OK\n"); + fprintf(stderr, "CT03010 END\n"); + exit(0); +} diff --git a/test/uti/CT04.c b/test/uti/CT04.c new file mode 100644 index 00000000..7ecd17b0 --- /dev/null +++ b/test/uti/CT04.c @@ -0,0 +1,106 @@ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include + +int flag1; +pthread_mutex_t mutex1; +pthread_cond_t cond1; + +int flag2; +pthread_mutex_t mutex2; +pthread_cond_t cond2; + +char *a; +char *b; +char *c; + + +void * +util_thread(void *arg) +{ + int rc; + + rc = syscall(732); + if (rc == -1) + fprintf(stderr, "CT04003 get_system OK\n"); + else { + fprintf(stderr, "CT04003 get_system NG get_system=%d\n", rc); + exit(1); + } + errno = 0; + a = sbrk(0); + fprintf(stderr, "CT04004 sbrk OK\n"); + b = sbrk(4096); + strcpy(a, "sbrk OK"); + + pthread_mutex_lock(&mutex1); + flag1 = 1; + pthread_cond_signal(&cond1); + pthread_mutex_unlock(&mutex1); + + pthread_mutex_lock(&mutex2); + while(!flag2) { + pthread_cond_wait(&cond2, &mutex2); + } + flag2 = 0; + pthread_mutex_unlock(&mutex2); + + b = sbrk(0); + if (c == b) { + fprintf(stderr, "CT04006 sbrk OK\n"); + } + else { + fprintf(stderr, "CT04006 sbrk NG %p != %p\n", c, b); + } + return NULL; +} + +int +main(int argc, char **argv) +{ + pthread_t thr; + int rc; + + pthread_mutex_init(&mutex1, NULL); + pthread_cond_init(&cond1, NULL); + pthread_mutex_init(&mutex2, NULL); + pthread_cond_init(&cond2, NULL); + + fprintf(stderr, "CT04001 brk START\n"); + rc = syscall(731, 1, NULL); + if (rc) { + fprintf(stderr, "util_indicate_clone rc=%d, errno=%d\n", rc, errno); + fflush(stderr); + } + rc = pthread_create(&thr, NULL, util_thread, NULL); + if(rc){ + fprintf(stderr, "pthread_create: %d\n", rc); + exit(1); + } + fprintf(stderr, "CT04002 pthread_create OK\n"); + + pthread_mutex_lock(&mutex1); + while(!flag1) { + pthread_cond_wait(&cond1, &mutex1); + } + flag1 = 0; + pthread_mutex_unlock(&mutex1); + fprintf(stderr, "CT04005 %s\n", a); + + c = sbrk(0); + pthread_mutex_lock(&mutex2); + flag2 = 1; + pthread_cond_signal(&cond2); + pthread_mutex_unlock(&mutex2); + pthread_join(thr, NULL); + fprintf(stderr, "CT04007 pthread_join OK\n"); + fprintf(stderr, "CT04008 END\n"); + exit(0); +} diff --git a/test/uti/CT05.c b/test/uti/CT05.c new file mode 100644 index 00000000..ad5d4918 --- /dev/null +++ b/test/uti/CT05.c @@ -0,0 +1,67 @@ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include + +void * +util_thread(void *arg) +{ + int rc; + int tid; + + rc = syscall(732); + if (rc == 0) + fprintf(stderr, "CT05003 get_system OK\n"); + else { + fprintf(stderr, "CT05003 get_system NG get_system=%d\n", rc); + exit(1); + } + tid = syscall(SYS_gettid); + fprintf(stderr, "CT05004 gettid OK %d\n", tid); + rc = syscall(730); + if (rc == 0) { + fprintf(stderr, "CT05005 util_migrate_inter_kernel OK\n"); + } + else { + fprintf(stderr, "CT05005 util_migrate_inter_kernel NG rc=%d errno=%d\n", rc, errno); + } + rc = syscall(732); + if (rc == -1) + fprintf(stderr, "CT05006 get_system OK\n"); + else { + fprintf(stderr, "CT05006 get_system NG get_system=%d\n", rc); + exit(1); + } + if ((rc = syscall(SYS_gettid)) == tid) { + fprintf(stderr, "CT05007 gettid OK %d\n", tid); + } + else { + fprintf(stderr, "CT05007 gettid NG %d\n", rc); + } + return NULL; +} + +int +main(int argc, char **argv) +{ + pthread_t thr; + int rc; + + fprintf(stderr, "CT05001 gettid START\n"); + rc = pthread_create(&thr, NULL, util_thread, NULL); + if(rc){ + fprintf(stderr, "pthread_create: %d\n", rc); + exit(1); + } + fprintf(stderr, "CT05002 pthread_create OK\n"); + pthread_join(thr, NULL); + fprintf(stderr, "CT05008 pthread_join OK\n"); + fprintf(stderr, "CT05009 END\n"); + exit(0); +} diff --git a/test/uti/CT06.c b/test/uti/CT06.c new file mode 100644 index 00000000..61d1d238 --- /dev/null +++ b/test/uti/CT06.c @@ -0,0 +1,79 @@ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include + +void * +util_thread(void *arg) +{ + long rc; + + rc = syscall(732); + if (rc == -1) + fprintf(stderr, "CT06003 get_system OK\n"); + else { + fprintf(stderr, "CT06003 get_system NG get_system=%d\n", rc); + exit(1); + } + + syscall(SYS_exit_group, 99); + + return NULL; +} + +int +main(int argc, char **argv) +{ + int rc; + pthread_t thr; + int st; + pid_t pid; + + fprintf(stderr, "CT06001 syscall error START\n"); + + pid = fork(); + if (pid) { + if (pid == -1) { + perror("fork"); + exit(1); + } + while ((rc = waitpid(pid, &st, 0)) == -1 && errno == EINTR); + if (rc == -1) { + fprintf(stderr, "CT06004 exit_group NG rc=%d errno=%d\n", rc, errno); + exit(1); + } + if (!WIFEXITED(st)) { + fprintf(stderr, "CT06004 exit_group NG st=%08x\n", st); + exit(1); + } + if (WEXITSTATUS(st) != 99) { + fprintf(stderr, "CT06004 exit_group NG st=%d\n", WEXITSTATUS(st)); + exit(1); + } + fprintf(stderr, "CT06004 exit_group OK\n"); + exit(0); + } + + rc = syscall(731, 1, NULL); + if (rc) { + fprintf(stderr, "util_indicate_clone rc=%d, errno=%d\n", rc, errno); + fflush(stderr); + } + rc = pthread_create(&thr, NULL, util_thread, NULL); + if(rc){ + fprintf(stderr, "pthread_create: %d\n", rc); + exit(1); + } + fprintf(stderr, "CT06002 pthread_create OK\n"); + + pthread_join(thr, NULL); + fprintf(stderr, "CT06004 pthread_join NG\n"); + fprintf(stderr, "CT06004 END\n"); + exit(0); +} diff --git a/test/uti/CT07.c b/test/uti/CT07.c new file mode 100644 index 00000000..9eff04ca --- /dev/null +++ b/test/uti/CT07.c @@ -0,0 +1,86 @@ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +void * +util_thread(void *arg) +{ + long rc; + rc = syscall(732); + if (rc == -1) + fprintf(stderr, "CT07003 get_system OK\n"); + else { + fprintf(stderr, "CT07003 get_system NG get_system=%d\n", rc); + exit(1); + } + + rc = syscall(SYS_clone); + if (rc == -1 && errno == ENOSYS) { + fprintf(stderr, "CT07004 clone OK\n"); + } + else { + fprintf(stderr, "CT07004 clone NG rc=%ld errno=%d\n", rc, errno); + } + + rc = syscall(SYS_fork); + if (rc == -1 && errno == ENOSYS) { + fprintf(stderr, "CT07005 fork OK\n"); + } + else { + fprintf(stderr, "CT07005 fork NG rc=%ld errno=%d\n", rc, errno); + } + +#if 0 /* It looks like syscall_intercept can't hook vfork */ + rc = syscall(SYS_vfork); + //rc = vfork(); + fprintf(stderr, "CT07006 vfork rc=%d,errno=%d\n", rc, errno); + if (rc == -1 && errno == ENOSYS) { + fprintf(stderr, "CT07006 vfork OK\n"); + } + else { + fprintf(stderr, "CT07006 vfork NG rc=%ld errno=%d\n", rc, errno); + } +#endif + + rc = syscall(SYS_execve); + if (rc == -1 && errno == ENOSYS) { + fprintf(stderr, "CT07007 execve OK\n"); + } + else { + fprintf(stderr, "CT07007 execve NG rc=%ld errno=%d\n", rc, errno); + } + return NULL; +} + +int +main(int argc, char **argv) +{ + int rc; + pthread_t thr; + + fprintf(stderr, "CT07001 syscall error START\n"); + rc = syscall(731, 1, NULL); + if (rc) { + fprintf(stderr, "util_indicate_clone rc=%d, errno=%d\n", rc, errno); + fflush(stderr); + } + rc = pthread_create(&thr, NULL, util_thread, NULL); + if(rc){ + fprintf(stderr, "pthread_create: %d\n", rc); + exit(1); + } + fprintf(stderr, "CT07002 pthread_create OK\n"); + + pthread_join(thr, NULL); + fprintf(stderr, "CT07008 pthread_join OK\n"); + fprintf(stderr, "CT07010 END\n"); + exit(0); +} diff --git a/test/uti/CT08.c b/test/uti/CT08.c new file mode 100644 index 00000000..8ce75e2e --- /dev/null +++ b/test/uti/CT08.c @@ -0,0 +1,165 @@ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define UTI_FLAG_NUMA_SET (1ULL<<1) /* Indicates NUMA_SET is specified */ + +#define UTI_FLAG_SAME_NUMA_DOMAIN (1ULL<<2) +#define UTI_FLAG_DIFFERENT_NUMA_DOMAIN (1ULL<<3) + +#define UTI_FLAG_SAME_L1 (1ULL<<4) +#define UTI_FLAG_SAME_L2 (1ULL<<5) +#define UTI_FLAG_SAME_L3 (1ULL<<6) + +#define UTI_FLAG_DIFFERENT_L1 (1ULL<<7) +#define UTI_FLAG_DIFFERENT_L2 (1ULL<<8) +#define UTI_FLAG_DIFFERENT_L3 (1ULL<<9) + +#define UTI_FLAG_EXCLUSIVE_CPU (1ULL<<10) +#define UTI_FLAG_CPU_INTENSIVE (1ULL<<11) +#define UTI_FLAG_HIGH_PRIORITY (1ULL<<12) +#define UTI_FLAG_NON_COOPERATIVE (1ULL<<13) + +/* Linux default value is used */ +#define UTI_MAX_NUMA_DOMAINS (1024) + +typedef struct uti_attr { + /* UTI_CPU_SET environmental variable is used to denote the preferred + location of utility thread */ + uint64_t numa_set[(UTI_MAX_NUMA_DOMAINS + sizeof(uint64_t) * 8 - 1) / + (sizeof(uint64_t) * 8)]; + uint64_t flags; /* Representing location and behavior hints by bitmap */ +} uti_attr_t; + +void +print_sched() +{ + cpu_set_t cpuset; + int sched; + + sched_getaffinity(0, sizeof cpuset, &cpuset); + sched = sched_getscheduler(0); + fprintf(stderr, "\tsched cpu=%16lx sched=%d\n", *(long *)&cpuset, sched); +} + +void * +util_thread(void *arg) +{ + print_sched(); + return NULL; +} + +void +thread_test(uti_attr_t *attr, char *msg) +{ + pthread_t thr; + int rc; + + fprintf(stderr, "%s\n", msg); + rc = syscall(731, 1, attr); + if (rc) { + fprintf(stderr, "util_indicate_clone rc=%d, errno=%d\n", rc, errno); + fflush(stderr); + } + rc = pthread_create(&thr, NULL, util_thread, NULL); + if(rc){ + fprintf(stderr, "pthread_create: %d\n", rc); + exit(1); + } + pthread_join(thr, NULL); +} + +int +main(int argc, char **argv) +{ + uti_attr_t attr; + + memset(&attr, '\0', sizeof attr); + attr.numa_set[0] = 2; // NUMA domain == 1 + attr.flags = UTI_FLAG_NUMA_SET; + thread_test(&attr, "CT08001 UTI_FLAG_NUMA_SET"); + + memset(&attr, '\0', sizeof attr); + attr.numa_set[0] = 2; + attr.flags = UTI_FLAG_NUMA_SET | UTI_FLAG_EXCLUSIVE_CPU; + thread_test(&attr, "CT08002 UTI_FLAG_NUMA_SET|UTI_FLAG_EXCLUSIVE_CPU"); + + memset(&attr, '\0', sizeof attr); + attr.numa_set[0] = 2; + attr.flags = UTI_FLAG_NUMA_SET | UTI_FLAG_EXCLUSIVE_CPU; + thread_test(&attr, "CT08003 UTI_FLAG_NUMA_SET|UTI_FLAG_EXCLUSIVE_CPU(2)"); + + memset(&attr, '\0', sizeof attr); + attr.flags = UTI_FLAG_SAME_NUMA_DOMAIN; + thread_test(&attr, "CT08004 UTI_FLAG_SAME_NUMA_DOMAIN"); + + memset(&attr, '\0', sizeof attr); + attr.flags = UTI_FLAG_SAME_NUMA_DOMAIN | UTI_FLAG_CPU_INTENSIVE; + thread_test(&attr, "CT08005 UTI_FLAG_SAME_NUMA_DOMAIN|UTI_FLAG_CPU_INTENSIVE"); + + memset(&attr, '\0', sizeof attr); + attr.flags = UTI_FLAG_DIFFERENT_NUMA_DOMAIN; + thread_test(&attr, "CT08006 UTI_FLAG_DIFFERENT_NUMA_DOMAIN"); + + memset(&attr, '\0', sizeof attr); + attr.flags = UTI_FLAG_DIFFERENT_NUMA_DOMAIN | UTI_FLAG_HIGH_PRIORITY; + thread_test(&attr, "CT08007 UTI_FLAG_DIFFERENT_NUMA_DOMAIN|UTI_FLAG_HIGH_PRIORITY"); + + memset(&attr, '\0', sizeof attr); + attr.flags = UTI_FLAG_SAME_L1; + thread_test(&attr, "CT08008 UTI_FLAG_SAME_L1"); + + memset(&attr, '\0', sizeof attr); + attr.flags = UTI_FLAG_SAME_L1 | UTI_FLAG_NON_COOPERATIVE; + thread_test(&attr, "CT08009 UTI_FLAG_SAME_L1|UTI_FLAG_NON_COOPERATIVE"); + + memset(&attr, '\0', sizeof attr); + attr.flags = UTI_FLAG_SAME_L2; + thread_test(&attr, "CT08010 UTI_FLAG_SAME_L2"); + + memset(&attr, '\0', sizeof attr); + attr.flags = UTI_FLAG_SAME_L2 | UTI_FLAG_CPU_INTENSIVE; + thread_test(&attr, "CT08011 UTI_FLAG_SAME_L2|UTI_FLAG_CPU_INTENSIVE"); + + memset(&attr, '\0', sizeof attr); + attr.flags = UTI_FLAG_SAME_L3; + thread_test(&attr, "CT08012 UTI_FLAG_SAME_L3"); + + memset(&attr, '\0', sizeof attr); + attr.flags = UTI_FLAG_SAME_L3 | UTI_FLAG_CPU_INTENSIVE; + thread_test(&attr, "CT08013 UTI_FLAG_SAME_L3|UTI_FLAG_CPU_INTENSIVE"); + + memset(&attr, '\0', sizeof attr); + attr.flags = UTI_FLAG_DIFFERENT_L1; + thread_test(&attr, "CT08014 UTI_FLAG_DIFFERENT_L1"); + + memset(&attr, '\0', sizeof attr); + attr.flags = UTI_FLAG_DIFFERENT_L1 | UTI_FLAG_CPU_INTENSIVE; + thread_test(&attr, "CT08015 UTI_FLAG_DIFFERENT_L1|UTI_FLAG_CPU_INTENSIVE"); + + memset(&attr, '\0', sizeof attr); + attr.flags = UTI_FLAG_DIFFERENT_L2; + thread_test(&attr, "CT08016 UTI_FLAG_DIFFERENT_L2"); + + memset(&attr, '\0', sizeof attr); + attr.flags = UTI_FLAG_DIFFERENT_L2 | UTI_FLAG_CPU_INTENSIVE; + thread_test(&attr, "CT08017 UTI_FLAG_DIFFERENT_L2|UTI_FLAG_CPU_INTENSIVE"); + + memset(&attr, '\0', sizeof attr); + attr.flags = UTI_FLAG_DIFFERENT_L3; + thread_test(&attr, "CT08018 UTI_FLAG_DIFFERENT_L3"); + + memset(&attr, '\0', sizeof attr); + attr.flags = UTI_FLAG_DIFFERENT_L3 | UTI_FLAG_CPU_INTENSIVE; + thread_test(&attr, "CT08019 UTI_FLAG_DIFFERENT_L3|UTI_FLAG_CPU_INTENSIVE"); + + exit(0); +} diff --git a/test/uti/CT09.c b/test/uti/CT09.c new file mode 100644 index 00000000..b8bed45b --- /dev/null +++ b/test/uti/CT09.c @@ -0,0 +1,278 @@ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define DEBUG + +#ifdef DEBUG +#define dprintf(...) \ + do { \ + char msg[1024]; \ + sprintf(msg, __VA_ARGS__); \ + fprintf(stdout, "%s,%s", __FUNCTION__, msg); \ + } while (0); +#define eprintf(...) \ + do { \ + char msg[1024]; \ + sprintf(msg, __VA_ARGS__); \ + fprintf(stdout, "%s,%s", __FUNCTION__, msg); \ + } while (0); +#else +#define dprintf(...) do { } while (0) +#define eprintf(...) do { } while (0) +#endif + +#define TS2NS(sec, nsec) ((unsigned long)(sec) * 1000000000ULL + (unsigned long)(nsec)) +#define CALC_DELAY (98600) /* 98.6 usec */ +#define RTS_DELAY (1000) /* 1 usec, CPU time for sending Request-to-Send packet */ +#define NIC_DELAY (3000) /* 5 usec, RTS packet propagation time + RDMA-read on the responder side + CPU time for sending DONE packet + DONE packet network propagation time */ +#define POLL_DELAY ( 200) /* 0.2 usec, CPU time for checking DRAM event queue */ +#define COMPL_DELAY ( 200) /* 0.2 usec, CPU time for updates MPI_Request */ +#define NSPIN 1 +static inline void FIXED_SIZE_WORK(unsigned long *ptr) { +#if 0 + asm volatile("movq %0, %%rax\n\t" + "addq $1, %%rax\n\t" + "movq %%rax, %0\n\t" + : "+rm" (*ptr) + : + : "rax", "cc", "memory"); +#endif + asm volatile( + "movq $0, %%rcx\n\t" + "1:\t" + "addq $1, %%rcx\n\t" + "cmpq $99, %%rcx\n\t" + "jle 1b\n\t" + : + : + : "rcx", "cc"); +} + +static inline void BULK_FSW(unsigned long n, unsigned long *ptr) { + int j; + for (j = 0; j < (n); j++) { + FIXED_SIZE_WORK(ptr); + } +} + +pthread_mutex_t ep_lock; /* Ownership of channel instance */ + +struct thr_arg { + int bar_count; /* Barrier before entering loop */ + pthread_mutex_t bar_lock; + pthread_cond_t bar_cond; + pthread_t pthread; + unsigned long mem; /* Per-thread storage */ +}; + +struct thr_arg thr_args; + +unsigned long mem; /* Per-thread storage */ +volatile int nevents; +volatile int terminate; +int wps = 1; /* work per sec */ +double nspw; /* nsec per work */ + +#define N_INIT 10000000 + +void fwq_init(unsigned long *mem) { + struct timespec start, end; + unsigned long nsec; + int i; + clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start); + BULK_FSW(N_INIT, mem); + clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end); + nsec = (TS2NS(end.tv_sec, end.tv_nsec) - TS2NS(start.tv_sec, start.tv_nsec)); + nspw = nsec / (double)N_INIT; + printf("nsec=%ld, nspw=%f\n", nsec, nspw); +} + +void fwq(long delay_nsec, unsigned long* mem) { + if (delay_nsec < 0) { + printf("%s: delay_nsec<0\n", __FUNCTION__); + } + //printf("delay_nsec=%ld,count=%f\n", delay_nsec, delay_nsec / nspw); + BULK_FSW(delay_nsec / nspw, mem); +} + +void mydelay(long delay_nsec, long *mem) { + struct timespec start, end; + clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start); + + while (1) { + clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end); + if (TS2NS(end.tv_sec, end.tv_nsec) - TS2NS(start.tv_sec, start.tv_nsec) > delay_nsec) { + break; + } + FIXED_SIZE_WORK(mem); + } +} + +void *progress_fn(void *_arg) { + struct thr_arg *arg = (struct thr_arg *)_arg; + int rc; + int spin_count = 0; + int i; + + rc = syscall(732); + if (rc == -1) + fprintf(stdout, "CT09100 progress_fn running on Linux OK\n"); + else { + fprintf(stdout, "CT09100 progress_fn running on McKernel NG\n", rc); + } + + printf("tid=%d,bar_count=%d\n", syscall(__NR_gettid), arg->bar_count); + + pthread_mutex_lock(&arg->bar_lock); + arg->bar_count++; + if (arg->bar_count == 2) { + if ((rc = pthread_cond_broadcast(&arg->bar_cond))) { + printf("pthread_cond_broadcast failed,rc=%d\n", rc); + } + } + while (arg->bar_count != 2) { + if ((rc = pthread_cond_wait(&arg->bar_cond, &arg->bar_lock))) { + printf("pthread_cond_wait failed,rc=%d\n", rc); + } + } + pthread_mutex_unlock(&arg->bar_lock); + + printf("after barrier\n"); + + /* Start progress */ + pthread_mutex_lock(&ep_lock); + while(1) { + if (terminate) { + break; + } + + fwq(POLL_DELAY, &arg->mem); + + /* Event found */ + if (nevents > 0) { + fwq(COMPL_DELAY, &arg->mem); /* Simulate MPI protocol response */ + nevents = 0; + } + + spin_count++; + if (spin_count >= NSPIN) { + spin_count = 0; + pthread_mutex_unlock(&ep_lock); + sched_yield(); + pthread_mutex_lock(&ep_lock); + } + } + return NULL; +} + +int main(int argc, char **argv) { + int rc; + int i; + char *uti_str; + int uti_val; + struct timespec start, end; + int disable_progress; + + fprintf(stdout, "CT09001 MPI progress thread skelton START\n"); + + rc = syscall(732); + if (rc == -1) + fprintf(stdout, "CT09002 main running on Linux INFO\n"); + else { + fprintf(stdout, "CT09002 main running on McKernel INFO\n"); + } + + fwq_init(&mem); + pthread_mutex_init(&ep_lock, NULL); + + thr_args.bar_count = 0; + pthread_cond_init(&thr_args.bar_cond, NULL); + pthread_mutex_init(&thr_args.bar_lock, NULL); + + disable_progress = (argc > 1 && strcmp(argv[1], "-d") == 0) ? 1 : 0; + + if (disable_progress) { + goto skip1; + } + + uti_str = getenv("DISABLE_UTI"); + uti_val = uti_str ? atoi(uti_str) : 0; + if (!uti_val) { + rc = syscall(731, 1, NULL); + if (rc) { + fprintf(stdout, "CT09003 INFO: uti not available (rc=%d)\n", rc); + } else { + fprintf(stdout, "CT09003 INFO: uti available\n"); + } + } else { + fprintf(stdout, "CT09003 INFO: uti disabled\n", rc); + } + + rc = pthread_create(&thr_args.pthread, NULL, progress_fn, &thr_args); + if (rc){ + fprintf(stdout, "pthread_create: %d\n", rc); + exit(1); + } + pthread_mutex_lock(&thr_args.bar_lock); + thr_args.bar_count++; + if (thr_args.bar_count == 2) { + if ((rc = pthread_cond_broadcast(&thr_args.bar_cond))) { + printf("pthread_cond_broadcast failed,rc=%d\n", rc); + } + } + while (thr_args.bar_count != 2) { + if ((rc = pthread_cond_wait(&thr_args.bar_cond, &thr_args.bar_lock))) { + printf("pthread_cond_wait failed,rc=%d\n", rc); + } + } + pthread_mutex_unlock(&thr_args.bar_lock); + + fprintf(stdout, "CT09004 pthread_create OK\n"); + skip1: + clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start); + for (i = 0; i < 10000; i++) { /* It takes 1 sec */ + if(!disable_progress) { + + /* Acquire endpoint and send request-to-send packet */ + pthread_mutex_lock(&ep_lock); + fwq(RTS_DELAY, &mem); + pthread_mutex_unlock(&ep_lock); + + /* Start calculation */ + + /* Generate event on behaf of responder */ + fwq(NIC_DELAY, &mem); + nevents++; + + fwq(CALC_DELAY - NIC_DELAY, &mem); /* Overlap remainder */ + + /* Wait until async thread consumes the event */ + while (nevents > 0) { + FIXED_SIZE_WORK(&mem); + } + } else { + /* No overlap case */ + fwq(RTS_DELAY + CALC_DELAY + POLL_DELAY + COMPL_DELAY, &mem); + } + } + clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end); + + if(!disable_progress) { + terminate = 1; + + pthread_join(thr_args.pthread, NULL); + } + fprintf(stderr, "total %ld nsec\n", TS2NS(end.tv_sec, end.tv_nsec) - TS2NS(start.tv_sec, start.tv_nsec)); + fprintf(stdout, "CT09006 END\n"); + + + exit(0); +} diff --git a/test/uti/CT09.sh b/test/uti/CT09.sh new file mode 100755 index 00000000..b28b6d5e --- /dev/null +++ b/test/uti/CT09.sh @@ -0,0 +1,51 @@ +#!/usr/bin/bash +MYHOME="/work/gg10/e29005" +MCK="${MYHOME}/project/os/install" +MCEXEC= +export DISABLE_UTI=0 + +stop=0 +reset=0 +go=0 +nodes="c[8194]" + +while getopts srgmd OPT +do + case ${OPT} in + s) stop=1 + ;; + r) reset=1 + ;; + g) go=1 + ;; + m) MCEXEC="${MCK}/bin/mcexec" + ;; + d) export DISABLE_UTI=1 + ;; + *) echo "invalid option -${OPT}" >&2 + exit 1 + esac +done + +if [ ${stop} -eq 1 ]; then + PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w ${nodes} \ + sudo mount /work + + PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w ${nodes} \ + sudo ${MCK}/sbin/mcstop+release.sh +fi + +if [ ${reset} -eq 1 ]; then + PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w ${nodes} \ + sudo mount /work + + PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w ${nodes} \ + sudo ${MCK}/sbin/mcreboot.sh -c 2-17,20-35,36-51,52-67 -r 2-5:0+6-9:1+10-13:68+14-17:69+20-23:136+24-27:137+28-31:204+32-35:205+36-39:18+40-43:19+44-47:86+48-51:87+52-55:154+56-59:155+60-63:222+64-67:223 -m 32G@0,12G@1 +fi + +if [ ${go} -eq 1 ]; then + > ./log + for i in {1..10}; do (${MCEXEC} --enable-uti ./CT09 1>/dev/null 2>> ./log); done + #${MCEXEC} ./CT09 + perl CT11.pl < ./log +fi diff --git a/test/uti/CT10.c b/test/uti/CT10.c new file mode 100644 index 00000000..763e8a53 --- /dev/null +++ b/test/uti/CT10.c @@ -0,0 +1,103 @@ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include + +pthread_mutex_t mutex1; +pthread_cond_t cond1; +pthread_mutex_t mutex2; +pthread_cond_t cond2; +char *m; +int flag1, flag2; + +int sigst; +pthread_t thr; + +void * +util_thread(void *arg) +{ + int rc; + + rc = syscall(732); + if (rc == -1) + fprintf(stderr, "CT10100 running on Linux OK\n"); + else { + fprintf(stderr, "CT10100 running on Linux NG (%d)\n", rc); + } + errno = 0; + + pthread_mutex_lock(&mutex1); + flag1 = 1; + pthread_cond_signal(&cond1); + pthread_mutex_unlock(&mutex1); + + pthread_mutex_lock(&mutex2); + while(!flag2) { + pthread_cond_wait(&cond2, &mutex2); + } + flag2 = 0; + pthread_mutex_unlock(&mutex2); + + pthread_mutex_lock(&mutex1); + flag1 = 1; + pthread_cond_signal(&cond1); + pthread_mutex_unlock(&mutex1); + return NULL; +} + +int +main(int argc, char **argv) +{ + int rc; + + pthread_mutex_init(&mutex1, NULL); + pthread_cond_init(&cond1, NULL); + pthread_mutex_init(&mutex2, NULL); + pthread_cond_init(&cond2, NULL); + + fprintf(stderr, "CT10001 futex START\n"); +#if 1 + rc = syscall(731, 1, NULL); + if (rc) { + fprintf(stderr, "util_indicate_clone rc=%d, errno=%d\n", rc, errno); + fflush(stderr); + } +#endif + rc = pthread_create(&thr, NULL, util_thread, NULL); + if(rc){ + fprintf(stderr, "pthread_create: %d\n", rc); + exit(1); + } + fprintf(stderr, "CT10002 pthread_create OK\n"); + + pthread_mutex_lock(&mutex1); + while(!flag1) { + pthread_cond_wait(&cond1, &mutex1); + } + flag1 = 0; + pthread_mutex_unlock(&mutex1); + + pthread_mutex_lock(&mutex2); + flag2 = 1; + pthread_cond_signal(&cond2); + pthread_mutex_unlock(&mutex2); + + pthread_mutex_lock(&mutex1); + while(!flag1) { + pthread_cond_wait(&cond1, &mutex1); + } + flag1 = 0; + pthread_mutex_unlock(&mutex1); + + pthread_join(thr, NULL); + fprintf(stderr, "CT10003 pthread_join OK\n"); + + fprintf(stderr, "CT10004 END\n"); + exit(0); +} diff --git a/test/uti/CT11.c b/test/uti/CT11.c new file mode 100644 index 00000000..04d77e3e --- /dev/null +++ b/test/uti/CT11.c @@ -0,0 +1,275 @@ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include /* For SYS_xxx definitions */ +#include +#include +#include +#include +#include + +#define NLOOP 10 +#define TS2NS(sec, nsec) ((unsigned long)(sec) * 1000000000ULL + (unsigned long)(nsec)) +#define SZCHUNK 4096 +#define IHK_DEVICE_CREATE_OS 0x112900 +#define IHK_DEVICE_DESTROY_OS 0x112901 + +pthread_mutex_t mutex; +pthread_cond_t cond; +int sem; +int futex_flag; +pthread_t thr; + +struct syscall { + int number; + const char *name; +}; + +struct syscall syscalls[] = { + { .number = __NR_getuid, .name = "getuid" }, + { .number = __NR_ioctl, .name = "ioctl" }, + { .number = __NR_futex, .name = "futex" }, + { .number = __NR_mmap, .name = "mmap" }, + { .number = __NR_munmap, .name = "munmap" }, + { .number = __NR_brk, .name = "brk" }, + { .number = __NR_gettid, .name = "gettid" }, + { .number = __NR_mprotect, .name = "mprotect" }, + { .number = __NR_mremap, .name = "mremap" }, + { .number = __NR_open, .name = "open" }, + { .number = __NR_read, .name = "read" }, + { .number = __NR_write, .name = "write" } +}; + +void *util_thread(void *arg) { + int i, j; + int rc; + uid_t uid; + int osnum; + int fds[NLOOP]; + void *mems[NLOOP]; + void *memremaps[NLOOP]; + void *brk_cur; + char* buf = malloc(SZCHUNK*NLOOP); + struct timespec start, end; + long nsec; + + rc = syscall(732); + if (rc == -1) + fprintf(stdout, "[INFO] Child is running on Liux\n"); + else { + fprintf(stdout, "[INFO] Child is running on McKernel\n"); + } + errno = 0; + + for (i = 0; i < sizeof(syscalls) / sizeof(syscalls[0]); i++) { + clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start); + + switch (syscalls[i].number) { + case __NR_brk: + brk_cur = sbrk(0); + break; + case __NR_mprotect: + if((mems[0] = mmap(0, SZCHUNK, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0)) == (void*)-1) { + fprintf(stderr, "mmap failed: %s\n", strerror(errno)); + } + case __NR_munmap: + case __NR_mremap: + for (j = 0; j < NLOOP; j++) { + if((mems[j] = mmap(0, SZCHUNK, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0)) == (void*)-1) { + fprintf(stderr, "mmap failed: %s\n", strerror(errno)); + } + } + break; + case __NR_ioctl: + if((fds[0] = open("/dev/hello", O_RDWR)) < 0) { + fprintf(stderr, "ioctl, open failed: %s\n", strerror(errno)); + exit(1); + } + break; + case __NR_read: + case __NR_write: + if((fds[0] = open("./file", O_RDWR)) < 0) { + fprintf(stderr, "write, open failed: %s\n", strerror(errno)); + exit(1); + } + break; + default: + break; + } + + for (j = 0; j < NLOOP; j++) { + switch (syscalls[i].number) { + case __NR_gettid: + if((rc = syscall(syscalls[i].number)) < 0) { + fprintf(stderr, "%s failed: %s\n", syscalls[i].name, strerror(errno)); + } + break; + case __NR_futex: + futex_flag = 1; + if((rc = syscall(__NR_futex, &futex_flag, FUTEX_WAKE, 1, NULL, NULL, 0)) < 0) { + fprintf(stderr, "%s failed: %s\n", syscalls[i].name, strerror(errno)); + } + break; + case __NR_brk: + if((rc = brk(brk_cur)) < 0) { + fprintf(stderr, "%s failed: %s\n", syscalls[i].name, strerror(errno)); + } + break; + case __NR_mmap: + if((mems[j] = mmap(0, SZCHUNK, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0)) == (void*)-1) { + fprintf(stderr, "%s failed: %s\n", syscalls[i].name, strerror(errno)); + } + break; + case __NR_munmap: + if((rc = munmap(mems[j], SZCHUNK)) < 0) { + fprintf(stderr, "%s failed: %s\n", syscalls[i].name, strerror(errno)); + } + break; + case __NR_mprotect: + if((rc = mprotect(mems[0], SZCHUNK, PROT_READ)) < 0) { + fprintf(stderr, "%s failed: %s\n", syscalls[i].name, strerror(errno)); + } + break; + case __NR_mremap: + if((memremaps[j] = mremap(mems[j], SZCHUNK, 8192, MREMAP_MAYMOVE)) == (void*)-1) { + fprintf(stderr, "%s failed: %s\n", syscalls[i].name, strerror(errno)); + } + break; + case __NR_getuid: + if((uid = syscall(syscalls[i].number)) < 0) { + fprintf(stderr, "%s failed: uid=%d,%s\n", syscalls[i].name, uid, strerror(errno)); + } + break; + case __NR_open: + if((fds[j] = open("./file", O_RDONLY)) < 0) { + fprintf(stderr, "%s ./file failed: %s\n", syscalls[i].name, strerror(errno)); + } + break; + case __NR_ioctl: + if((rc = syscall(syscalls[i].number, fds[0], 0, 0)) < 0) { + fprintf(stderr, "%s failed: %s\n", syscalls[i].name, strerror(errno)); + } + break; + case __NR_read: + if((rc = read(fds[0], buf + j * SZCHUNK, SZCHUNK)) < 0) { + fprintf(stderr, "%s failed: %s\n", syscalls[i].name, strerror(errno)); + } + break; + case __NR_write: + if((rc = write(fds[0], buf + j * SZCHUNK, SZCHUNK)) < 0) { + fprintf(stderr, "%s failed: rc=%d,%s\n", syscalls[i].name, rc, strerror(errno)); + } + break; + } + } + clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end); + nsec = (TS2NS(end.tv_sec, end.tv_nsec) - TS2NS(start.tv_sec, start.tv_nsec)); + fprintf(stderr, "%s %ld nsec\n", syscalls[i].name, nsec / NLOOP); + + switch (syscalls[i].number) { + case __NR_mmap: + for (j = 0; j < NLOOP; j++) { + if((rc = munmap(mems[j], SZCHUNK)) < 0) { + fprintf(stderr, "munmap failed: %s\n", strerror(errno)); + } + } + break; + case __NR_mprotect: + if((rc = munmap(mems[0], SZCHUNK)) < 0) { + fprintf(stderr, "munmap failed: %s\n", strerror(errno)); + } + break; + case __NR_mremap: + for (j = 0; j < NLOOP; j++) { + if((rc = munmap(memremaps[j], SZCHUNK)) < 0) { + fprintf(stderr, "munmap failed: %s\n", strerror(errno)); + } + } + break; + case __NR_open: + for (j = 0; j < NLOOP; j++) { + if((rc = close(fds[j])) < 0) { + fprintf(stderr, "close failed: %s\n", strerror(errno)); + } + } + break; + case __NR_ioctl: + case __NR_read: + case __NR_write: + if((rc = close(fds[0])) < 0) { + fprintf(stderr, "close failed: %s\n", strerror(errno)); + } + break; + default: + break; + } + } + + pthread_mutex_lock(&mutex); + while (!sem) { + pthread_cond_wait(&cond, &mutex); + } + sem = 0; + pthread_mutex_unlock(&mutex); + + return NULL; +} + +int +main(int argc, char **argv) +{ + int rc; + char *uti_str; + int disable_syscall_intercept = 0; + int opt; + + while ((opt = getopt(argc, argv, "+I:")) != -1) { + switch (opt) { + case 'I': + disable_syscall_intercept = atoi(optarg); + break; + default: /* '?' */ + printf("unknown option %c\n", optopt); + exit(1); + } + } + + if (disable_syscall_intercept == 0) { + rc = syscall(731, 1, NULL); + if (rc) { + fprintf(stdout, "CT11002 INFO: uti not available (rc=%d)\n", rc); + } else { + fprintf(stdout, "CT11002 INFO: uti available\n"); + } + } else { + fprintf(stdout, "CT11002 INFO: uti disabled\n", rc); + } + + rc = pthread_create(&thr, NULL, util_thread, NULL); + if (rc) { + fprintf(stderr, "pthread_create: %d\n", rc); + exit(1); + } + fprintf(stdout, "CT11003 pthread_create OK\n"); + + while (!futex_flag) { + rc = syscall(__NR_futex, &futex_flag, FUTEX_WAIT, 0, NULL, NULL, 0); + if (rc == -1) { + fprintf(stderr, "CT11101 FUTEX_WAIT ERROR: %s\n", strerror(errno)); + } + } + + pthread_mutex_lock(&mutex); + sem = 1; + pthread_cond_signal(&cond); + pthread_mutex_unlock(&mutex); + pthread_join(thr, NULL); + + fprintf(stdout, "CT10005 END\n"); + exit(0); +} diff --git a/test/uti/CT11.pl b/test/uti/CT11.pl new file mode 100755 index 00000000..8e64f98e --- /dev/null +++ b/test/uti/CT11.pl @@ -0,0 +1,17 @@ +#!/usr/bin/perl + +while(<>) { +# print $_; + @row = split(/\s+/, $_); +# print $row[0]."\n"; + $nsec{$row[0]} += $row[1]; + $count{$row[0]}++; + if ($bitmap{$row[0]} == "") { + push @names, ($row[0]); + } + $bitmap{$row[0]} = 1; +} + +foreach $name (@names) { + print $name . ',' . $nsec{$name} / $count{$name} . "\n"; +} diff --git a/test/uti/CT11.sh b/test/uti/CT11.sh new file mode 100755 index 00000000..80a48625 --- /dev/null +++ b/test/uti/CT11.sh @@ -0,0 +1,110 @@ +#!/usr/bin/bash + +MYHOME=$HOME + +MCK="${MYHOME}/project/os/install" + +stop=0 +reset=0 +go=0 +measure=0 + +mck=0 +disable_syscall_intercept=0 +nloops=1 + +while getopts srgmI:l:M OPT +do + case ${OPT} in + s) stop=1 + ;; + r) reset=1 + ;; + g) go=1 + ;; + m) mck=1 + ;; + I) disable_syscall_intercept=$OPTARG + ;; + l) nloops=$OPTARG + ;; + M) measure=1 + ;; + *) echo "invalid option -${OPT}" >&2 + exit 1 + esac +done + +if [ ${mck} -eq 1 ]; then + MCEXEC="${MCK}/bin/mcexec" + if [ ${disable_syscall_intercept} -eq 0 ]; then + mcexecopt="--enable-uti" + else + mcexecopt= + fi +else + MCEXEC= + mcexecopt= +fi + +if [ ${stop} -eq 1 ]; then +# sudo mount /work + + sudo ${MCK}/sbin/mcstop+release.sh +fi + +if [ ${reset} -eq 1 ]; then + if hostname | grep ofp &>/dev/null; then + #sudo mount /work + : + fi + + if hostname | grep ofp &>/dev/null; then + sudo ${MCK}/sbin/mcreboot.sh -c 2-17,70-85,138-153,206-221,20-35,88-103,156-171,224-239,36-51,104-119,172-187,240-255,52-67,120-135,188-203,256-271 -r 2-5,70-73,138-141,206-209:0+6-9,74-77,142-145,210-213:1+10-13,78-81,146-149,214-217:68+14-17,82-85,150-153,218-221:69+20-23,88-91,156-159,224-227:136+24-27,92-95,160-163,228-231:137+28-31,96-99,164-167,232-235:204+32-35,100-103,168-171,236-239:205+36-39,104-107,172-175,240-243:18+40-43,108-111,176-179,244-247:19+44-47,112-115,180-183,248-251:86+48-51,116-119,184-187,252-255:87+52-55,120-123,188-191,256-259:154+56-59,124-127,192-195,260-263:155+60-63,128-131,196-199,264-267:222+64-67,132-135,200-203,268-271:223 -m 32G@0,12G@1 + elif hostname | grep koala &>/dev/null; then + sudo ${MCK}/sbin/mcreboot.sh -c 1-15,65-79,129-143,193-207,17-31,81-95,145-159,209-223,33-47,97-111,161-175,225-239,49-63,113-127,177-191,241-255 -r 1-15:0+65-79:64+129-143:128+193-207:192+17-31:16+81-95:80+145-159:144+209-223:208+33-47:32+97-111:96+161-175:160+225-239:224+49-63:48+113-127:112+177-191:176+241-255:240 -m 12G@0,12G@1,12G@2,12G@3,3920M@4,3920M@5,3920M@6,3920M@7 + else + sudo ${MCK}/sbin/mcreboot.sh -c 1,2 -m 512M + fi +fi + +function init_mod() { + if grep hello /proc/devices > /dev/null; then + sudo rm -f /dev/hello + sudo rmmod "driver/hello.ko" + fi + + (cd driver; make) + + if ! grep hello /proc/devices > /dev/null; then + sudo insmod "driver/hello.ko" + major=`grep hello /proc/devices | cut -d' ' -f 1` + sudo mknod /dev/hello c $major 0 + sudo chmod og+rw /dev/hello + fi +} + +if [ ${measure} -eq 1 ]; then + init_mod + + rm -f ./CT11 + make ./CT11 + + > ./log + for i in {1..10}; do (${MCEXEC} $mcexecopt ./CT11 -I $disable_syscall_intercept 1>/dev/null 2>> ./log); done + perl CT11.pl < ./log +fi + +if [ ${go} -eq 1 ]; then + init_mod + + rm -f ./CT11 + make ./CT11 + + for i in `seq 1 ${nloops}`; do + ${MCEXEC} $mcexecopt ./CT11 -I $disable_syscall_intercept + echo =====; + echo $i; + echo =====; i=$((i+1)); + done +fi diff --git a/test/uti/CT12.c b/test/uti/CT12.c new file mode 100644 index 00000000..b630b902 --- /dev/null +++ b/test/uti/CT12.c @@ -0,0 +1,118 @@ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* For SYS_xxx definitions */ +#include +#include +#include + +int passed = 0, sem = 0; +pthread_t thr; + +unsigned long mem; /* delay functions issue ld/st instructions on this address */ +double nspw; /* nsec per work */ + +/* Timer related macros */ +#define TS2NS(sec, nsec) ((unsigned long)(sec) * 1000000000ULL + (unsigned long)(nsec)) +#define N_INIT 10000000 + +static inline void fixed_size_work(unsigned long *ptr) { + asm volatile("movq %0, %%rax\n\t" + "addq $1, %%rax\n\t" \ + "movq %%rax, %0\n\t" \ + : "+rm" (*ptr) \ + : \ + : "rax", "cc", "memory"); \ +} + +static inline void delay_loop(unsigned long n, unsigned long *ptr) { + int j; + for (j = 0; j < (n); j++) { + fixed_size_work(ptr); + } +} + +void delay_init(unsigned long *mem) { + struct timespec start, end; + unsigned long nsec; + int i; + clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start); + delay_loop(N_INIT, mem); + clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end); + nsec = (TS2NS(end.tv_sec, end.tv_nsec) - TS2NS(start.tv_sec, start.tv_nsec)); + nspw = nsec / (double)N_INIT; + printf("nsec=%ld, nspw=%f\n", nsec, nspw); +} + +void delay_nsec(unsigned long delay_nsec, unsigned long* mem) { + //printf("delay_nsec=%ld,count=%f\n", delay_nsec, delay_nsec / nspw); + delay_loop(delay_nsec / nspw, mem); +} + +void *util_thread(void *arg) { + int rc; + + rc = syscall(732); + if (rc == -1) + fprintf(stderr, "CT12100 running on Linux CPU OK\n"); + else { + fprintf(stderr, "CT12100 running on Linux CPU NG (%d)\n", rc); + } + + passed = 1; + + rc = syscall(__NR_futex, &sem, FUTEX_WAIT, 0, NULL, NULL, 0); + if (rc != 0) { + fprintf(stderr, "CT12101 FUTEX_WAIT NG (%s)\n", strerror(errno)); + } else { + fprintf(stderr, "CT12101 FUTEX_WAIT OK\n"); + } + + return NULL; +} + +int +main(int argc, char **argv) +{ + int rc; + + fprintf(stderr, "CT12001 futex START\n"); + rc = syscall(731, 1, NULL); + if (rc) { + fprintf(stderr, "util_indicate_clone rc=%d, errno=%d\n", rc, errno); + fflush(stderr); + } + + rc = pthread_create(&thr, NULL, util_thread, NULL); + if (rc){ + fprintf(stderr, "pthread_create: %d\n", rc); + exit(1); + } + fprintf(stderr, "CT12002 pthread_create OK\n"); + + retry: + while (!passed) { + asm volatile("pause" ::: "memory"); + } + usleep(100000); + + rc = syscall(__NR_futex, &sem, FUTEX_WAKE, 1, NULL, NULL, 0); + if (rc != 1) { + fprintf(stderr, "CT12003 FUTEX_WAKE NG (%d,%s)\n", rc, strerror(errno)); + } else { + fprintf(stderr, "CT12003 FUTEX_WAKE OK\n"); + } + + pthread_join(thr, NULL); + fprintf(stderr, "CT12004 pthread_join OK\n"); + + fprintf(stderr, "CT12005 END\n"); + exit(0); +} diff --git a/test/uti/CT13.c b/test/uti/CT13.c new file mode 100644 index 00000000..1f6105c1 --- /dev/null +++ b/test/uti/CT13.c @@ -0,0 +1,74 @@ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include /* For SYS_xxx definitions */ +#include +#include + +int passed = 0, sem = 0; +pthread_t thr; + +void *util_thread(void *arg) { + int rc; + + rc = syscall(732); + if (rc == -1) + fprintf(stderr, "CT13100 running on Linux CPU OK\n"); + else { + fprintf(stderr, "CT13100 running on Linux CPU NG (%d)\n", rc); + } + + retry: + while (!passed) { + asm volatile("pause" ::: "memory"); + } + usleep(100000); /* debug messages via serial takes 0.05 sec */ + + rc = syscall(__NR_futex, &sem, FUTEX_WAKE, 1, NULL, NULL, 0); + if (rc != 1) { + fprintf(stderr, "CT13101 FUTEX_WAKE NG (%d,%s)\n", rc, strerror(errno)); + } else { + fprintf(stderr, "CT13101 FUTEX_WAKE OK\n"); + } + + return NULL; +} + +int main(int argc, char **argv) +{ + int rc; + + fprintf(stderr, "CT13001 futex START\n"); + rc = syscall(731, 1, NULL); + if (rc) { + fprintf(stderr, "util_indicate_clone rc=%d, errno=%d\n", rc, errno); + fflush(stderr); + } + + rc = pthread_create(&thr, NULL, util_thread, NULL); + if (rc){ + fprintf(stderr, "pthread_create: %d\n", rc); + exit(1); + } + fprintf(stderr, "CT13002 pthread_create OK\n"); + + passed = 1; + + rc = syscall(__NR_futex, &sem, FUTEX_WAIT, 0, NULL, NULL, 0); + if (rc != 0) { + fprintf(stderr, "CT13003 FUTEX_WAIT NG (%s)\n", strerror(errno)); + } else { + fprintf(stderr, "CT13003 FUTEX_WAIT OK\n"); + } + + pthread_join(thr, NULL); + fprintf(stderr, "CT13004 pthread_join OK\n"); + + fprintf(stderr, "CT13005 END\n"); + exit(0); +} diff --git a/test/uti/CT14.c b/test/uti/CT14.c new file mode 100644 index 00000000..279613fe --- /dev/null +++ b/test/uti/CT14.c @@ -0,0 +1,121 @@ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include + +pthread_mutex_t mutex; +int owned; +pthread_t thr; + +#define TS2NS(sec, nsec) ((unsigned long)(sec) * 1000000000ULL + (unsigned long)(nsec)) +static inline void FIXED_SIZE_WORK(unsigned long *ptr) { + asm volatile("movq %0, %%rax\n\t" + "addq $1, %%rax\n\t" \ + "movq %%rax, %0\n\t" \ + : "+rm" (*ptr) \ + : \ + : "rax", "cc", "memory"); \ +} + +static inline void BULK_FSW(unsigned long n, unsigned long *ptr) { + int j; + for (j = 0; j < (n); j++) { + FIXED_SIZE_WORK(ptr); + } +} + +double nspw; /* nsec per work */ + +#define N_INIT 10000000 + +void fwq_init(unsigned long *mem) { + struct timespec start, end; + unsigned long nsec; + int i; + clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start); + BULK_FSW(N_INIT, mem); + clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end); + nsec = (TS2NS(end.tv_sec, end.tv_nsec) - TS2NS(start.tv_sec, start.tv_nsec)); + nspw = nsec / (double)N_INIT; + printf("nsec=%ld, nspw=%f\n", nsec, nspw); +} + +void fwq(unsigned long delay_nsec, unsigned long* mem) { + //printf("delay_nsec=%ld,count=%f\n", delay_nsec, delay_nsec / nspw); + BULK_FSW(delay_nsec / nspw, mem); +} + +void * +util_thread(void *arg) +{ + int rc; + unsigned long mem; + + rc = syscall(732); + if (rc == -1) + fprintf(stderr, "CT14100 running on Linux OK\n"); + else { + fprintf(stderr, "CT14100 running on Linux NG (%d)\n", rc); + } + errno = 0; + + fwq(500 * 1000 * 1000UL, &mem); /* Sending debug messages through serial takes 0.05 sec */ + + pthread_mutex_lock(&mutex); + if (owned) { + fprintf(stderr, "CT14101 lock second OK\n"); + } else { + fprintf(stderr, "CT14101 lock second NG\n"); + } + owned = 1; + pthread_mutex_unlock(&mutex); + + return NULL; +} + +int main(int argc, char **argv) { + int rc; + unsigned long mem; + + pthread_mutex_init(&mutex, NULL); + fwq_init(&mem); + + fprintf(stderr, "CT14001 futex START\n"); + + rc = syscall(731, 1, NULL); + if (rc) { + fprintf(stderr, "CT14002 util_indicate_clone NG (rc=%d, errno=%d)\n", rc, errno); + fflush(stderr); + } else { + fprintf(stderr, "CT14002 util_indicate_clone OK\n"); + } + + rc = pthread_create(&thr, NULL, util_thread, NULL); + if(rc){ + fprintf(stderr, "pthread_create: %d\n", rc); + exit(1); + } + fprintf(stderr, "CT14003 pthread_create OK\n"); + + pthread_mutex_lock(&mutex); + if (!owned) { + fprintf(stderr, "CT14004 lock first OK\n"); + } else { + fprintf(stderr, "CT14004 lock first NG\n"); + } + owned = 1; + fwq(2000 * 1000 * 1000UL, &mem); /* Need 2 sec to make child sleep */ + pthread_mutex_unlock(&mutex); + + pthread_join(thr, NULL); + fprintf(stderr, "CT14005 pthread_join OK\n"); + + fprintf(stderr, "CT14006 END\n"); + exit(0); +} diff --git a/test/uti/CT15.c b/test/uti/CT15.c new file mode 100644 index 00000000..3c6306b0 --- /dev/null +++ b/test/uti/CT15.c @@ -0,0 +1,121 @@ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include + +pthread_mutex_t mutex; +int owned; +pthread_t thr; + +#define TS2NS(sec, nsec) ((unsigned long)(sec) * 1000000000ULL + (unsigned long)(nsec)) +static inline void FIXED_SIZE_WORK(unsigned long *ptr) { + asm volatile("movq %0, %%rax\n\t" + "addq $1, %%rax\n\t" \ + "movq %%rax, %0\n\t" \ + : "+rm" (*ptr) \ + : \ + : "rax", "cc", "memory"); \ +} + +static inline void BULK_FSW(unsigned long n, unsigned long *ptr) { + int j; + for (j = 0; j < (n); j++) { + FIXED_SIZE_WORK(ptr); + } +} + +double nspw; /* nsec per work */ + +#define N_INIT 10000000 + +void fwq_init(unsigned long *mem) { + struct timespec start, end; + unsigned long nsec; + int i; + clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start); + BULK_FSW(N_INIT, mem); + clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end); + nsec = (TS2NS(end.tv_sec, end.tv_nsec) - TS2NS(start.tv_sec, start.tv_nsec)); + nspw = nsec / (double)N_INIT; + printf("nsec=%ld, nspw=%f\n", nsec, nspw); +} + +void fwq(unsigned long delay_nsec, unsigned long* mem) { + //printf("delay_nsec=%ld,count=%f\n", delay_nsec, delay_nsec / nspw); + BULK_FSW(delay_nsec / nspw, mem); +} + +void * +util_thread(void *arg) +{ + int rc; + unsigned long mem; + + rc = syscall(732); + if (rc == -1) + fprintf(stderr, "CT14100 running on Linux OK\n"); + else { + fprintf(stderr, "CT14100 running on Linux NG (%d)\n", rc); + } + errno = 0; + + pthread_mutex_lock(&mutex); + if (!owned) { + fprintf(stderr, "CT14101 lock first OK\n"); + } else { + fprintf(stderr, "CT14101 lock first NG\n"); + } + owned = 1; + fwq(2000 * 1000 * 1000UL, &mem); /* Need 2 sec to make parent sleep */ + pthread_mutex_unlock(&mutex); + + return NULL; +} + +int main(int argc, char **argv) { + int rc; + unsigned long mem; + + pthread_mutex_init(&mutex, NULL); + fwq_init(&mem); + + fprintf(stderr, "CT14001 futex START\n"); + + rc = syscall(731, 1, NULL); + if (rc) { + fprintf(stderr, "CT14002 util_indicate_clone NG (rc=%d, errno=%d)\n", rc, errno); + fflush(stderr); + } else { + fprintf(stderr, "CT14002 util_indicate_clone OK\n"); + } + + rc = pthread_create(&thr, NULL, util_thread, NULL); + if(rc){ + fprintf(stderr, "pthread_create: %d\n", rc); + exit(1); + } + fprintf(stderr, "CT14003 pthread_create OK\n"); + + fwq(500 * 1000 * 1000UL, &mem); /* Sending debug messages through serial takes 0.05 sec */ + + pthread_mutex_lock(&mutex); + if (owned) { + fprintf(stderr, "CT14004 lock second OK\n"); + } else { + fprintf(stderr, "CT14004 lock second NG\n"); + } + owned = 1; + pthread_mutex_unlock(&mutex); + + pthread_join(thr, NULL); + fprintf(stderr, "CT14005 pthread_join OK\n"); + + fprintf(stderr, "CT14006 END\n"); + exit(0); +} diff --git a/test/uti/CT16.c b/test/uti/CT16.c new file mode 100644 index 00000000..ec29ccda --- /dev/null +++ b/test/uti/CT16.c @@ -0,0 +1,83 @@ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include + +pthread_mutex_t mutex; +pthread_cond_t cond; +int passed, flag; +pthread_t thr; + +void * +util_thread(void *arg) +{ + int rc; + unsigned long mem; + + rc = syscall(732); + if (rc == -1) + fprintf(stderr, "CT16101 running on Linux OK\n"); + else { + fprintf(stderr, "CT16101 running on Linux NG (%d)\n", rc); + } + errno = 0; + + passed = 1; + pthread_mutex_lock(&mutex); + while(!flag) { + pthread_cond_wait(&cond, &mutex); + } + flag = 0; + pthread_mutex_unlock(&mutex); + + fprintf(stderr, "CT16102 return from pthread_cond_wait() OK\n"); + + return NULL; +} + +int main(int argc, char **argv) { + int rc; + unsigned long mem; + + pthread_mutex_init(&mutex, NULL); + pthread_cond_init(&cond, NULL); + + fprintf(stderr, "CT16001 futex START\n"); + + rc = syscall(731, 1, NULL); + if (rc) { + fprintf(stderr, "CT16002 util_indicate_clone NG (rc=%d, errno=%d)\n", rc, errno); + fflush(stderr); + } else { + fprintf(stderr, "CT16002 util_indicate_clone OK\n"); + } + + rc = pthread_create(&thr, NULL, util_thread, NULL); + if(rc){ + fprintf(stderr, "pthread_create: %d\n", rc); + exit(1); + } + fprintf(stderr, "CT16003 pthread_create OK\n"); + + while (!passed) { + asm volatile("pause" ::: "memory"); + } + usleep(100 * 1000UL); /* Send debug message through serial takes 0.05 sec */ + + pthread_mutex_lock(&mutex); + flag = 1; + pthread_cond_signal(&cond); + pthread_mutex_unlock(&mutex); + + pthread_join(thr, NULL); + fprintf(stderr, "CT16004 pthread_join OK\n"); + + fprintf(stderr, "CT16005 END\n"); + exit(0); +} diff --git a/test/uti/CT17.c b/test/uti/CT17.c new file mode 100644 index 00000000..795a002a --- /dev/null +++ b/test/uti/CT17.c @@ -0,0 +1,81 @@ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include + +pthread_mutex_t mutex; +pthread_cond_t cond; +int passed, flag; +pthread_t thr; + +void *util_thread(void *arg) { + int rc; + unsigned long mem; + + rc = syscall(732); + if (rc == -1) + fprintf(stderr, "CT17100 running on Linux OK\n"); + else { + fprintf(stderr, "CT17100 running on Linux NG (%d)\n", rc); + } + + while (!passed) { + asm volatile("pause" ::: "memory"); + } + usleep(100 * 1000UL); /* Send debug message through serial takes 0.05 sec */ + + pthread_mutex_lock(&mutex); + flag = 1; + pthread_cond_signal(&cond); + pthread_mutex_unlock(&mutex); + + return NULL; +} + +int main(int argc, char **argv) { + int rc; + unsigned long mem; + + pthread_mutex_init(&mutex, NULL); + pthread_cond_init(&cond, NULL); + + fprintf(stderr, "CT17001 futex START\n"); + + rc = syscall(731, 1, NULL); + if (rc) { + fprintf(stderr, "CT17002 util_indicate_clone NG (rc=%d, errno=%d)\n", rc, errno); + fflush(stderr); + } else { + fprintf(stderr, "CT17002 util_indicate_clone OK\n"); + } + + rc = pthread_create(&thr, NULL, util_thread, NULL); + if(rc){ + fprintf(stderr, "pthread_create: %d\n", rc); + exit(1); + } + fprintf(stderr, "CT17003 pthread_create OK\n"); + + passed = 1; + pthread_mutex_lock(&mutex); + fprintf(stderr, "CT17004 lock on %p OK\n", &mutex); + while(!flag) { + pthread_cond_wait(&cond, &mutex); + fprintf(stderr, "CT17005 wake on %p OK\n", &cond); + } + flag = 0; + + pthread_mutex_unlock(&mutex); + + pthread_join(thr, NULL); + fprintf(stderr, "CT17006 pthread_join OK\n"); + + fprintf(stderr, "CT17007 END\n"); + exit(0); +} diff --git a/test/uti/CT18.c b/test/uti/CT18.c new file mode 100644 index 00000000..9ef11b78 --- /dev/null +++ b/test/uti/CT18.c @@ -0,0 +1,111 @@ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* For SYS_xxx definitions */ +#include +#include +#include + +int passed, sem, flag; +pthread_t thr; +#define TS2NS(sec, nsec) ((unsigned long)(sec) * 1000000000ULL + (unsigned long)(nsec)) + +void *util_thread(void *arg) { + int rc; + struct timespec start, timeout, end; + unsigned long elapsed; + + rc = syscall(732); + if (rc == -1) + fprintf(stderr, "CT18101 running on Linux CPU OK\n"); + else { + fprintf(stderr, "CT18101 running on Linux CPU NG (%d)\n", rc); + } + + passed = 1; + + rc = clock_gettime(CLOCK_REALTIME, &start); + if (rc != 0) { + fprintf(stderr, "clock_gettime failed\n"); + return NULL; + } + fprintf(stderr, "start=%ld.%09ld\n", start.tv_sec, start.tv_nsec); + + timeout.tv_sec = start.tv_sec; + timeout.tv_nsec = start.tv_nsec + 800UL * 1000 * 1000; + if (timeout.tv_nsec > 1000UL * 1000 * 1000) { + timeout.tv_sec += 1; + timeout.tv_nsec -= 1000UL * 1000* 1000; + } + rc = syscall(__NR_futex, &sem, FUTEX_WAIT_BITSET | FUTEX_CLOCK_REALTIME, 0, &timeout, NULL, 0x12345678); + fprintf(stderr, "op=%x\n", FUTEX_WAIT_BITSET | FUTEX_CLOCK_REALTIME); + + rc = clock_gettime(CLOCK_REALTIME, &end); + if (rc != 0) { + fprintf(stderr, "clock_gettime failed\n"); + return NULL; + } + fprintf(stderr, "end=%ld.%09ld\n", end.tv_sec, end.tv_nsec); + + if (rc != 0) { + fprintf(stderr, "CT18102 FUTEX_WAIT NG (%s)\n", strerror(errno)); + } else { + fprintf(stderr, "CT18102 FUTEX_WAIT OK\n"); + } + + elapsed = TS2NS(end.tv_sec, end.tv_nsec) - TS2NS(start.tv_sec, start.tv_nsec); + if (flag == 0 || elapsed < 800UL * 1000 * 1000 + 80UL * 1000 * 1000) { + fprintf(stderr, "CT18103 timeout OK\n"); + } else { + fprintf(stderr, "CT18103 timeout NG (%lx)\n", elapsed); + } + + return NULL; +} + +int +main(int argc, char **argv) +{ + int rc; + + fprintf(stderr, "CT18001 futex START\n"); + rc = syscall(731, 1, NULL); + if (rc) { + fprintf(stderr, "util_indicate_clone rc=%d, errno=%d\n", rc, errno); + fflush(stderr); + } + + rc = pthread_create(&thr, NULL, util_thread, NULL); + if (rc){ + fprintf(stderr, "pthread_create: %d\n", rc); + exit(1); + } + fprintf(stderr, "CT18002 pthread_create OK\n"); + + retry: + while (!passed) { + asm volatile("pause" ::: "memory"); + } + usleep(800 * 1000UL * 10); + + flag = 1; + rc = syscall(__NR_futex, &sem, FUTEX_WAKE_BITSET, 1, NULL, NULL, 0x12345678); + if (rc != 0) { + fprintf(stderr, "CT18003 FUTEX_WAKE missing the waiter NG (%d,%s)\n", rc, strerror(errno)); + } else { + fprintf(stderr, "CT18003 FUTEX_WAKE missing the waiter OK\n"); + } + + pthread_join(thr, NULL); + fprintf(stderr, "CT18004 pthread_join OK\n"); + + fprintf(stderr, "CT18005 END\n"); + exit(0); +} diff --git a/test/uti/CT19.c b/test/uti/CT19.c new file mode 100644 index 00000000..0d90168d --- /dev/null +++ b/test/uti/CT19.c @@ -0,0 +1,112 @@ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* For SYS_xxx definitions */ +#include +#include +#include + +int passed, sem, flag; +pthread_t thr; +#define TS2NS(sec, nsec) ((unsigned long)(sec) * 1000000000ULL + (unsigned long)(nsec)) + +void *util_thread(void *arg) { + int rc; + struct timespec start, timeout, end; + unsigned long elapsed; + + rc = syscall(732); + if (rc == -1) + fprintf(stderr, "CT19100 running on Linux CPU OK\n"); + else { + fprintf(stderr, "CT19100 running on Linux CPU NG (%d)\n", rc); + } + + passed = 1; + + rc = clock_gettime(CLOCK_MONOTONIC, &start); + if (rc != 0) { + fprintf(stderr, "clock_gettime failed\n"); + return NULL; + } + fprintf(stderr, "start=%ld.%09ld\n", start.tv_sec, start.tv_nsec); + + timeout.tv_sec = start.tv_sec; + timeout.tv_nsec = start.tv_nsec + 800UL * 1000 * 1000; + if (timeout.tv_nsec > 1000UL * 1000 * 1000) { + timeout.tv_sec += 1; + timeout.tv_nsec -= 1000UL * 1000* 1000; + } + /* timeout - clock_gettime(CLOCK_MONOTONIC) */ + rc = syscall(__NR_futex, &sem, FUTEX_WAIT_BITSET, 0, &timeout, NULL, 0x12345678); + fprintf(stderr, "op=%x\n", FUTEX_WAIT_BITSET); + + rc = clock_gettime(CLOCK_MONOTONIC, &end); + if (rc != 0) { + fprintf(stderr, "clock_gettime failed\n"); + return NULL; + } + fprintf(stderr, "end=%ld.%09ld\n", end.tv_sec, end.tv_nsec); + + if (rc != 0) { + fprintf(stderr, "CT19101 FUTEX_WAIT NG (%s)\n", strerror(errno)); + } else { + fprintf(stderr, "CT19101 FUTEX_WAIT OK\n"); + } + + elapsed = TS2NS(end.tv_sec, end.tv_nsec) - TS2NS(start.tv_sec, start.tv_nsec); + if (flag == 0 || elapsed < 800UL * 1000 * 1000 + 80UL * 1000 * 1000) { + fprintf(stderr, "CT19102 timeout OK\n"); + } else { + fprintf(stderr, "CT19101 timeout NG\n"); + } + + return NULL; +} + +int +main(int argc, char **argv) +{ + int rc; + + fprintf(stderr, "CT19001 futex START\n"); + rc = syscall(731, 1, NULL); + if (rc) { + fprintf(stderr, "util_indicate_clone rc=%d, errno=%d\n", rc, errno); + fflush(stderr); + } + + rc = pthread_create(&thr, NULL, util_thread, NULL); + if (rc){ + fprintf(stderr, "pthread_create: %d\n", rc); + exit(1); + } + fprintf(stderr, "CT19002 pthread_create OK\n"); + + retry: + while (!passed) { + asm volatile("pause" ::: "memory"); + } + usleep(2000 * 1000UL); + + flag = 1; + rc = syscall(__NR_futex, &sem, FUTEX_WAKE_BITSET, 1, NULL, NULL, 0x12345678); + if (rc != 0) { + fprintf(stderr, "CT19003 FUTEX_WAKE missing the waiter NG (%d,%s)\n", rc, strerror(errno)); + } else { + fprintf(stderr, "CT19003 FUTEX_WAKE missing the waiter OK\n"); + } + + pthread_join(thr, NULL); + fprintf(stderr, "CT19004 pthread_join OK\n"); + + fprintf(stderr, "CT19005 END\n"); + exit(0); +} diff --git a/test/uti/CT20.c b/test/uti/CT20.c new file mode 100644 index 00000000..42bdc973 --- /dev/null +++ b/test/uti/CT20.c @@ -0,0 +1,106 @@ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* For SYS_xxx definitions */ +#include +#include +#include + +int passed, sem, flag; +pthread_t thr; +#define TS2NS(sec, nsec) ((unsigned long)(sec) * 1000000000ULL + (unsigned long)(nsec)) + +void *util_thread(void *arg) { + int rc; + struct timespec start, timeout, end; + unsigned long elapsed; + + rc = syscall(732); + if (rc == -1) + fprintf(stderr, "CT20100 running on Linux CPU OK\n"); + else { + fprintf(stderr, "CT20100 running on Linux CPU NG (%d)\n", rc); + } + + passed = 1; + + rc = clock_gettime(CLOCK_REALTIME, &start); + if (rc != 0) { + fprintf(stderr, "clock_gettime failed\n"); + return NULL; + } + fprintf(stderr, "start=%ld.%09ld\n", start.tv_sec, start.tv_nsec); + + timeout.tv_sec = 0; + timeout.tv_nsec = 800ULL * 1000 * 1000; + rc = syscall(__NR_futex, &sem, FUTEX_WAIT, 0, &timeout, NULL, 0); + + rc = clock_gettime(CLOCK_REALTIME, &end); + if (rc != 0) { + fprintf(stderr, "clock_gettime failed\n"); + return NULL; + } + fprintf(stderr, "end=%ld.%09ld\n", end.tv_sec, end.tv_nsec); + + if (rc != 0) { + fprintf(stderr, "CT20101 FUTEX_WAIT NG (%s)\n", strerror(errno)); + } else { + fprintf(stderr, "CT20101 FUTEX_WAIT OK\n"); + } + + elapsed = TS2NS(end.tv_sec, end.tv_nsec) - TS2NS(start.tv_sec, start.tv_nsec); + if (flag == 0 || elapsed < 800UL * 1000 * 1000 + 80UL * 1000 * 1000) { + fprintf(stderr, "CT20102 timeout OK\n"); + } else { + fprintf(stderr, "CT20101 timeout NG\n"); + } + + return NULL; +} + +int +main(int argc, char **argv) +{ + int rc; + + fprintf(stderr, "CT20001 futex START\n"); + rc = syscall(731, 1, NULL); + if (rc) { + fprintf(stderr, "util_indicate_clone rc=%d, errno=%d\n", rc, errno); + fflush(stderr); + } + + rc = pthread_create(&thr, NULL, util_thread, NULL); + if (rc){ + fprintf(stderr, "pthread_create: %d\n", rc); + exit(1); + } + fprintf(stderr, "CT20002 pthread_create OK\n"); + + retry: + while (!passed) { + asm volatile("pause" ::: "memory"); + } + usleep(2000 * 1000UL); + + flag = 1; + rc = syscall(__NR_futex, &sem, FUTEX_WAKE, 1, NULL, NULL, 0); + if (rc != 0) { + fprintf(stderr, "CT20003 FUTEX_WAKE missing the waiter NG (%d,%s)\n", rc, strerror(errno)); + } else { + fprintf(stderr, "CT20003 FUTEX_WAKE missing the waiter OK\n"); + } + + pthread_join(thr, NULL); + fprintf(stderr, "CT20004 pthread_join OK\n"); + + fprintf(stderr, "CT20005 END\n"); + exit(0); +} diff --git a/test/uti/CT21.c b/test/uti/CT21.c new file mode 100644 index 00000000..8c9552d4 --- /dev/null +++ b/test/uti/CT21.c @@ -0,0 +1,210 @@ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define DEBUG + +#ifdef DEBUG +#define dprintf(...) \ + do { \ + char msg[1024]; \ + sprintf(msg, __VA_ARGS__); \ + fprintf(stdout, "%s,%s", __FUNCTION__, msg); \ + } while (0); +#define eprintf(...) \ + do { \ + char msg[1024]; \ + sprintf(msg, __VA_ARGS__); \ + fprintf(stdout, "%s,%s", __FUNCTION__, msg); \ + } while (0); +#else +#define dprintf(...) do { } while (0) +#define eprintf(...) do { } while (0) +#endif + +#define NTHR 1 +#define TS2NS(sec, nsec) ((unsigned long)(sec) * 1000000000ULL + (unsigned long)(nsec)) +#define CALC_DELAY (93000) /* 93 usec */ +#define INIT_DELAY (2000) /* 2 usec, CPU sends CTS packet */ +#define NIC_DELAY (3000) /* 3 usec, NIC reads by RDMA-read */ +#define POLL_DELAY (200) /* .2 usec, CPU fetces event queue entry from DRAM */ +#define RESP_DELAY (2000) /* 2 usec, CPU sends DONE packet and updates MPI_Request */ +#define NSPIN 1 +static inline void FIXED_SIZE_WORK(unsigned long *ptr) { + asm volatile("movq %0, %%rax\n\t" + "addq $1, %%rax\n\t" \ + "movq %%rax, %0\n\t" \ + : "+rm" (*ptr) \ + : \ + : "rax", "cc", "memory"); \ +} + +static inline void BULK_FSW(unsigned long n, unsigned long *ptr) { + int j; + for (j = 0; j < (n); j++) { + FIXED_SIZE_WORK(ptr); + } +} + + +pthread_mutex_t ep_lock; /* Ownership of channel instance */ + +struct thr_arg { + int bar_count; /* Barrier before entering loop */ + pthread_mutex_t bar_lock; + pthread_cond_t bar_cond; + pthread_t pthread; + unsigned long mem; /* Per-thread storage */ +}; + +struct thr_arg thr_args[NTHR]; + +unsigned long mem; /* Per-thread storage */ +volatile int nevents; +volatile int terminate; +int wps = 1; /* work per sec */ +double nspw; /* nsec per work */ + +#define N_INIT 10000000 + +void fwq_init(unsigned long *mem) { + struct timespec start, end; + unsigned long nsec; + int i; + clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start); + BULK_FSW(N_INIT, mem); + clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end); + nsec = (TS2NS(end.tv_sec, end.tv_nsec) - TS2NS(start.tv_sec, start.tv_nsec)); + nspw = nsec / (double)N_INIT; + printf("nsec=%ld, nspw=%f\n", nsec, nspw); +} + +void fwq(unsigned long delay_nsec, unsigned long* mem) { + //printf("delay_nsec=%ld,count=%f\n", delay_nsec, delay_nsec / nspw); + BULK_FSW(delay_nsec / nspw, mem); +} + +void mydelay(long delay_nsec, long *mem) { + struct timespec start, end; + clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start); + + while (1) { + clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end); + if (TS2NS(end.tv_sec, end.tv_nsec) - TS2NS(start.tv_sec, start.tv_nsec) > delay_nsec) { + break; + } + FIXED_SIZE_WORK(mem); + } +} + +void *progress_fn(void *_arg) { + struct thr_arg *arg = (struct thr_arg *)_arg; + int rc; + int i; + + rc = syscall(732); + if (rc == -1) + fprintf(stdout, "CT09100 progress_fn running on Linux OK\n"); + else { + fprintf(stdout, "CT09100 progress_fn running on McKernel NG\n", rc); + return NULL; + } + + pthread_mutex_lock(&arg->bar_lock); + while(arg->bar_count == 0) { + pthread_cond_wait(&arg->bar_cond, &arg->bar_lock); + } + pthread_mutex_unlock(&arg->bar_lock); + + /* Start progress */ + pthread_mutex_lock(&ep_lock); + while(1) { + if (terminate) { + break; + } + + /* Event found */ + if (nevents > 0) { + nevents = 0; + } + + pthread_mutex_unlock(&ep_lock); + fwq(random() % 1000000000, &mem); /* 0 - 1 sec */ + pthread_mutex_lock(&ep_lock); + } + return NULL; +} + +int main(int argc, char **argv) { + int rc; + int i; + struct timespec start, end; + + fprintf(stdout, "CT09001 MPI progress thread skelton START\n"); + + rc = syscall(732); + if (rc == -1) + fprintf(stdout, "CT09002 main running on Linux INFO\n"); + else { + fprintf(stdout, "CT09002 main running on McKernel INFO\n"); + } + + fwq_init(&mem); + pthread_mutex_init(&ep_lock, NULL); + + for(i = 0; i < NTHR; i++) { + thr_args[i].bar_count = 0; + pthread_cond_init(&thr_args[i].bar_cond, NULL); + pthread_mutex_init(&thr_args[i].bar_lock, NULL); + } + + rc = syscall(731, 1, NULL); + if (rc) { + fprintf(stdout, "util_indicate_clone rc=%d, errno=%d\n", rc, errno); + fflush(stdout); + } + for (i = 0; i < NTHR; i++) { + rc = pthread_create(&thr_args[i].pthread, NULL, progress_fn, &thr_args[i]); + if (rc){ + fprintf(stdout, "pthread_create: %d\n", rc); + exit(1); + } + } + for (i = 0; i < NTHR; i++) { + pthread_mutex_lock(&thr_args[i].bar_lock); + thr_args[i].bar_count++; + pthread_cond_signal(&thr_args[i].bar_cond); + pthread_mutex_unlock(&thr_args[i].bar_lock); + } + + fprintf(stdout, "CT09004 pthread_create OK\n"); + clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start); + for (i = 0; i < 10; i++) { + pthread_mutex_lock(&ep_lock); + nevents++; + fwq(random() % 1000000000, &mem); /* 0 - 1 sec */ + pthread_mutex_unlock(&ep_lock); + while (nevents > 0) { + FIXED_SIZE_WORK(&mem); + } + } + clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end); + + terminate = 1; + + for (i = 0; i < NTHR; i++) { + pthread_join(thr_args[i].pthread, NULL); + } + fprintf(stdout, "CT09005 takes %ld nsec INFO\n", TS2NS(end.tv_sec, end.tv_nsec) - TS2NS(start.tv_sec, start.tv_nsec)); + fprintf(stdout, "CT09006 END\n"); + + + exit(0); +} diff --git a/test/uti/CT22.c b/test/uti/CT22.c new file mode 100644 index 00000000..627b1beb --- /dev/null +++ b/test/uti/CT22.c @@ -0,0 +1,210 @@ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define DEBUG + +#ifdef DEBUG +#define dprintf(...) \ + do { \ + char msg[1024]; \ + sprintf(msg, __VA_ARGS__); \ + fprintf(stdout, "%s,%s", __FUNCTION__, msg); \ + } while (0); +#define eprintf(...) \ + do { \ + char msg[1024]; \ + sprintf(msg, __VA_ARGS__); \ + fprintf(stdout, "%s,%s", __FUNCTION__, msg); \ + } while (0); +#else +#define dprintf(...) do { } while (0) +#define eprintf(...) do { } while (0) +#endif + +#define NTHR 1 +#define TS2NS(sec, nsec) ((unsigned long)(sec) * 1000000000ULL + (unsigned long)(nsec)) +#define CALC_DELAY (93000) /* 93 usec */ +#define INIT_DELAY (2000) /* 2 usec, CPU sends CTS packet */ +#define NIC_DELAY (3000) /* 3 usec, NIC reads by RDMA-read */ +#define POLL_DELAY (200) /* .2 usec, CPU fetces event queue entry from DRAM */ +#define RESP_DELAY (2000) /* 2 usec, CPU sends DONE packet and updates MPI_Request */ +#define NSPIN 1 +static inline void FIXED_SIZE_WORK(unsigned long *ptr) { + asm volatile("movq %0, %%rax\n\t" + "addq $1, %%rax\n\t" \ + "movq %%rax, %0\n\t" \ + : "+rm" (*ptr) \ + : \ + : "rax", "cc", "memory"); \ +} + +static inline void BULK_FSW(unsigned long n, unsigned long *ptr) { + int j; + for (j = 0; j < (n); j++) { + FIXED_SIZE_WORK(ptr); + } +} + + +pthread_mutex_t ep_lock; /* Ownership of channel instance */ + +struct thr_arg { + int bar_count; /* Barrier before entering loop */ + pthread_mutex_t bar_lock; + pthread_cond_t bar_cond; + pthread_t pthread; + unsigned long mem; /* Per-thread storage */ +}; + +struct thr_arg thr_args[NTHR]; + +unsigned long mem; /* Per-thread storage */ +volatile int nevents; +volatile int terminate; +int wps = 1; /* work per sec */ +double nspw; /* nsec per work */ + +#define N_INIT 10000000 + +void fwq_init(unsigned long *mem) { + struct timespec start, end; + unsigned long nsec; + int i; + clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start); + BULK_FSW(N_INIT, mem); + clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end); + nsec = (TS2NS(end.tv_sec, end.tv_nsec) - TS2NS(start.tv_sec, start.tv_nsec)); + nspw = nsec / (double)N_INIT; + printf("nsec=%ld, nspw=%f\n", nsec, nspw); +} + +void fwq(unsigned long delay_nsec, unsigned long* mem) { + //printf("delay_nsec=%ld,count=%f\n", delay_nsec, delay_nsec / nspw); + BULK_FSW(delay_nsec / nspw, mem); +} + +void mydelay(long delay_nsec, long *mem) { + struct timespec start, end; + clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start); + + while (1) { + clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end); + if (TS2NS(end.tv_sec, end.tv_nsec) - TS2NS(start.tv_sec, start.tv_nsec) > delay_nsec) { + break; + } + FIXED_SIZE_WORK(mem); + } +} + +void *progress_fn(void *_arg) { + struct thr_arg *arg = (struct thr_arg *)_arg; + int rc; + int i; + + rc = syscall(732); + if (rc == -1) + fprintf(stdout, "CT09100 progress_fn running on Linux OK\n"); + else { + fprintf(stdout, "CT09100 progress_fn running on McKernel NG\n", rc); + return NULL; + } + + pthread_mutex_lock(&arg->bar_lock); + while(arg->bar_count == 0) { + pthread_cond_wait(&arg->bar_cond, &arg->bar_lock); + } + pthread_mutex_unlock(&arg->bar_lock); + + for (i = 0; i < 100; i++) { + pthread_mutex_lock(&ep_lock); + nevents++; + fwq(random() % 100000000, &mem); /* 0 - 0.1 sec */ + pthread_mutex_unlock(&ep_lock); + while (nevents > 0) { + FIXED_SIZE_WORK(&mem); + } + } + terminate = 1; + return NULL; +} + +int main(int argc, char **argv) { + int rc; + int i; + struct timespec start, end; + + fprintf(stdout, "CT09001 MPI progress thread skelton START\n"); + + rc = syscall(732); + if (rc == -1) + fprintf(stdout, "CT09002 main running on Linux INFO\n"); + else { + fprintf(stdout, "CT09002 main running on McKernel INFO\n"); + } + + fwq_init(&mem); + pthread_mutex_init(&ep_lock, NULL); + + for(i = 0; i < NTHR; i++) { + thr_args[i].bar_count = 0; + pthread_cond_init(&thr_args[i].bar_cond, NULL); + pthread_mutex_init(&thr_args[i].bar_lock, NULL); + } + + rc = syscall(731, 1, NULL); + if (rc) { + fprintf(stdout, "util_indicate_clone rc=%d, errno=%d\n", rc, errno); + fflush(stdout); + } + for (i = 0; i < NTHR; i++) { + rc = pthread_create(&thr_args[i].pthread, NULL, progress_fn, &thr_args[i]); + if (rc){ + fprintf(stdout, "pthread_create: %d\n", rc); + exit(1); + } + } + for (i = 0; i < NTHR; i++) { + pthread_mutex_lock(&thr_args[i].bar_lock); + thr_args[i].bar_count++; + pthread_cond_signal(&thr_args[i].bar_cond); + pthread_mutex_unlock(&thr_args[i].bar_lock); + } + + fprintf(stdout, "CT09004 pthread_create OK\n"); + clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start); + /* Start progress */ + pthread_mutex_lock(&ep_lock); + while(1) { + if (terminate) { + break; + } + + /* Event found */ + if (nevents > 0) { + nevents = 0; + } + + pthread_mutex_unlock(&ep_lock); + fwq(random() % 100000000, &mem); /* 0 - 0.1 sec */ + pthread_mutex_lock(&ep_lock); + } + pthread_mutex_unlock(&ep_lock); + clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end); + + for (i = 0; i < NTHR; i++) { + pthread_join(thr_args[i].pthread, NULL); + } + fprintf(stdout, "CT09005 takes %ld nsec INFO\n", TS2NS(end.tv_sec, end.tv_nsec) - TS2NS(start.tv_sec, start.tv_nsec)); + fprintf(stdout, "CT09006 END\n"); + + + exit(0); +} diff --git a/test/uti/CT23.c b/test/uti/CT23.c new file mode 100644 index 00000000..69a19991 --- /dev/null +++ b/test/uti/CT23.c @@ -0,0 +1,212 @@ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define DEBUG + +#ifdef DEBUG +#define dprintf(...) \ + do { \ + char msg[1024]; \ + sprintf(msg, __VA_ARGS__); \ + fprintf(stdout, "%s,%s", __FUNCTION__, msg); \ + } while (0); +#define eprintf(...) \ + do { \ + char msg[1024]; \ + sprintf(msg, __VA_ARGS__); \ + fprintf(stdout, "%s,%s", __FUNCTION__, msg); \ + } while (0); +#else +#define dprintf(...) do { } while (0) +#define eprintf(...) do { } while (0) +#endif + +#define NTHR 1 +#define TS2NS(sec, nsec) ((unsigned long)(sec) * 1000000000ULL + (unsigned long)(nsec)) +#define CALC_DELAY (93000) /* 93 usec */ +#define INIT_DELAY (2000) /* 2 usec, CPU sends CTS packet */ +#define NIC_DELAY (3000) /* 3 usec, NIC reads by RDMA-read */ +#define POLL_DELAY (200) /* .2 usec, CPU fetces event queue entry from DRAM */ +#define RESP_DELAY (2000) /* 2 usec, CPU sends DONE packet and updates MPI_Request */ +#define NSPIN 1 +static inline void FIXED_SIZE_WORK(unsigned long *ptr) { + asm volatile("movq %0, %%rax\n\t" + "addq $1, %%rax\n\t" \ + "movq %%rax, %0\n\t" \ + : "+rm" (*ptr) \ + : \ + : "rax", "cc", "memory"); \ +} + +static inline void BULK_FSW(unsigned long n, unsigned long *ptr) { + int j; + for (j = 0; j < (n); j++) { + FIXED_SIZE_WORK(ptr); + } +} + + +pthread_cond_t ep_cond; +pthread_mutex_t ep_lock; /* Ownership of channel instance */ + +struct thr_arg { + int bar_count; /* Barrier before entering loop */ + pthread_mutex_t bar_lock; + pthread_cond_t bar_cond; + pthread_t pthread; + unsigned long mem; /* Per-thread storage */ +}; + +struct thr_arg thr_args[NTHR]; + +unsigned long mem; /* Per-thread storage */ +volatile int nevents; +volatile int terminate; +int wps = 1; /* work per sec */ +double nspw; /* nsec per work */ + +#define N_INIT 10000000 + +void fwq_init(unsigned long *mem) { + struct timespec start, end; + unsigned long nsec; + int i; + clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start); + BULK_FSW(N_INIT, mem); + clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end); + nsec = (TS2NS(end.tv_sec, end.tv_nsec) - TS2NS(start.tv_sec, start.tv_nsec)); + nspw = nsec / (double)N_INIT; + printf("nsec=%ld, nspw=%f\n", nsec, nspw); +} + +void fwq(unsigned long delay_nsec, unsigned long* mem) { + //printf("delay_nsec=%ld,count=%f\n", delay_nsec, delay_nsec / nspw); + BULK_FSW(delay_nsec / nspw, mem); +} + +void mydelay(long delay_nsec, long *mem) { + struct timespec start, end; + clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start); + + while (1) { + clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end); + if (TS2NS(end.tv_sec, end.tv_nsec) - TS2NS(start.tv_sec, start.tv_nsec) > delay_nsec) { + break; + } + FIXED_SIZE_WORK(mem); + } +} + +void *progress_fn(void *_arg) { + struct thr_arg *arg = (struct thr_arg *)_arg; + int rc; + int i; + + rc = syscall(732); + if (rc == -1) + fprintf(stdout, "CT09100 progress_fn running on Linux OK\n"); + else { + fprintf(stdout, "CT09100 progress_fn running on McKernel NG\n", rc); + return NULL; + } + + pthread_mutex_lock(&arg->bar_lock); + while(arg->bar_count == 0) { + pthread_cond_wait(&arg->bar_cond, &arg->bar_lock); + } + pthread_mutex_unlock(&arg->bar_lock); + + /* Start progress */ + pthread_mutex_lock(&ep_lock); + while(1) { + if (terminate) { + break; + } + while(nevents == 0) { + pthread_cond_wait(&ep_cond, &ep_lock); + } + nevents = 0; + pthread_mutex_unlock(&ep_lock); + fwq(random() % 100000000, &mem); /* 0 - 0.1 sec */ + pthread_mutex_lock(&ep_lock); + } + pthread_mutex_unlock(&ep_lock); + return NULL; +} + +int main(int argc, char **argv) { + int rc; + int i; + struct timespec start, end; + + fprintf(stdout, "CT09001 MPI progress thread skelton START\n"); + + rc = syscall(732); + if (rc == -1) + fprintf(stdout, "CT09002 main running on Linux INFO\n"); + else { + fprintf(stdout, "CT09002 main running on McKernel INFO\n"); + } + + fwq_init(&mem); + pthread_cond_init(&ep_cond, NULL); + pthread_mutex_init(&ep_lock, NULL); + + for(i = 0; i < NTHR; i++) { + thr_args[i].bar_count = 0; + pthread_cond_init(&thr_args[i].bar_cond, NULL); + pthread_mutex_init(&thr_args[i].bar_lock, NULL); + } + + rc = syscall(731, 1, NULL); + if (rc) { + fprintf(stdout, "util_indicate_clone rc=%d, errno=%d\n", rc, errno); + fflush(stdout); + } + for (i = 0; i < NTHR; i++) { + rc = pthread_create(&thr_args[i].pthread, NULL, progress_fn, &thr_args[i]); + if (rc){ + fprintf(stdout, "pthread_create: %d\n", rc); + exit(1); + } + } + for (i = 0; i < NTHR; i++) { + pthread_mutex_lock(&thr_args[i].bar_lock); + thr_args[i].bar_count++; + pthread_cond_signal(&thr_args[i].bar_cond); + pthread_mutex_unlock(&thr_args[i].bar_lock); + } + + fprintf(stdout, "CT09004 pthread_create OK\n"); + clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start); + for (i = 0; i < 100; i++) { + fwq(random() % 100000000, &mem); /* 0 - 0.1 sec */ + pthread_mutex_lock(&ep_lock); + nevents++; + pthread_cond_signal(&ep_cond); + pthread_mutex_unlock(&ep_lock); + while (nevents > 0) { + FIXED_SIZE_WORK(&mem); + } + } + clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end); + + terminate = 1; + + for (i = 0; i < NTHR; i++) { + pthread_join(thr_args[i].pthread, NULL); + } + fprintf(stdout, "CT09005 takes %ld nsec INFO\n", TS2NS(end.tv_sec, end.tv_nsec) - TS2NS(start.tv_sec, start.tv_nsec)); + fprintf(stdout, "CT09006 END\n"); + + + exit(0); +} diff --git a/test/uti/CT24.c b/test/uti/CT24.c new file mode 100644 index 00000000..fcde9496 --- /dev/null +++ b/test/uti/CT24.c @@ -0,0 +1,210 @@ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define DEBUG + +#ifdef DEBUG +#define dprintf(...) \ + do { \ + char msg[1024]; \ + sprintf(msg, __VA_ARGS__); \ + fprintf(stdout, "%s,%s", __FUNCTION__, msg); \ + } while (0); +#define eprintf(...) \ + do { \ + char msg[1024]; \ + sprintf(msg, __VA_ARGS__); \ + fprintf(stdout, "%s,%s", __FUNCTION__, msg); \ + } while (0); +#else +#define dprintf(...) do { } while (0) +#define eprintf(...) do { } while (0) +#endif + +#define NTHR 1 +#define TS2NS(sec, nsec) ((unsigned long)(sec) * 1000000000ULL + (unsigned long)(nsec)) +#define CALC_DELAY (93000) /* 93 usec */ +#define INIT_DELAY (2000) /* 2 usec, CPU sends CTS packet */ +#define NIC_DELAY (3000) /* 3 usec, NIC reads by RDMA-read */ +#define POLL_DELAY (200) /* .2 usec, CPU fetces event queue entry from DRAM */ +#define RESP_DELAY (2000) /* 2 usec, CPU sends DONE packet and updates MPI_Request */ +#define NSPIN 1 +static inline void FIXED_SIZE_WORK(unsigned long *ptr) { + asm volatile("movq %0, %%rax\n\t" + "addq $1, %%rax\n\t" \ + "movq %%rax, %0\n\t" \ + : "+rm" (*ptr) \ + : \ + : "rax", "cc", "memory"); \ +} + +static inline void BULK_FSW(unsigned long n, unsigned long *ptr) { + int j; + for (j = 0; j < (n); j++) { + FIXED_SIZE_WORK(ptr); + } +} + + +pthread_cond_t ep_cond; +pthread_mutex_t ep_lock; /* Ownership of channel instance */ + +struct thr_arg { + int bar_count; /* Barrier before entering loop */ + pthread_mutex_t bar_lock; + pthread_cond_t bar_cond; + pthread_t pthread; + unsigned long mem; /* Per-thread storage */ +}; + +struct thr_arg thr_args[NTHR]; + +unsigned long mem; /* Per-thread storage */ +volatile int nevents; +volatile int terminate; +int wps = 1; /* work per sec */ +double nspw; /* nsec per work */ + +#define N_INIT 10000000 + +void fwq_init(unsigned long *mem) { + struct timespec start, end; + unsigned long nsec; + int i; + clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start); + BULK_FSW(N_INIT, mem); + clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end); + nsec = (TS2NS(end.tv_sec, end.tv_nsec) - TS2NS(start.tv_sec, start.tv_nsec)); + nspw = nsec / (double)N_INIT; + printf("nsec=%ld, nspw=%f\n", nsec, nspw); +} + +void fwq(unsigned long delay_nsec, unsigned long* mem) { + //printf("delay_nsec=%ld,count=%f\n", delay_nsec, delay_nsec / nspw); + BULK_FSW(delay_nsec / nspw, mem); +} + +void mydelay(long delay_nsec, long *mem) { + struct timespec start, end; + clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start); + + while (1) { + clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end); + if (TS2NS(end.tv_sec, end.tv_nsec) - TS2NS(start.tv_sec, start.tv_nsec) > delay_nsec) { + break; + } + FIXED_SIZE_WORK(mem); + } +} + +void *progress_fn(void *_arg) { + struct thr_arg *arg = (struct thr_arg *)_arg; + int rc; + int i; + + rc = syscall(732); + if (rc == -1) + fprintf(stdout, "CT09100 progress_fn running on Linux OK\n"); + else { + fprintf(stdout, "CT09100 progress_fn running on McKernel NG\n", rc); + return NULL; + } + + pthread_mutex_lock(&arg->bar_lock); + while(arg->bar_count == 0) { + pthread_cond_wait(&arg->bar_cond, &arg->bar_lock); + } + pthread_mutex_unlock(&arg->bar_lock); + + for (i = 0; i < 100; i++) { + fwq(random() % 100000000, &mem); /* 0 - 0.1 sec */ + pthread_mutex_lock(&ep_lock); + nevents++; + pthread_cond_signal(&ep_cond); + pthread_mutex_unlock(&ep_lock); + while (nevents > 0) { + FIXED_SIZE_WORK(&mem); + } + } + terminate = 1; + return NULL; +} + +int main(int argc, char **argv) { + int rc; + int i; + struct timespec start, end; + + fprintf(stdout, "CT09001 MPI progress thread skelton START\n"); + + rc = syscall(732); + if (rc == -1) + fprintf(stdout, "CT09002 main running on Linux INFO\n"); + else { + fprintf(stdout, "CT09002 main running on McKernel INFO\n"); + } + + fwq_init(&mem); + pthread_mutex_init(&ep_lock, NULL); + + for(i = 0; i < NTHR; i++) { + thr_args[i].bar_count = 0; + pthread_cond_init(&thr_args[i].bar_cond, NULL); + pthread_mutex_init(&thr_args[i].bar_lock, NULL); + } + + rc = syscall(731, 1, NULL); + if (rc) { + fprintf(stdout, "util_indicate_clone rc=%d, errno=%d\n", rc, errno); + fflush(stdout); + } + for (i = 0; i < NTHR; i++) { + rc = pthread_create(&thr_args[i].pthread, NULL, progress_fn, &thr_args[i]); + if (rc){ + fprintf(stdout, "pthread_create: %d\n", rc); + exit(1); + } + } + for (i = 0; i < NTHR; i++) { + pthread_mutex_lock(&thr_args[i].bar_lock); + thr_args[i].bar_count++; + pthread_cond_signal(&thr_args[i].bar_cond); + pthread_mutex_unlock(&thr_args[i].bar_lock); + } + + fprintf(stdout, "CT09004 pthread_create OK\n"); + clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start); + /* Start progress */ + pthread_mutex_lock(&ep_lock); + while(1) { + if (terminate) { + break; + } + while(nevents == 0) { + pthread_cond_wait(&ep_cond, &ep_lock); + } + nevents = 0; + pthread_mutex_unlock(&ep_lock); + fwq(random() % 100000000, &mem); /* 0 - 0.1 sec */ + pthread_mutex_lock(&ep_lock); + } + pthread_mutex_unlock(&ep_lock); + clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end); + + for (i = 0; i < NTHR; i++) { + pthread_join(thr_args[i].pthread, NULL); + } + fprintf(stdout, "CT09005 takes %ld nsec INFO\n", TS2NS(end.tv_sec, end.tv_nsec) - TS2NS(start.tv_sec, start.tv_nsec)); + fprintf(stdout, "CT09006 END\n"); + + + exit(0); +} diff --git a/test/uti/CT25.c b/test/uti/CT25.c new file mode 100644 index 00000000..1aa5dd4d --- /dev/null +++ b/test/uti/CT25.c @@ -0,0 +1,163 @@ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* For SYS_xxx definitions */ +#include +#include +#include +#include + +pthread_t thr; + +unsigned long mem; /* delay functions issue ld/st instructions on this address */ +double nspw; /* nsec per work */ + +sem_t sem_kick, sem_report; +int nentry, szentry; +char **sendv, **recvv; + + + +/* Timer related macros */ +#define TS2NS(sec, nsec) ((unsigned long)(sec) * 1000000000ULL + (unsigned long)(nsec)) +#define N_INIT 10000000 + +static inline void fixed_size_work(unsigned long *ptr) { + asm volatile("movq %0, %%rax\n\t" + "addq $1, %%rax\n\t" \ + "movq %%rax, %0\n\t" \ + : "+rm" (*ptr) \ + : \ + : "rax", "cc", "memory"); \ +} + +static inline void delay_loop(unsigned long n, unsigned long *ptr) { + int j; + for (j = 0; j < (n); j++) { + fixed_size_work(ptr); + } +} + +void delay_init(unsigned long *mem) { + struct timespec start, end; + unsigned long nsec; + int i; + clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start); + delay_loop(N_INIT, mem); + clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end); + nsec = (TS2NS(end.tv_sec, end.tv_nsec) - TS2NS(start.tv_sec, start.tv_nsec)); + nspw = nsec / (double)N_INIT; + printf("nsec=%ld, nspw=%f\n", nsec, nspw); +} + +void delay_nsec(unsigned long delay_nsec, unsigned long* mem) { + //printf("delay_nsec=%ld,count=%f\n", delay_nsec, delay_nsec / nspw); + delay_loop(delay_nsec / nspw, mem); +} + +void *util_thread(void *arg) { + int rc; + int i; + + rc = syscall(732); + if (rc == -1) + fprintf(stderr, "CT25101 running on Linux CPU OK\n"); + else { + fprintf(stderr, "CT25101 running on Linux CPU NG (%d)\n", rc); + } + + sem_wait(&sem_kick); + + /* Cause remote page fault */ + for (i = 0; i < nentry; i++) { + memset(recvv[i], 0, szentry); + } + + sem_post(&sem_report); + + return NULL; +} + +pid_t gettid(void) +{ + return syscall(SYS_gettid); +} + +int +main(int argc, char **argv) +{ + int ret = 0; + int rc; + int i; + pthread_attr_t attr; + + if(argc == 3) { + szentry = (1ULL << atoi(argv[1])); + nentry = atoi(argv[2]); + } + + if (argc != 3 || szentry == 0) { + fprintf(stderr, "usage: CT25 <# of entries>\n"); + ret = 1; + goto fn_fail; + } + + sem_init(&sem_kick, 0, 0); + sem_init(&sem_report, 0, 0); + + fprintf(stderr, "CT25001 START\n"); + fprintf(stderr, "CT25001 INFO (pid=%d,tid=%d)\n", getpid(), gettid()); + + sendv = malloc(sizeof(char *) * nentry); + if(!sendv) { printf("malloc failed"); goto fn_fail; } + for (i = 0; i < nentry; i++) { + sendv[i] = (char*)mmap(0, szentry, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + if(sendv[i] == MAP_FAILED) { printf("mmap failed"); goto fn_fail; } + memset(sendv[i], 0xaa, szentry); + } + + recvv = malloc(sizeof(char *) * nentry); + if(!recvv) { printf("malloc failed"); goto fn_fail; } + for (i = 0; i < nentry; i++) { + recvv[i] = (char*)mmap(0, szentry, PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0); + if(recvv[i] == MAP_FAILED) { printf("mmap failed"); goto fn_fail; } + memset(recvv[i], 0, szentry); + } + + rc = syscall(731, 1, NULL); + if (rc) { + fprintf(stderr, "CT25002 util_indicate_clone INFO (rc=%d, errno=%d)\n", rc, errno); + } else { + fprintf(stderr, "CT25002 util_indicate_clone OK\n", rc, errno); + } + + pthread_attr_init(&attr); + //pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED); + rc = pthread_create(&thr, &attr, util_thread, NULL); + if (rc){ + fprintf(stderr, "pthread_create: %d\n", rc); + exit(1); + } + fprintf(stderr, "CT25002 pthread_create OK\n"); + + sem_post(&sem_kick); + sem_wait(&sem_report); + + pthread_join(thr, NULL); + + fprintf(stderr, "CT25003 END\n"); + ret = 0; + + fn_exit: + exit(ret); + + fn_fail: + goto fn_exit; +} diff --git a/test/uti/CT26.c b/test/uti/CT26.c new file mode 100644 index 00000000..4ca3a8b8 --- /dev/null +++ b/test/uti/CT26.c @@ -0,0 +1,139 @@ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* For SYS_xxx definitions */ +#include +#include +#include +#include + +int passed = 0; +pthread_t thr; + +unsigned long mem; /* delay functions issue ld/st instructions on this address */ +double nspw; /* nsec per work */ + +/* Timer related macros */ +#define TS2NS(sec, nsec) ((unsigned long)(sec) * 1000000000ULL + (unsigned long)(nsec)) +#define N_INIT 10000000 + +static inline void fixed_size_work(unsigned long *ptr) { + asm volatile("movq %0, %%rax\n\t" + "addq $1, %%rax\n\t" \ + "movq %%rax, %0\n\t" \ + : "+rm" (*ptr) \ + : \ + : "rax", "cc", "memory"); \ +} + +static inline void delay_loop(unsigned long n, unsigned long *ptr) { + int j; + for (j = 0; j < (n); j++) { + fixed_size_work(ptr); + } +} + +void delay_init(unsigned long *mem) { + struct timespec start, end; + unsigned long nsec; + int i; + clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start); + delay_loop(N_INIT, mem); + clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end); + nsec = (TS2NS(end.tv_sec, end.tv_nsec) - TS2NS(start.tv_sec, start.tv_nsec)); + nspw = nsec / (double)N_INIT; + printf("nsec=%ld, nspw=%f\n", nsec, nspw); +} + +void delay_nsec(unsigned long delay_nsec, unsigned long* mem) { + //printf("delay_nsec=%ld,count=%f\n", delay_nsec, delay_nsec / nspw); + delay_loop(delay_nsec / nspw, mem); +} + +void halt(int sig) { + while(1) { } +} + +void *util_thread(void *arg) { + int rc; + fprintf(stderr, "CT12101 enter OK\n"); + + rc = syscall(732); + if (rc == -1) + fprintf(stderr, "CT12102 running on Linux CPU OK (tid=%d)\n", syscall(__NR_gettid)); + else { + fprintf(stderr, "CT12102 running on Linux CPU NG (tid=%d,rc=%d)\n", syscall(__NR_gettid), rc); + } + + passed = 1; + + rc = syscall(888); + if (rc != -1) { + fprintf(stderr, "CT12103 syscall(888) OK (%x)\n", rc); + } else { + fprintf(stderr, "CT12103 syscall(888) NG (%x)\n", rc); + } + + return NULL; +} + +int +main(int argc, char **argv) +{ + int rc; + pthread_attr_t attr; + struct sigaction act; + + fprintf(stderr, "CT12001 futex START (tid=%d)\n", syscall(__NR_gettid)); +#if 0 + rc = syscall(731, 1, NULL); + if (rc) { + fprintf(stderr, "util_indicate_clone rc=%d, errno=%d\n", rc, errno); + fflush(stderr); + } +#endif + sigaction(SIGINT, NULL, &act); + act.sa_handler = halt; + act.sa_flags &= ~(SA_RESTART); + sigaction(SIGINT, &act, NULL); + + rc = pthread_attr_init(&attr); + if (rc){ + fprintf(stderr, "pthread_attr_init: %d\n", rc); + exit(1); + } +#if 0 + rc = pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED); + if (rc){ + fprintf(stderr, "pthread_attr_setdetachstate: %d\n", rc); + exit(1); + } +#endif + rc = pthread_create(&thr, &attr, util_thread, NULL); + if (rc){ + fprintf(stderr, "pthread_create: %d\n", rc); + exit(1); + } + fprintf(stderr, "CT12002 pthread_create OK\n"); + +#if 1 + while (!passed) { + asm volatile("pause" ::: "memory"); + } +#endif + usleep(200000); + +#if 1 + pthread_join(thr, NULL); + fprintf(stderr, "CT12004 pthread_join OK\n"); +#endif + //fprintf(stderr, "CT12005 END\n"); + exit(0); +} diff --git a/test/uti/CT27.c b/test/uti/CT27.c new file mode 100644 index 00000000..65df96df --- /dev/null +++ b/test/uti/CT27.c @@ -0,0 +1,497 @@ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define DEBUG + +#ifdef DEBUG +#define dprintf(...) \ + do { \ + char msg[1024]; \ + sprintf(msg, __VA_ARGS__); \ + fprintf(stdout, "%s,%s", __FUNCTION__, msg); \ + } while (0); +#define eprintf(...) \ + do { \ + char msg[1024]; \ + sprintf(msg, __VA_ARGS__); \ + fprintf(stdout, "%s,%s", __FUNCTION__, msg); \ + } while (0); +#else +#define dprintf(...) do { } while (0) +#define eprintf(...) do { } while (0) +#endif + +#define NPROC 1 +#define MAX_NOPS 10 +int NOPS=1;/* RDMA:1, accumulate:10 */ +#define TS2NS(sec, nsec) ((unsigned long)(sec) * 1000000000ULL + (unsigned long)(nsec)) +#define CALC_CPU (100000) /* 100,000 nsec, CPU time for calculation */ +#define I2R_OCC ( 200) /* 200 nsec, occupation time for for sending AM packet */ +#define I2R_NET (1000) /* 1,000 nsec, Network time for packet to arrive at responder */ +int R2I_OCC= (10200/*400*/); /* RDMA:10,200 nsec, accumulate:400ns, occupation time for perforing accumulate or RDMA-RD and sending ACK packet . Note that 10GB/s means 100KB/10,000 ns */ +#define R2I_NET (1000) /* 1000 nsec, Network time for packet to arrive at initiator */ +#define POLL_CPU ( 200) /* 200 nsec, CPU time for checking DRAM event queue */ +#define REQ_UPDATE_CPU ( 200) /* 200 nsec, CPU time for updates MPI_Request */ +#define NSPIN 1 +static inline void fixed_size_work() { + asm volatile( + "movq $0, %%rcx\n\t" + "1:\t" + "addq $1, %%rcx\n\t" + "cmpq $99, %%rcx\n\t" + "jle 1b\n\t" + : + : + : "rcx", "cc"); +} + +static inline void bulk_fsw(unsigned long n) { + int j; + for (j = 0; j < (n); j++) { + fixed_size_work(); + } +} + +struct thr_arg { + int rank; + volatile int bar_count; /* Barrier before entering loop */ + pthread_mutex_t bar_lock; + pthread_cond_t bar_cond; + pthread_t pthread; + + pthread_mutex_t ep_lock; /* mutex for endpoint manipulation */ + volatile long ini_ev[MAX_NOPS]; /* events on the responder */ + volatile long res_ev[MAX_NOPS]; /* events on the initiator */ + volatile int terminate; + long ini_busy; /* Initiator is busy sending AM packet or RTS packet etc. */ + long res_busy; /* Responder is busy doing accumulate or RDMA-RD etc. */ +}; + +struct per_proc { + int rank; + struct thr_arg thr_arg; + long nsec; + +}; + +struct proc_glb { + struct per_proc per_procs[NPROC]; + volatile int bar_count; + pthread_mutex_t bar_lock; + pthread_cond_t bar_cond; +}; + +struct proc_glb *proc_glb; + +unsigned long mem; /* Per-thread storage */ +int wps = 1; /* work per sec */ +double nspw; /* nsec per work */ + +#define N_INIT 10000000 + +void fwq_init() { + struct timespec start, end; + unsigned long nsec; + int i; + clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start); + bulk_fsw(N_INIT); + clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end); + nsec = (TS2NS(end.tv_sec, end.tv_nsec) - TS2NS(start.tv_sec, start.tv_nsec)); + nspw = nsec / (double)N_INIT; + printf("nsec=%ld, nspw=%f\n", nsec, nspw); +} + +void fwq(long delay_nsec) { + if (delay_nsec < 0) { + printf("%s: delay_nsec<0\n", __FUNCTION__); + } + //printf("delay_nsec=%ld,count=%f\n", delay_nsec, delay_nsec / nspw); + bulk_fsw(delay_nsec / nspw); +} +int progress_responder(struct thr_arg *thr_arg) { + int ret = 0; + int j; + struct timespec now_ts; + long now_long; + clock_gettime(CLOCK_REALTIME, &now_ts); + now_long = TS2NS(now_ts.tv_sec, now_ts.tv_nsec); + + pthread_mutex_lock(&thr_arg->ep_lock); /* This lock is for consistency */ + for (j = 0; j < NOPS; j++) { + if (thr_arg->res_busy <= now_long && thr_arg->res_ev[j] && thr_arg->res_ev[j] <= now_long) { + //if(thr_arg->rank == 0) { printf("res_ev=%ld,busy=%ld,now=%ld\n", thr_arg->res_ev[j] % 1000000000UL, thr_arg->res_busy % 1000000000UL, now_long % 1000000000UL); } + thr_arg->ini_ev[j] = now_long + R2I_OCC + R2I_NET; + thr_arg->res_ev[j] = 0; + thr_arg->res_busy = now_long + R2I_OCC; /* responder is busy for AM or RDMA-RD etc. */ + ret = 1; + } + } + pthread_mutex_unlock(&thr_arg->ep_lock); + return ret; +} + +int progress_initiator(struct thr_arg* thr_arg) { + int ret = 0; + int j; + struct timespec now_ts; + long now_long; + clock_gettime(CLOCK_REALTIME, &now_ts); + now_long = TS2NS(now_ts.tv_sec, now_ts.tv_nsec); + + pthread_mutex_lock(&thr_arg->ep_lock); + for (j = 0; j < NOPS; j++) { + //if(thr_arg->rank == 0) { printf("ini_ev=%ld,now=%ld\n", thr_arg->ini_ev[j], now_long); } + if (thr_arg->ini_busy <= now_long && thr_arg->ini_ev[j] && thr_arg->ini_ev[j] <= now_long) { + fwq(POLL_CPU); /* Account for cache miss */ + fwq(REQ_UPDATE_CPU); + now_long += POLL_CPU + REQ_UPDATE_CPU; + thr_arg->ini_ev[j] = 0; /* Event is consumed */ + thr_arg->ini_busy = now_long; + ret = 1; + } + } + pthread_mutex_unlock(&thr_arg->ep_lock); + return ret; +} + +void *progress_fn(void *arg) { + struct thr_arg *thr_arg = (struct thr_arg *)arg; + int rc; + int spin_count = 0; + int i, j; + struct timespec now_ts; + long now_long; + + rc = syscall(732); + if (rc == -1) + fprintf(stdout, "CT09100 progress_fn running on Linux OK\n"); + else { + fprintf(stdout, "CT09100 progress_fn running on McKernel NG\n", rc); + } + + printf("progress,enter,rank=%d\n", thr_arg->rank); + + pthread_mutex_lock(&thr_arg->bar_lock); + thr_arg->bar_count++; + if (thr_arg->bar_count == 2) { + if ((rc = pthread_cond_broadcast(&thr_arg->bar_cond))) { + printf("[%d] pthread_cond_broadcast failed,rc=%d\n", thr_arg->rank, rc); + } + } + while (thr_arg->bar_count != 2) { + if ((rc = pthread_cond_wait(&thr_arg->bar_cond, &thr_arg->bar_lock))) { + printf("[%d] pthread_cond_wait failed,rc=%d\n", thr_arg->rank, rc); + } + } + pthread_mutex_unlock(&thr_arg->bar_lock); + + printf("[%d] progress,after barrier\n", thr_arg->rank); + //#define NO_ASYNC +#ifdef NO_ASYNC + return NULL; +#endif + /* Start progress */ + while(1) { + if (thr_arg->terminate) { + break; + } + + if (progress_responder(thr_arg)) { + //if (thr_arg->rank == 0) { printf("progress_fn, responder progressed\n"); } + } + + if (progress_initiator(thr_arg)) { + //if (thr_arg->rank == 0) { printf("progress_fn, initiator progressed\n"); } + } + + spin_count++; + if (spin_count >= NSPIN) { + spin_count = 0; + sched_yield(); + } + } + printf("progress,exit,rank=%d\n", thr_arg->rank); + return NULL; +} + +void parent_fn(struct per_proc *per_proc) { + int i, j; + int rc; + char* uti_str; + int uti_val; + struct timespec start, end; + pthread_condattr_t condattr; + pthread_mutexattr_t mutexattr; + struct timespec now_ts; + long now_long; + + printf("[%d] parent_fn,enter,proc_glb=%p,bar_count=%d\n", per_proc->rank, proc_glb, proc_glb->bar_count); + + pthread_mutex_lock(&proc_glb->bar_lock); + proc_glb->bar_count++; + if (proc_glb->bar_count == NPROC) { + if ((rc = pthread_cond_broadcast(&proc_glb->bar_cond))) { + printf("[%d] pthread_cond_broadcast failed,rc=%d\n", per_proc->rank, rc); + } + } + while (proc_glb->bar_count != NPROC) { + if ((rc = pthread_cond_wait(&proc_glb->bar_cond, &proc_glb->bar_lock))) { + printf("[%d] pthread_cond_wait failed,rc=%d\n", per_proc->rank, rc); + } + } + pthread_mutex_unlock(&proc_glb->bar_lock); + + //printf("[%d] parent,after barrier\n", per_proc->rank); + + pthread_mutexattr_init(&mutexattr); + //pthread_mutexattr_setpshared(&mutexattr, PTHREAD_PROCESS_SHARED); + pthread_mutex_init(&per_proc->thr_arg.ep_lock, &mutexattr); + + per_proc->thr_arg.bar_count = 0; + + pthread_condattr_init(&condattr); + //pthread_condattr_setpshared(&condattr, PTHREAD_PROCESS_SHARED); + pthread_cond_init(&per_proc->thr_arg.bar_cond, &condattr); + + pthread_mutexattr_init(&mutexattr); + //pthread_mutexattr_setpshared(&mutexattr, PTHREAD_PROCESS_SHARED); + pthread_mutex_init(&per_proc->thr_arg.bar_lock, &mutexattr); + + uti_str = getenv("DISABLE_UTI"); + uti_val = uti_str ? atoi(uti_str) : 0; + if (!uti_val) { + rc = syscall(731, 1, NULL); + if (rc) { + fprintf(stdout, "CT09003 INFO: uti not available (rc=%d)\n", rc); + } else { + fprintf(stdout, "CT09003 INFO: uti available\n"); + } + } else { + fprintf(stdout, "CT09003 INFO: uti disabled\n", rc); + } + + per_proc->thr_arg.rank = per_proc->rank; + rc = pthread_create(&per_proc->thr_arg.pthread, NULL, progress_fn, &per_proc->thr_arg); + if (rc){ + fprintf(stdout, "pthread_create: %d\n", rc); + exit(1); + } + + pthread_mutex_lock(&per_proc->thr_arg.bar_lock); + per_proc->thr_arg.bar_count++; + if (per_proc->thr_arg.bar_count == 2) { + if ((rc = pthread_cond_broadcast(&per_proc->thr_arg.bar_cond))) { + printf("[%d] pthread_cond_broadcast failed,rc=%d\n", per_proc->rank, rc); + } + } + while (per_proc->thr_arg.bar_count != 2) { + if ((rc = pthread_cond_wait(&per_proc->thr_arg.bar_cond, &per_proc->thr_arg.bar_lock))) { + printf("[%d] pthread_cond_wait failed,rc=%d\n", per_proc->rank, rc); + } + } + pthread_mutex_unlock(&per_proc->thr_arg.bar_lock); + + printf("[%d] parent,after barrier\n", per_proc->rank); + //fprintf(stdout, "CT09004 pthread_create OK\n"); + + //#define TIMER_KIND CLOCK_THREAD_CPUTIME_ID +#define TIMER_KIND CLOCK_REALTIME + clock_gettime(TIMER_KIND, &start); + for (i = 0; i < 10000; i++) { /* It takes 1 sec */ + + /* Send request-to-send packet */ + clock_gettime(CLOCK_REALTIME, &now_ts); + now_long = TS2NS(now_ts.tv_sec, now_ts.tv_nsec); + + for (j = 0; j < NOPS; j++) { + pthread_mutex_lock(&per_proc->thr_arg.ep_lock); /* Lock is taken per MPI_Accumulate() */ + fwq(I2R_OCC); + now_long += I2R_OCC; + per_proc->thr_arg.res_ev[j] = now_long + I2R_NET; + per_proc->thr_arg.ini_busy = now_long; + //printf("res_ev=%ld,ini_busy=%ld,now=%ld\n", per_proc->thr_arg.res_ev[j] % 1000000000UL, per_proc->thr_arg.ini_busy % 1000000000UL, now_long % 1000000000UL); + pthread_mutex_unlock(&per_proc->thr_arg.ep_lock); + } + + /* Start calculation */ + fwq(CALC_CPU); + + /* Progress responder and initiator */ + int more_reap_needed; + while (1) { + if (progress_responder(&per_proc->thr_arg)) { + //printf("parent_fn, responder progressed\n"); + } + + if (progress_initiator(&per_proc->thr_arg)) { + //printf("parent_fn, initiator progressed\n"); + } + + more_reap_needed = 0; + for (j = 0; j < NOPS; j++) { + if (per_proc->thr_arg.res_ev[j] || per_proc->thr_arg.ini_ev[j]) { + more_reap_needed = 1; + break; + } + } + if (!more_reap_needed) { + break; + } + } + } + clock_gettime(TIMER_KIND, &end); + + per_proc->thr_arg.terminate = 1; + pthread_join(per_proc->thr_arg.pthread, NULL); + + per_proc->nsec = TS2NS(end.tv_sec, end.tv_nsec) - TS2NS(start.tv_sec, start.tv_nsec); +} + +static struct option options[] = { + { + .name = "ppn", + .has_arg = required_argument, + .flag = NULL, + .val = 'P', + }, + /* end */ + { NULL, 0, NULL, 0, }, +}; + +int main(int argc, char **argv) { + int rc; + int i; + char *uti_str; + int uti_val; + int st; + pid_t pid; + long max; + pthread_condattr_t condattr; + pthread_mutexattr_t mutexattr; + int fd; + key_t key = ftok(argv[0], 0); + int shmid; + int opt; + + while ((opt = getopt_long(argc, argv, "+ar", options, NULL)) != -1) { + switch (opt) { + case 'a': /* accumulate */ + NOPS = 10; /* ten accumulates */ + R2I_OCC = 400; /* 200 ns to accumulate, 200 ns to send ACK */ + break; + case 'r': + NOPS = 6; /* 3D stencil, RDMA */ + R2I_OCC = 10200; /* 10000 ns to RDMA-RD, 200 ns to send DONE */ + break; + default: /* '?' */ + printf("usage: [-a] [-r]"); + exit(1); + } + } + + fprintf(stdout, "CT09001 MPI progress thread skelton START\n"); + + rc = syscall(732); + if (rc == -1) + fprintf(stdout, "CT09002 main running on Linux INFO\n"); + else { + fprintf(stdout, "CT09002 main running on McKernel INFO\n"); + } + + fwq_init(); + +#define SHMPOSIX 1 +#define SHMSYSV 2 +#define SHMANON 3 +#define SHM_METHOD SHMPOSIX +#if SHM_METHOD==SHMPOSIX + printf("posix1\n"); + if((fd = shm_open("/CT27", O_RDWR | O_CREAT, 0644)) == -1) { + fprintf(stdout, "shm_open failed\n"); + } + if(ftruncate(fd, sizeof(struct proc_glb))) { + fprintf(stdout, "ftruncate failed\n"); + } + proc_glb = mmap(0, sizeof(struct proc_glb), PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + if (proc_glb == (void*)-1) { + fprintf(stdout, "mmap failed\n"); + exit(1); + } +#elif SHM_METHOD==SHMSYSV + printf("sysv1\n"); + if ((shmid = shmget(key, sizeof(struct proc_glb), IPC_CREAT | 0660)) == -1) { + fprintf(stdout, "shmget failed: %s\n", strerror(errno)); + } + proc_glb = shmat(shmid, NULL, 0); + if (proc_glb == (void*)-1) { + fprintf(stdout, "shmat failed\n"); + exit(1); + } +#elif SHM_METHOD==SHMANON + printf("anon1\n"); + proc_glb = mmap(0, sizeof(struct proc_glb), PROT_READ | PROT_WRITE, MAP_SHARED/* | MAP_ANONYMOUS*/, /*-1*/fd, 0); + if (proc_glb == (void*)-1) { + fprintf(stdout, "mmap failed\n"); + exit(1); + } +#endif + + memset(proc_glb, 0, sizeof(struct proc_glb)); + + proc_glb->bar_count = 0; + + pthread_condattr_init(&condattr); + pthread_condattr_setpshared(&condattr, PTHREAD_PROCESS_SHARED); + pthread_cond_init(&proc_glb->bar_cond, &condattr); + + pthread_mutexattr_init(&mutexattr); + pthread_mutexattr_setpshared(&mutexattr, PTHREAD_PROCESS_SHARED); + pthread_mutex_init(&proc_glb->bar_lock, &mutexattr); + + for (i = 0; i < NPROC; i++) { + proc_glb->per_procs[i].rank = i; + printf("[0] i=%d,rank=%d\n", i, proc_glb->per_procs[i].rank); + } + for (i = 1; i < NPROC; i++) { + pid = fork(); + if(pid < 0) { + fprintf(stdout, "fork failed: %s\n", strerror(errno)); + exit(1); + } else if (pid == 0) { +#if SHM_METHOD==SHMSYSV + printf("sysv2\n"); + proc_glb = shmat(shmid, NULL, 0); +#endif + printf("[%d] rank=%d\n", i, proc_glb->per_procs[i].rank); + parent_fn(&proc_glb->per_procs[i]); + exit(0); + } + } + parent_fn(&proc_glb->per_procs[0]); + + while ((pid = waitpid(-1, &st, __WALL)) > 0); + + max = -1; + for (i = 0; i < NPROC; i++) { + if (max < proc_glb->per_procs[i].nsec) { + max = proc_glb->per_procs[i].nsec; + } + } + + fprintf(stderr, "max %ld nsec\n", max); + fprintf(stdout, "CT09006 END\n"); +} + diff --git a/test/uti/CT27.sh b/test/uti/CT27.sh new file mode 100755 index 00000000..7561b523 --- /dev/null +++ b/test/uti/CT27.sh @@ -0,0 +1,64 @@ +#!/usr/bin/bash +MYHOME="/work/gg10/e29005" +MCK="${MYHOME}/project/os/install" +MCEXEC= +MCEXECOPT="--enable-uti" +export DISABLE_UTI=0 + +stop=0 +reset=0 +go=0 +acc=0 +nodes="c[8195]" + +while getopts srgamd OPT +do + case ${OPT} in + s) stop=1 + ;; + r) reset=1 + ;; + g) go=1 + ;; + a) acc=1 # accumulate, otherwise RDMA + ;; + m) + MCEXEC="${MCK}/bin/mcexec" + ;; + d) export DISABLE_UTI=1 + ;; + *) echo "invalid option -${OPT}" >&2 + exit 1 + esac +done + +if [ ${acc} -eq 1 ]; then + exeopt="-a" +else + exeopt="-r" +fi + +if [ ${stop} -eq 1 ]; then + PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w ${nodes} \ + sudo mount /work + + PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w ${nodes} \ + sudo ${MCK}/sbin/mcstop+release.sh +fi + +if [ ${reset} -eq 1 ]; then + PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w ${nodes} \ + sudo mount /work + + PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w ${nodes} \ + sudo ${MCK}/sbin/mcreboot.sh -c 2-17,20-35,36-51,52-67 -r 2-5:0+6-9:1+10-13:68+14-17:69+20-23:136+24-27:137+28-31:204+32-35:205+36-39:18+40-43:19+44-47:86+48-51:87+52-55:154+56-59:155+60-63:222+64-67:223 -m 32G@0,12G@1 +fi + +if [ ${go} -eq 1 ]; then + make + + > ./log + for i in {1..10}; do (${MCEXEC} ${MCEXECOPT} taskset -c 0-7 ./CT27 $exeopt 1>/dev/null 2>> ./log); done + perl CT11.pl < ./log + #${MCEXEC} ${MCEXECOPT} taskset -c 0-7 ./CT27 $exeopt +fi diff --git a/test/uti/CT28.c b/test/uti/CT28.c new file mode 100644 index 00000000..d579fb52 --- /dev/null +++ b/test/uti/CT28.c @@ -0,0 +1,441 @@ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define DEBUG + +#ifdef DEBUG +#define dprintf(...) \ + do { \ + char msg[1024]; \ + sprintf(msg, __VA_ARGS__); \ + fprintf(stdout, "%s,%s", __FUNCTION__, msg); \ + } while (0); +#define eprintf(...) \ + do { \ + char msg[1024]; \ + sprintf(msg, __VA_ARGS__); \ + fprintf(stdout, "%s,%s", __FUNCTION__, msg); \ + } while (0); +#else +#define dprintf(...) do { } while (0) +#define eprintf(...) do { } while (0) +#endif + +#define NPROC 8 +#define NINC 10000 +#define TS2NS(sec, nsec) ((unsigned long)(sec) * 1000000000ULL + (unsigned long)(nsec)) +#define NSPIN 1 + +static inline void fixed_size_work() { + asm volatile( + "movq $0, %%rcx\n\t" + "1:\t" + "addq $1, %%rcx\n\t" + "cmpq $99, %%rcx\n\t" + "jle 1b\n\t" + : + : + : "rcx", "cc"); +} + +static inline void bulk_fsw(unsigned long n) { + int j; + for (j = 0; j < (n); j++) { + fixed_size_work(); + } +} + +struct thr_arg { + int rank; + volatile int bar_count; /* Barrier before entering loop */ + pthread_mutex_t bar_lock; + pthread_cond_t bar_cond; + pthread_t pthread; + + pthread_mutex_t ep_lock; /* mutex for endpoint manipulation */ + volatile long count; /* events on the responder */ + volatile int terminate; +}; + +struct per_proc { + int rank; + struct thr_arg thr_arg; + long nsec; +}; + +struct proc_glb { + struct per_proc per_procs[NPROC]; + volatile int bar_count; + pthread_mutex_t bar_lock; + pthread_cond_t bar_cond; +}; + +struct proc_glb *proc_glb; + +unsigned long mem; /* Per-thread storage */ +int wps = 1; /* work per sec */ +double nspw; /* nsec per work */ + +#define N_INIT 10000000 + +static int print_cpu_last_executed_on() { + char fn[256]; + char* result; + pid_t tid = syscall(SYS_gettid); + int fd; + int offset; + int mpi_errno = 0; + + sprintf(fn, "/proc/%d/task/%d/stat", getpid(), (int)tid); + //printf("fn=%s\n", fn); + fd = open(fn, O_RDONLY); + if(fd == -1) { + printf("open() failed\n"); + goto fn_fail; + } + + result = malloc(65536); + if(result == NULL) { + printf("malloc() failed"); + goto fn_fail; + } + + int amount = 0; + offset = 0; + while(1) { + amount = read(fd, result + offset, 65536); + // printf("amount=%d\n", amount); + if(amount == -1) { + printf("read() failed"); + goto fn_fail; + } + if(amount == 0) { + goto eof; + } + offset += amount; + } + eof:; + //printf("result:%s\n", result); + + char* next_delim = result; + char* field; + int i; + for(i = 0; i < 39; i++) { + field = strsep(&next_delim, " "); + } + + int cpu = sched_getcpu(); + if(cpu == -1) { + printf("getpu() failed\n"); + goto fn_fail; + } + + printf("stat-cpu=%02d,sched_getcpu=%02d,tid=%d\n", atoi(field), cpu, tid); fflush(stdout); + fn_exit: + free(result); + return mpi_errno; + fn_fail: + mpi_errno = -1; + goto fn_exit; +} + +void fwq_init() { + struct timespec start, end; + unsigned long nsec; + int i; + clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start); + bulk_fsw(N_INIT); + clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end); + nsec = (TS2NS(end.tv_sec, end.tv_nsec) - TS2NS(start.tv_sec, start.tv_nsec)); + nspw = nsec / (double)N_INIT; + printf("nsec=%ld, nspw=%f\n", nsec, nspw); +} + +void fwq(long delay_nsec) { + if (delay_nsec < 0) { + printf("%s: delay_nsec<0\n", __FUNCTION__); + } + bulk_fsw(delay_nsec / nspw); +} + +void init_bar(struct thr_arg* thr_arg) { + pthread_mutex_lock(&thr_arg->bar_lock); + thr_arg->bar_count= 0; + pthread_mutex_unlock(&thr_arg->bar_lock); +} + +void bar(struct thr_arg* thr_arg) { + int rc; + pthread_mutex_lock(&thr_arg->bar_lock); + thr_arg->bar_count++; + if (thr_arg->bar_count == 2) { + if ((rc = pthread_cond_broadcast(&thr_arg->bar_cond))) { + printf("[%d] pthread_cond_broadcast failed,rc=%d\n", thr_arg->rank, rc); + } + } + while (thr_arg->bar_count != 2) { + if ((rc = pthread_cond_wait(&thr_arg->bar_cond, &thr_arg->bar_lock))) { + printf("[%d] pthread_cond_wait failed,rc=%d\n", thr_arg->rank, rc); + } + } + pthread_mutex_unlock(&thr_arg->bar_lock); +} + +void *progress_fn(void *arg) { + struct thr_arg *thr_arg = (struct thr_arg *)arg; + int rc; + int spin_count = 0; + int i, j; + struct timespec now_ts; + long now_long; + + rc = syscall(732); + if (rc == -1) + fprintf(stdout, "CT09100 progress_fn running on Linux OK\n"); + else { + fprintf(stdout, "CT09100 progress_fn running on McKernel NG\n", rc); + } + + printf("[%d] progress,enter,", thr_arg->rank); + print_cpu_last_executed_on(); + + bar(thr_arg); + + printf("[%d] progress,after barrier\n", thr_arg->rank); + + for (i = 0; i < NINC; i++) { + pthread_mutex_lock(&thr_arg->ep_lock); + thr_arg->count++; + pthread_mutex_unlock(&thr_arg->ep_lock); + sched_yield(); + } + + bar(thr_arg); + printf("progress,exit,rank=%d\n", thr_arg->rank); + + return NULL; +} + +#define TIMER_KIND CLOCK_THREAD_CPUTIME_ID +//#define TIMER_KIND CLOCK_REALTIME + +void parent_fn(struct per_proc *per_proc) { + int i, j; + int rc; + char* uti_str; + int uti_val; + struct timespec start, end; + pthread_condattr_t condattr; + pthread_mutexattr_t mutexattr; + struct timespec now_ts; + long now_long; + + printf("[%d] parent_fn,enter,", per_proc->rank); + print_cpu_last_executed_on(); + + pthread_mutex_lock(&proc_glb->bar_lock); + proc_glb->bar_count++; + if (proc_glb->bar_count == NPROC) { + if ((rc = pthread_cond_broadcast(&proc_glb->bar_cond))) { + printf("[%d] pthread_cond_broadcast failed,rc=%d\n", per_proc->rank, rc); + } + } + while (proc_glb->bar_count != NPROC) { + if ((rc = pthread_cond_wait(&proc_glb->bar_cond, &proc_glb->bar_lock))) { + printf("[%d] pthread_cond_wait failed,rc=%d\n", per_proc->rank, rc); + } + } + pthread_mutex_unlock(&proc_glb->bar_lock); + + + pthread_mutexattr_init(&mutexattr); + pthread_mutex_init(&per_proc->thr_arg.ep_lock, &mutexattr); + + per_proc->thr_arg.bar_count = 0; + + pthread_condattr_init(&condattr); + pthread_cond_init(&per_proc->thr_arg.bar_cond, &condattr); + + pthread_mutexattr_init(&mutexattr); + pthread_mutex_init(&per_proc->thr_arg.bar_lock, &mutexattr); + + uti_str = getenv("DISABLE_UTI"); + uti_val = uti_str ? atoi(uti_str) : 0; + if (!uti_val) { + rc = syscall(731, 1, NULL); + if (rc) { + fprintf(stdout, "CT09003 INFO: uti not available (rc=%d)\n", rc); + } else { + fprintf(stdout, "CT09003 INFO: uti available\n"); + } + } else { + fprintf(stdout, "CT09003 INFO: uti disabled\n", rc); + } + + per_proc->thr_arg.rank = per_proc->rank; + rc = pthread_create(&per_proc->thr_arg.pthread, NULL, progress_fn, &per_proc->thr_arg); + if (rc){ + fprintf(stdout, "pthread_create: %d\n", rc); + exit(1); + } + + init_bar(&per_proc->thr_arg); + bar(&per_proc->thr_arg); + + printf("[%d] parent,after barrier\n", per_proc->rank); + + clock_gettime(TIMER_KIND, &start); + for (i = 0; i < NINC; i++) { + pthread_mutex_lock(&per_proc->thr_arg.ep_lock); /* Lock is taken per MPI_Accumulate() */ + per_proc->thr_arg.count++; + pthread_mutex_unlock(&per_proc->thr_arg.ep_lock); + } + init_bar(&per_proc->thr_arg); + bar(&per_proc->thr_arg); + clock_gettime(TIMER_KIND, &end); + + pthread_join(per_proc->thr_arg.pthread, NULL); + + per_proc->nsec = TS2NS(end.tv_sec, end.tv_nsec) - TS2NS(start.tv_sec, start.tv_nsec); +} + +static struct option options[] = { + { + .name = "ppn", + .has_arg = required_argument, + .flag = NULL, + .val = 'P', + }, + /* end */ + { NULL, 0, NULL, 0, }, +}; + +int main(int argc, char **argv) { + int rc; + int i; + char *uti_str; + int uti_val; + int st; + pid_t pid; + long max; + pthread_condattr_t condattr; + pthread_mutexattr_t mutexattr; + int fd; + key_t key = ftok(argv[0], 0); + int shmid; + int opt; + + while ((opt = getopt_long(argc, argv, "+", options, NULL)) != -1) { + switch (opt) { + default: /* '?' */ + printf("unknown option: %c\n", optopt); + exit(1); + } + } + + fprintf(stdout, "CT09001 MPI progress thread skelton START\n"); + + rc = syscall(732); + if (rc == -1) + fprintf(stdout, "CT09002 main running on Linux INFO\n"); + else { + fprintf(stdout, "CT09002 main running on McKernel INFO\n"); + } + + fwq_init(); + +#define SHMPOSIX 1 +#define SHMSYSV 2 +#define SHMANON 3 +#define SHM_METHOD SHMPOSIX +#if SHM_METHOD==SHMPOSIX + printf("posix1\n"); + if((fd = shm_open("/CT27", O_RDWR | O_CREAT, 0644)) == -1) { + fprintf(stdout, "shm_open failed\n"); + } + if(ftruncate(fd, sizeof(struct proc_glb))) { + fprintf(stdout, "ftruncate failed\n"); + } + proc_glb = mmap(0, sizeof(struct proc_glb), PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + if (proc_glb == (void*)-1) { + fprintf(stdout, "mmap failed\n"); + exit(1); + } +#elif SHM_METHOD==SHMSYSV + printf("sysv1\n"); + if ((shmid = shmget(key, sizeof(struct proc_glb), IPC_CREAT | 0660)) == -1) { + fprintf(stdout, "shmget failed: %s\n", strerror(errno)); + } + proc_glb = shmat(shmid, NULL, 0); + if (proc_glb == (void*)-1) { + fprintf(stdout, "shmat failed\n"); + exit(1); + } +#elif SHM_METHOD==SHMANON + printf("anon1\n"); + proc_glb = mmap(0, sizeof(struct proc_glb), PROT_READ | PROT_WRITE, MAP_SHARED/* | MAP_ANONYMOUS*/, /*-1*/fd, 0); + if (proc_glb == (void*)-1) { + fprintf(stdout, "mmap failed\n"); + exit(1); + } +#endif + + memset(proc_glb, 0, sizeof(struct proc_glb)); + + proc_glb->bar_count = 0; + + pthread_condattr_init(&condattr); + pthread_condattr_setpshared(&condattr, PTHREAD_PROCESS_SHARED); + pthread_cond_init(&proc_glb->bar_cond, &condattr); + + pthread_mutexattr_init(&mutexattr); + pthread_mutexattr_setpshared(&mutexattr, PTHREAD_PROCESS_SHARED); + pthread_mutex_init(&proc_glb->bar_lock, &mutexattr); + + for (i = 0; i < NPROC; i++) { + proc_glb->per_procs[i].rank = i; + printf("[0] i=%d,rank=%d\n", i, proc_glb->per_procs[i].rank); + } + for (i = 1; i < NPROC; i++) { + pid = fork(); + if(pid < 0) { + fprintf(stdout, "fork failed: %s\n", strerror(errno)); + exit(1); + } else if (pid == 0) { +#if SHM_METHOD==SHMSYSV + printf("sysv2\n"); + proc_glb = shmat(shmid, NULL, 0); +#endif + printf("[%d] rank=%d\n", i, proc_glb->per_procs[i].rank); + parent_fn(&proc_glb->per_procs[i]); + exit(0); + } + } + parent_fn(&proc_glb->per_procs[0]); + + while ((pid = waitpid(-1, &st, __WALL)) > 0); + + max = -1; + for (i = 0; i < NPROC; i++) { + if (max < proc_glb->per_procs[i].nsec) { + max = proc_glb->per_procs[i].nsec; + } + } + + fprintf(stderr, "max %ld nsec\n", max); + fprintf(stdout, "CT09006 END\n"); +} + diff --git a/test/uti/CT28.sh b/test/uti/CT28.sh new file mode 100755 index 00000000..7ccef343 --- /dev/null +++ b/test/uti/CT28.sh @@ -0,0 +1,76 @@ +#!/usr/bin/bash -x +MYHOME="/work/gg10/e29005" +MCK="${MYHOME}/project/os/install" +MCEXECOPT="--enable-uti" +export DISABLE_UTI=0 + +stop=0 +reset=0 +go=0 +mck=0; +loop=0 +nodes="c[8195]" +NPROC=8 + +while getopts srglamd OPT +do + case ${OPT} in + s) stop=1 + ;; + r) reset=1 + ;; + g) go=1 + ;; + l) loop=1 + ;; + m) + mck=1 + ;; + d) export DISABLE_UTI=1 + ;; + *) echo "invalid option -${OPT}" >&2 + exit 1 + esac +done + + +if [ ${mck} -eq 1 ]; then + MCEXEC="${MCK}/bin/mcexec" + cpus="0-7" + NUMACTL= +else + MCEXEC= + cpus="2-9" + NUMACTL="numactl -C $cpus" +fi + +if [ ${stop} -eq 1 ]; then + PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w ${nodes} \ + sudo mount /work + + PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w ${nodes} \ + sudo ${MCK}/sbin/mcstop+release.sh +fi + +if [ ${reset} -eq 1 ]; then + PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w ${nodes} \ + sudo mount /work + + PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w ${nodes} \ + sudo ${MCK}/sbin/mcreboot.sh `${HOME}/project/src/tools/cpus.pl $NPROC` -m 32G@0,12G@1 + #sudo ${MCK}/sbin/mcreboot.sh -c 2-17,20-35,36-51,52-67 -r 2-5:0+6-9:1+10-13:68+14-17:69+20-23:136+24-27:137+28-31:204+32-35:205+36-39:18+40-43:19+44-47:86+48-51:87+52-55:154+56-59:155+60-63:222+64-67:223 -m 32G@0,12G@1 +fi + +if [ ${go} -eq 1 ]; then + cd $MYHOME/project/os/mckernel/test/uti + rm -f ./CT28 + make -DNPROC=$NPROC + + if [ ${loop} -eq 1 ]; then + > ./log + for i in {1..10}; do (${MCEXEC} ${MCEXECOPT} $NUMACTL ./CT28 1> ./log1 2>> ./log); done + perl CT11.pl < ./log + else + ${MCEXEC} ${MCEXECOPT} $NUMACTL ./CT28 + fi +fi diff --git a/test/uti/CT29.c b/test/uti/CT29.c new file mode 100644 index 00000000..d5f15ca7 --- /dev/null +++ b/test/uti/CT29.c @@ -0,0 +1,117 @@ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* For SYS_xxx definitions */ +#include +#include +#include +#include + +int passed = 0; +pthread_t thr; + +unsigned long mem; /* delay functions issue ld/st instructions on this address */ +double nspw; /* nsec per work */ + +/* Timer related macros */ +#define TS2NS(sec, nsec) ((unsigned long)(sec) * 1000000000ULL + (unsigned long)(nsec)) + +static inline void fixed_size_work() { + asm volatile( + "movq $0, %%rcx\n\t" + "1:\t" + "addq $1, %%rcx\n\t" + "cmpq $99, %%rcx\n\t" + "jle 1b\n\t" + : + : + : "rcx", "cc"); +} + +static inline void bulk_fsw(unsigned long n) { + int j; + for (j = 0; j < (n); j++) { + fixed_size_work(); + } +} + +#define N_INIT 1000000 + +void fwq_init() { + struct timespec start, end; + unsigned long nsec; + int i; + clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start); + bulk_fsw(N_INIT); + clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end); + nsec = (TS2NS(end.tv_sec, end.tv_nsec) - TS2NS(start.tv_sec, start.tv_nsec)); + nspw = nsec / (double)N_INIT; + printf("nsec=%ld, nspw=%f\n", nsec, nspw); +} + +void fwq(long delay_nsec) { + if (delay_nsec < 0) { + printf("%s: delay_nsec<0\n", __FUNCTION__); + } + //printf("delay_nsec=%ld,count=%f\n", delay_nsec, delay_nsec / nspw); + bulk_fsw(delay_nsec / nspw); +} + +void *util_thread(void *arg) { + int rc; + + fwq(1000*1000); + + return NULL; +} + +int +main(int argc, char **argv) +{ + int rc; + pthread_attr_t attr; + struct sigaction act; + + fwq_init(); + + fprintf(stderr, "CT29001 INFO start (tid=%d)\n", syscall(__NR_gettid)); + rc = syscall(731, 1, NULL); + if (rc) { + fprintf(stderr, "CT29002 INFO uti not supported (rc=%d, errno=%d)\n", rc, errno); + fflush(stderr); + } + + rc = pthread_attr_init(&attr); + if (rc){ + fprintf(stderr, "pthread_attr_init: %d\n", rc); + exit(1); + } +#if 1 + rc = pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED); + if (rc){ + fprintf(stderr, "pthread_attr_setdetachstate: %d\n", rc); + exit(1); + } +#endif + rc = pthread_create(&thr, &attr, util_thread, NULL); + if (rc){ + fprintf(stderr, "pthread_create: %d\n", rc); + exit(1); + } + fprintf(stderr, "CT29003 pthread_create OK\n"); + + fwq(100*1000*1000); + +#if 0 + pthread_join(thr, NULL); + fprintf(stderr, "CT29004 pthread_join OK\n"); +#endif + exit(0); +} diff --git a/test/uti/CT30.c b/test/uti/CT30.c new file mode 100644 index 00000000..34a97ef0 --- /dev/null +++ b/test/uti/CT30.c @@ -0,0 +1,177 @@ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "util.h" + +#define NTHR 1 +#define TS2NS(sec, nsec) ((unsigned long)(sec) * 1000000000ULL + (unsigned long)(nsec)) +#define CALC_DELAY (93000) /* 93 usec */ +#define INIT_DELAY (2000) /* 2 usec, CPU sends CTS packet */ +#define NIC_DELAY (3000) /* 3 usec, NIC reads by RDMA-read */ +#define POLL_DELAY (200) /* .2 usec, CPU fetces event queue entry from DRAM */ +#define RESP_DELAY (2000) /* 2 usec, CPU sends DONE packet and updates MPI_Request */ +#define NSPIN 1 +static inline void FIXED_SIZE_WORK(unsigned long *ptr) { + asm volatile("movq %0, %%rax\n\t" + "addq $1, %%rax\n\t" \ + "movq %%rax, %0\n\t" \ + : "+rm" (*ptr) \ + : \ + : "rax", "cc", "memory"); \ +} + +static inline void BULK_FSW(unsigned long n, unsigned long *ptr) { + int j; + for (j = 0; j < (n); j++) { + FIXED_SIZE_WORK(ptr); + } +} + + +pthread_mutex_t ep_lock; /* Ownership of channel instance */ +pthread_barrier_t bar; + +struct thr_arg { + pthread_t pthread; + unsigned long mem; /* Per-thread storage */ +}; + +struct thr_arg thr_args[NTHR]; + +unsigned long mem; /* Per-thread storage */ +volatile int nevents; +volatile int terminate; +int wps = 1; /* work per sec */ +double nspw; /* nsec per work */ + +#define N_INIT 10000000 + +void fwq_init(unsigned long *mem) { + struct timespec start, end; + unsigned long nsec; + int i; + clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start); + BULK_FSW(N_INIT, mem); + clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end); + nsec = (TS2NS(end.tv_sec, end.tv_nsec) - TS2NS(start.tv_sec, start.tv_nsec)); + nspw = nsec / (double)N_INIT; + printf("[INFO] nsec=%ld, nspw=%f\n", nsec, nspw); +} + +void fwq(unsigned long delay_nsec, unsigned long* mem) { + //printf("delay_nsec=%ld,count=%f\n", delay_nsec, delay_nsec / nspw); + BULK_FSW(delay_nsec / nspw, mem); +} + +void fwq_omp(unsigned long delay_nsec, unsigned long* mem) { +#pragma omp parallel + { + BULK_FSW(delay_nsec / nspw, mem); + } +} + +void mydelay(long delay_nsec, long *mem) { + struct timespec start, end; + clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start); + + while (1) { + clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end); + if (TS2NS(end.tv_sec, end.tv_nsec) - TS2NS(start.tv_sec, start.tv_nsec) > delay_nsec) { + break; + } + FIXED_SIZE_WORK(mem); + } +} + +void *util_fn(void *_arg) { + struct thr_arg *arg = (struct thr_arg *)_arg; + int ret; + int i; + + ret = syscall(732); + OKNGNOJUMP(ret == -1, "util_fn running on Linux, tid=%d\n", syscall(SYS_gettid)); + + pthread_barrier_wait(&bar); + + /* Start progress */ + while (1) { + pthread_mutex_lock(&ep_lock); + if (terminate) { + pthread_mutex_unlock(&ep_lock); + break; + } + + if (nevents > 0) { + nevents--; + fwq(random() % 100000000, &mem); /* 0 - 0.1 sec */ + } + pthread_mutex_unlock(&ep_lock); + } + + fn_fail: + return NULL; +} + +int main(int argc, char **argv) { + int ret; + int i; + struct timespec start, end; + + ret = syscall(732); + OKNGNOJUMP(ret != -1, "Master is running on McKernel\n"); + + fwq_init(&mem); + pthread_mutex_init(&ep_lock, NULL); + + pthread_barrier_init(&bar, NULL, NTHR + 1); + + if ((ret = syscall(731, 1, NULL))) { + fprintf(stdout, "Error: util_indicate_clone: %s\n", strerror(errno)); + } + + for (i = 0; i < NTHR; i++) { + if ((ret = pthread_create(&thr_args[i].pthread, NULL, util_fn, &thr_args[i]))) { + fprintf(stdout, "Error: pthread_create: %s\n", strerror(errno)); + exit(1); + } + } + + pthread_barrier_wait(&bar); + +#pragma omp parallel for + for (i = 0; i < omp_get_num_threads(); i++) { + printf("[INFO] thread_num=%d,tid=%d\n", i, syscall(SYS_gettid)); + } + + clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start); + for (i = 0; i < 10; i++) { + pthread_mutex_lock(&ep_lock); + nevents++; + fwq_omp(random() % 100000000, &mem); /* 0 - 0.1 sec */ + pthread_mutex_unlock(&ep_lock); + + while (nevents > 0) { + FIXED_SIZE_WORK(&mem); + } + } + clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end); + + terminate = 1; + + for (i = 0; i < NTHR; i++) { + pthread_join(thr_args[i].pthread, NULL); + } + + printf("[INFO] Time: %ld usec\n", (TS2NS(end.tv_sec, end.tv_nsec) - TS2NS(start.tv_sec, start.tv_nsec)) / 1000); + + ret = 0; + fn_fail: + return ret; +} diff --git a/test/uti/CT30.sh b/test/uti/CT30.sh new file mode 100755 index 00000000..fc0198fa --- /dev/null +++ b/test/uti/CT30.sh @@ -0,0 +1,92 @@ +#!/usr/bin/bash + +bn=`basename $0` +fn=`echo $bn | sed 's/.sh//'` + +stop=0 +reboot=0 +go=0 +mck=0 +NNODES=1 +NPROC=$((16 * NNODES)) +LASTNODE=8200 + +while getopts srgmN:P:L: OPT +do + case ${OPT} in + s) stop=1 + ;; + r) reboot=1 + ;; + g) go=1 + ;; + m) mck=1 + ;; + N) NNODES=$OPTARG + ;; + P) NPROC=$OPTARG + ;; + L) LASTNODE=$OPTARG + ;; + *) echo "invalid option -${OPT}" >&2 + exit 1 + esac +done + +MYHOME=/work/gg10/e29005 +ABS_SRCDIR=${MYHOME}/project/os/mckernel/test/uti +MCK=${MYHOME}/project/os/install + +NODES=`echo $(seq -s ",c" $(($LASTNODE + 1 - $NNODES)) $LASTNODE) | sed 's/^/c/'` +PPN=$((NPROC / NNODES)) +echo NPROC=$NPROC NNODES=$NNODES PPN=$PPN NODES=$NODES + +if [ ${mck} -eq 1 ]; then + MCEXEC="${MCK}/bin/mcexec" + mcexecopt="--enable-uti" +else + MCEXEC= + mcexecopt= +fi + +if [ ${stop} -eq 1 ]; then + PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $NODES \ + sudo mount /work + + if [ ${mck} -eq 1 ]; then + PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $NODES \ + /sbin/pidof mcexec \| xargs -r kill -9 + PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $NODES \ + sudo ${MCK}/sbin/mcstop+release.sh + else + : + fi +fi + +if [ ${reboot} -eq 1 ]; then + PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $NODES \ + sudo mount /work + + if [ ${mck} -eq 1 ]; then + PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $NODES \ + sudo ${MCK}/sbin/mcreboot.sh -c 2-17,70-85,138-153,206-221,20-35,88-103,156-171,224-239,36-51,104-119,172-187,240-255,52-67,120-135,188-203,256-271 -r 2-5,70-73,138-141,206-209:0+6-9,74-77,142-145,210-213:1+10-13,78-81,146-149,214-217:68+14-17,82-85,150-153,218-221:69+20-23,88-91,156-159,224-227:136+24-27,92-95,160-163,228-231:137+28-31,96-99,164-167,232-235:204+32-35,100-103,168-171,236-239:205+36-39,104-107,172-175,240-243:18+40-43,108-111,176-179,244-247:19+44-47,112-115,180-183,248-251:86+48-51,116-119,184-187,252-255:87+52-55,120-123,188-191,256-259:154+56-59,124-127,192-195,260-263:155+60-63,128-131,196-199,264-267:222+64-67,132-135,200-203,268-271:223 -m 32G@0,12G@1 + else + : + fi +fi + +if [ ${go} -eq 1 ]; then + cd $ABS_SRCDIR + PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $NODES \ + ulimit -u 16384; + PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $NODES \ + ulimit -s unlimited + PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $NODES \ + ulimit -c unlimited + + export KMP_STACKSIZE=64M + export OMP_NUM_THREADS=4 + + $MCEXEC $mcexecopt ./$fn +fi + diff --git a/test/uti/CT31.c b/test/uti/CT31.c new file mode 100644 index 00000000..e5f839de --- /dev/null +++ b/test/uti/CT31.c @@ -0,0 +1,158 @@ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "util.h" + +#define WAITER_CPU 0 +#define WAKER_CPU 1 + +pthread_mutex_t mutex; +pthread_cond_t cond; +pthread_barrier_t bar; +int flag; +pthread_t thr; +long t_cond_wait, t_fwq; +long nloop; +long blocktime = 10L * 1000 * 1000; + +void *util_fn(void *arg) +{ + int i; + int ret; + long start, end; + + print_cpu_last_executed_on("Utility thread"); + + ret = syscall(732); + OKNGNOJUMP(ret == -1, "Utility thread is running on Linux\n"); + + pthread_barrier_wait(&bar); + for (i = 0; i < nloop; i++) { + start = rdtsc_light(); + + fwq(blocktime); + + end = rdtsc_light(); + t_fwq += end - start; + + pthread_mutex_lock(&mutex); + flag = 1; + pthread_cond_signal(&cond); + pthread_mutex_unlock(&mutex); + + } + + fn_fail: + return NULL; +} + +static struct option options[] = { + /* end */ + { NULL, 0, NULL, 0, } +}; + +int main(int argc, char **argv) +{ + int i; + int ret; + long start, end; + cpu_set_t cpuset; + pthread_attr_t attr; + pthread_barrierattr_t bar_attr; + struct sched_param param = { .sched_priority = 99 }; + int opt; + + while ((opt = getopt_long(argc, argv, "+b:", options, NULL)) != -1) { + switch (opt) { + case 'b': + blocktime = atoi(optarg); + break; + default: /* '?' */ + printf("unknown option %c\n", optopt); + exit(1); + } + } + nloop = (10 * 1000000000UL) / blocktime; + printf("[INFO] nloop=%ld,blocktime=%ld\n", nloop, blocktime); + + + CPU_ZERO(&cpuset); + CPU_SET(WAITER_CPU, &cpuset); + if ((ret = sched_setaffinity(0, sizeof(cpu_set_t), &cpuset))) { + printf("Error: sched_setaffinity: %s\n", strerror(errno)); + goto fn_fail; + } + print_cpu_last_executed_on("Master thread"); + + fwq_init(); + + pthread_mutex_init(&mutex, NULL); + pthread_cond_init(&cond, NULL); + + pthread_barrierattr_init(&bar_attr); + pthread_barrier_init(&bar, &bar_attr, 2); + + ret = syscall(732); + OKNGNOJUMP(ret != -1, "Master thread is running on McKernel\n"); + + ret = syscall(731, 1, NULL); + OKNGNOJUMP(ret != -1, "util_indicate_clone\n"); + + if ((ret = pthread_attr_init(&attr))) { + printf("%s: Error: pthread_attr_init failed (%d)\n", __FUNCTION__, ret); + goto fn_fail; + } + + CPU_ZERO(&cpuset); + CPU_SET(WAKER_CPU, &cpuset); + + if ((ret = pthread_attr_setaffinity_np(&attr, sizeof(cpu_set_t), &cpuset))) { + printf("%s: Error: pthread_attr_setaffinity_np failed (%d)\n", __FUNCTION__, ret); + goto fn_fail; + } + + if ((ret = pthread_create(&thr, &attr, util_fn, NULL))) { + fprintf(stderr, "Error: pthread_create failed (%d)\n", ret); + goto fn_fail; + } + + if ((ret = sched_setscheduler(0, SCHED_FIFO, ¶m))) { + fprintf(stderr, "Error: sched_setscheduler failed (%d)\n", ret); + goto fn_fail; + } + + syscall(701, 1 | 2); + pthread_barrier_wait(&bar); + for (i = 0; i < nloop; i++) { + start = rdtsc_light(); + + pthread_mutex_lock(&mutex); /* no futex */ + while(!flag) { + pthread_cond_wait(&cond, &mutex); /* 1st futex */ + } + flag = 0; + pthread_mutex_unlock(&mutex); /* 2nd futex */ + + end = rdtsc_light(); + t_cond_wait += end - start; + } + syscall(701, 4 | 8); + + pthread_join(thr, NULL); + printf("[INFO] waker: %ld cycles, waiter: %ld cycles, (waiter - waker) / nloop: %ld cycles\n", t_fwq, t_cond_wait, (t_cond_wait - t_fwq) / nloop); + + ret = 0; + fn_fail: + return ret; +} diff --git a/test/uti/CT31.sh b/test/uti/CT31.sh new file mode 100755 index 00000000..8a6a6e28 --- /dev/null +++ b/test/uti/CT31.sh @@ -0,0 +1,102 @@ +#!/usr/bin/bash + +bn=`basename $0` +fn=`echo $bn | sed 's/.sh//'` + +nloop=800 +stop=0 +reboot=0 +go=0 +mck=0 +NNODES=1 +NPROC=$((1 * NNODES)) +LASTNODE=8200 +use_hfi=0 + +while getopts srgmh:N:P:L: OPT +do + case ${OPT} in + s) stop=1 + ;; + r) reboot=1 + ;; + g) go=1 + ;; + m) mck=1 + ;; + h) use_hfi=1 + ;; + N) NNODES=$OPTARG + ;; + P) NPROC=$OPTARG + ;; + L) LASTNODE=$OPTARG + ;; + *) echo "invalid option -${OPT}" >&2 + exit 1 + esac +done + +MYHOME=/work/gg10/e29005 +ABS_SRCDIR=${MYHOME}/project/os/mckernel/test/uti +MCK=${MYHOME}/project/os/install + +NODES=`echo $(seq -s ",c" $(($LASTNODE + 1 - $NNODES)) $LASTNODE) | sed 's/^/c/'` +PPN=$((NPROC / NNODES)) +echo NPROC=$NPROC NNODES=$NNODES PPN=$PPN NODES=$NODES + +if [ ${mck} -eq 1 ]; then + MCEXEC="${MCK}/bin/mcexec" + mcexecopt="--enable-uti" + if [ ${use_hfi} -eq 1 ]; then + mcexecopt="--enable-hfi1 $mcexecopt" + fi +else + MCEXEC= + mcexecopt= +fi + +if [ ${stop} -eq 1 ]; then + PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $NODES \ + sudo mount /work + + if [ ${mck} -eq 1 ]; then + PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $NODES \ + /sbin/pidof mcexec \| xargs -r kill -9 + PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $NODES \ + sudo ${MCK}/sbin/mcstop+release.sh + else + : + fi +fi + +if [ ${reboot} -eq 1 ]; then + PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $NODES \ + sudo mount /work + + if [ ${mck} -eq 1 ]; then + PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $NODES \ + sudo ${MCK}/sbin/mcreboot.sh -c 2-17,70-85,138-153,206-221,20-35,88-103,156-171,224-239,36-51,104-119,172-187,240-255,52-67,120-135,188-203,256-271 -r 2-5,70-73,138-141,206-209:0+6-9,74-77,142-145,210-213:1+10-13,78-81,146-149,214-217:68+14-17,82-85,150-153,218-221:69+20-23,88-91,156-159,224-227:136+24-27,92-95,160-163,228-231:137+28-31,96-99,164-167,232-235:204+32-35,100-103,168-171,236-239:205+36-39,104-107,172-175,240-243:18+40-43,108-111,176-179,244-247:19+44-47,112-115,180-183,248-251:86+48-51,116-119,184-187,252-255:87+52-55,120-123,188-191,256-259:154+56-59,124-127,192-195,260-263:155+60-63,128-131,196-199,264-267:222+64-67,132-135,200-203,268-271:223 -m 32G@0,12G@1 + else + : + fi +fi + +if [ ${go} -eq 1 ]; then + cd $ABS_SRCDIR + make $fn + + PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $NODES \ + ulimit -u 16384; + PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $NODES \ + ulimit -s unlimited + + for((count=0;count +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "util.h" + +#define WAITER_CPU 0 +#define WAKER_CPU 1 + +int sem; +pthread_barrier_t bar; +int flag; +pthread_t thr; +long t_futex_wait, t_fwq; +long nloop; +long blocktime = 10L * 1000 * 1000; + +void *util_fn(void *arg) +{ + int i; + int ret; + long start, end; + int testid = 32101; + + print_cpu_last_executed_on("Utility thread"); + + ret = syscall(732); + OKNGNOJUMP(ret == -1, "Utility thread is running on Linux\n"); + + pthread_barrier_wait(&bar); + + for (i = 0; i < nloop; i++) { + start = rdtsc_light(); + + fwq(blocktime); + + end = rdtsc_light(); + t_fwq += end - start; + + if ((ret = syscall(__NR_futex, &sem, FUTEX_WAKE, 1, NULL, NULL, 0)) == -1) { + printf("Error: futex wake: %s\n", strerror(errno)); + } + + //pthread_barrier_wait(&bar); + + } + + ret = 0; + fn_fail: + return NULL; +} + +static struct option options[] = { + /* end */ + { NULL, 0, NULL, 0, } +}; + +int main(int argc, char **argv) +{ + int i; + int ret; + long start, end; + cpu_set_t cpuset; + pthread_attr_t attr; + pthread_barrierattr_t bar_attr; + struct sched_param param = { .sched_priority = 99 }; + int opt; + + while ((opt = getopt_long(argc, argv, "+b:", options, NULL)) != -1) { + switch (opt) { + case 'b': + blocktime = atoi(optarg); + break; + default: /* '?' */ + printf("unknown option %c\n", optopt); + exit(1); + } + } + nloop = (10 * 1000000000UL) / blocktime; + printf("[INFO] nloop=%ld,blocktime=%ld\n", nloop, blocktime); + + + CPU_ZERO(&cpuset); + CPU_SET(WAITER_CPU, &cpuset); + if ((ret = sched_setaffinity(0, sizeof(cpu_set_t), &cpuset))) { + printf("Error: sched_setaffinity: %s\n", strerror(errno)); + goto fn_fail; + } + print_cpu_last_executed_on("Master thread"); + + fwq_init(); + + pthread_barrierattr_init(&bar_attr); + pthread_barrier_init(&bar, &bar_attr, 2); + + if ((ret = pthread_attr_init(&attr))) { + printf("Error: pthread_attr_init: %s\n", strerror(errno)); + goto fn_fail; + } + +#if 0 + uti_attr_t uti_attr; + ret = uti_attr_init(&uti_attr); + if (ret) { + printf("%s: Error: uti_attr_init failed (%d)\n", __FUNCTION__, ret); + exit(1); + } + + /* Give a hint that it's beneficial to prioritize it in scheduling. */ + ret = UTI_ATTR_HIGH_PRIORITY(&uti_attr); + if (ret) { + printf("%s: Error: UTI_ATTR_HIGH_PRIORITY failed (%d)\n", __FUNCTION__, ret); + exit(1); + } + + if ((ret = pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED))) { + printf("%s: Error: pthread_attr_setdetachstate failed (%d)\n", __FUNCTION__, ret); + exit(1); + } + + if ((ret = uti_pthread_create(&thr, &attr, progress_function, NULL, &uti_attr))) { + printf("%s: Error: uti_pthread_create: %s\n", __FUNCTION__, strerror(errno)); + exit(1); + } + + if ((ret = uti_attr_destroy(&uti_attr))) { + printf("%s: Error: uti_attr_destroy failed (%d)\n", __FUNCTION__, ret); + exit(1); + } +#else + CPU_ZERO(&cpuset); + CPU_SET(WAKER_CPU, &cpuset); + + if ((ret = pthread_attr_setaffinity_np(&attr, sizeof(cpu_set_t), &cpuset))) { + printf("Error: pthread_attr_setaffinity_np: %s\n", strerror(errno)); + goto fn_fail; + } + + ret = syscall(732); + OKNGNOJUMP(ret != -1, "Master thread is running on McKernel\n"); + + ret = syscall(731, 1, NULL); + OKNGNOJUMP(ret != -1, "util_indicate_clone\n"); + + if ((ret = pthread_create(&thr, &attr, util_fn, NULL))) { + printf("Error: pthread_create: %s\n", strerror(errno)); + goto fn_fail; + } + +#endif + + if ((ret = sched_setscheduler(0, SCHED_FIFO, ¶m))) { + printf("Error: sched_setscheduler: %s\n", strerror(errno)); + ret = -errno; + goto fn_fail; + } + + syscall(701, 1 | 2); + pthread_barrier_wait(&bar); + start = rdtsc_light(); + for (i = 0; i < nloop; i++) { + + if ((ret = syscall(__NR_futex, &sem, FUTEX_WAIT, 0, NULL, NULL, 0))) { + printf("Error: futex wait failed (%s)\n", strerror(errno)); + } + + //pthread_barrier_wait(&bar); /* 2nd futex */ + } + end = rdtsc_light(); + t_futex_wait += end - start; + syscall(701, 4 | 8); + + pthread_join(thr, NULL); + printf("[INFO] waiter: %ld cycles, waker: %ld cycles, (waiter - waker) / nloop: %ld cycles\n", t_fwq, t_futex_wait, (t_futex_wait - t_fwq) / nloop); + + ret = 0; + fn_fail: + return ret; +} diff --git a/test/uti/CT32.sh b/test/uti/CT32.sh new file mode 100755 index 00000000..854cc27f --- /dev/null +++ b/test/uti/CT32.sh @@ -0,0 +1,104 @@ +#!/usr/bin/bash + +bn=`basename $0` +fn=`echo $bn | sed 's/.sh//'` + +stop=0 +reboot=0 +go=0 +mck=0 +disable_uti=1 +NNODES=1 +NPROC=$((1 * NNODES)) +LASTNODE=8200 +use_hfi=0 + +while getopts srgmh:N:P:L:d: OPT +do + case ${OPT} in + s) stop=1 + ;; + r) reboot=1 + ;; + g) go=1 + ;; + m) mck=1 + ;; + h) use_hfi=1 + ;; + d) disable_uti=$OPTARG + ;; + N) NNODES=$OPTARG + ;; + P) NPROC=$OPTARG + ;; + L) LASTNODE=$OPTARG + ;; + *) echo "invalid option -${OPT}" >&2 + exit 1 + esac +done + +MYHOME=/work/gg10/e29005 +ABS_SRCDIR=${MYHOME}/project/os/mckernel/test/uti +MCK=${MYHOME}/project/os/install + +NODES=`echo $(seq -s ",c" $(($LASTNODE + 1 - $NNODES)) $LASTNODE) | sed 's/^/c/'` +PPN=$((NPROC / NNODES)) +echo NPROC=$NPROC NNODES=$NNODES PPN=$PPN NODES=$NODES + +if [ $disable_uti -eq 1 ]; then + export DISABLE_UTI=1 +else + unset DISABLE_UTI +fi + +if [ ${mck} -eq 1 ]; then + MCEXEC="${MCK}/bin/mcexec" + mcexecopt="--enable-uti" + if [ ${use_hfi} -eq 1 ]; then + mcexecopt="--enable-hfi1 $mcexecopt" + fi +else + MCEXEC= + mcexecopt= +fi + +if [ ${stop} -eq 1 ]; then + PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $NODES \ + sudo mount /work + + if [ ${mck} -eq 1 ]; then + PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $NODES \ + /sbin/pidof mcexec \| xargs -r kill -9 + PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $NODES \ + sudo ${MCK}/sbin/mcstop+release.sh + else + : + fi +fi + +if [ ${reboot} -eq 1 ]; then + PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $NODES \ + sudo mount /work + + if [ ${mck} -eq 1 ]; then + PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $NODES \ + sudo ${MCK}/sbin/mcreboot.sh -c 2-17,70-85,138-153,206-221,20-35,88-103,156-171,224-239,36-51,104-119,172-187,240-255,52-67,120-135,188-203,256-271 -r 2-5,70-73,138-141,206-209:0+6-9,74-77,142-145,210-213:1+10-13,78-81,146-149,214-217:68+14-17,82-85,150-153,218-221:69+20-23,88-91,156-159,224-227:136+24-27,92-95,160-163,228-231:137+28-31,96-99,164-167,232-235:204+32-35,100-103,168-171,236-239:205+36-39,104-107,172-175,240-243:18+40-43,108-111,176-179,244-247:19+44-47,112-115,180-183,248-251:86+48-51,116-119,184-187,252-255:87+52-55,120-123,188-191,256-259:154+56-59,124-127,192-195,260-263:155+60-63,128-131,196-199,264-267:222+64-67,132-135,200-203,268-271:223 -m 32G@0,12G@1 + else + : + fi +fi + +if [ ${go} -eq 1 ]; then + cd $ABS_SRCDIR + make $fn + + PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $NODES \ + ulimit -u 16384; + PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $NODES \ + ulimit -s unlimited + + sudo $MCEXEC $mcexecopt ./$fn +fi + diff --git a/test/uti/CT33.c b/test/uti/CT33.c new file mode 100644 index 00000000..7a2a9f96 --- /dev/null +++ b/test/uti/CT33.c @@ -0,0 +1,167 @@ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "util.h" + +#define WAITER_CPU 0 +#define WAKER_CPU 1 + +int sem; +pthread_barrier_t bar; +int flag; +pthread_t thr; +long t_fwq, t_futex_wake, t_futex_wait; +long t_fwq2; +long nloop; +long blocktime = 10 * 1000 * 1000L; + +void *util_fn(void *arg) +{ + int i; + int ret; + long start, end; + long start2, end2; + + print_cpu_last_executed_on("Utility thread"); + + ret = syscall(732); + OKNGNOJUMP(ret == -1, "Utility thread is running on Linux\n"); + + /* Measure fwq time */ + start = rdtsc_light(); + for (i = 0; i < nloop; i++) { + fwq(blocktime); + } + end = rdtsc_light(); + t_fwq2 += end - start; + + /* Measure fwq + futex time */ + syscall(701, 1 | 2 | 0x80000000); + pthread_barrier_wait(&bar); + start = rdtsc_light(); + for (i = 0; i < nloop; i++) { + start2 = rdtsc_light(); + + fwq(blocktime); + + end2 = rdtsc_light(); + t_fwq += end2 - start2; + + if ((ret = syscall(__NR_futex, &sem, FUTEX_WAKE, 1, NULL, NULL, 0)) != 1) { + printf("Error: futex wake failed (%d,%s)\n", ret, strerror(errno)); + } + + //pthread_barrier_wait(&bar); + } + end = rdtsc_light(); + t_futex_wake += end - start; + + syscall(701, 4 | 8 | 0x80000000); + + fn_fail: + return NULL; +} + +static struct option options[] = { + /* end */ + { NULL, 0, NULL, 0, } +}; + +int main(int argc, char **argv) +{ + int i, j; + int ret; + long start, end; + cpu_set_t cpuset; + pthread_attr_t attr; + pthread_barrierattr_t bar_attr; + struct sched_param param = { .sched_priority = 99 }; + int opt; + + while ((opt = getopt_long(argc, argv, "+b:", options, NULL)) != -1) { + switch (opt) { + case 'b': + blocktime = atoi(optarg); + break; + default: /* '?' */ + printf("unknown option %c\n", optopt); + exit(1); + } + } + nloop = 10 * 1000000000UL / blocktime; + printf("[INFO] nloop=%ld,blocktime=%ld\n", nloop, blocktime); + + + CPU_ZERO(&cpuset); + CPU_SET(WAITER_CPU, &cpuset); + if ((ret = sched_setaffinity(0, sizeof(cpu_set_t), &cpuset))) { + printf("Error: sched_setaffinity: %s\n", strerror(errno)); + goto fn_fail; + } + print_cpu_last_executed_on("Master thread"); + + fwq_init(); + + pthread_barrierattr_init(&bar_attr); + pthread_barrier_init(&bar, &bar_attr, 2); + + ret = syscall(732); + OKNGNOJUMP(ret != -1, "Master thread is running on McKernel\n"); + + ret = syscall(731, 1, NULL); + OKNGNOJUMP(ret != -1, "util_indicate_clone\n"); + + if ((ret = pthread_attr_init(&attr))) { + printf("Error: pthread_attr_init failed: %s\n", strerror(errno)); + goto fn_fail; + } + + CPU_ZERO(&cpuset); + CPU_SET(WAKER_CPU, &cpuset); + + if ((ret = pthread_attr_setaffinity_np(&attr, sizeof(cpu_set_t), &cpuset))) { + printf("Error: pthread_attr_setaffinity_np: %s\n", strerror(errno)); + goto fn_fail; + } + + if ((ret = pthread_create(&thr, &attr, util_fn, NULL))) { + printf("Error: pthread_create: %s\n", strerror(errno)); + goto fn_fail; + } + + if ((ret = sched_setscheduler(0, SCHED_FIFO, ¶m))) { + printf("Error: sched_setscheduler: %s\n", strerror(errno)); + goto fn_fail; + } + + pthread_barrier_wait(&bar); + start = rdtsc_light(); + for (i = 0; i < nloop; i++) { + + if ((ret = syscall(__NR_futex, &sem, FUTEX_WAIT, 0, NULL, NULL, 0))) { + printf("Error: futex wait: %s\n", strerror(errno)); + } + + //pthread_barrier_wait(&bar); + } + end = rdtsc_light(); + t_futex_wait += end - start; + + pthread_join(thr, NULL); + printf("[INFO] compute: %ld, wake: %ld, wait: %ld, wake - compute: %ld, wait - compute: %ld (cycles)\n", t_fwq, t_futex_wake, t_futex_wait, (t_futex_wake - t_fwq) / nloop, (t_futex_wait - t_fwq) / nloop); + + fn_fail: + return ret; +} diff --git a/test/uti/CT33.sh b/test/uti/CT33.sh new file mode 100755 index 00000000..5c83ba1c --- /dev/null +++ b/test/uti/CT33.sh @@ -0,0 +1,93 @@ +#!/usr/bin/bash + +bn=`basename $0` +fn=`echo $bn | sed 's/.sh//'` + +stop=0 +reboot=0 +go=0 +mck=0 +NNODES=1 +NPROC=$((1 * NNODES)) +LASTNODE=8200 +use_hfi=0 + +while getopts srgmh:N:P:L: OPT +do + case ${OPT} in + s) stop=1 + ;; + r) reboot=1 + ;; + g) go=1 + ;; + m) mck=1 + ;; + h) use_hfi=1 + ;; + N) NNODES=$OPTARG + ;; + P) NPROC=$OPTARG + ;; + L) LASTNODE=$OPTARG + ;; + *) echo "invalid option -${OPT}" >&2 + exit 1 + esac +done + +MYHOME=/work/gg10/e29005 +ABS_SRCDIR=${MYHOME}/project/os/mckernel/test/uti +MCK=${MYHOME}/project/os/install + +nodes=`echo $(seq -s ",c" $(($LASTNODE + 1 - $NNODES)) $LASTNODE) | sed 's/^/c/'` +PPN=$((NPROC / NNODES)) +echo NPROC=$NPROC NNODES=$NNODES PPN=$PPN nodes=$nodes + +if [ "`cat /etc/mtab | while read line; do cut -d" " -f 2; done | grep /work`" == "" ]; then + PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $nodes sudo mount /work +fi + +if [ ${mck} -eq 1 ]; then + MCEXEC="${MCK}/bin/mcexec" + mcexecopt="--enable-uti" + if [ ${use_hfi} -eq 1 ]; then + mcexecopt="--enable-hfi1 $mcexecopt" + fi +else + MCEXEC= + mcexecopt= +fi + +if [ ${stop} -eq 1 ]; then + if [ ${mck} -eq 1 ]; then + PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $nodes \ + /sbin/pidof mcexec \| xargs -r kill -9 + PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $nodes \ + sudo ${MCK}/sbin/mcstop+release.sh + else + : + fi +fi + +if [ ${reboot} -eq 1 ]; then + if [ ${mck} -eq 1 ]; then + PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $nodes \ + sudo ${MCK}/sbin/mcreboot.sh -c 2-17,70-85,138-153,206-221,20-35,88-103,156-171,224-239,36-51,104-119,172-187,240-255,52-67,120-135,188-203,256-271 -r 2-5,70-73,138-141,206-209:0+6-9,74-77,142-145,210-213:1+10-13,78-81,146-149,214-217:68+14-17,82-85,150-153,218-221:69+20-23,88-91,156-159,224-227:136+24-27,92-95,160-163,228-231:137+28-31,96-99,164-167,232-235:204+32-35,100-103,168-171,236-239:205+36-39,104-107,172-175,240-243:18+40-43,108-111,176-179,244-247:19+44-47,112-115,180-183,248-251:86+48-51,116-119,184-187,252-255:87+52-55,120-123,188-191,256-259:154+56-59,124-127,192-195,260-263:155+60-63,128-131,196-199,264-267:222+64-67,132-135,200-203,268-271:223 -m 32G@0,12G@1 + else + : + fi +fi + +if [ ${go} -eq 1 ]; then + cd $ABS_SRCDIR + make $fn + + PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $nodes \ + ulimit -u 16384; + PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $nodes \ + ulimit -s unlimited + + sudo $MCEXEC $mcexecopt ./$fn +fi + diff --git a/test/uti/CT34.c b/test/uti/CT34.c new file mode 100644 index 00000000..f4c8a98b --- /dev/null +++ b/test/uti/CT34.c @@ -0,0 +1,62 @@ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "util.h" + +void *util_fn(void *arg) +{ + int ret; + ret = syscall(732); + OKNGNOJUMP(ret == -1, "Utility thread is running on Linux\n"); + fn_fail: + return NULL; +} + +int my_thread_create() +{ + pthread_t thr; + int ret = 0; + + ret = syscall(731, 1, NULL); + OKNGNOJUMP(ret == 0, "util_indicate_clone,ret=%d,errno=%d\n", ret, errno); + + if ((ret = pthread_create(&thr, NULL, util_fn, NULL))) { + printf("Error: pthread_create: %s\n", strerror(errno)); + } + + if ((ret = pthread_join(thr, NULL))) { + printf("Error: pthread_join: %s\n", strerror(errno)); + } + + fn_exit: + return ret; + + fn_fail: + ret = -1; + goto fn_exit; +} + +int +main(int argc, char **argv) +{ + int ret = 0; + + if ((ret = my_thread_create())) { + printf("Error: my_thread_create,ret=%d\n", ret); + } + + fn_exit: + return ret; + + fn_fail: + ret = -1; + goto fn_exit; +} diff --git a/test/uti/CT35.sh b/test/uti/CT35.sh new file mode 100755 index 00000000..48bac59f --- /dev/null +++ b/test/uti/CT35.sh @@ -0,0 +1,36 @@ +#!/usr/bin/bash + +mck_dir=/work/gg10/e29005/project/os/install +nloop=800 +exe=CT26 +mck=1 + +mcexec="${mck_dir}/bin/mcexec" + +sudo ${mck_dir}/sbin/mcstop+release.sh +sudo ${mck_dir}/sbin/mcreboot.sh -c 1,2,3 -m 512M + +ulimit -c unlimited + +for((count=0;count SIGSEGV発生 +□ CT01007 munmap OK (SIGSEGV) + Linux スレッド終了 + メインスレッドにて pthread_join。成功 +□ CT01008 exit (pthread_join) OK +□ CT01009 futex (pthread_mutex/pthread_cond) OK +□ CT01010 END + +CT02 システムコールテスト mremap +□ CT02001 mremap START + Linuxにスレッドを生成 (pthread_create)。成功 +□ CT02002 pthread_create OK + get_system() の戻り値が -1 (Linux で動作) +□ CT02003 get_system OK + mmap 発行。戻り値が (void *)-1 以外 +□ CT02004 mmap OK + mmap 領域の縮小予定の領域に "mmap OK" を書き込む + Linuxスレッドからメインスレッドに対して cond_signal + メインスレッドにて、mmap 領域の書き込んだ領域を参照 (mmap 領域の内容を表示) +□ CT02005 mmap OK + メインスレッドからLinuxスレッドに cond_signal + Linux スレッドにて mmap 領域を mremap して縮小。戻り値が 0 +□ CT02006 mremap OK + Linuxスレッドからメインスレッドに対して cond_signal + メインスレッドが mmap 領域の縮小した領域を参照 -> SIGSEGV発生 +□ CT02007 mremap OK (SIGSEGV) + メインスレッドからLinuxスレッドに cond_signal + Linux スレッドにて mmap 領域を munmap。戻り値が 0 +□ CT02008 munmap OK + Linux スレッド終了 + メインスレッドにて pthread_join。成功 +□ CT02009 pthread_join OK +□ CT02010 END + +CT03 システムコールテスト mprotect +□ CT03001 mprotect START + Linuxにスレッドを生成 (pthread_create)。成功 +□ CT03002 pthread_create OK + get_system() の戻り値が -1 (Linux で動作) +□ CT03003 get_system OK + mmap 発行。戻り値が (void *)-1 以外 +□ CT03004 mmap OK + mmap 領域に "mmap OK" を書き込む + Linuxスレッドからメインスレッドに対して cond_signal + メインスレッドにて、mmap 領域を参照 (mmap 領域の内容を表示) +□ CT03005 mmap OK + メインスレッドからLinuxスレッドに cond_signal + Linux スレッドにて mmap 領域を mprotect して参照権のみ設定。戻り値が 0 +□ CT03006 mprotect OK + Linuxスレッドからメインスレッドに対して cond_signal + メインスレッドが mmap 領域に書き込み -> SIGSEGV発生 +□ CT03007 mremap OK (SIGSEGV) + メインスレッドからLinuxスレッドに cond_signal + Linux スレッドにて mmap 領域を munmap。戻り値が 0 +□ CT03008 munmap OK + Linux スレッド終了 + メインスレッドにて pthread_join。成功 +□ CT03009 pthread_join OK +□ CT03010 END + +CT04 システムコールテスト brk +□ CT04001 brk START + Linuxにスレッドを生成 (pthread_create)。成功 +□ CT04002 pthread_create OK + get_system() の戻り値が -1 (Linux で動作) +□ CT04003 get_system OK + sbrk(0)発行。戻り値を保存…(A) +□ CT04004 sbrk OK + sbrk(4096)発行。戻り値を保存…(B) + (A)の場所に "sbrk OK" を書き込む + Linuxスレッドからメインスレッドに対して cond_signal + メインスレッドにて、(A) 領域を参照 (領域の内容を表示) +□ CT04005 sbrk OK + メインスレッドにてsbrk(0)発行。戻り値を保存…(C) + メインスレッドからLinuxスレッドに cond_signal + Linuxスレッドでsbrk(0)発行。戻り値が(C)と一致している +□ CT04006 sbrk OK + Linux スレッド終了 + メインスレッドにて pthread_join。成功 +□ CT04007 pthread_join OK +□ CT04008 END + +CT05 システムコールテスト gettid +□ CT05001 gettid START + McKernelにスレッドを生成 (pthread_create)。成功 +□ CT05002 pthread_create OK + get_system() の戻り値が 0 (McKernel で動作) +□ CT05003 get_system OK + gettid() の戻り値を保存…(A) +□ CT05004 gettid OK %d + util_migrate_inter_kernel 発行。戻り値が 0 +□ CT05005 util_migrate_inter_kernel OK + get_system() の戻り値が -1 (Linux で動作) +□ CT05006 get_system OK + gettid() の戻り値が(A)と一致 +□ CT05007 gettid OK %d + Linux スレッド終了 + メインスレッドにて pthread_join。成功 +□ CT05008 pthread_join OK +□ CT05009 END + +CT06 システムコールテスト exit_group +□ CT06001 exit_group START + fork して子プロセス生成。以下、子プロセスの処理 + Linuxにスレッドを生成 (pthread_create)。成功 +□ CT06002 pthread_create OK + get_system() の戻り値が -1 (Linux で動作) +□ CT06003 get_system OK + Linuxスレッドが exit_group(99) +□ CT06004 pthread_join NG が表示されない + 親プロセスが wait。子プロセスの終了ステータスが 99 +□ CT06004 exit_group OK +□ CT06005 END + +CT07 システムコールテスト エラー系 +□ CT07001 error START + Linuxにスレッドを生成 (pthread_create)。成功 +□ CT07002 pthread_create OK + get_system() の戻り値が -1 (Linux で動作) +□ CT07003 get_system OK + clone() の戻り値が -1 で errno が ENOSYS +□ CT07004 clone OK %d + fork() の戻り値が -1 で errno が ENOSYS +□ CT07005 fork OK %d + vfork() の戻り値が -1 で errno が ENOSYS + +□ CT07006 vfork OK %d + execve() の戻り値が -1 で errno が ENOSYS +※ syscall_interceptの不具合によりvforkはSegmentation faultを起こすため、除外している + +□ CT07007 execve OK %d + Linux スレッド終了 + メインスレッドにて pthread_join。成功 +□ CT07008 pthread_join OK +□ CT07009 END + +CT08 uti_attr_t関連 +uti_attr_t の動作は実行環境によって変化するため、機械的にOK/NGの判断ができない。 +このため、affinityとschedulerを目視確認して、OK/NGを判断して下さい。 +UTI_FLAG_SAME_NUMA_DOMAIN のテストを容易にするため、mcreboot では特定のNUMA +ドメインにCPUを寄せて下さい。 +また、UTI_FLAG_SAME_L1のテストを容易にするために、論理コアを1つ以上空けるように +CPUを割り当てて下さい。 + +sched cpu には Linux に生成したスレッドの sched_getaffinity の結果を表示する。 +sched には同じく sched_getscheduler の結果を表示する。 + +□ CT08001 UTI_FLAG_NUMA_SET + sched cpu に NUMA domain 2 に属すLinux CPU集合が表示されること。 + sched=0 であること。 +□ CT08002 UTI_FLAG_NUMA_SET|UTI_FLAG_EXCLUSIVE_CPU + sched cpu に NUMA domain 2 に属すLinux CPUの内、1つが表示されること。 + (CT08001 のCPU集合のメンバであること) + sched=1 であること。 +□ CT08003 UTI_FLAG_NUMA_SET|UTI_FLAG_EXCLUSIVE_CPU(2) + sched cpu に NUMA domain 2 に属すLinux CPUの内、1つが表示されること。 + sched cpu は CT08002 とは異なるCPUが表示されていること(ラウンドロビン)。 + sched=1 であること。 +□ CT08004 UTI_FLAG_SAME_NUMA_DOMAIN + sched cpu にMcKernelに割り当てたCPUと同じNUMAドメインに属すLinux CPU集合が + 表示されること。 + sched=0 であること。 +□ CT08005 UTI_FLAG_SAME_NUMA_DOMAIN|UTI_FLAG_CPU_INTENSIVE + sched cpu に NUMA domain 2 に属すLinux CPUの内、1つが表示されること。 + (CT08004 のCPU集合のメンバであること) + sched=0 であること。 +□ CT08006 UTI_FLAG_DIFFERENT_NUMA_DOMAIN + sched cpu にMcKernelに割り当てたCPUと異なるNUMAドメインに属すLinux CPU集合が + 表示されること。 + sched=0 であること。 +□ CT08007 UTI_FLAG_DIFFERENT_NUMA_DOMAIN|UTI_FLAG_HIGH_PRIORITY + sched cpu にMcKernelに割り当てたCPUと異なるNUMAドメインに属すLinux CPU集合の + 内、1つが表示されること。(CT08006 のCPU集合のメンバであること) + sched=1 であること。 +□ CT08008 UTI_FLAG_SAME_L1 + sched cpu にMcKernelの親プロセスが実行するCPUとL1キャッシュを共有するLinuxの + CPU集合が表示されること。(McKernelへのCPU割り当て状態に依存するが、2論理コア + /物理コアの場合、高々1CPUのみが該当する。該当コアが存在しない場合は、全ての + コアが対象となる)。 + sched=0 であること。 +□ CT08009 UTI_FLAG_SAME_L1|UTI_FLAG_NON_COOPERATIVE + sched cpu にMcKernelの親プロセスが実行するCPUとL1キャッシュを共有するLinuxの + CPUの内1つが表示されること。(CT08008のCPU集合のメンバ。但し、CT08008で該当 + CPUが存在しない場合は、全てのコアが対象になる)。 + sched=0 であること。 +□ CT08010 UTI_FLAG_SAME_L2 + sched cpu にMcKernelの親プロセスが実行するCPUとL2キャッシュを共有するLinuxの + CPU集合が表示されること。(McKernelへのCPU割り当て状態に依存するが、2論理コア + /物理コアの場合、高々1CPUのみが該当する。該当コアが存在しない場合は、全ての + コアが対象となる)。 + sched=0 であること。 +□ CT08011 UTI_FLAG_SAME_L2|UTI_FLAG_CPU_INTENSIVE + sched cpu にMcKernelの親プロセスが実行するCPUとL2キャッシュを共有するLinuxの + CPUの内1つが表示されること。(CT08010のCPU集合のメンバ。但し、CT08010で該当 + CPUが存在しない場合は、全てのコアが対象になる)。 + sched=0 であること。 +□ CT08012 UTI_FLAG_SAME_L3 + sched cpu にMcKernelの親プロセスが実行するCPUとL3キャッシュを共有するLinuxの + CPU集合が表示されること。 + sched=0 であること。 +□ CT08013 UTI_FLAG_SAME_L3|UTI_FLAG_CPU_INTENSIVE + sched cpu にMcKernelの親プロセスが実行するCPUとL3キャッシュを共有するLinuxの + CPUの内1つが表示されること。(CT08012のCPU集合のメンバ)。 + sched=0 であること。 +□ CT08014 UTI_FLAG_DIFFERENT_L1 + sched cpu にMcKernelの親プロセスが実行するCPUとL1キャッシュを共有しない + LinuxのCPU集合が表示されること。 + sched=0 であること。 +□ CT08015 UTI_FLAG_DIFFERENT_L1|UTI_FLAG_CPU_INTENSIVE + sched cpu にMcKernelの親プロセスが実行するCPUとL1キャッシュを共有しない + LinuxのCPUの内、1つが表示されること(CT08014のCPU集合のメンバ)。 + sched=0 であること。 +□ CT08016 UTI_FLAG_DIFFERENT_L2 + sched cpu にMcKernelの親プロセスが実行するCPUとL2キャッシュを共有しない + LinuxのCPU集合が表示されること。 + コアが対象となる)。 + sched=0 であること。 +□ CT08017 UTI_FLAG_DIFFERENT_L2|UTI_FLAG_CPU_INTENSIVE + sched cpu にMcKernelの親プロセスが実行するCPUとL2キャッシュを共有しない + LinuxのCPUの内、1つが表示されること(CT08016のCPU集合のメンバ)。 + sched=0 であること。 +□ CT08018 UTI_FLAG_DIFFERENT_L3 + sched cpu にMcKernelの親プロセスが実行するCPUとL3キャッシュを共有しない + LinuxのCPU集合が表示されること。 + sched=0 であること。 +□ CT08019 UTI_FLAG_DIFFERENT_L3|UTI_FLAG_CPU_INTENSIVE + sched cpu にMcKernelの親プロセスが実行するCPUとL3キャッシュを共有しない + LinuxのCPUの内、1つが表示されること(CT08018のCPU集合のメンバ)。 + sched=0 であること。 + +CT09 プログレス処理オーバーヘッド測定 + +MPI通信処理とMPIプログレス処理とのロック競合を模すことで、MPIプログレス処理の +オーバーヘッドを測定する。 + +MPI通信処理のステップは以下の通り。 +(1) 1usの間オブジェクトをロック +(2) 30usの間計算を行う +MPIプログレス処理のステップは以下の通り。 +(1) 10msに一回オブジェクトをロック +(2) 通信が終了したタイミングに重なった場合は2usの処理を行う。そうでない + 場合は直ちにアンロックする + +CT10 pthread_cond_{wait,signal}() [OK] + +CT11 measure time of system calls [OK] + +CT12 child (helper thread) futex() wait [OK] + +CT13 parent futex() wait [OK] + +CT14 child pthread_lock wait [OK] + +CT15 parent pthread_lock wait [OK] + +CT16 child pthread_cond_wait [OK] +Linuxはcondで起きる。その後mutexで起きたりしない。 + +CT17 parent pthread_cond_wait [OK] +McKernelはcondで起きる。その後mutexで起きる。 + +CT18 child (helper thread) futex() wait with FUTEX_WAIT_BITSET | FUTEX_CLOCK_REALTIME and non-zero timeout [OK] + +CT19 child (helper thread) futex() wait with FUTEX_WAIT_BITSET and non-zero timeout [OK] + +CT20 child (helper thread) futex() wait with FUTEX_WAIT and non-zero timeout [OK] + +CT21 progress-threadのlockタイミングを変化させたテスト + +CT22 compute-threadのlockタイミングを変化させたテスト + +CT23 progress-threadのcond_waitタイミングを変化させたテスト + +CT24 compute-threadのcond_waitタイミングを変化させたテスト + +CT25 MPI_Isend()でのプロセス終了時メモリ破壊不具合のスケルトン。パラメタは以下の通り。 +* 1MB x 250 (./CT25 20 250) +* 128K x 1024 (./CT25 17 1024) + +CT26 終了時レースコンディションのテスト +* thread->statusがPS_EXITEDの場合もhold_thread()を呼んでデッドロックする不具合のテスト + +CT27 プログレス処理オーバーヘッド測定 +* CT09の複数プロセス版。async progressによってオーバーサブスクライブになった場合のオーバーヘッドを測定する。 + +CT28 taskset -c 0-7 lock-inc-lock x 10000 + +CT29 no reverse offload + +CT30 CT21にopenmpスレッドを追加したテスト + +CT31 pthread_cond_waitオーバーヘッド測定 +* waiterとwakerのCPUは、それぞれ、WAITER_CPU、WAKER_CPUで設定 + +CT32 futex waitオーバーヘッド測定 +* waiterとwakerのCPUは、それぞれ、WAITER_CPU、WAKER_CPUで設定 + +CT33 futex wakeオーバーヘッド測定 +* waiterとwakerのCPUは、それぞれ、WAITER_CPU、WAKER_CPUで設定 + +CT34 繰り返しpthread_create + +CT35 LD_PRELOADでsyscall_interceptを用いたsoをつけた場合のテスト \ No newline at end of file diff --git a/test/uti/driver/Makefile b/test/uti/driver/Makefile new file mode 100644 index 00000000..5d0a015b --- /dev/null +++ b/test/uti/driver/Makefile @@ -0,0 +1,12 @@ +obj-m += hello.o + +hello-y = driver.o + +.PHONY: clean install modules + +modules: + $(MAKE) -C /lib/modules/$(shell uname -r)/build M=$(PWD) modules + +clean: + $(RM) .*.cmd *.mod.c *.o *.ko* Module.symvers modules.order -r .tmp* + diff --git a/test/uti/driver/driver.c b/test/uti/driver/driver.c new file mode 100644 index 00000000..262aaea9 --- /dev/null +++ b/test/uti/driver/driver.c @@ -0,0 +1,80 @@ +/* + * This file is created by mixing the following two codes. + * + * URL: https://www.apriorit.com/dev-blog/195-simple-driver-for-linux-os + * Author: Danil Ishkov, Apriorit + * + * URL: http://www.linuxdevcenter.com/pub/a/linux/2007/07/05/devhelloworld-a-simple-introduction-to-device-drivers-under-linux.html + * Author: Valerie Henson + * + */ +#include +#include +#include +#include +#include +#include + +static int hello_open(struct inode *inode, struct file *file) +{ + return 0; +} + +static int hello_release(struct inode *inode, struct file *file) +{ + return 0; +} + +static long hello_ioctl(struct file *file, unsigned int request, unsigned long arg) +{ + return 0; +} + +static struct file_operations fops = { + .open = hello_open, + .release = hello_release, + .unlocked_ioctl = hello_ioctl, +}; + +static int device_file_major_number = 0; +static const char device_name[] = "hello"; +static int register_device(void) +{ + int result = 0; + result = register_chrdev( 0, device_name, &fops ); + if( result < 0 ) { + printk( KERN_WARNING "hello: register_chrdev failed,result=%i", result ); + return result; + } + device_file_major_number = result; + printk( KERN_NOTICE "hello: major number=%i,try \"grep hello /proc/devices\"", device_file_major_number ); + return 0; +} + +void unregister_device(void) +{ + printk( KERN_NOTICE "hello: unregister_device() is called" ); + if(device_file_major_number != 0) { + unregister_chrdev(device_file_major_number, device_name); + } +} + +static int __init hello_init(void) +{ + register_device(); + return 0; +} + +module_init(hello_init); + +static void __exit hello_exit(void) +{ + unregister_device(); +} + +module_exit(hello_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR(" Danil Ishkov, Apriorit and Valerie Henson"); +MODULE_DESCRIPTION("Module that does nothing"); +MODULE_VERSION("1.0"); diff --git a/test/uti/mpi/001.c b/test/uti/mpi/001.c new file mode 100755 index 00000000..2584f3a0 --- /dev/null +++ b/test/uti/mpi/001.c @@ -0,0 +1,216 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* For SYS_xxx definitions */ +#include + +//#define DEBUG +#ifdef DEBUG +#define dprintf printf +#else +#define dprintf {} +#endif + +#define SZENTRY_DEFAULT (65536) /* Size of one slot */ +#define NENTRY_DEFAULT 10000 /* Number of slots */ + +#define DIFFNSEC(end, start) ((end.tv_sec - start.tv_sec) * 1000000000UL + (end.tv_nsec - start.tv_nsec)) + +static int print_cpu_last_executed_on() { + char fn[256]; + char* result; + pid_t tid = syscall(SYS_gettid); + int fd; + int offset; + int mpi_errno = 0; + + sprintf(fn, "/proc/%d/task/%d/stat", getpid(), (int)tid); + //printf("fn=%s\n", fn); + fd = open(fn, O_RDONLY); + if(fd == -1) { + printf("open() failed\n"); + goto fn_fail; + } + + result = malloc(65536); + if(result == NULL) { + printf("malloc() failed"); + goto fn_fail; + } + + int amount = 0; + offset = 0; + while(1) { + amount = read(fd, result + offset, 65536); + // printf("amount=%d\n", amount); + if(amount == -1) { + printf("read() failed"); + goto fn_fail; + } + if(amount == 0) { + goto eof; + } + offset += amount; + } + eof:; + //printf("result:%s\n", result); + + char* next_delim = result; + char* field; + int i; + for(i = 0; i < 39; i++) { + field = strsep(&next_delim, " "); + } + + int cpu = sched_getcpu(); + if(cpu == -1) { + printf("getpu() failed\n"); + goto fn_fail; + } + + printf("compute thread,pmi_rank=%02d,stat-cpu=%02d,sched_getcpu=%02d,tid=%d\n", atoi(getenv("PMI_RANK")), atoi(field), cpu, tid); fflush(stdout); + fn_exit: + free(result); + return mpi_errno; + fn_fail: + mpi_errno = -1; + goto fn_exit; +} + +void sendrecv(int rank, int nentry, char **sendv, char **recvv, int szentry, int src, int dest, MPI_Request* reqs, MPI_Status* status, double usec) { + int i; + if(rank == 1) { + for(i = 0; i < nentry; i++) { + MPI_Isend(sendv[i], szentry, MPI_CHAR, dest, 0, MPI_COMM_WORLD, &reqs[i]); + if (nentry > 10 && i % (nentry / 10) == 0) { + printf("s"); fflush(stdout); + } + } + MPI_Waitall(nentry, reqs, status); + printf("w\n"); fflush(stdout); + } else { + for(i = 0; i < nentry; i++) { + MPI_Irecv(recvv[i], szentry, MPI_CHAR, src, 0, MPI_COMM_WORLD, &reqs[i]); + if (nentry > 10 && i % (nentry / 10) == 0) { + printf("r"); fflush(stdout); + } + } + usleep(usec); + MPI_Waitall(nentry, reqs, status); + printf("W\n"); fflush(stdout); + } +} + +int main(int argc, char **argv) { + int my_rank = -1, size = -1; + int i, j; + char **sendv, **recvv; + MPI_Status* status; + MPI_Request* reqs; + long szentry; + long nentry; + int src, dest; + struct timespec start, end; + double diffusec; + + if(argc == 3) { + szentry = atoi(argv[1]); + nentry = atoi(argv[2]); + } else { + szentry = SZENTRY_DEFAULT; + nentry = NENTRY_DEFAULT; + } + printf("szentry=%ld,nentry=%ld\n", szentry, nentry); + + status = (MPI_Status*)malloc(sizeof(MPI_Status) * nentry); + reqs = (MPI_Request*)malloc(sizeof(MPI_Request) * nentry); + + int actual; + + MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &actual); + printf("Thread support level is %d\n", actual); + + MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); + MPI_Comm_size(MPI_COMM_WORLD, &size); + + src = (size + my_rank - 1) % size; + dest = (my_rank + 1) % size; + + printf("rank=%d, size=%d, src=%d, dest=%d\n", my_rank, size, src, dest); + + sendv = malloc(sizeof(char *) * nentry); + if(!sendv) { printf("malloc failed"); goto fn_fail; } + for (i = 0; i < nentry; i++) { +#if 0 + int fd; + fd = open("./file", O_RDWR); + if(fd == -1) { printf("open failed\n"); goto fn_fail; } + sendv[i] = (char*)mmap(0, szentry, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0); +#else + sendv[i] = (char*)mmap(0, szentry, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); +#endif + if(sendv[i] == MAP_FAILED) { printf("mmap failed"); goto fn_fail; } + dprintf("[%d] sendv[%d]=%p\n", my_rank, i, sendv[i]); + memset(sendv[i], 0xaa, szentry); + } + + recvv = malloc(sizeof(char *) * nentry); + if(!recvv) { printf("malloc failed"); goto fn_fail; } + for (i = 0; i < nentry; i++) { +#if 0 + int fd; + fd = open("./file", O_RDWR); + if(fd == -1) { printf("open failed\n"); goto fn_fail; } + recvv[i] = (char*)mmap(0, szentry, PROT_READ|PROT_WRITE, MAP_PRIVATE, fd, 0); +#else + recvv[i] = (char*)mmap(0, szentry, PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0); +#endif + if(recvv[i] == MAP_FAILED) { printf("mmap failed"); goto fn_fail; } + dprintf("[%d] recvv[%d]=%p\n", my_rank, i, recvv[i]); + memset(recvv[i], 0, szentry); + } + + printf("after memset\n"); + + print_cpu_last_executed_on(); + + for (i = 0; i < 1; i++) { + MPI_Barrier(MPI_COMM_WORLD); + if(my_rank == 0) { + clock_gettime(CLOCK_REALTIME, &start); + } + sendrecv(my_rank, nentry, sendv, recvv, szentry, src, dest, reqs, status, 0); + MPI_Barrier(MPI_COMM_WORLD); + if(my_rank == 0) { + clock_gettime(CLOCK_REALTIME, &end); + diffusec = DIFFNSEC(end, start) / (double)1000; + printf("%4.4f sec\n", DIFFNSEC(end, start) / (double)1000000000); fflush(stdout); + } + + MPI_Barrier(MPI_COMM_WORLD); + if(my_rank == 0) { + clock_gettime(CLOCK_REALTIME, &start); + } + sendrecv(my_rank, nentry, sendv, recvv, szentry, src, dest, reqs, status, diffusec); + MPI_Barrier(MPI_COMM_WORLD); + if(my_rank == 0) { + clock_gettime(CLOCK_REALTIME, &end); + printf("%4.4f sec\n", DIFFNSEC(end, start) / (double)1000000000); fflush(stdout); + } + } + + fn_exit: + MPI_Finalize(); + return 0; + fn_fail: + goto fn_exit; +} + diff --git a/test/uti/mpi/002.c b/test/uti/mpi/002.c new file mode 100755 index 00000000..5a85014c --- /dev/null +++ b/test/uti/mpi/002.c @@ -0,0 +1,127 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* For SYS_xxx definitions */ +#include + +//#define DEBUG +#ifdef DEBUG +#define dprintf printf +#else +#define dprintf {} +#endif + +#define SZENTRY_DEFAULT (65536) /* Size of one slot */ +#define NENTRY_DEFAULT 10000 /* Number of slots */ + +#define DIFFNSEC(end, start) ((end.tv_sec - start.tv_sec) * 1000000000UL + (end.tv_nsec - start.tv_nsec)) + +static int print_cpu_last_executed_on() { + char fn[256]; + char* result; + pid_t tid = syscall(SYS_gettid); + int fd; + int offset; + int mpi_errno = 0; + + sprintf(fn, "/proc/%d/task/%d/stat", getpid(), (int)tid); + //printf("fn=%s\n", fn); + fd = open(fn, O_RDONLY); + if(fd == -1) { + printf("open() failed\n"); + goto fn_fail; + } + + result = malloc(65536); + if(result == NULL) { + printf("malloc() failed"); + goto fn_fail; + } + + int amount = 0; + offset = 0; + while(1) { + amount = read(fd, result + offset, 65536); + // printf("amount=%d\n", amount); + if(amount == -1) { + printf("read() failed"); + goto fn_fail; + } + if(amount == 0) { + goto eof; + } + offset += amount; + } + eof:; + //printf("result:%s\n", result); + + char* next_delim = result; + char* field; + int i; + for(i = 0; i < 39; i++) { + field = strsep(&next_delim, " "); + } + + int cpu = sched_getcpu(); + if(cpu == -1) { + printf("getpu() failed\n"); + goto fn_fail; + } + + printf("compute thread,pmi_rank=%02d,stat-cpu=%02d,sched_getcpu=%02d,tid=%d\n", atoi(getenv("PMI_RANK")), atoi(field), cpu, tid); fflush(stdout); + fn_exit: + free(result); + return mpi_errno; + fn_fail: + mpi_errno = -1; + goto fn_exit; +} + +int main(int argc, char **argv) { + int my_rank = -1, size = -1; + int i, j; + struct timespec start, end; + + int actual; + + printf("nloop=%d\n", atoi(argv[1])); + + MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &actual); + printf("Thread support level is %d\n", actual); + + MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); + MPI_Comm_size(MPI_COMM_WORLD, &size); + + print_cpu_last_executed_on(); + + printf("Before 1st barrier\n"); fflush(stdout); + MPI_Barrier(MPI_COMM_WORLD); + + printf("Before 2nd barrier\n"); fflush(stdout); + if(my_rank == 0) { + clock_gettime(CLOCK_REALTIME, &start); + } + for (i = 0; i < atoi(argv[1]); i++) { + MPI_Barrier(MPI_COMM_WORLD); + } + if(my_rank == 0) { + clock_gettime(CLOCK_REALTIME, &end); + printf("%4.4f sec\n", DIFFNSEC(end, start) / (double)1000000000); fflush(stdout); + } + + + fn_exit: + //MPI_Finalize(); + usleep(100000); + return 0; + fn_fail: + goto fn_exit; +} diff --git a/test/uti/mpi/003.c b/test/uti/mpi/003.c new file mode 100755 index 00000000..fa696ee2 --- /dev/null +++ b/test/uti/mpi/003.c @@ -0,0 +1,188 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* For SYS_xxx definitions */ +#include + +//#define DEBUG +#ifdef DEBUG +#define dprintf printf +#else +#define dprintf {} +#endif + +#define SZENTRY_DEFAULT (65536) /* Size of one slot */ +#define NENTRY_DEFAULT 10000 /* Number of slots */ + +#define DIFFNSEC(end, start) ((end.tv_sec - start.tv_sec) * 1000000000UL + (end.tv_nsec - start.tv_nsec)) + +static int print_cpu_last_executed_on() { + char fn[256]; + char* result; + pid_t tid = syscall(SYS_gettid); + int fd; + int offset; + int mpi_errno = 0; + + sprintf(fn, "/proc/%d/task/%d/stat", getpid(), (int)tid); + //printf("fn=%s\n", fn); + fd = open(fn, O_RDONLY); + if(fd == -1) { + printf("open() failed\n"); + goto fn_fail; + } + + result = malloc(65536); + if(result == NULL) { + printf("malloc() failed"); + goto fn_fail; + } + + int amount = 0; + offset = 0; + while(1) { + amount = read(fd, result + offset, 65536); + // printf("amount=%d\n", amount); + if(amount == -1) { + printf("read() failed"); + goto fn_fail; + } + if(amount == 0) { + goto eof; + } + offset += amount; + } + eof:; + //printf("result:%s\n", result); + + char* next_delim = result; + char* field; + int i; + for(i = 0; i < 39; i++) { + field = strsep(&next_delim, " "); + } + + int cpu = sched_getcpu(); + if(cpu == -1) { + printf("getpu() failed\n"); + goto fn_fail; + } + + printf("compute thread,pmi_rank=%02d,stat-cpu=%02d,sched_getcpu=%02d,tid=%d\n", atoi(getenv("PMI_RANK")), atoi(field), cpu, tid); fflush(stdout); + fn_exit: + free(result); + return mpi_errno; + fn_fail: + mpi_errno = -1; + goto fn_exit; +} + +void sendrecv(int rank, int nentry, char **sendv, char **recvv, int szentry, int src, int dest, MPI_Request* reqs, MPI_Status* status, double usec) { + int i; + if(rank == 1) { + for(i = 0; i < nentry; i++) { + if (i % (nentry / 10) == 0) { + printf("s"); fflush(stdout); + } + MPI_Isend(sendv[0], szentry, MPI_CHAR, dest, 0, MPI_COMM_WORLD, &reqs[i]); + } + printf("\n"); fflush(stdout); + MPI_Waitall(nentry, reqs, status); + } else { + for(i = 0; i < nentry; i++) { + if (i % (nentry / 10) == 0) { + printf("r"); fflush(stdout); + } + MPI_Irecv(recvv[0], szentry, MPI_CHAR, src, 0, MPI_COMM_WORLD, &reqs[i]); + } + usleep(usec); + MPI_Waitall(nentry, reqs, status); + } +} + +int main(int argc, char **argv) { + int my_rank = -1, size = -1; + int i, j; + char **sendv, **recvv; + MPI_Status* status; + MPI_Request* reqs; + long szentry; + long nentry; + int src, dest; + struct timespec start, end; + double diffusec; + + if(argc == 3) { + szentry = atoi(argv[1]); + nentry = atoi(argv[2]); + } else { + szentry = SZENTRY_DEFAULT; + nentry = NENTRY_DEFAULT; + } + + status = (MPI_Status*)malloc(sizeof(MPI_Status) * nentry); + reqs = (MPI_Request*)malloc(sizeof(MPI_Request) * nentry); + + int actual; + + MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &actual); + printf("Thread support level is %d\n", actual); + + MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); + MPI_Comm_size(MPI_COMM_WORLD, &size); + + src = (size + my_rank - 1) % size; + dest = (my_rank + 1) % size; + + printf("rank=%d, size=%d, src=%d, dest=%d\n", my_rank, size, src, dest); + + sendv = malloc(sizeof(char *) * nentry); + if(!sendv) { printf("malloc failed"); goto fn_fail; } + for (i = 0; i < 1; i++) { + sendv[i] = (char*)mmap(0, szentry, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + if(sendv[i] == MAP_FAILED) { printf("mmap failed"); goto fn_fail; } + dprintf("[%d] sendv[%d]=%p\n", my_rank, i, sendv[i]); + memset(sendv[i], 0xaa, szentry); + } + + recvv = malloc(sizeof(char *) * nentry); + if(!recvv) { printf("malloc failed"); goto fn_fail; } + for (i = 0; i < 1; i++) { + recvv[i] = (char*)mmap(0, szentry, PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0); + if(recvv[i] == MAP_FAILED) { printf("mmap failed"); goto fn_fail; } + dprintf("[%d] recvv[%d]=%p\n", my_rank, i, recvv[i]); + memset(recvv[i], 0, szentry); + } + + printf("after memset\n"); + + print_cpu_last_executed_on(); + + printf("Before 1st barrier\n"); fflush(stdout); + MPI_Barrier(MPI_COMM_WORLD); + if(my_rank == 0) { + clock_gettime(CLOCK_REALTIME, &start); + } + sendrecv(my_rank, nentry, sendv, recvv, szentry, src, dest, reqs, status, 0); + printf("Before 2nd barrier\n"); fflush(stdout); + MPI_Barrier(MPI_COMM_WORLD); + if(my_rank == 0) { + clock_gettime(CLOCK_REALTIME, &end); + diffusec = DIFFNSEC(end, start) / (double)1000; + printf("%4.4f sec\n", DIFFNSEC(end, start) / (double)1000000000); fflush(stdout); + } + + fn_exit: + MPI_Finalize(); + return 0; + fn_fail: + goto fn_exit; +} diff --git a/test/uti/mpi/004.c b/test/uti/mpi/004.c new file mode 100755 index 00000000..bf92ca19 --- /dev/null +++ b/test/uti/mpi/004.c @@ -0,0 +1,281 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* For SYS_xxx definitions */ +#include + +//#define DEBUG +#ifdef DEBUG +#define dprintf printf +#else +#define dprintf {} +#endif + +#define DIFFNSEC(end, start) ((end.tv_sec - start.tv_sec) * 1000000000UL + (end.tv_nsec - start.tv_nsec)) + +static inline void fixed_size_work() { + asm volatile( + "movq $0, %%rcx\n\t" + "1:\t" + "addq $1, %%rcx\n\t" + "cmpq $99, %%rcx\n\t" + "jle 1b\n\t" + : + : + : "rcx", "cc"); +} + +static inline void bulk_fsw(unsigned long n) { + int j; + for (j = 0; j < (n); j++) { + fixed_size_work(); + } +} + +double nspw; /* nsec per work */ +unsigned long nsec; + +void fwq_init() { + struct timespec start, end; + int i; + clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start); +#define N_INIT 10000000 + bulk_fsw(N_INIT); + clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end); + nsec = DIFFNSEC(end, start); + nspw = nsec / (double)N_INIT; +} + +#if 1 +void fwq(long delay_nsec) { + if (delay_nsec < 0) { + return; + //printf("%s: delay_nsec < 0\n", __FUNCTION__); + } + bulk_fsw(delay_nsec / nspw); +} +#else /* For machines with large core-to-core performance variation (e.g. OFP) */ +void fwq(long delay_nsec) { + struct timespec start, end; + clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start); + + while (1) { + clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end); + if (DIFFNSEC(end, start) >= delay_nsec) { + break; + } + bulk_fsw(2); /* ~150 ns per iteration on FOP */ + } +} +#endif + + +static int print_cpu_last_executed_on() { + char fn[256]; + char* result; + pid_t tid = syscall(SYS_gettid); + int fd; + int offset; + int mpi_errno = 0; + + sprintf(fn, "/proc/%d/task/%d/stat", getpid(), (int)tid); + //printf("fn=%s\n", fn); + fd = open(fn, O_RDONLY); + if(fd == -1) { + printf("open() failed\n"); + goto fn_fail; + } + + result = malloc(65536); + if(result == NULL) { + printf("malloc() failed"); + goto fn_fail; + } + + int amount = 0; + offset = 0; + while(1) { + amount = read(fd, result + offset, 65536); + // printf("amount=%d\n", amount); + if(amount == -1) { + printf("read() failed"); + goto fn_fail; + } + if(amount == 0) { + goto eof; + } + offset += amount; + } + eof:; + //printf("result:%s\n", result); + + char* next_delim = result; + char* field; + int i; + for(i = 0; i < 39; i++) { + field = strsep(&next_delim, " "); + } + + int cpu = sched_getcpu(); + if(cpu == -1) { + printf("getpu() failed\n"); + goto fn_fail; + } + + printf("compute thread,pmi_rank=%02d,stat-cpu=%02d,sched_getcpu=%02d,pid=%d,tid=%d\n", atoi(getenv("PMI_RANK")), atoi(field), cpu, getpid(), tid); fflush(stdout); + fn_exit: + free(result); + return mpi_errno; + fn_fail: + mpi_errno = -1; + goto fn_exit; +} + +static inline int on_same_node(int ppn, int me, int you) { + return (me / ppn == you / ppn); +} + +/* isend-calc-wait */ +void my_send(int nproc, int ppn, int rank, double *sbuf, double *rbuf, int ndoubles, MPI_Request* reqs, long calc_nsec) { + int i; + int r = 0, s = 0; + int req = 0; + for (i = 0; i < nproc; i++) { + if (!on_same_node(ppn, rank, i)) { + MPI_Irecv(rbuf + r * ndoubles, ndoubles, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, &reqs[req]); + r++; + req++; + MPI_Isend(sbuf + s * ndoubles, ndoubles, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, &reqs[req]); + s++; + req++; + } + } + fwq(calc_nsec); + MPI_Waitall(req, reqs, MPI_STATUSES_IGNORE); +} + +static struct option options[] = { + { + .name = "ppn", + .has_arg = required_argument, + .flag = NULL, + .val = 'P', + }, + /* end */ + { NULL, 0, NULL, 0, }, +}; + +int main(int argc, char **argv) { + int actual; + int ppn = -1; + int nproc; + int ndoubles = -1; + int my_rank = -1, size = -1; + int i, j; + double *sbuf, *rbuf; + MPI_Request* reqs; + struct timespec start, end; + long t_pure_l, t_overall_l; + long t_pure, t_overall; + int opt; + + fwq_init(); + + while ((opt = getopt_long(argc, argv, "+d:P:", options, NULL)) != -1) { + switch (opt) { + case 'd': + ndoubles = (1ULL << atoi(optarg)); + break; + case 'P': + ppn = atoi(optarg); + break; + default: /* '?' */ + printf("unknown option %c\n", optopt); + exit(1); + } + } + + if (ndoubles == -1 || ppn == -1) { + printf("specify ndoubles with -d and ppn with --ppn"); + exit(1); + } + + MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &actual); + if (actual != 3) { + printf("ERROR: Thread support level is %d (it should be 3)\n", actual); + exit(1); + } + + MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); + MPI_Comm_size(MPI_COMM_WORLD, &nproc); + + if (my_rank == 0) { + printf("tid=%d,pid=%d,ndoubles=%d,nproc=%d\n", syscall(__NR_gettid), getpid(), ndoubles, nproc); + printf("nsec=%ld, nspw=%f\n", nsec, nspw); + } + + reqs = (MPI_Request*)malloc(sizeof(MPI_Request) * nproc * 2); + if(!reqs) { printf("malloc failed"); goto fn_fail; } + + sbuf = malloc(sizeof(double) * ndoubles * nproc); + if(!sbuf) { printf("malloc failed"); goto fn_fail; } + memset(sbuf, 0, sizeof(double) * ndoubles); + printf("tid=%d,pid=%d,sbuf=%p\n", syscall(__NR_gettid), getpid(), sbuf); + + rbuf = malloc(sizeof(double) * ndoubles * nproc); + if(!rbuf) { printf("malloc failed"); goto fn_fail; } + memset(rbuf, 0, sizeof(double) * ndoubles); + printf("tid=%d,pid=%d,rbuf=%p\n", syscall(__NR_gettid), getpid(), rbuf); + + print_cpu_last_executed_on(); + + /* Measure isend-wait time */ + MPI_Barrier(MPI_COMM_WORLD); +#define NSKIP 5 +#define NPURE 30 + for (i = 0; i < NPURE + NSKIP; i++) { + if (i == NSKIP) { + clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start); + } + my_send(nproc, ppn, my_rank, sbuf, rbuf, ndoubles, reqs, 0); + } + clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end); + t_pure_l = DIFFNSEC(end, start) / NPURE; + //printf("t_pure (local): %ld usec\n", t_pure_l / 1000UL); + MPI_Allreduce(&t_pure_l, &t_pure, 1, MPI_LONG, MPI_MAX, MPI_COMM_WORLD); + if (my_rank == 0) printf("t_pure (max): %ld usec\n", t_pure / 1000UL); + + /* Measure isend-calc-wait time */ + MPI_Barrier(MPI_COMM_WORLD); +#define NOVERALL 30 + for (i = 0; i < NOVERALL + NSKIP; i++) { + if (i == NSKIP) { + clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start); + } + my_send(nproc, ppn, my_rank, sbuf, rbuf, ndoubles, reqs, t_pure); + } + clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end); + t_overall_l = DIFFNSEC(end, start) / NOVERALL; + //printf("t_overall (local): %ld usec\n", t_overall_l / 1000UL); + MPI_Allreduce(&t_overall_l, &t_overall, 1, MPI_LONG, MPI_MAX, MPI_COMM_WORLD); + if (my_rank == 0) printf("t_overall (max): %ld usec\n", t_overall / 1000UL); + if (my_rank == 0) { + long t_abs = (t_pure * 2) - t_overall; + printf("overlap: %.2f %%\n", (t_abs * 100) / (double)t_pure); + } + + fn_exit: + MPI_Finalize(); + return 0; + fn_fail: + goto fn_exit; +} diff --git a/test/uti/mpi/005.c b/test/uti/mpi/005.c new file mode 100755 index 00000000..0803ebb8 --- /dev/null +++ b/test/uti/mpi/005.c @@ -0,0 +1,338 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* For SYS_xxx definitions */ +#include + +//#define DEBUG +#ifdef DEBUG +#define dprintf printf +#else +#define dprintf {} +#endif + +#define DIFFNSEC(end, start) ((end.tv_sec - start.tv_sec) * 1000000000UL + (end.tv_nsec - start.tv_nsec)) + +#if 1 +#define BEGIN_EPOCH(win) do { MPI_Win_fence(0, win); } while(0) +#define END_EPOCH(win) do { MPI_Win_fence(0, win); } while(0) +#define BAR_EPOCH do { } while(0) +#else +#define BEGIN_EPOCH(win) do { MPI_Win_lock_all(0, win); } while(0) +#define END_EPOCH(win) do { MPI_Win_unlock_all(win); } while(0) +#define BAR_EPOCH do { MPI_Barrier(MPI_COMM_WORLD); } while(0) +#endif + + +static inline void fixed_size_work() { + asm volatile( + "movq $0, %%rcx\n\t" + "1:\t" + "addq $1, %%rcx\n\t" + "cmpq $99, %%rcx\n\t" + "jle 1b\n\t" + : + : + : "rcx", "cc"); +} + +static inline void bulk_fsw(unsigned long n) { + int j; + for (j = 0; j < (n); j++) { + fixed_size_work(); + } +} + +double nspw; /* nsec per work */ +unsigned long nsec; + +void fwq_init() { + struct timespec start, end; + int i; + clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start); +#define N_INIT 10000000 + bulk_fsw(N_INIT); + clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end); + nsec = DIFFNSEC(end, start); + nspw = nsec / (double)N_INIT; +} + +#if 1 +void fwq(long delay_nsec) { + if (delay_nsec < 0) { + return; + //printf("%s: delay_nsec < 0\n", __FUNCTION__); + } + bulk_fsw(delay_nsec / nspw); +} +#else /* For machines with large core-to-core performance variation (e.g. OFP) */ +void fwq(long delay_nsec) { + struct timespec start, end; + + if (delay_nsec < 0) { return; } + clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start); + + while (1) { + clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end); + if (DIFFNSEC(end, start) >= delay_nsec) { + break; + } + bulk_fsw(2); /* ~150 ns per iteration on FOP */ + } +} +#endif + + +static int print_cpu_last_executed_on() { + char fn[256]; + char* result; + pid_t tid = syscall(SYS_gettid); + int fd; + int offset; + int mpi_errno = 0; + + sprintf(fn, "/proc/%d/task/%d/stat", getpid(), (int)tid); + //printf("fn=%s\n", fn); + fd = open(fn, O_RDONLY); + if(fd == -1) { + printf("open() failed\n"); + goto fn_fail; + } + + result = malloc(65536); + if(result == NULL) { + printf("malloc() failed"); + goto fn_fail; + } + + int amount = 0; + offset = 0; + while(1) { + amount = read(fd, result + offset, 65536); + // printf("amount=%d\n", amount); + if(amount == -1) { + printf("read() failed"); + goto fn_fail; + } + if(amount == 0) { + goto eof; + } + offset += amount; + } + eof:; + //printf("result:%s\n", result); + + char* next_delim = result; + char* field; + int i; + for(i = 0; i < 39; i++) { + field = strsep(&next_delim, " "); + } + + int cpu = sched_getcpu(); + if(cpu == -1) { + printf("getpu() failed\n"); + goto fn_fail; + } + + printf("compute thread,pmi_rank=%02d,stat-cpu=%02d,sched_getcpu=%02d,tid=%d\n", atoi(getenv("PMI_RANK")), atoi(field), cpu, tid); fflush(stdout); + fn_exit: + free(result); + return mpi_errno; + fn_fail: + mpi_errno = -1; + goto fn_exit; +} + +static inline int on_same_node(int ppn, int me, int you) { + return (me / ppn == you / ppn); +} + +/* fence-accumulate-calc-fence */ +void accumulate(int nproc, int ppn, int rank, double *wbuf, double *rbuf, int ndoubles, MPI_Win win, long calc_nsec) { + int i, j; + int r = 0, s = 0; + int req = 0; + BEGIN_EPOCH(win); + for (i = 0; i < nproc; i++) { + if (!on_same_node(ppn, rank, i)) { + for (j = 0; j < ndoubles; j++) { + //printf("i=%d,j=%d,rbuf=%f,wbuf=%f\n", i, j, rbuf[i * ndoubles + j], wbuf[i * ndoubles + j]); + MPI_Accumulate(rbuf + i * ndoubles + j, 1, MPI_DOUBLE, i, i * ndoubles + j, 1, MPI_DOUBLE, MPI_SUM, win); + } + } + } + fwq(calc_nsec); + END_EPOCH(win); +} + +static struct option options[] = { + { + .name = "ppn", + .has_arg = required_argument, + .flag = NULL, + .val = 'P', + }, + /* end */ + { NULL, 0, NULL, 0, }, +}; + +int main(int argc, char **argv) { + int rc; + int actual; + int ppn = -1; + int nproc; + int ndoubles = -1; + int my_rank = -1, size = -1; + int i, j; + double *wbuf, *rbuf; + MPI_Win win; + struct timespec start, end; + long t_fence_l, t_pure_l, t_overall_l; + long t_fence, t_pure, t_overall; + int opt; + + fwq_init(); + + while ((opt = getopt_long(argc, argv, "+d:P:", options, NULL)) != -1) { + switch (opt) { + case 'd': + ndoubles = (1ULL << atoi(optarg)); + break; + case 'P': + ppn = atoi(optarg); + break; + default: /* '?' */ + printf("unknown option %c\n", optopt); + exit(1); + } + } + + if (ndoubles == -1 || ppn == -1) { + printf("specify ndoubles with -d and ppn with --ppn"); + exit(1); + } + + MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &actual); + if (actual != 3) { + printf("ERROR: MPI_THREAD_MULTIPLE not available (level was set to %d)\n", actual); + exit(1); + } + + MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); + MPI_Comm_size(MPI_COMM_WORLD, &nproc); + + if (my_rank == 0) { + printf("ndoubles=%d,nproc=%d\n", ndoubles, nproc); + printf("nsec=%ld, nspw=%f\n", nsec, nspw); + } + + /* write-to buffer */ + wbuf = malloc(sizeof(double) * ndoubles * nproc); + if(!wbuf) { printf("malloc failed"); goto fn_fail; } + memset(wbuf, 0, sizeof(double) * ndoubles * nproc); + + /* read-from buffer */ + rbuf = malloc(sizeof(double) * ndoubles * nproc); + if(!rbuf) { printf("malloc failed"); goto fn_fail; } + memset(rbuf, 0, sizeof(double) * ndoubles * nproc); + + if (rc = MPI_Win_create(wbuf, sizeof(double) * ndoubles * nproc, sizeof(double), MPI_INFO_NULL, MPI_COMM_WORLD, &win)) { + printf("MPI_Win_create failed,rc=%d\n", rc); + } + + print_cpu_last_executed_on(); + + for (i = 0; i < nproc; i++) { + for (j = 0; j < ndoubles; j++) { + wbuf[i * ndoubles + j] = i + 1 + j; + rbuf[i * ndoubles + j] = (i + 1) * 2 + j; + } + } + +#if 0 + for (i = 0; i < nproc; i++) { + for (j = 0; j < ndoubles; j++) { + printf("wbuf,proc=%d,j=%d,val=%f\n", i, j, wbuf[i * ndoubles + j]); + printf("rbuf,proc=%d,j=%d,val=%f\n", i, j, rbuf[i * ndoubles + j]); + } + } +#endif + /* Measure fence-fence time */ + MPI_Barrier(MPI_COMM_WORLD); +#define NSKIP 5 +#define NFENCE 30 + for (i = 0; i < NFENCE + NSKIP; i++) { + if (i == NSKIP) { + clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start); + } + BEGIN_EPOCH(win); + END_EPOCH(win); + } + BAR_EPOCH; + clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end); + t_fence_l = DIFFNSEC(end, start) / NFENCE; + //printf("t_fence (local): %ld usec\n", t_fence_l / 1000UL); + MPI_Allreduce(&t_fence_l, &t_fence, 1, MPI_LONG, MPI_MAX, MPI_COMM_WORLD); + if (my_rank == 0) printf("t_fence (max): %ld usec\n", t_fence / 1000UL); + + /* Measure fence-acc-fence time */ + MPI_Barrier(MPI_COMM_WORLD); +#define NPURE 30 + for (i = 0; i < NPURE + NSKIP; i++) { + if (i == NSKIP) { + clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start); +} + accumulate(nproc, ppn, my_rank, wbuf, rbuf, ndoubles, win, 0); + } + BAR_EPOCH; + clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end); + t_pure_l = DIFFNSEC(end, start) / NPURE; + //printf("t_pure (local): %ld usec\n", t_pure_l / 1000UL); + MPI_Allreduce(&t_pure_l, &t_pure, 1, MPI_LONG, MPI_MAX, MPI_COMM_WORLD); + if (my_rank == 0) printf("t_pure (max): %ld usec\n", t_pure / 1000UL); + +#if 0 + for (i = 0; i < nproc; i++) { + for (j = 0; j < ndoubles; j++) { + printf("wbuf,proc=%d,j=%d,val=%f\n", i, j, wbuf[i * ndoubles + j]); + printf("rbuf,proc=%d,j=%d,val=%f\n", i, j, rbuf[i * ndoubles + j]); + } + } +#endif + + /* Measure fenc-acc-calc-fence time */ + MPI_Barrier(MPI_COMM_WORLD); +#define NOVERALL 30 + for (i = 0; i < NOVERALL + NSKIP; i++) { + if (i == NSKIP) { + clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start); +} + accumulate(nproc, ppn, my_rank, wbuf, rbuf, ndoubles, win, t_pure - t_fence); + } + BAR_EPOCH; + clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end); + t_overall_l = DIFFNSEC(end, start) / NOVERALL; + //printf("t_overall (local): %ld usec\n", t_overall_l / 1000UL); + MPI_Allreduce(&t_overall_l, &t_overall, 1, MPI_LONG, MPI_MAX, MPI_COMM_WORLD); + if (my_rank == 0) printf("t_overall (max): %ld usec\n", t_overall / 1000UL); + if (my_rank == 0) { + long t_abs = (t_pure * 2) - t_overall; + printf("overlap: %.2f %%\n", (t_abs * 100) / (double)t_pure); +} + + fn_exit: + MPI_Finalize(); + return 0; + fn_fail: + goto fn_exit; +} diff --git a/test/uti/mpi/005.sh b/test/uti/mpi/005.sh new file mode 100755 index 00000000..416e9ecc --- /dev/null +++ b/test/uti/mpi/005.sh @@ -0,0 +1,127 @@ +#!/usr/bin/bash + +#!/usr/bin/bash -x + +MYHOME=$HOME +UTI_MPI_TOP=${MYHOME}/project/os/mckernel/test/uti/mpi + +MCK=${MYHOME}/project/os/install +unset DISABLE_UTI + +cmdline="./005" + +stop=0 +reboot=0 +go=0 + +mck=0 +nloops=1 +ppn=1 + +while getopts srgac:n:mdl:P:o: OPT +do + case ${OPT} in + s) stop=1 + ;; + r) reboot=1 + ;; + g) go=1 + ;; + a) async=1 + ;; + c) cmdline=$OPTARG + ;; + n) ndoubles=$OPTARG + ;; + m) mck=1 + ;; + d) export DISABLE_UTI=1 + ;; + P) ppn=$OPTARG + ;; + o) omp_num_threads=$OPTARG + ;; + *) echo "invalid option -${OPT}" >&2 + exit 1 + esac +done + +if [ ${mck} -eq 1 ]; then + mcexec="${mck_dir}/bin/mcexec" + mcexecopt="--enable-uti --uti-thread-rank=$uti_thread_rank" + if [ ${use_hfi} -eq 1 ]; then + mcexecopt="--enable-hfi1 $mcexecopt" + fi + mcexecopt="-n $ppn -t $((256 / ppn + 4)) -m 1 $mcexecopt" +else + mcexec= + mcexecopt= +fi + +if [ ${mck} -eq 1 ]; then + i_mpi_pin=off +else + i_mpi_pin=on +fi + +if [ "$i_mpi_pin" == on ] ; then + i_mpi_pin_domain="export I_MPI_PIN_DOMAIN=$((omp_num_threads + 1)):scatter" +else + i_mpi_pin_domain= +fi + +if [ $async -eq 0 ] || [ "$async_progress_pin" == "" ] ; then + i_mpi_async_progress_pin= +else + i_mpi_async_progress_pin="export I_MPI_ASYNC_PROGRESS_PIN=$async_progress_pin" +fi + +if [ ${stop} -eq 1 ]; then + if [ ${mck} -eq 1 ]; then + sudo ${MCK}/sbin/mcstop+release.sh + else + : + fi +fi + +if [ ${reboot} -eq 1 ]; then + if [ ${mck} -eq 1 ]; then + if hostname | grep ofp &>/dev/null; then + sudo ${MCK}/sbin/mcreboot.sh -s -c 2-17,70-85,138-153,206-221,20-35,88-103,156-171,224-239,36-51,104-119,172-187,240-255,52-67,120-135,188-203,256-271 -r 2-5,70-73,138-141,206-209:0+6-9,74-77,142-145,210-213:1+10-13,78-81,146-149,214-217:68+14-17,82-85,150-153,218-221:69+20-23,88-91,156-159,224-227:136+24-27,92-95,160-163,228-231:137+28-31,96-99,164-167,232-235:204+32-35,100-103,168-171,236-239:205+36-39,104-107,172-175,240-243:18+40-43,108-111,176-179,244-247:19+44-47,112-115,180-183,248-251:86+48-51,116-119,184-187,252-255:87+52-55,120-123,188-191,256-259:154+56-59,124-127,192-195,260-263:155+60-63,128-131,196-199,264-267:222+64-67,132-135,200-203,268-271:223 -m 32G@0,12G@1 + else + sudo ${MCK}/sbin/mcreboot.sh -s -c 1-15,65-79,129-143,193-207,17-31,81-95,145-159,209-223,33-47,97-111,161-175,225-239,49-63,113-127,177-191,241-255 -r 1-15:0+65-79:64+129-143:128+193-207:192+17-31:16+81-95:80+145-159:144+209-223:208+33-47:32+97-111:96+161-175:160+225-239:224+49-63:48+113-127:112+177-191:176+241-255:240 -m 12G@0,12G@1,12G@2,12G@3,3920M@4,3920M@5,3920M@6,3920M@7 + fi + else + : + fi +fi + +cd ${UTI_MPI_TOP} +( +cat < ./job.sh + +if [ ${go} -eq 1 ]; then + cd ${UTI_MPI_TOP} + make CC=gcc 008 + ./job.sh +fi + + + diff --git a/test/uti/mpi/006.c b/test/uti/mpi/006.c new file mode 100755 index 00000000..d7aa6e61 --- /dev/null +++ b/test/uti/mpi/006.c @@ -0,0 +1,625 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* For SYS_xxx definitions */ +#include +#include + +#include /* required for core PSM2 functions */ +#include /* required for PSM2 MQ functions (send, recv, etc) */ + +//#define DEBUG +#ifdef DEBUG +#define dprintf printf +#else +#define dprintf {} +#endif + +#define BUFFER_LENGTH 8000000 +#define CONNECT_ARRAY_SIZE 8 +void die(char *msg, int rc) { + fprintf(stderr, "%s: %d\n", msg, rc); +} + +#define DIFFNSEC(end, start) ((end.tv_sec - start.tv_sec) * 1000000000UL + (end.tv_nsec - start.tv_nsec)) + +static inline void fixed_size_work() { + asm volatile( + "movq $0, %%rcx\n\t" + "1:\t" + "addq $1, %%rcx\n\t" + "cmpq $99, %%rcx\n\t" + "jle 1b\n\t" + : + : + : "rcx", "cc"); +} + +static inline void bulk_fsw(unsigned long n) { + int j; + for (j = 0; j < (n); j++) { + fixed_size_work(); + } +} + +double nspw; /* nsec per work */ +unsigned long nsec; + +void fwq_init() { + struct timespec start, end; + int i; + clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start); +#define N_INIT 10000000 + bulk_fsw(N_INIT); + clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end); + nsec = DIFFNSEC(end, start); + nspw = nsec / (double)N_INIT; +} + +#if 1 +void fwq(long delay_nsec) { + if (delay_nsec < 0) { + return; + //printf("%s: delay_nsec < 0\n", __FUNCTION__); + } + bulk_fsw(delay_nsec / nspw); +} +#else /* For machines with large core-to-core performance variation (e.g. OFP) */ +void fwq(long delay_nsec) { + struct timespec start, end; + clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start); + + while (1) { + clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end); + if (DIFFNSEC(end, start) >= delay_nsec) { + break; + } + bulk_fsw(2); /* ~150 ns per iteration on FOP */ + } +} +#endif + + +static int print_cpu_last_executed_on() { + char fn[256]; + char* result; + pid_t tid = syscall(SYS_gettid); + int fd; + int offset; + int mpi_errno = 0; + + sprintf(fn, "/proc/%d/task/%d/stat", getpid(), (int)tid); + //printf("fn=%s\n", fn); + fd = open(fn, O_RDONLY); + if(fd == -1) { + printf("open() failed\n"); + goto fn_fail; + } + + result = malloc(65536); + if(result == NULL) { + printf("malloc() failed"); + goto fn_fail; + } + + int amount = 0; + offset = 0; + while(1) { + amount = read(fd, result + offset, 65536); + // printf("amount=%d\n", amount); + if(amount == -1) { + printf("read() failed"); + goto fn_fail; + } + if(amount == 0) { + goto eof; + } + offset += amount; + } + eof:; + //printf("result:%s\n", result); + + char* next_delim = result; + char* field; + int i; + for(i = 0; i < 39; i++) { + field = strsep(&next_delim, " "); + } + + int cpu = sched_getcpu(); + if(cpu == -1) { + printf("getpu() failed\n"); + goto fn_fail; + } + + printf("compute thread,pmi_rank=%02d,stat-cpu=%02d,sched_getcpu=%02d,pid=%d,tid=%d\n", atoi(getenv("PMI_RANK")), atoi(field), cpu, getpid(), tid); fflush(stdout); + fn_exit: + free(result); + return mpi_errno; + fn_fail: + mpi_errno = -1; + goto fn_exit; +} + +static inline int on_same_node(int ppn, int me, int you) { + return (me / ppn == you / ppn); +} + +/* isend-calc-wait */ +void my_send(int nproc, int ppn, int rank, double *sbuf, double *rbuf, int ndoubles, MPI_Request* reqs, long calc_nsec) { + int i; + int r = 0, s = 0; + int req = 0; + for (i = 0; i < nproc; i++) { + if (!on_same_node(ppn, rank, i)) { + MPI_Irecv(rbuf + r * ndoubles, ndoubles, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, &reqs[req]); + r++; + req++; + MPI_Isend(sbuf + s * ndoubles, ndoubles, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, &reqs[req]); + s++; + req++; + } + } + fwq(calc_nsec); + MPI_Waitall(req, reqs, MPI_STATUSES_IGNORE); +} + + +/* Helper functions to find the server's PSM2 endpoint identifier (epid). */ +psm2_epid_t find_server(int rank) { + FILE *fp = NULL; + psm2_epid_t server_epid = 0; + char fn[256]; + sprintf(fn, "psm2-demo-server-epid-%d", rank); + printf("PSM2 client waiting for epid mapping file to appear...\n"); + while (!fp) { + sleep(1); + fp = fopen(fn, "r"); + } + fscanf(fp, "%lx", &server_epid); + fclose(fp); + printf("PSM2 client found server epid = 0x%lx\n", server_epid); + return server_epid; +} + +void write_epid_to_file(int rank, psm2_epid_t myepid) { + FILE *fp; + char fn[256]; + sprintf(fn, "psm2-demo-server-epid-%d", rank); + fp = fopen(fn, "w"); + if (!fp) { + fprintf(stderr, + "Exiting, couldn't write server's epid mapping file: "); + die(strerror(errno), errno); + } + fprintf(fp, "0x%lx", myepid); + fclose(fp); + printf("PSM2 server wrote epid = 0x%lx to file.\n", myepid); + return; +} + +int psm2_sendrecv(int rank, int sender, int receiver) { + struct psm2_ep_open_opts o; + psm2_uuid_t uuid; /* 16 byte */ + psm2_ep_t myep; + psm2_epid_t myepid; + psm2_epid_t server_epid; + psm2_epid_t epid_array[CONNECT_ARRAY_SIZE]; + int epid_array_mask[CONNECT_ARRAY_SIZE]; + psm2_error_t epid_connect_errors[CONNECT_ARRAY_SIZE]; + psm2_epaddr_t epaddr_array[CONNECT_ARRAY_SIZE]; + int rc; + int ver_major = PSM2_VERNO_MAJOR; + int ver_minor = PSM2_VERNO_MINOR; + char msgbuf[BUFFER_LENGTH]; + psm2_mq_t q; + psm2_mq_req_t req_mq; + memset(uuid, 0, sizeof(psm2_uuid_t)); /* Use a UUID of zero */ + *((int *)&uuid) = rand(); +/* Try to initialize PSM2 with the requested library version. + * * In this example, given the use of the PSM2_VERNO_MAJOR and MINOR + * * as defined in the PSM2 headers, ensure that we are linking with + * * the same version of PSM2 as we compiled against. */ + + if ((rc = psm2_init(&ver_major, &ver_minor)) != PSM2_OK) { + die("couldn't init", rc); + return -1; + } + printf("PSM2 init done.\n"); + /* Setup the endpoint options struct */ + if ((rc = psm2_ep_open_opts_get_defaults(&o)) != PSM2_OK) { + die("couldn't set default opts", rc); + return -1; + } + printf("PSM2 opts_get_defaults done.\n"); + /* Attempt to open a PSM2 endpoint. This allocates hardware resources. */ + if ((rc = psm2_ep_open(uuid, &o, &myep, &myepid)) != PSM2_OK) { + die("couldn't psm2_ep_open()", rc); + return -1; + } + printf("PSM2 endpoint open done.\n"); + int is_server = (rank == receiver) ? 1 : 0; + if (is_server) { + write_epid_to_file(rank, myepid); + } else { + server_epid = find_server(receiver); + } + if (is_server) { + /* Server does nothing here. A connection does not have to be + * * established to receive messages. */ + printf("PSM2 server up.\n"); + } else { + /* Setup connection request info */ + /* PSM2 can connect to a single epid per request, + * * or an arbitrary number of epids in a single connect call. + * * For this example, use part of an array of + * * connection requests. */ + memset(epid_array_mask, 0, sizeof(int) * CONNECT_ARRAY_SIZE); + epid_array[0] = server_epid; + epid_array_mask[0] = 1; + /* Begin the connection process. + * * note that if a requested epid is not responding, + * * the connect call will still return OK. + * * The errors array will contain the state of individual + * * connection requests. */ + if ((rc = psm2_ep_connect(myep, + CONNECT_ARRAY_SIZE, + epid_array, + epid_array_mask, + epid_connect_errors, + epaddr_array, + 0 /* no timeout */ + )) != PSM2_OK) { + die("couldn't ep_connect", rc); + return -1; + } + printf("PSM2 connect request processed.\n"); + /* Now check if our connection to the server is ready */ + if (epid_connect_errors[0] != PSM2_OK) { + die("couldn't connect to server", + epid_connect_errors[0]); + return -1; + } + printf("PSM2 client-server connection established.\n"); + } + /* Setup our PSM2 message queue */ + if ((rc = psm2_mq_init(myep, PSM2_MQ_ORDERMASK_NONE, NULL, 0, &q)) + != PSM2_OK) { + die("couldn't initialize PSM2 MQ", rc); + return -1; + } + printf("PSM2 MQ init done.\n"); + if (is_server) { + psm2_mq_tag_t t = {0xABCD}; + psm2_mq_tag_t tm = {-1}; + /* Post the receive request */ + if ((rc = psm2_mq_irecv2(q, PSM2_MQ_ANY_ADDR, + &t, /* message tag */ + &tm, /* message tag mask */ + 0, /* no flags */ + msgbuf, BUFFER_LENGTH, + NULL, /* no context to add */ + &req_mq /* track irecv status */ + )) != PSM2_OK) { + die("couldn't post psm2_mq_irecv()", rc); + return -1; + } + printf("PSM2 MQ irecv() posted\n"); + /* Wait until the message arrives */ + if ((rc = psm2_mq_wait(&req_mq, NULL)) != PSM2_OK) { + die("couldn't wait for the irecv", rc); + return -1; + } + printf("PSM2 MQ wait() done.\n"); + printf("Message from client:\n"); + printf("%s", msgbuf); + unlink("psm2-demo-server-epid"); + } else { + /* Say hello */ + snprintf(msgbuf, BUFFER_LENGTH, + "Hello world from epid=0x%lx, pid=%d.\n", + myepid, getpid()); + psm2_mq_tag_t t = {0xABCD}; + if ((rc = psm2_mq_send2(q, + epaddr_array[0], /* destination epaddr */ + PSM2_MQ_FLAG_SENDSYNC, /* no flags */ + &t, /* tag */ + msgbuf, BUFFER_LENGTH + )) != PSM2_OK) { + die("couldn't post psm2_mq_isend", rc); + return -1; + } + printf("PSM2 MQ send() done.\n"); + } +/* Close down the MQ */ + if ((rc = psm2_mq_finalize(q)) != PSM2_OK) { + die("couldn't psm2_mq_finalize()", rc); + return -1; + } + printf("PSM2 MQ finalized.\n"); +/* Close our ep, releasing all hardware resources. + * * Try to close all connections properly */ + if ((rc = psm2_ep_close(myep, PSM2_EP_CLOSE_GRACEFUL, + 0 /* no timeout */)) != PSM2_OK) { + die("couldn't psm2_ep_close()", rc); + return -1; + } + printf("PSM2 ep closed.\n"); + /* Release all local PSM2 resources */ + if ((rc = psm2_finalize()) != PSM2_OK) { + die("couldn't psm2_finalize()", rc); + return -1; + } + printf("PSM2 shut down, exiting.\n"); + return 0; +} + +static struct option options[] = { + { + .name = "ppn", + .has_arg = required_argument, + .flag = NULL, + .val = 'P', + }, + /* end */ + { NULL, 0, NULL, 0, }, +}; + +struct thr_arg { + volatile int bar_count; /* Barrier before entering loop */ + pthread_mutex_t bar_lock; + pthread_cond_t bar_cond; + pthread_t pthread; + int rank; + int ppn; + int nproc; +}; + +struct thr_arg thr_arg; + +void *progress_fn(void *arg) { + struct thr_arg *thr_arg = (struct thr_arg *)arg; + int rc; + int i; + + rc = syscall(732); + if (rc == -1) + fprintf(stdout, "CT09100 progress_fn running on Linux OK\n"); + else { + fprintf(stdout, "CT09100 progress_fn running on McKernel NG (%d)\n", rc); + } + + printf("progress,enter\n"); + + /* barrier */ + pthread_mutex_lock(&thr_arg->bar_lock); + thr_arg->bar_count++; + if (thr_arg->bar_count == 2) { + if ((rc = pthread_cond_broadcast(&thr_arg->bar_cond))) { + printf("pthread_cond_broadcast failed,rc=%d\n", rc); + } + } + while (thr_arg->bar_count != 2) { + if ((rc = pthread_cond_wait(&thr_arg->bar_cond, &thr_arg->bar_lock))) { + printf("pthread_cond_wait failed,rc=%d\n", rc); + } + } + pthread_mutex_unlock(&thr_arg->bar_lock); + +#if 0 + printf("progress,after barrier\n"); + for (i = 0; i < thr_arg->nproc; i++) { + if (!on_same_node(thr_arg->ppn, thr_arg->rank, i)) { + if (thr_arg->rank < i) { + psm2_sendrecv(thr_arg->rank, thr_arg->rank, i); + } else { + psm2_sendrecv(thr_arg->rank, i, thr_arg->rank); + } + } + } +#endif + + /* barrier */ + pthread_mutex_lock(&thr_arg->bar_lock); + thr_arg->bar_count--; + if (thr_arg->bar_count == 0) { + if ((rc = pthread_cond_broadcast(&thr_arg->bar_cond))) { + printf("pthread_cond_broadcast failed,rc=%d\n", rc); + } + } + while (thr_arg->bar_count != 0) { + if ((rc = pthread_cond_wait(&thr_arg->bar_cond, &thr_arg->bar_lock))) { + printf("pthread_cond_wait failed,rc=%d\n", rc); + } + } + pthread_mutex_unlock(&thr_arg->bar_lock); + + + printf("progress,exit\n"); + return NULL; +} + +int main(int argc, char **argv) { + int rc; + int actual; + int nproc; + int ppn = -1; + int ndoubles = -1; + int my_rank = -1, size = -1; + int i, j; + double *sbuf, *rbuf; + MPI_Request* reqs; + struct timespec start, end; + long t_pure_l, t_overall_l; + long t_pure, t_overall; + int opt; + pthread_condattr_t condattr; + pthread_mutexattr_t mutexattr; + + fwq_init(); + + while ((opt = getopt_long(argc, argv, "+d:P:", options, NULL)) != -1) { + switch (opt) { + case 'd': + ndoubles = (1ULL << atoi(optarg)); + break; + case 'P': + ppn = atoi(optarg); + break; + default: /* '?' */ + printf("unknown option %c\n", optopt); + exit(1); + } + } + + if (ndoubles == -1 || ppn == -1) { + printf("specify ndoubles with -d and ppn with --ppn"); + exit(1); + } + + MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &actual); + if (actual != 3) { + printf("ERROR: Thread support level is %d (it should be 3)\n", actual); + exit(1); + } + + MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); + MPI_Comm_size(MPI_COMM_WORLD, &nproc); + + if (my_rank == 0) { + printf("tid=%d,pid=%d,ndoubles=%d,nproc=%d\n", syscall(__NR_gettid), getpid(), ndoubles, nproc); + printf("nsec=%ld, nspw=%f\n", nsec, nspw); + } + + /* Spawn a thread */ + thr_arg.rank = my_rank; + thr_arg.ppn = ppn; + thr_arg.nproc = nproc; + thr_arg.bar_count = 0; + + pthread_condattr_init(&condattr); + pthread_cond_init(&thr_arg.bar_cond, &condattr); + + pthread_mutexattr_init(&mutexattr); + pthread_mutex_init(&thr_arg.bar_lock, &mutexattr); + + char *uti_str = getenv("DISABLE_UTI"); + int uti_val = uti_str ? atoi(uti_str) : 0; + if (!uti_val) { + rc = syscall(731, 1, NULL); + if (rc) { + fprintf(stdout, "CT09003 INFO: uti not available (rc=%d)\n", rc); + } else { + fprintf(stdout, "CT09003 INFO: uti available\n"); + } + } else { + fprintf(stdout, "CT09003 INFO: uti disabled\n"); + } + + rc = pthread_create(&thr_arg.pthread, NULL, progress_fn, &thr_arg); + if (rc){ + fprintf(stdout, "pthread_create: %d\n", rc); + exit(1); + } + + /* barrier */ + pthread_mutex_lock(&thr_arg.bar_lock); + thr_arg.bar_count++; + if (thr_arg.bar_count == 2) { + if ((rc = pthread_cond_broadcast(&thr_arg.bar_cond))) { + printf("pthread_cond_broadcast failed,rc=%d\n", rc); + } + } + while (thr_arg.bar_count != 2) { + if ((rc = pthread_cond_wait(&thr_arg.bar_cond, &thr_arg.bar_lock))) { + printf("pthread_cond_wait failed,rc=%d\n", rc); + } + } + pthread_mutex_unlock(&thr_arg.bar_lock); + + printf("parent,after barrier\n"); + + + reqs = (MPI_Request*)malloc(sizeof(MPI_Request) * nproc * 2); + if(!reqs) { printf("malloc failed"); goto fn_fail; } + + sbuf = malloc(sizeof(double) * ndoubles * nproc); + if(!sbuf) { printf("malloc failed"); goto fn_fail; } + memset(sbuf, 0, sizeof(double) * ndoubles); + printf("tid=%d,pid=%d,sbuf=%p\n", syscall(__NR_gettid), getpid(), sbuf); + + rbuf = malloc(sizeof(double) * ndoubles * nproc); + if(!rbuf) { printf("malloc failed"); goto fn_fail; } + memset(rbuf, 0, sizeof(double) * ndoubles); + printf("tid=%d,pid=%d,rbuf=%p\n", syscall(__NR_gettid), getpid(), rbuf); + + print_cpu_last_executed_on(); + + /* Measure isend-wait time */ + MPI_Barrier(MPI_COMM_WORLD); +#define NSKIP 5 +#define NPURE 30 + for (i = 0; i < NPURE + NSKIP; i++) { + if (i == NSKIP) { + clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start); + } + my_send(nproc, ppn, my_rank, sbuf, rbuf, ndoubles, reqs, 0); + } + clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end); + t_pure_l = DIFFNSEC(end, start) / NPURE; + //printf("t_pure (local): %ld usec\n", t_pure_l / 1000UL); + MPI_Allreduce(&t_pure_l, &t_pure, 1, MPI_LONG, MPI_MAX, MPI_COMM_WORLD); + if (my_rank == 0) printf("t_pure (max): %ld usec\n", t_pure / 1000UL); + + /* Measure isend-calc-wait time */ + MPI_Barrier(MPI_COMM_WORLD); +#define NOVERALL 30 + for (i = 0; i < NOVERALL + NSKIP; i++) { + if (i == NSKIP) { + clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start); + } + my_send(nproc, ppn, my_rank, sbuf, rbuf, ndoubles, reqs, t_pure); + } + clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end); + t_overall_l = DIFFNSEC(end, start) / NOVERALL; + //printf("t_overall (local): %ld usec\n", t_overall_l / 1000UL); + MPI_Allreduce(&t_overall_l, &t_overall, 1, MPI_LONG, MPI_MAX, MPI_COMM_WORLD); + if (my_rank == 0) printf("t_overall (max): %ld usec\n", t_overall / 1000UL); + if (my_rank == 0) { + long t_abs = (t_pure * 2) - t_overall; + printf("overlap: %.2f %%\n", (t_abs * 100) / (double)t_pure); + } + + /* barrier */ + pthread_mutex_lock(&thr_arg.bar_lock); + thr_arg.bar_count--; + if (thr_arg.bar_count == 0) { + if ((rc = pthread_cond_broadcast(&thr_arg.bar_cond))) { + printf("pthread_cond_broadcast failed,rc=%d\n", rc); + } + } + while (thr_arg.bar_count != 0) { + if ((rc = pthread_cond_wait(&thr_arg.bar_cond, &thr_arg.bar_lock))) { + printf("pthread_cond_wait failed,rc=%d\n", rc); + } + } + pthread_mutex_unlock(&thr_arg.bar_lock); + + + pthread_join(thr_arg.pthread, NULL); + + fn_exit: + MPI_Finalize(); + return 0; + fn_fail: + goto fn_exit; +} diff --git a/test/uti/mpi/007.c b/test/uti/mpi/007.c new file mode 100755 index 00000000..af31c581 --- /dev/null +++ b/test/uti/mpi/007.c @@ -0,0 +1,563 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* For SYS_xxx definitions */ +#include +#include + +#include /* required for core PSM2 functions */ +#include /* required for PSM2 MQ functions (send, recv, etc) */ + +//#define DEBUG +#ifdef DEBUG +#define dprintf printf +#else +#define dprintf {} +#endif + +#define BUFFER_LENGTH 8000000 +#define CONNECT_ARRAY_SIZE 8 +void die(char *msg, int rc) { + fprintf(stderr, "%s: %d\n", msg, rc); + exit(1); +} + +#define DIFFNSEC(end, start) ((end.tv_sec - start.tv_sec) * 1000000000UL + (end.tv_nsec - start.tv_nsec)) + +static inline void fixed_size_work() { + asm volatile( + "movq $0, %%rcx\n\t" + "1:\t" + "addq $1, %%rcx\n\t" + "cmpq $99, %%rcx\n\t" + "jle 1b\n\t" + : + : + : "rcx", "cc"); +} + +static inline void bulk_fsw(unsigned long n) { + int j; + for (j = 0; j < (n); j++) { + fixed_size_work(); + } +} + +double nspw; /* nsec per work */ +unsigned long nsec; + +void fwq_init() { + struct timespec start, end; + int i; + clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start); +#define N_INIT 10000000 + bulk_fsw(N_INIT); + clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end); + nsec = DIFFNSEC(end, start); + nspw = nsec / (double)N_INIT; +} + +#if 1 +void fwq(long delay_nsec) { + if (delay_nsec < 0) { + return; + //printf("%s: delay_nsec < 0\n", __FUNCTION__); + } + bulk_fsw(delay_nsec / nspw); +} +#else /* For machines with large core-to-core performance variation (e.g. OFP) */ +void fwq(long delay_nsec) { + struct timespec start, end; + clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start); + + while (1) { + clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end); + if (DIFFNSEC(end, start) >= delay_nsec) { + break; + } + bulk_fsw(2); /* ~150 ns per iteration on FOP */ + } +} +#endif + + +static int print_cpu_last_executed_on() { + char fn[256]; + char* result; + pid_t tid = syscall(SYS_gettid); + int fd; + int offset; + int mpi_errno = 0; + + sprintf(fn, "/proc/%d/task/%d/stat", getpid(), (int)tid); + //printf("fn=%s\n", fn); + fd = open(fn, O_RDONLY); + if(fd == -1) { + printf("open() failed\n"); + goto fn_fail; + } + + result = malloc(65536); + if(result == NULL) { + printf("malloc() failed"); + goto fn_fail; + } + + int amount = 0; + offset = 0; + while(1) { + amount = read(fd, result + offset, 65536); + // printf("amount=%d\n", amount); + if(amount == -1) { + printf("read() failed"); + goto fn_fail; + } + if(amount == 0) { + goto eof; + } + offset += amount; + } + eof:; + //printf("result:%s\n", result); + + char* next_delim = result; + char* field; + int i; + for(i = 0; i < 39; i++) { + field = strsep(&next_delim, " "); + } + + int cpu = sched_getcpu(); + if(cpu == -1) { + printf("getpu() failed\n"); + goto fn_fail; + } + + printf("compute thread,pmi_rank=%02d,stat-cpu=%02d,sched_getcpu=%02d,pid=%d,tid=%d\n", atoi(getenv("PMI_RANK")), atoi(field), cpu, getpid(), tid); fflush(stdout); + fn_exit: + free(result); + return mpi_errno; + fn_fail: + mpi_errno = -1; + goto fn_exit; +} + +static inline int on_same_node(int ppn, int me, int you) { + return (me / ppn == you / ppn); +} + +/* isend-calc-wait */ +void my_send(int nproc, int ppn, int rank, double *sbuf, double *rbuf, int ndoubles, MPI_Request* reqs, long calc_nsec) { + int i; + int r = 0, s = 0; + int req = 0; + for (i = 0; i < nproc; i++) { + if (!on_same_node(ppn, rank, i)) { + MPI_Irecv(rbuf + r * ndoubles, ndoubles, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, &reqs[req]); + r++; + req++; + MPI_Isend(sbuf + s * ndoubles, ndoubles, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, &reqs[req]); + s++; + req++; + } + } + fwq(calc_nsec); + MPI_Waitall(req, reqs, MPI_STATUSES_IGNORE); +} + + +/* Helper functions to find the server's PSM2 endpoint identifier (epid). */ +psm2_epid_t find_server(int rank) { + FILE *fp = NULL; + psm2_epid_t server_epid = 0; + char fn[256]; + sprintf(fn, "psm2-demo-server-epid-%d", rank); + printf("PSM2 client waiting for epid mapping file to appear...\n"); + while (!fp) { + sleep(1); + fp = fopen(fn, "r"); + } + fscanf(fp, "%lx", &server_epid); + fclose(fp); + printf("PSM2 client found server epid = 0x%lx\n", server_epid); + return server_epid; +} + +void write_epid_to_file(int rank, psm2_epid_t myepid) { + FILE *fp; + char fn[256]; + sprintf(fn, "psm2-demo-server-epid-%d", rank); + fp = fopen(fn, "w"); + if (!fp) { + fprintf(stderr, + "Exiting, couldn't write server's epid mapping file: "); + die(strerror(errno), errno); + } + fprintf(fp, "0x%lx", myepid); + fclose(fp); + printf("PSM2 server wrote epid = 0x%lx to file.\n", myepid); + return; +} + +int psm2_sendrecv(int rank, int sender, int receiver) { + struct psm2_ep_open_opts o; + psm2_uuid_t uuid; + psm2_ep_t myep; + psm2_epid_t myepid; + psm2_epid_t server_epid; + psm2_epid_t epid_array[CONNECT_ARRAY_SIZE]; + int epid_array_mask[CONNECT_ARRAY_SIZE]; + psm2_error_t epid_connect_errors[CONNECT_ARRAY_SIZE]; + psm2_epaddr_t epaddr_array[CONNECT_ARRAY_SIZE]; + int rc; + int ver_major = PSM2_VERNO_MAJOR; + int ver_minor = PSM2_VERNO_MINOR; + char msgbuf[BUFFER_LENGTH]; + psm2_mq_t q; + psm2_mq_req_t req_mq; + memset(uuid, 0, sizeof(psm2_uuid_t)); /* Use a UUID of zero */ +/* Try to initialize PSM2 with the requested library version. + * * In this example, given the use of the PSM2_VERNO_MAJOR and MINOR + * * as defined in the PSM2 headers, ensure that we are linking with + * * the same version of PSM2 as we compiled against. */ + + if ((rc = psm2_init(&ver_major, &ver_minor)) != PSM2_OK) { + die("couldn't init", rc); + } + printf("PSM2 init done.\n"); + /* Setup the endpoint options struct */ + if ((rc = psm2_ep_open_opts_get_defaults(&o)) != PSM2_OK) { + die("couldn't set default opts", rc); + } + printf("PSM2 opts_get_defaults done.\n"); + /* Attempt to open a PSM2 endpoint. This allocates hardware resources. */ + if ((rc = psm2_ep_open(uuid, &o, &myep, &myepid)) != PSM2_OK) { + die("couldn't psm2_ep_open()", rc); + } + printf("PSM2 endpoint open done.\n"); + int is_server = (rank == receiver) ? 1 : 0; + if (is_server) { + write_epid_to_file(rank, myepid); + } else { + server_epid = find_server(receiver); + } + if (is_server) { + /* Server does nothing here. A connection does not have to be + * * established to receive messages. */ + printf("PSM2 server up.\n"); + } else { + /* Setup connection request info */ + /* PSM2 can connect to a single epid per request, + * * or an arbitrary number of epids in a single connect call. + * * For this example, use part of an array of + * * connection requests. */ + memset(epid_array_mask, 0, sizeof(int) * CONNECT_ARRAY_SIZE); + epid_array[0] = server_epid; + epid_array_mask[0] = 1; + /* Begin the connection process. + * * note that if a requested epid is not responding, + * * the connect call will still return OK. + * * The errors array will contain the state of individual + * * connection requests. */ + if ((rc = psm2_ep_connect(myep, + CONNECT_ARRAY_SIZE, + epid_array, + epid_array_mask, + epid_connect_errors, + epaddr_array, + 0 /* no timeout */ + )) != PSM2_OK) { + die("couldn't ep_connect", rc); + } + printf("PSM2 connect request processed.\n"); + /* Now check if our connection to the server is ready */ + if (epid_connect_errors[0] != PSM2_OK) { + die("couldn't connect to server", + epid_connect_errors[0]); + } + printf("PSM2 client-server connection established.\n"); + } + /* Setup our PSM2 message queue */ + if ((rc = psm2_mq_init(myep, PSM2_MQ_ORDERMASK_NONE, NULL, 0, &q)) + != PSM2_OK) { + die("couldn't initialize PSM2 MQ", rc); + } + printf("PSM2 MQ init done.\n"); + if (is_server) { + psm2_mq_tag_t t = {0xABCD}; + psm2_mq_tag_t tm = {-1}; + /* Post the receive request */ + if ((rc = psm2_mq_irecv2(q, PSM2_MQ_ANY_ADDR, + &t, /* message tag */ + &tm, /* message tag mask */ + 0, /* no flags */ + msgbuf, BUFFER_LENGTH, + NULL, /* no context to add */ + &req_mq /* track irecv status */ + )) != PSM2_OK) { + die("couldn't post psm2_mq_irecv()", rc); + } + printf("PSM2 MQ irecv() posted\n"); + /* Wait until the message arrives */ + if ((rc = psm2_mq_wait(&req_mq, NULL)) != PSM2_OK) { + die("couldn't wait for the irecv", rc); + } + printf("PSM2 MQ wait() done.\n"); + printf("Message from client:\n"); + printf("%s", msgbuf); + unlink("psm2-demo-server-epid"); + } else { + /* Say hello */ + snprintf(msgbuf, BUFFER_LENGTH, + "Hello world from epid=0x%lx, pid=%d.\n", + myepid, getpid()); + psm2_mq_tag_t t = {0xABCD}; + if ((rc = psm2_mq_send2(q, + epaddr_array[0], /* destination epaddr */ + PSM2_MQ_FLAG_SENDSYNC, /* no flags */ + &t, /* tag */ + msgbuf, BUFFER_LENGTH + )) != PSM2_OK) { + die("couldn't post psm2_mq_isend", rc); + } + printf("PSM2 MQ send() done.\n"); + } +/* Close down the MQ */ + if ((rc = psm2_mq_finalize(q)) != PSM2_OK) { + die("couldn't psm2_mq_finalize()", rc); + } + printf("PSM2 MQ finalized.\n"); +/* Close our ep, releasing all hardware resources. + * * Try to close all connections properly */ + if ((rc = psm2_ep_close(myep, PSM2_EP_CLOSE_GRACEFUL, + 0 /* no timeout */)) != PSM2_OK) { + die("couldn't psm2_ep_close()", rc); + } + printf("PSM2 ep closed.\n"); + /* Release all local PSM2 resources */ + if ((rc = psm2_finalize()) != PSM2_OK) { + die("couldn't psm2_finalize()", rc); + } + printf("PSM2 shut down, exiting.\n"); + return 0; +} + +static struct option options[] = { + { + .name = "ppn", + .has_arg = required_argument, + .flag = NULL, + .val = 'P', + }, + /* end */ + { NULL, 0, NULL, 0, }, +}; + +struct thr_arg { + volatile int bar_count; /* Barrier before entering loop */ + pthread_mutex_t bar_lock; + pthread_cond_t bar_cond; + pthread_t pthread; + int rank; + int ppn; + int nproc; +}; + +struct thr_arg thr_arg; + +void *progress_fn(void *arg) { + struct thr_arg *thr_arg = (struct thr_arg *)arg; + int rc; + int i; + + rc = syscall(732); + if (rc == -1) + fprintf(stdout, "CT09100 progress_fn running on Linux OK\n"); + else { + fprintf(stdout, "CT09100 progress_fn running on McKernel NG (%d)\n", rc); + } + + printf("progress,enter\n"); + + /* barrier */ + pthread_mutex_lock(&thr_arg->bar_lock); + thr_arg->bar_count++; + if (thr_arg->bar_count == 2) { + if ((rc = pthread_cond_broadcast(&thr_arg->bar_cond))) { + printf("pthread_cond_broadcast failed,rc=%d\n", rc); + } + } + while (thr_arg->bar_count != 2) { + if ((rc = pthread_cond_wait(&thr_arg->bar_cond, &thr_arg->bar_lock))) { + printf("pthread_cond_wait failed,rc=%d\n", rc); + } + } + pthread_mutex_unlock(&thr_arg->bar_lock); + + printf("progress,after barrier\n"); +#if 1 + for (i = 0; i < thr_arg->nproc; i++) { + if (!on_same_node(thr_arg->ppn, thr_arg->rank, i)) { + if (thr_arg->rank < i) { + psm2_sendrecv(thr_arg->rank, thr_arg->rank, i); + } else { + psm2_sendrecv(thr_arg->rank, i, thr_arg->rank); + } + } + } +#endif + + /* barrier */ + pthread_mutex_lock(&thr_arg->bar_lock); + thr_arg->bar_count--; + if (thr_arg->bar_count == 0) { + if ((rc = pthread_cond_broadcast(&thr_arg->bar_cond))) { + printf("pthread_cond_broadcast failed,rc=%d\n", rc); + } + } + while (thr_arg->bar_count != 0) { + if ((rc = pthread_cond_wait(&thr_arg->bar_cond, &thr_arg->bar_lock))) { + printf("pthread_cond_wait failed,rc=%d\n", rc); + } + } + pthread_mutex_unlock(&thr_arg->bar_lock); + + + printf("progress,exit\n"); + return NULL; +} + +int main(int argc, char **argv) { + int rc; + int actual; + int nproc; + int ppn = -1; + int ndoubles = -1; + int my_rank = -1, size = -1; + int i, j; + double *sbuf, *rbuf; + MPI_Request* reqs; + struct timespec start, end; + long t_pure_l, t_overall_l; + long t_pure, t_overall; + int opt; + pthread_condattr_t condattr; + pthread_mutexattr_t mutexattr; + + fwq_init(); + + while ((opt = getopt_long(argc, argv, "+d:P:", options, NULL)) != -1) { + switch (opt) { + case 'd': + ndoubles = (1ULL << atoi(optarg)); + break; + case 'P': + ppn = atoi(optarg); + break; + default: /* '?' */ + printf("unknown option %c\n", optopt); + exit(1); + } + } + + if (ndoubles == -1 || ppn == -1) { + printf("specify ndoubles with -d and ppn with --ppn"); + exit(1); + } + + char *rank_str = getenv("PMI_RANK"); + if (!rank_str) { + printf("getenv failed\n"); + exit(1); + } + my_rank = atoi(rank_str); + nproc = 2; + + if (my_rank == 0) { + printf("tid=%d,pid=%d,ndoubles=%d,nproc=%d\n", syscall(__NR_gettid), getpid(), ndoubles, nproc); + printf("nsec=%ld, nspw=%f\n", nsec, nspw); + } + + /* Spawn a thread */ + thr_arg.rank = my_rank; + thr_arg.ppn = ppn; + thr_arg.nproc = nproc; + thr_arg.bar_count = 0; + + pthread_condattr_init(&condattr); + pthread_cond_init(&thr_arg.bar_cond, &condattr); + + pthread_mutexattr_init(&mutexattr); + pthread_mutex_init(&thr_arg.bar_lock, &mutexattr); + + char *uti_str = getenv("DISABLE_UTI"); + int uti_val = uti_str ? atoi(uti_str) : 0; + if (!uti_val) { + rc = syscall(731, 1, NULL); + if (rc) { + fprintf(stdout, "CT09003 INFO: uti not available (rc=%d)\n", rc); + } else { + fprintf(stdout, "CT09003 INFO: uti available\n"); + } + } else { + fprintf(stdout, "CT09003 INFO: uti disabled\n"); + } + + rc = pthread_create(&thr_arg.pthread, NULL, progress_fn, &thr_arg); + if (rc){ + fprintf(stdout, "pthread_create: %d\n", rc); + exit(1); + } + + /* barrier */ + pthread_mutex_lock(&thr_arg.bar_lock); + thr_arg.bar_count++; + if (thr_arg.bar_count == 2) { + if ((rc = pthread_cond_broadcast(&thr_arg.bar_cond))) { + printf("pthread_cond_broadcast failed,rc=%d\n", rc); + } + } + while (thr_arg.bar_count != 2) { + if ((rc = pthread_cond_wait(&thr_arg.bar_cond, &thr_arg.bar_lock))) { + printf("pthread_cond_wait failed,rc=%d\n", rc); + } + } + pthread_mutex_unlock(&thr_arg.bar_lock); + + printf("parent,after barrier\n"); + + + print_cpu_last_executed_on(); + + /* barrier */ + pthread_mutex_lock(&thr_arg.bar_lock); + thr_arg.bar_count--; + if (thr_arg.bar_count == 0) { + if ((rc = pthread_cond_broadcast(&thr_arg.bar_cond))) { + printf("pthread_cond_broadcast failed,rc=%d\n", rc); + } + } + while (thr_arg.bar_count != 0) { + if ((rc = pthread_cond_wait(&thr_arg.bar_cond, &thr_arg.bar_lock))) { + printf("pthread_cond_wait failed,rc=%d\n", rc); + } + } + pthread_mutex_unlock(&thr_arg.bar_lock); + + + pthread_join(thr_arg.pthread, NULL); + + fn_exit: + return 0; + fn_fail: + goto fn_exit; +} diff --git a/test/uti/mpi/008.c b/test/uti/mpi/008.c new file mode 100755 index 00000000..6db6e3ae --- /dev/null +++ b/test/uti/mpi/008.c @@ -0,0 +1,589 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* For SYS_xxx definitions */ +#include +#include + +#include /* required for core PSM2 functions */ +#include /* required for PSM2 MQ functions (send, recv, etc) */ + +//#define DEBUG +#ifdef DEBUG +#define dprintf printf +#else +#define dprintf {} +#endif + +#define BUFFER_LENGTH /*8000000*/(1ULL<<12) +#define CONNECT_ARRAY_SIZE 8 +void die(char *msg, int rc) { + fprintf(stderr, "%s: %d\n", msg, rc); + fflush(stderr); +} + +#define DIFFNSEC(end, start) ((end.tv_sec - start.tv_sec) * 1000000000UL + (end.tv_nsec - start.tv_nsec)) + +static inline void fixed_size_work() { + asm volatile( + "movq $0, %%rcx\n\t" + "1:\t" + "addq $1, %%rcx\n\t" + "cmpq $99, %%rcx\n\t" + "jle 1b\n\t" + : + : + : "rcx", "cc"); +} + +static inline void bulk_fsw(unsigned long n) { + int j; + for (j = 0; j < (n); j++) { + fixed_size_work(); + } +} + +double nspw; /* nsec per work */ +unsigned long nsec; + +void fwq_init() { + struct timespec start, end; + int i; + clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start); +#define N_INIT 10000000 + bulk_fsw(N_INIT); + clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end); + nsec = DIFFNSEC(end, start); + nspw = nsec / (double)N_INIT; +} + +#if 1 +void fwq(long delay_nsec) { + if (delay_nsec < 0) { + return; + //printf("%s: delay_nsec < 0\n", __FUNCTION__); + } + bulk_fsw(delay_nsec / nspw); +} +#else /* For machines with large core-to-core performance variation (e.g. OFP) */ +void fwq(long delay_nsec) { + struct timespec start, end; + clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start); + + while (1) { + clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end); + if (DIFFNSEC(end, start) >= delay_nsec) { + break; + } + bulk_fsw(2); /* ~150 ns per iteration on FOP */ + } +} +#endif + + +static int print_cpu_last_executed_on() { + char fn[256]; + char* result; + pid_t tid = syscall(SYS_gettid); + int fd; + int offset; + int mpi_errno = 0; + + sprintf(fn, "/proc/%d/task/%d/stat", getpid(), (int)tid); + //printf("fn=%s\n", fn); + fd = open(fn, O_RDONLY); + if(fd == -1) { + printf("open() failed\n"); + goto fn_fail; + } + + result = malloc(65536); + if(result == NULL) { + printf("malloc() failed"); + goto fn_fail; + } + + int amount = 0; + offset = 0; + while(1) { + amount = read(fd, result + offset, 65536); + // printf("amount=%d\n", amount); + if(amount == -1) { + printf("read() failed"); + goto fn_fail; + } + if(amount == 0) { + goto eof; + } + offset += amount; + } + eof:; + //printf("result:%s\n", result); + + char* next_delim = result; + char* field; + int i; + for(i = 0; i < 39; i++) { + field = strsep(&next_delim, " "); + } + + int cpu = sched_getcpu(); + if(cpu == -1) { + printf("getpu() failed\n"); + goto fn_fail; + } + + printf("compute thread,pmi_rank=%02d,stat-cpu=%02d,sched_getcpu=%02d,pid=%d,tid=%d\n", atoi(getenv("PMI_RANK")), atoi(field), cpu, getpid(), tid); fflush(stdout); + fn_exit: + free(result); + return mpi_errno; + fn_fail: + mpi_errno = -1; + goto fn_exit; +} + +static inline int on_same_node(int ppn, int me, int you) { + return (me / ppn == you / ppn); +} + +/* Helper functions to find the server's PSM2 endpoint identifier (epid). */ +psm2_epid_t find_server(int rank) { + FILE *fp = NULL; + psm2_epid_t server_epid = 0; + char fn[256]; + printf("%s: enter\n", __FUNCTION__); fflush(stdout); + + sprintf(fn, "psm2-demo-server-epid-%d", rank); + printf("PSM2 client waiting for epid mapping file to appear...\n"); fflush(stdout); + while (!fp) { + usleep(250*1000); + fp = fopen(fn, "r"); + } + fscanf(fp, "%lx", &server_epid); + fclose(fp); + printf("PSM2 client found server epid = 0x%lx\n", server_epid); + return server_epid; +} + +void write_epid_to_file(int rank, psm2_epid_t myepid) { + FILE *fp; + char fn[256]; + printf("%s: enter\n", __FUNCTION__); + sprintf(fn, "psm2-demo-server-epid-%d", rank); + fp = fopen(fn, "w"); + if (!fp) { + fprintf(stderr, + "Exiting, couldn't write server's epid mapping file: "); + die(strerror(errno), errno); + } + fprintf(fp, "0x%lx", myepid); + fclose(fp); + printf("PSM2 server wrote epid = 0x%lx to file.\n", myepid); + return; +} + +psm2_uuid_t uuid; +psm2_ep_t myep; +psm2_epid_t myepid; +psm2_epid_t server_epid; +psm2_epid_t epid_array[CONNECT_ARRAY_SIZE]; +int epid_array_mask[CONNECT_ARRAY_SIZE]; +psm2_error_t epid_connect_errors[CONNECT_ARRAY_SIZE]; +psm2_epaddr_t epaddr_array[CONNECT_ARRAY_SIZE]; + +int my_psm2_init(int my_rank, int server_rank) { + struct psm2_ep_open_opts o; + int rc; + int ver_major = PSM2_VERNO_MAJOR; + int ver_minor = PSM2_VERNO_MINOR; + memset(uuid, 0, sizeof(psm2_uuid_t)); /* Use a UUID of zero */ +/* Try to initialize PSM2 with the requested library version. + * * In this example, given the use of the PSM2_VERNO_MAJOR and MINOR + * * as defined in the PSM2 headers, ensure that we are linking with + * * the same version of PSM2 as we compiled against. */ + + if ((rc = psm2_init(&ver_major, &ver_minor)) != PSM2_OK) { + die("couldn't init", rc); + } + printf("PSM2 init done.\n"); + /* Setup the endpoint options struct */ + if ((rc = psm2_ep_open_opts_get_defaults(&o)) != PSM2_OK) { + die("couldn't set default opts", rc); + } + printf("PSM2 opts_get_defaults done.\n"); + /* Attempt to open a PSM2 endpoint. This allocates hardware resources. */ + if ((rc = psm2_ep_open(uuid, &o, &myep, &myepid)) != PSM2_OK) { + die("couldn't psm2_ep_open()", rc); + } + printf("PSM2 endpoint open done.\n"); + + return 0; +} + +psm2_mq_t q; + +int my_psm2_connect(int my_rank, int server_rank) { + int rc; + int is_server = (my_rank == server_rank) ? 1 : 0; + printf("%s: enter\n", __FUNCTION__); fflush(stdout); + if (is_server) { + write_epid_to_file(my_rank, myepid); + } else { + server_epid = find_server(server_rank); + } + printf("%s: epid exchange done\n", __FUNCTION__); fflush(stdout); + if (is_server) { + /* Server does nothing here. A connection does not have to be + * * established to receive messages. */ + printf("PSM2 server up.\n"); + } else { + /* Setup connection request info */ + /* PSM2 can connect to a single epid per request, + * * or an arbitrary number of epids in a single connect call. + * * For this example, use part of an array of + * * connection requests. */ + memset(epid_array_mask, 0, sizeof(int) * CONNECT_ARRAY_SIZE); + epid_array[0] = server_epid; + epid_array_mask[0] = 1; + /* Begin the connection process. + * * note that if a requested epid is not responding, + * * the connect call will still return OK. + * * The errors array will contain the state of individual + * * connection requests. */ + printf("calling ep_connect\n"); + int count = 0; + while ((rc = psm2_ep_connect(myep, + CONNECT_ARRAY_SIZE, + epid_array, + epid_array_mask, + epid_connect_errors, + epaddr_array, + 1 /* 0.5 sec timeout */ + )) != PSM2_OK) { + struct timespec ts = { .tv_sec = 0, .tv_nsec = 500*1000*1000 }; + nanosleep(&ts, NULL); + printf("."); fflush(stdout); + count++; + if (count > 30) { + break; + } + } + + if (rc != PSM2_OK) { + printf("psm2_ep_connect timed-out\n"); + return -1; + } + + printf("PSM2 connect request processed.\n"); + /* Now check if our connection to the server is ready */ + if (epid_connect_errors[0] != PSM2_OK) { + die("couldn't connect to server", epid_connect_errors[0]); + return -1; + } + printf("PSM2 client-server connection established.\n"); + } + + /* Setup our PSM2 message queue */ + if ((rc = psm2_mq_init(myep, PSM2_MQ_ORDERMASK_NONE, NULL, 0, &q)) + != PSM2_OK) { + die("couldn't initialize PSM2 MQ", rc); + } + printf("PSM2 MQ init done.\n"); + + return 0; +} +char msgbuf[BUFFER_LENGTH]; + +int my_psm2_sendrecv(int rank, int sender, int receiver) { + int is_server = (rank == receiver) ? 1 : 0; + int rc; + psm2_mq_req_t req_mq; + //char msgbuf[BUFFER_LENGTH]; + + register long rsp asm ("rsp"); + printf("rsp=%lx.msgbuf=%p\n", rsp, msgbuf); fflush(stdout); + + memset(msgbuf, 0, BUFFER_LENGTH); + + if (is_server) { + psm2_mq_tag_t t = {0xABCD}; + psm2_mq_tag_t tm = {-1}; + /* Post the receive request */ + if ((rc = psm2_mq_irecv2(q, PSM2_MQ_ANY_ADDR, + &t, /* message tag */ + &tm, /* message tag mask */ + 0, /* no flags */ + msgbuf, BUFFER_LENGTH, + NULL, /* no context to add */ + &req_mq /* track irecv status */ + )) != PSM2_OK) { + die("couldn't post psm2_mq_irecv()", rc); + } + printf("PSM2 MQ irecv() posted\n"); + +#if 0 + /* Wait until the message arrives */ + if ((rc = psm2_mq_wait(&req_mq, NULL)) != PSM2_OK) { + die("couldn't wait for the irecv", rc); + } + printf("PSM2 MQ wait() done.\n"); + printf("Message from client:\n"); + printf("%s", msgbuf); + + if (is_server) { + char fn[256]; + sprintf(fn, "psm2-demo-server-epid-%d", rank); + unlink(fn); + } +#else + int count = 0; + while ((rc = psm2_mq_ipeek(q, &req_mq, NULL)) != PSM2_OK) { + struct timespec ts = { .tv_sec = 0, .tv_nsec = 500*1000*1000 }; + nanosleep(&ts, NULL); + printf("."); fflush(stdout); + count++; + if (count > 2) { + break; + } + } + if (rc == PSM2_OK) { + if ((rc = psm2_mq_test(&req_mq, NULL)) != PSM2_OK) { + printf("psm2_mq_test failed\n"); + } else { + printf("PSM2 MQ test() done.\n"); + printf("Message from client:\n"); + printf("%s", msgbuf); + } + char fn[256]; + sprintf(fn, "psm2-demo-server-epid-%d", rank); + unlink(fn); + } else { + printf("PSM2 MQ test() timed-out.\n"); + } +#endif + } else { + /* Say hello */ + snprintf(msgbuf, BUFFER_LENGTH, + "Hello world from epid=0x%lx, pid=%d.\n", + myepid, getpid()); + psm2_mq_tag_t t = {0xABCD}; +#if 0 + if ((rc = psm2_mq_send2(q, + epaddr_array[0], /* destination epaddr */ + PSM2_MQ_FLAG_SENDSYNC, /* no flags */ + &t, /* tag */ + msgbuf, BUFFER_LENGTH + )) != PSM2_OK) { + die("couldn't post psm2_mq_isend", rc); + } + printf("PSM2 MQ send() done.\n"); +#else + if ((rc = psm2_mq_isend2(q, + epaddr_array[0], /* destination epaddr */ + PSM2_MQ_FLAG_SENDSYNC, /* no flags */ + &t, /* tag */ + msgbuf, BUFFER_LENGTH, + NULL, /* no context to add */ + &req_mq /* track irecv status */ + )) != PSM2_OK) { + die("couldn't post psm2_mq_isend", rc); + } + printf("PSM2 MQ isend() posted\n"); + + int count = 0; + while ((rc = psm2_mq_ipeek2(q, &req_mq, NULL)) != PSM2_OK) { + struct timespec ts = { .tv_sec = 0, .tv_nsec = 500*1000*1000 }; + nanosleep(&ts, NULL); + printf("."); fflush(stdout); + count++; + if (count > 30) { + break; + } + } + if (rc == PSM2_OK) { + if ((rc = psm2_mq_test2(&req_mq, NULL)) != PSM2_OK) { + printf("PSM2 MQ test() failed.\n"); + } else { + printf("PSM2 MQ test() done.\n"); + } + } else { + printf("PSM2 MQ test() timeout.\n"); + } +#endif + } +/* Close down the MQ */ + if ((rc = psm2_mq_finalize(q)) != PSM2_OK) { + die("couldn't psm2_mq_finalize()", rc); + } + printf("PSM2 MQ finalized.\n"); +/* Close our ep, releasing all hardware resources. + * * Try to close all connections properly */ + if ((rc = psm2_ep_close(myep, PSM2_EP_CLOSE_GRACEFUL, + 0 /* no timeout */)) != PSM2_OK) { + die("couldn't psm2_ep_close()", rc); + } + printf("PSM2 ep closed.\n"); + /* Release all local PSM2 resources */ + if ((rc = psm2_finalize()) != PSM2_OK) { + die("couldn't psm2_finalize()", rc); + } + printf("PSM2 shut down, exiting.\n"); + return 0; +} + +static struct option options[] = { + { + .name = "ppn", + .has_arg = required_argument, + .flag = NULL, + .val = 'P', + }, + /* end */ + { NULL, 0, NULL, 0, }, +}; + +struct thr_arg { + pthread_barrier_t bar; + pthread_t pthread; + int rank; + int ppn; + int nproc; +}; + +struct thr_arg thr_arg; + +void *progress_fn(void *arg) { + struct thr_arg *thr_arg = (struct thr_arg *)arg; + int rc; + int i; + + rc = syscall(732); + if (rc == -1) + fprintf(stdout, "CT09100 progress_fn running on Linux OK\n"); + else { + fprintf(stdout, "CT09100 progress_fn running on McKernel NG (%d)\n", rc); + } + + printf("progress,enter\n"); + + pthread_barrier_wait(&thr_arg->bar); + +#if 1 + for (i = 0; i < thr_arg->nproc; i++) { + if (!on_same_node(thr_arg->ppn, thr_arg->rank, i)) { + if (thr_arg->rank < i) { + my_psm2_sendrecv(thr_arg->rank, thr_arg->rank, i); + } else { + my_psm2_sendrecv(thr_arg->rank, i, thr_arg->rank); + } + } + } +#endif + + pthread_barrier_wait(&thr_arg->bar); + + +#if 0 + printf("progress,entering infinite loop\n"); + while(1) { } +#endif + printf("progress,returning\n"); + return NULL; +} + +int main(int argc, char **argv) { + int rc; + int actual; + int nproc; + int ppn = -1; + int my_rank = -1, size = -1; + int i, j; + struct timespec start, end; + long t_pure_l, t_overall_l; + long t_pure, t_overall; + int opt; + pthread_barrierattr_t barrierattr; + + fwq_init(); + + while ((opt = getopt_long(argc, argv, "+P:", options, NULL)) != -1) { + switch (opt) { + case 'P': + ppn = atoi(optarg); + break; + default: /* '?' */ + printf("unknown option %c\n", optopt); + exit(1); + } + } + + if (ppn == -1) { + printf("specify ppn with --ppn"); + exit(1); + } + + char *rank_str = getenv("PMI_RANK"); + if (!rank_str) { + printf("getenv failed\n"); + exit(1); + } + my_rank = atoi(rank_str); + printf("my_rank=%d\n", my_rank); fflush(stdout); + + nproc = 2; + + if (my_rank == 0) { + printf("tid=%d,pid=%d,nproc=%d\n", syscall(__NR_gettid), getpid(), nproc); + printf("nsec=%ld, nspw=%f\n", nsec, nspw); + } + + int server_rank = ppn + (my_rank % ppn); + my_psm2_init(my_rank, server_rank); + my_psm2_connect(my_rank, server_rank); + + /* Spawn a thread */ + thr_arg.rank = my_rank; + thr_arg.ppn = ppn; + thr_arg.nproc = nproc; + + pthread_barrierattr_init(&barrierattr); + pthread_barrier_init(&thr_arg.bar, &barrierattr, nproc); + + char *uti_str = getenv("DISABLE_UTI"); + int uti_val = uti_str ? atoi(uti_str) : 0; + if (!uti_val) { + rc = syscall(731, 1, NULL); + if (rc) { + fprintf(stdout, "CT09003 INFO: uti not available (rc=%d)\n", rc); + } else { + fprintf(stdout, "CT09003 INFO: uti available\n"); + } + } else { + fprintf(stdout, "CT09003 INFO: uti disabled\n"); + } + + rc = pthread_create(&thr_arg.pthread, NULL, progress_fn, &thr_arg); + if (rc){ + fprintf(stdout, "pthread_create: %d\n", rc); + exit(1); + } + + pthread_barrier_wait(&thr_arg.bar); + + pthread_barrier_wait(&thr_arg.bar); + + pthread_join(thr_arg.pthread, NULL); + + fn_exit: + return 0; + fn_fail: + goto fn_exit; +} diff --git a/test/uti/mpi/008.sh b/test/uti/mpi/008.sh new file mode 100755 index 00000000..24a310cf --- /dev/null +++ b/test/uti/mpi/008.sh @@ -0,0 +1,89 @@ +#!/usr/bin/bash + +#!/usr/bin/bash -x + +MYHOME=$HOME +UTI_MPI_TOP=${MYHOME}/project/os/mckernel/test/uti/mpi + +MCK=${MYHOME}/project/os/install +unset DISABLE_UTI + +cmdline="./008" + +stop=0 +reboot=0 +go=0 + +mck=0 +nloops=1 + +while getopts srgac:n:mdl: OPT +do + case ${OPT} in + s) stop=1 + ;; + r) reboot=1 + ;; + g) go=1 + ;; + a) async=1 + ;; + c) cmdline=$OPTARG + ;; + n) ndoubles=$OPTARG + ;; + m) + mck=1 + ;; + d) export DISABLE_UTI=1 + ;; + l) nloops=$OPTARG + ;; + *) echo "invalid option -${OPT}" >&2 + exit 1 + esac +done + +if [ ${mck} -eq 1 ]; then + MCEXEC="${MCK}/bin/mcexec" +else + MCEXEC= +fi + +if [ ${stop} -eq 1 ]; then + if [ ${mck} -eq 1 ]; then + sudo ${MCK}/sbin/mcstop+release.sh + else + : + fi +fi + +if [ ${reboot} -eq 1 ]; then + if [ ${mck} -eq 1 ]; then + if hostname | grep ofp &>/dev/null; then + sudo ${MCK}/sbin/mcreboot.sh -s -c 2-17,70-85,138-153,206-221,20-35,88-103,156-171,224-239,36-51,104-119,172-187,240-255,52-67,120-135,188-203,256-271 -r 2-5,70-73,138-141,206-209:0+6-9,74-77,142-145,210-213:1+10-13,78-81,146-149,214-217:68+14-17,82-85,150-153,218-221:69+20-23,88-91,156-159,224-227:136+24-27,92-95,160-163,228-231:137+28-31,96-99,164-167,232-235:204+32-35,100-103,168-171,236-239:205+36-39,104-107,172-175,240-243:18+40-43,108-111,176-179,244-247:19+44-47,112-115,180-183,248-251:86+48-51,116-119,184-187,252-255:87+52-55,120-123,188-191,256-259:154+56-59,124-127,192-195,260-263:155+60-63,128-131,196-199,264-267:222+64-67,132-135,200-203,268-271:223 -m 32G@0,12G@1 + else + sudo ${MCK}/sbin/mcreboot.sh -s -c 1-15,65-79,129-143,193-207,17-31,81-95,145-159,209-223,33-47,97-111,161-175,225-239,49-63,113-127,177-191,241-255 -r 1-15:0+65-79:64+129-143:128+193-207:192+17-31:16+81-95:80+145-159:144+209-223:208+33-47:32+97-111:96+161-175:160+225-239:224+49-63:48+113-127:112+177-191:176+241-255:240 -m 12G@0,12G@1,12G@2,12G@3,3920M@4,3920M@5,3920M@6,3920M@7 + fi + else + : + fi +fi + +if [ ${go} -eq 1 ]; then + cd ${UTI_MPI_TOP} + make CC=gcc 008 + for i in `seq 1 ${nloops}`; do + rm -f psm2-demo-server-epid-* + #PSM2_RCVTHREAD=0 PMI_RANK=0 DISABLE_UTI=1 ${MCK}/bin/mcexec --enable-uti taskset -c 2 ./008 --ppn 1 & + PSM2_RCVTHREAD=0 PMI_RANK=1 DISABLE_UTI=0 ${MCK}/bin/mcexec --enable-uti taskset -c 3 ./008 --ppn 1 + #wait + echo =====; + echo $i; + echo =====; i=$((i+1)); + #sleep 2 + done +fi + + + diff --git a/test/uti/mpi/009.c b/test/uti/mpi/009.c new file mode 100755 index 00000000..3a1209a5 --- /dev/null +++ b/test/uti/mpi/009.c @@ -0,0 +1,537 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* For SYS_xxx definitions */ +#include +#include + +#include /* required for core PSM2 functions */ +#include /* required for PSM2 MQ functions (send, recv, etc) */ + +//#define DEBUG +#ifdef DEBUG +#define dprintf printf +#else +#define dprintf {} +#endif + +#define BUFFER_LENGTH /*8000000*/(1ULL<<12) +#define CONNECT_ARRAY_SIZE 8 +void die(char *msg, int rc) { + fprintf(stderr, "%s: %d\n", msg, rc); + fflush(stderr); +} + +#define DIFFNSEC(end, start) ((end.tv_sec - start.tv_sec) * 1000000000UL + (end.tv_nsec - start.tv_nsec)) + +static inline void fixed_size_work() { + asm volatile( + "movq $0, %%rcx\n\t" + "1:\t" + "addq $1, %%rcx\n\t" + "cmpq $99, %%rcx\n\t" + "jle 1b\n\t" + : + : + : "rcx", "cc"); +} + +static inline void bulk_fsw(unsigned long n) { + int j; + for (j = 0; j < (n); j++) { + fixed_size_work(); + } +} + +double nspw; /* nsec per work */ +unsigned long nsec; + +void fwq_init() { + struct timespec start, end; + int i; + clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start); +#define N_INIT 10000000 + bulk_fsw(N_INIT); + clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end); + nsec = DIFFNSEC(end, start); + nspw = nsec / (double)N_INIT; +} + +#if 1 +void fwq(long delay_nsec) { + if (delay_nsec < 0) { + return; + //printf("%s: delay_nsec < 0\n", __FUNCTION__); + } + bulk_fsw(delay_nsec / nspw); +} +#else /* For machines with large core-to-core performance variation (e.g. OFP) */ +void fwq(long delay_nsec) { + struct timespec start, end; + clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start); + + while (1) { + clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end); + if (DIFFNSEC(end, start) >= delay_nsec) { + break; + } + bulk_fsw(2); /* ~150 ns per iteration on FOP */ + } +} +#endif + + +static int print_cpu_last_executed_on() { + char fn[256]; + char* result; + pid_t tid = syscall(SYS_gettid); + int fd; + int offset; + int mpi_errno = 0; + + sprintf(fn, "/proc/%d/task/%d/stat", getpid(), (int)tid); + //printf("fn=%s\n", fn); + fd = open(fn, O_RDONLY); + if(fd == -1) { + printf("open() failed\n"); + goto fn_fail; + } + + result = malloc(65536); + if(result == NULL) { + printf("malloc() failed"); + goto fn_fail; + } + + int amount = 0; + offset = 0; + while(1) { + amount = read(fd, result + offset, 65536); + // printf("amount=%d\n", amount); + if(amount == -1) { + printf("read() failed"); + goto fn_fail; + } + if(amount == 0) { + goto eof; + } + offset += amount; + } + eof:; + //printf("result:%s\n", result); + + char* next_delim = result; + char* field; + int i; + for(i = 0; i < 39; i++) { + field = strsep(&next_delim, " "); + } + + int cpu = sched_getcpu(); + if(cpu == -1) { + printf("getpu() failed\n"); + goto fn_fail; + } + + printf("compute thread,pmi_rank=%02d,stat-cpu=%02d,sched_getcpu=%02d,pid=%d,tid=%d\n", atoi(getenv("PMI_RANK")), atoi(field), cpu, getpid(), tid); fflush(stdout); + fn_exit: + free(result); + return mpi_errno; + fn_fail: + mpi_errno = -1; + goto fn_exit; +} + +static inline int on_same_node(int ppn, int me, int you) { + return (me / ppn == you / ppn); +} + +/* isend-calc-wait */ +void my_send(int nproc, int ppn, int rank, double *sbuf, double *rbuf, int ndoubles, MPI_Request* reqs, long calc_nsec) { + int i; + int r = 0, s = 0; + int req = 0; + for (i = 0; i < nproc; i++) { + if (!on_same_node(ppn, rank, i)) { + MPI_Irecv(rbuf + r * ndoubles, ndoubles, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, &reqs[req]); + r++; + req++; + MPI_Isend(sbuf + s * ndoubles, ndoubles, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, &reqs[req]); + s++; + req++; + } + } + fwq(calc_nsec); + MPI_Waitall(req, reqs, MPI_STATUSES_IGNORE); +} + + +/* Helper functions to find the server's PSM2 endpoint identifier (epid). */ +psm2_epid_t find_server(int rank) { + FILE *fp = NULL; + psm2_epid_t server_epid = 0; + char fn[256]; + printf("%s: enter\n", __FUNCTION__); fflush(stdout); + + sprintf(fn, "psm2-demo-server-epid-%d", rank); + printf("PSM2 client waiting for epid mapping file to appear...\n"); fflush(stdout); + while (!fp) { + sleep(1); + fp = fopen(fn, "r"); + } + fscanf(fp, "%lx", &server_epid); + fclose(fp); + printf("PSM2 client found server epid = 0x%lx\n", server_epid); + return server_epid; +} + +void write_epid_to_file(int rank, psm2_epid_t myepid) { + FILE *fp; + char fn[256]; + printf("%s: enter\n", __FUNCTION__); + sprintf(fn, "psm2-demo-server-epid-%d", rank); + fp = fopen(fn, "w"); + if (!fp) { + fprintf(stderr, + "Exiting, couldn't write server's epid mapping file: "); + die(strerror(errno), errno); + } + fprintf(fp, "0x%lx", myepid); + fclose(fp); + printf("PSM2 server wrote epid = 0x%lx to file.\n", myepid); + return; +} + +psm2_uuid_t uuid; +psm2_ep_t myep; +psm2_epid_t myepid; +psm2_epid_t server_epid; +psm2_epid_t epid_array[CONNECT_ARRAY_SIZE]; +int epid_array_mask[CONNECT_ARRAY_SIZE]; +psm2_error_t epid_connect_errors[CONNECT_ARRAY_SIZE]; +psm2_epaddr_t epaddr_array[CONNECT_ARRAY_SIZE]; + +int my_psm2_init(int my_rank, int server_rank) { + struct psm2_ep_open_opts o; + int rc; + int ver_major = PSM2_VERNO_MAJOR; + int ver_minor = PSM2_VERNO_MINOR; + + printf("%s: my_rank=%d,server_rank=%d\n", __FUNCTION__, my_rank, server_rank); fflush(stdout); + memset(uuid, 0, sizeof(psm2_uuid_t)); /* Use a UUID of zero */ +/* Try to initialize PSM2 with the requested library version. + * * In this example, given the use of the PSM2_VERNO_MAJOR and MINOR + * * as defined in the PSM2 headers, ensure that we are linking with + * * the same version of PSM2 as we compiled against. */ + + if ((rc = psm2_init(&ver_major, &ver_minor)) != PSM2_OK) { + die("couldn't init", rc); + } + printf("PSM2 init done.\n"); + /* Setup the endpoint options struct */ + if ((rc = psm2_ep_open_opts_get_defaults(&o)) != PSM2_OK) { + die("couldn't set default opts", rc); + } + printf("PSM2 opts_get_defaults done.\n"); + /* Attempt to open a PSM2 endpoint. This allocates hardware resources. */ + if ((rc = psm2_ep_open(uuid, &o, &myep, &myepid)) != PSM2_OK) { + die("couldn't psm2_ep_open()", rc); + } + printf("PSM2 endpoint open done.\n"); + + return 0; +} +int my_psm2_connect(int my_rank, int server_rank) { + int rc; + int is_server = (my_rank == server_rank) ? 1 : 0; + printf("%s: my_rank=%d,server_rank=%d\n", __FUNCTION__, my_rank, server_rank); fflush(stdout); + if (is_server) { + write_epid_to_file(my_rank, myepid); + } else { + server_epid = find_server(server_rank); + } + printf("%s: epid exchange done\n", __FUNCTION__); fflush(stdout); + if (is_server) { + /* Server does nothing here. A connection does not have to be + * * established to receive messages. */ + printf("PSM2 server up.\n"); + } else { + /* Setup connection request info */ + /* PSM2 can connect to a single epid per request, + * * or an arbitrary number of epids in a single connect call. + * * For this example, use part of an array of + * * connection requests. */ + memset(epid_array_mask, 0, sizeof(int) * CONNECT_ARRAY_SIZE); + epid_array[0] = server_epid; + epid_array_mask[0] = 1; + /* Begin the connection process. + * * note that if a requested epid is not responding, + * * the connect call will still return OK. + * * The errors array will contain the state of individual + * * connection requests. */ + if ((rc = psm2_ep_connect(myep, + CONNECT_ARRAY_SIZE, + epid_array, + epid_array_mask, + epid_connect_errors, + epaddr_array, + 0 /* no timeout */ + )) != PSM2_OK) { + die("couldn't ep_connect", rc); + return -1; + } + printf("PSM2 connect request processed.\n"); + /* Now check if our connection to the server is ready */ + if (epid_connect_errors[0] != PSM2_OK) { + die("couldn't connect to server", epid_connect_errors[0]); + return -1; + } + printf("PSM2 client-server connection established.\n"); + } + return 0; +} +char msgbuf[BUFFER_LENGTH]; + +int my_psm2_sendrecv(int rank, int sender, int receiver) { + int is_server = (rank == receiver) ? 1 : 0; + int rc; + psm2_mq_t q; + psm2_mq_req_t req_mq; + //char msgbuf[BUFFER_LENGTH]; + + register long rsp asm ("rsp"); + printf("rsp=%lx.msgbuf=%p\n", rsp, msgbuf); fflush(stdout); + + memset(msgbuf, 0, BUFFER_LENGTH); + + /* Setup our PSM2 message queue */ + if ((rc = psm2_mq_init(myep, PSM2_MQ_ORDERMASK_NONE, NULL, 0, &q)) + != PSM2_OK) { + die("couldn't initialize PSM2 MQ", rc); + } + printf("PSM2 MQ init done.\n"); + if (is_server) { + psm2_mq_tag_t t = {0xABCD}; + psm2_mq_tag_t tm = {-1}; + /* Post the receive request */ + if ((rc = psm2_mq_irecv2(q, PSM2_MQ_ANY_ADDR, + &t, /* message tag */ + &tm, /* message tag mask */ + 0, /* no flags */ + msgbuf, BUFFER_LENGTH, + NULL, /* no context to add */ + &req_mq /* track irecv status */ + )) != PSM2_OK) { + die("couldn't post psm2_mq_irecv()", rc); + } + printf("PSM2 MQ irecv() posted\n"); + /* Wait until the message arrives */ + if ((rc = psm2_mq_wait(&req_mq, NULL)) != PSM2_OK) { + die("couldn't wait for the irecv", rc); + } + printf("PSM2 MQ wait() done.\n"); + printf("Message from client:\n"); + printf("%s", msgbuf); + + if (is_server) { + char fn[256]; + sprintf(fn, "psm2-demo-server-epid-%d", rank); + unlink(fn); + } + } else { + /* Say hello */ + snprintf(msgbuf, BUFFER_LENGTH, + "Hello world from epid=0x%lx, pid=%d.\n", + myepid, getpid()); + psm2_mq_tag_t t = {0xABCD}; + if ((rc = psm2_mq_send2(q, + epaddr_array[0], /* destination epaddr */ + PSM2_MQ_FLAG_SENDSYNC, /* no flags */ + &t, /* tag */ + msgbuf, BUFFER_LENGTH + )) != PSM2_OK) { + die("couldn't post psm2_mq_isend", rc); + } + printf("PSM2 MQ send() done.\n"); + } +/* Close down the MQ */ + if ((rc = psm2_mq_finalize(q)) != PSM2_OK) { + die("couldn't psm2_mq_finalize()", rc); + } + printf("PSM2 MQ finalized.\n"); +/* Close our ep, releasing all hardware resources. + * * Try to close all connections properly */ + if ((rc = psm2_ep_close(myep, PSM2_EP_CLOSE_GRACEFUL, + 0 /* no timeout */)) != PSM2_OK) { + die("couldn't psm2_ep_close()", rc); + } + printf("PSM2 ep closed.\n"); + /* Release all local PSM2 resources */ + if ((rc = psm2_finalize()) != PSM2_OK) { + die("couldn't psm2_finalize()", rc); + } + printf("PSM2 shut down, exiting.\n"); + return 0; +} + +static struct option options[] = { + { + .name = "ppn", + .has_arg = required_argument, + .flag = NULL, + .val = 'P', + }, + /* end */ + { NULL, 0, NULL, 0, }, +}; + +struct thr_arg { + pthread_barrier_t bar; + pthread_t pthread; + int rank; + int ppn; + int nproc; + int server_rank; +}; + +struct thr_arg thr_arg; + +void *progress_fn(void *arg) { + struct thr_arg *thr_arg = (struct thr_arg *)arg; + int rc; + int i; + + rc = syscall(732); + if (rc == -1) + fprintf(stdout, "CT09100 progress_fn running on Linux OK\n"); + else { + fprintf(stdout, "CT09100 progress_fn running on McKernel NG (%d)\n", rc); + } + + printf("progress,enter\n"); + + pthread_barrier_wait(&thr_arg->bar); + +#if 1 + my_psm2_init(thr_arg->rank, thr_arg->server_rank); + my_psm2_connect(thr_arg->rank, thr_arg->server_rank); + + for (i = 0; i < thr_arg->nproc; i++) { + if (!on_same_node(thr_arg->ppn, thr_arg->rank, i)) { + if (thr_arg->rank < i) { + my_psm2_sendrecv(thr_arg->rank, thr_arg->rank, i); + } else { + my_psm2_sendrecv(thr_arg->rank, i, thr_arg->rank); + } + } + } +#endif + + pthread_barrier_wait(&thr_arg->bar); + + + printf("progress,exit\n"); + return NULL; +} + +int main(int argc, char **argv) { + int rc; + int actual; + int nproc; + int ppn = -1; + int ndoubles = -1; + int my_rank = -1, size = -1; + int i, j; + double *sbuf, *rbuf; + MPI_Request* reqs; + struct timespec start, end; + long t_pure_l, t_overall_l; + long t_pure, t_overall; + int opt; + pthread_barrierattr_t barrierattr; + + fwq_init(); + + while ((opt = getopt_long(argc, argv, "+d:P:", options, NULL)) != -1) { + switch (opt) { + case 'd': + ndoubles = (1ULL << atoi(optarg)); + break; + case 'P': + ppn = atoi(optarg); + break; + default: /* '?' */ + printf("unknown option %c\n", optopt); + exit(1); + } + } + + if (ndoubles == -1 || ppn == -1) { + printf("specify ndoubles with -d and ppn with --ppn"); + exit(1); + } + + char *rank_str = getenv("PMI_RANK"); + if (!rank_str) { + printf("getenv failed\n"); + exit(1); + } + my_rank = atoi(rank_str); + printf("my_rank=%d\n", my_rank); fflush(stdout); + + nproc = 2; + + if (my_rank == 0) { + printf("tid=%d,pid=%d,ndoubles=%d,nproc=%d\n", syscall(__NR_gettid), getpid(), ndoubles, nproc); + printf("nsec=%ld, nspw=%f\n", nsec, nspw); + } + + + /* Spawn a thread */ + thr_arg.rank = my_rank; + thr_arg.ppn = ppn; + thr_arg.nproc = nproc; + thr_arg.server_rank = ppn + (my_rank % ppn); + + pthread_barrierattr_init(&barrierattr); + pthread_barrier_init(&thr_arg.bar, &barrierattr, nproc); + + char *uti_str = getenv("DISABLE_UTI"); + int uti_val = uti_str ? atoi(uti_str) : 0; + if (!uti_val) { + rc = syscall(731, 1, NULL); + if (rc) { + fprintf(stdout, "CT09003 INFO: uti not available (rc=%d)\n", rc); + } else { + fprintf(stdout, "CT09003 INFO: uti available\n"); + } + } else { + fprintf(stdout, "CT09003 INFO: uti disabled\n"); + } + + rc = pthread_create(&thr_arg.pthread, NULL, progress_fn, &thr_arg); + if (rc){ + fprintf(stdout, "pthread_create: %d\n", rc); + exit(1); + } + + pthread_barrier_wait(&thr_arg.bar); + + pthread_barrier_wait(&thr_arg.bar); + + pthread_join(thr_arg.pthread, NULL); + + fn_exit: + return 0; + fn_fail: + goto fn_exit; +} diff --git a/test/uti/mpi/010.c b/test/uti/mpi/010.c new file mode 100755 index 00000000..65ed6d55 --- /dev/null +++ b/test/uti/mpi/010.c @@ -0,0 +1,508 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* For SYS_xxx definitions */ +#include +#include + +#include /* required for core PSM2 functions */ +#include /* required for PSM2 MQ functions (send, recv, etc) */ + +//#define DEBUG +#ifdef DEBUG +#define dprintf printf +#else +#define dprintf {} +#endif + +#define BUFFER_LENGTH /*8000000*/(1ULL<<12) +#define CONNECT_ARRAY_SIZE 8 +void die(char *msg, int rc) { + fprintf(stderr, "%s: %d\n", msg, rc); + fflush(stderr); +} + +#define DIFFNSEC(end, start) ((end.tv_sec - start.tv_sec) * 1000000000UL + (end.tv_nsec - start.tv_nsec)) + +static inline void fixed_size_work() { + asm volatile( + "movq $0, %%rcx\n\t" + "1:\t" + "addq $1, %%rcx\n\t" + "cmpq $99, %%rcx\n\t" + "jle 1b\n\t" + : + : + : "rcx", "cc"); +} + +static inline void bulk_fsw(unsigned long n) { + int j; + for (j = 0; j < (n); j++) { + fixed_size_work(); + } +} + +double nspw; /* nsec per work */ +unsigned long nsec; + +void fwq_init() { + struct timespec start, end; + int i; + clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start); +#define N_INIT 10000000 + bulk_fsw(N_INIT); + clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end); + nsec = DIFFNSEC(end, start); + nspw = nsec / (double)N_INIT; +} + +#if 1 +void fwq(long delay_nsec) { + if (delay_nsec < 0) { + return; + //printf("%s: delay_nsec < 0\n", __FUNCTION__); + } + bulk_fsw(delay_nsec / nspw); +} +#else /* For machines with large core-to-core performance variation (e.g. OFP) */ +void fwq(long delay_nsec) { + struct timespec start, end; + clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start); + + while (1) { + clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end); + if (DIFFNSEC(end, start) >= delay_nsec) { + break; + } + bulk_fsw(2); /* ~150 ns per iteration on FOP */ + } +} +#endif + + +static int print_cpu_last_executed_on() { + char fn[256]; + char* result; + pid_t tid = syscall(SYS_gettid); + int fd; + int offset; + int mpi_errno = 0; + + sprintf(fn, "/proc/%d/task/%d/stat", getpid(), (int)tid); + //printf("fn=%s\n", fn); + fd = open(fn, O_RDONLY); + if(fd == -1) { + printf("open() failed\n"); + goto fn_fail; + } + + result = malloc(65536); + if(result == NULL) { + printf("malloc() failed"); + goto fn_fail; + } + + int amount = 0; + offset = 0; + while(1) { + amount = read(fd, result + offset, 65536); + // printf("amount=%d\n", amount); + if(amount == -1) { + printf("read() failed"); + goto fn_fail; + } + if(amount == 0) { + goto eof; + } + offset += amount; + } + eof:; + //printf("result:%s\n", result); + + char* next_delim = result; + char* field; + int i; + for(i = 0; i < 39; i++) { + field = strsep(&next_delim, " "); + } + + int cpu = sched_getcpu(); + if(cpu == -1) { + printf("getpu() failed\n"); + goto fn_fail; + } + + printf("compute thread,pmi_rank=%02d,stat-cpu=%02d,sched_getcpu=%02d,pid=%d,tid=%d\n", atoi(getenv("PMI_RANK")), atoi(field), cpu, getpid(), tid); fflush(stdout); + fn_exit: + free(result); + return mpi_errno; + fn_fail: + mpi_errno = -1; + goto fn_exit; +} + +static inline int on_same_node(int ppn, int me, int you) { + return (me / ppn == you / ppn); +} + +/* Helper functions to find the server's PSM2 endpoint identifier (epid). */ +psm2_epid_t find_server(int rank) { + FILE *fp = NULL; + psm2_epid_t server_epid = 0; + char fn[256]; + printf("%s: enter\n", __FUNCTION__); fflush(stdout); + + sprintf(fn, "psm2-demo-server-epid-%d", rank); + printf("PSM2 client waiting for epid mapping file to appear...\n"); fflush(stdout); + while (!fp) { + sleep(1); + fp = fopen(fn, "r"); + } + fscanf(fp, "%lx", &server_epid); + fclose(fp); + printf("PSM2 client found server epid = 0x%lx\n", server_epid); + return server_epid; +} + +void write_epid_to_file(int rank, psm2_epid_t myepid) { + FILE *fp; + char fn[256]; + printf("%s: enter\n", __FUNCTION__); + sprintf(fn, "psm2-demo-server-epid-%d", rank); + fp = fopen(fn, "w"); + if (!fp) { + fprintf(stderr, + "Exiting, couldn't write server's epid mapping file: "); + die(strerror(errno), errno); + } + fprintf(fp, "0x%lx", myepid); + fclose(fp); + printf("PSM2 server wrote epid = 0x%lx to file.\n", myepid); + return; +} + +psm2_uuid_t uuid; +psm2_ep_t myep; +psm2_epid_t myepid; +psm2_epid_t server_epid; +psm2_epid_t epid_array[CONNECT_ARRAY_SIZE]; +int epid_array_mask[CONNECT_ARRAY_SIZE]; +psm2_error_t epid_connect_errors[CONNECT_ARRAY_SIZE]; +psm2_epaddr_t epaddr_array[CONNECT_ARRAY_SIZE]; + +int my_psm2_init(int my_rank, int server_rank) { + struct psm2_ep_open_opts o; + int rc; + int ver_major = PSM2_VERNO_MAJOR; + int ver_minor = PSM2_VERNO_MINOR; + + printf("%s: my_rank=%d,server_rank=%d\n", __FUNCTION__, my_rank, server_rank); fflush(stdout); + memset(uuid, 0, sizeof(psm2_uuid_t)); /* Use a UUID of zero */ +/* Try to initialize PSM2 with the requested library version. + * * In this example, given the use of the PSM2_VERNO_MAJOR and MINOR + * * as defined in the PSM2 headers, ensure that we are linking with + * * the same version of PSM2 as we compiled against. */ + + if ((rc = psm2_init(&ver_major, &ver_minor)) != PSM2_OK) { + die("couldn't init", rc); + } + printf("PSM2 init done.\n"); + /* Setup the endpoint options struct */ + if ((rc = psm2_ep_open_opts_get_defaults(&o)) != PSM2_OK) { + die("couldn't set default opts", rc); + } + printf("PSM2 opts_get_defaults done.\n"); + /* Attempt to open a PSM2 endpoint. This allocates hardware resources. */ + if ((rc = psm2_ep_open(uuid, &o, &myep, &myepid)) != PSM2_OK) { + die("couldn't psm2_ep_open()", rc); + } + printf("PSM2 endpoint open done.\n"); + + return 0; +} +int my_psm2_connect(int my_rank, int server_rank) { + int rc; + int is_server = (my_rank == server_rank) ? 1 : 0; + printf("%s: my_rank=%d,server_rank=%d\n", __FUNCTION__, my_rank, server_rank); fflush(stdout); + if (is_server) { + write_epid_to_file(my_rank, myepid); + } else { + server_epid = find_server(server_rank); + } + printf("%s: epid exchange done\n", __FUNCTION__); fflush(stdout); + if (is_server) { + /* Server does nothing here. A connection does not have to be + * * established to receive messages. */ + printf("PSM2 server up.\n"); + } else { + /* Setup connection request info */ + /* PSM2 can connect to a single epid per request, + * * or an arbitrary number of epids in a single connect call. + * * For this example, use part of an array of + * * connection requests. */ + memset(epid_array_mask, 0, sizeof(int) * CONNECT_ARRAY_SIZE); + epid_array[0] = server_epid; + epid_array_mask[0] = 1; + /* Begin the connection process. + * * note that if a requested epid is not responding, + * * the connect call will still return OK. + * * The errors array will contain the state of individual + * * connection requests. */ + if ((rc = psm2_ep_connect(myep, + CONNECT_ARRAY_SIZE, + epid_array, + epid_array_mask, + epid_connect_errors, + epaddr_array, + 0 /* no timeout */ + )) != PSM2_OK) { + die("couldn't ep_connect", rc); + return -1; + } + printf("PSM2 connect request processed.\n"); + /* Now check if our connection to the server is ready */ + if (epid_connect_errors[0] != PSM2_OK) { + die("couldn't connect to server", epid_connect_errors[0]); + return -1; + } + printf("PSM2 client-server connection established.\n"); + } + return 0; +} +char msgbuf[BUFFER_LENGTH]; + +int my_psm2_sendrecv(int rank, int sender, int receiver) { + int is_server = (rank == receiver) ? 1 : 0; + int rc; + psm2_mq_t q; + psm2_mq_req_t req_mq; + //char msgbuf[BUFFER_LENGTH]; + + register long rsp asm ("rsp"); + printf("rsp=%lx.msgbuf=%p\n", rsp, msgbuf); fflush(stdout); + + memset(msgbuf, 0, BUFFER_LENGTH); + + /* Setup our PSM2 message queue */ + if ((rc = psm2_mq_init(myep, PSM2_MQ_ORDERMASK_NONE, NULL, 0, &q)) + != PSM2_OK) { + die("couldn't initialize PSM2 MQ", rc); + } + printf("PSM2 MQ init done.\n"); + if (is_server) { + psm2_mq_tag_t t = {0xABCD}; + psm2_mq_tag_t tm = {-1}; + /* Post the receive request */ + if ((rc = psm2_mq_irecv2(q, PSM2_MQ_ANY_ADDR, + &t, /* message tag */ + &tm, /* message tag mask */ + 0, /* no flags */ + msgbuf, BUFFER_LENGTH, + NULL, /* no context to add */ + &req_mq /* track irecv status */ + )) != PSM2_OK) { + die("couldn't post psm2_mq_irecv()", rc); + } + printf("PSM2 MQ irecv() posted\n"); + /* Wait until the message arrives */ + if ((rc = psm2_mq_wait(&req_mq, NULL)) != PSM2_OK) { + die("couldn't wait for the irecv", rc); + } + printf("PSM2 MQ wait() done.\n"); + printf("Message from client:\n"); + printf("%s", msgbuf); + + if (is_server) { + char fn[256]; + sprintf(fn, "psm2-demo-server-epid-%d", rank); + unlink(fn); + } + } else { + /* Say hello */ + snprintf(msgbuf, BUFFER_LENGTH, + "Hello world from epid=0x%lx, pid=%d.\n", + myepid, getpid()); + psm2_mq_tag_t t = {0xABCD}; + if ((rc = psm2_mq_send2(q, + epaddr_array[0], /* destination epaddr */ + PSM2_MQ_FLAG_SENDSYNC, /* no flags */ + &t, /* tag */ + msgbuf, BUFFER_LENGTH + )) != PSM2_OK) { + die("couldn't post psm2_mq_isend", rc); + } + printf("PSM2 MQ send() done.\n"); + } +/* Close down the MQ */ + if ((rc = psm2_mq_finalize(q)) != PSM2_OK) { + die("couldn't psm2_mq_finalize()", rc); + } + printf("PSM2 MQ finalized.\n"); +/* Close our ep, releasing all hardware resources. + * * Try to close all connections properly */ + if ((rc = psm2_ep_close(myep, PSM2_EP_CLOSE_GRACEFUL, + 0 /* no timeout */)) != PSM2_OK) { + die("couldn't psm2_ep_close()", rc); + } + printf("PSM2 ep closed.\n"); + /* Release all local PSM2 resources */ + if ((rc = psm2_finalize()) != PSM2_OK) { + die("couldn't psm2_finalize()", rc); + } + printf("PSM2 shut down, exiting.\n"); + return 0; +} + +static struct option options[] = { + { + .name = "ppn", + .has_arg = required_argument, + .flag = NULL, + .val = 'P', + }, + /* end */ + { NULL, 0, NULL, 0, }, +}; + +struct thr_arg { + pthread_barrier_t bar; + pthread_t pthread; + int rank; + int ppn; + int nproc; + int server_rank; +}; + +struct thr_arg thr_arg; + +void *progress_fn(void *arg) { + struct thr_arg *thr_arg = (struct thr_arg *)arg; + int rc; + int i; + + rc = syscall(732); + if (rc == -1) + fprintf(stdout, "CT09100 progress_fn running on Linux OK\n"); + else { + fprintf(stdout, "CT09100 progress_fn running on McKernel NG (%d)\n", rc); + } + + printf("progress,enter\n"); + + pthread_barrier_wait(&thr_arg->bar); + + pthread_barrier_wait(&thr_arg->bar); + + + printf("progress,exit\n"); + return NULL; +} + +int main(int argc, char **argv) { + int rc; + int actual; + int nproc; + int ppn = -1; + int my_rank = -1, size = -1; + int i, j; + struct timespec start, end; + long t_pure_l, t_overall_l; + long t_pure, t_overall; + int opt; + pthread_barrierattr_t barrierattr; + + fwq_init(); + + while ((opt = getopt_long(argc, argv, "+P:", options, NULL)) != -1) { + switch (opt) { + case 'P': + ppn = atoi(optarg); + break; + default: /* '?' */ + printf("unknown option %c\n", optopt); + exit(1); + } + } + + if (ppn == -1) { + printf("specify ppn with --ppn"); + exit(1); + } + + char *rank_str = getenv("PMI_RANK"); + if (!rank_str) { + printf("getenv failed\n"); + exit(1); + } + my_rank = atoi(rank_str); + printf("my_rank=%d\n", my_rank); fflush(stdout); + + nproc = 2; + + if (my_rank == 0) { + printf("tid=%d,pid=%d,nproc=%d\n", syscall(__NR_gettid), getpid(), nproc); + printf("nsec=%ld, nspw=%f\n", nsec, nspw); + } + + + /* Spawn a thread */ + thr_arg.rank = my_rank; + thr_arg.ppn = ppn; + thr_arg.nproc = nproc; + thr_arg.server_rank = ppn + (my_rank % ppn); + + pthread_barrierattr_init(&barrierattr); + pthread_barrier_init(&thr_arg.bar, &barrierattr, nproc); + + char *uti_str = getenv("DISABLE_UTI"); + int uti_val = uti_str ? atoi(uti_str) : 0; + if (!uti_val) { + rc = syscall(731, 1, NULL); + if (rc) { + fprintf(stdout, "CT09003 INFO: uti not available (rc=%d)\n", rc); + } else { + fprintf(stdout, "CT09003 INFO: uti available\n"); + } + } else { + fprintf(stdout, "CT09003 INFO: uti disabled\n"); + } + + rc = pthread_create(&thr_arg.pthread, NULL, progress_fn, &thr_arg); + if (rc){ + fprintf(stdout, "pthread_create: %d\n", rc); + exit(1); + } + + pthread_barrier_wait(&thr_arg.bar); + + my_psm2_init(thr_arg.rank, thr_arg.server_rank); + my_psm2_connect(thr_arg.rank, thr_arg.server_rank); + + for (i = 0; i < thr_arg.nproc; i++) { + if (!on_same_node(thr_arg.ppn, thr_arg.rank, i)) { + if (thr_arg.rank < i) { + my_psm2_sendrecv(thr_arg.rank, thr_arg.rank, i); + } else { + my_psm2_sendrecv(thr_arg.rank, i, thr_arg.rank); + } + } + } + + pthread_barrier_wait(&thr_arg.bar); + + pthread_join(thr_arg.pthread, NULL); + + fn_exit: + return 0; + fn_fail: + goto fn_exit; +} diff --git a/test/uti/mpi/011.c b/test/uti/mpi/011.c new file mode 100755 index 00000000..0cc48cb3 --- /dev/null +++ b/test/uti/mpi/011.c @@ -0,0 +1,220 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* For SYS_xxx definitions */ +#include + +//#define DEBUG +#ifdef DEBUG +#define dprintf printf +#else +#define dprintf {} +#endif + +#define SZENTRY_DEFAULT (65536) /* Size of one slot */ +#define NENTRY_DEFAULT 10000 /* Number of slots */ + +#define DIFFNSEC(end, start) ((end.tv_sec - start.tv_sec) * 1000000000UL + (end.tv_nsec - start.tv_nsec)) + +static int print_cpu_last_executed_on() { + char fn[256]; + char* result; + pid_t tid = syscall(SYS_gettid); + int fd; + int offset; + int mpi_errno = 0; + + sprintf(fn, "/proc/%d/task/%d/stat", getpid(), (int)tid); + //printf("fn=%s\n", fn); + fd = open(fn, O_RDONLY); + if(fd == -1) { + printf("open() failed\n"); + goto fn_fail; + } + + result = malloc(65536); + if(result == NULL) { + printf("malloc() failed"); + goto fn_fail; + } + + int amount = 0; + offset = 0; + while(1) { + amount = read(fd, result + offset, 65536); + // printf("amount=%d\n", amount); + if(amount == -1) { + printf("read() failed"); + goto fn_fail; + } + if(amount == 0) { + goto eof; + } + offset += amount; + } + eof:; + //printf("result:%s\n", result); + + char* next_delim = result; + char* field; + int i; + for(i = 0; i < 39; i++) { + field = strsep(&next_delim, " "); + } + + int cpu = sched_getcpu(); + if(cpu == -1) { + printf("getpu() failed\n"); + goto fn_fail; + } + + printf("compute thread,pmi_rank=%02d,stat-cpu=%02d,sched_getcpu=%02d,tid=%d\n", atoi(getenv("PMI_RANK")), atoi(field), cpu, tid); fflush(stdout); + fn_exit: + free(result); + return mpi_errno; + fn_fail: + mpi_errno = -1; + goto fn_exit; +} + +void sendrecv(int rank, int nentry, char **sendv, char **recvv, int szentry, int src, int dest, MPI_Request* reqs, MPI_Status* status, double usec) { + int i; + if(rank == 1) { + for(i = 0; i < nentry; i++) { + MPI_Isend(sendv[i], szentry, MPI_CHAR, dest, 0, MPI_COMM_WORLD, &reqs[i]); + if (nentry > 10 && i % (nentry / 10) == 0) { + printf("s"); fflush(stdout); + } + } + MPI_Waitall(nentry, reqs, status); + printf("w\n"); fflush(stdout); + } else { + for(i = 0; i < nentry; i++) { + MPI_Irecv(recvv[i], szentry, MPI_CHAR, src, 0, MPI_COMM_WORLD, &reqs[i]); + if (nentry > 10 && i % (nentry / 10) == 0) { + printf("r"); fflush(stdout); + } + } + usleep(usec); + MPI_Waitall(nentry, reqs, status); + printf("W\n"); fflush(stdout); + } +} + +int main(int argc, char **argv) { + int my_rank = -1, size = -1; + int i, j; + char **sendv, **recvv; + MPI_Status* status; + MPI_Request* reqs; + long szentry; + long nentry; + int src, dest; + struct timespec start, end; + double diffusec; + + if(argc == 3) { + szentry = atoi(argv[1]); + nentry = atoi(argv[2]); + } else { + szentry = SZENTRY_DEFAULT; + nentry = NENTRY_DEFAULT; + } + printf("szentry=%ld,nentry=%ld\n", szentry, nentry); + + status = (MPI_Status*)malloc(sizeof(MPI_Status) * nentry); + reqs = (MPI_Request*)malloc(sizeof(MPI_Request) * nentry); + + int actual; + + MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &actual); + printf("Thread support level is %d\n", actual); + + MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); + MPI_Comm_size(MPI_COMM_WORLD, &size); + + src = (size + my_rank - 1) % size; + dest = (my_rank + 1) % size; + + printf("rank=%d, size=%d, src=%d, dest=%d\n", my_rank, size, src, dest); + + sendv = malloc(sizeof(char *) * nentry); + if(!sendv) { printf("malloc failed"); goto fn_fail; } + for (i = 0; i < nentry; i++) { +#if 0 + int fd; + fd = open("./file", O_RDWR); + if(fd == -1) { printf("open failed\n"); goto fn_fail; } + sendv[i] = (char*)mmap(0, szentry, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0); +#else + sendv[i] = (char*)mmap(0, szentry, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); +#endif + if(sendv[i] == MAP_FAILED) { printf("mmap failed"); goto fn_fail; } + dprintf("[%d] sendv[%d]=%p\n", my_rank, i, sendv[i]); + memset(sendv[i], 0xaa, szentry); + } + + recvv = malloc(sizeof(char *) * nentry); + if(!recvv) { printf("malloc failed"); goto fn_fail; } + for (i = 0; i < nentry; i++) { +#if 0 + int fd; + fd = open("./file", O_RDWR); + if(fd == -1) { printf("open failed\n"); goto fn_fail; } + recvv[i] = (char*)mmap(0, szentry, PROT_READ|PROT_WRITE, MAP_PRIVATE, fd, 0); +#else + recvv[i] = (char*)mmap(0, szentry, PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0); +#endif + if(recvv[i] == MAP_FAILED) { printf("mmap failed"); goto fn_fail; } + dprintf("[%d] recvv[%d]=%p\n", my_rank, i, recvv[i]); + memset(recvv[i], 0, szentry); + } + + printf("after memset\n"); + + print_cpu_last_executed_on(); + +#pragma omp parallel for + for (i = 0; i < omp_get_num_threads(); i++) { + printf("thread_num=%d,tid=%d\n", i, syscall(SYS_gettid)); + } + + for (i = 0; i < 1; i++) { + MPI_Barrier(MPI_COMM_WORLD); + if(my_rank == 0) { + clock_gettime(CLOCK_REALTIME, &start); + } + sendrecv(my_rank, nentry, sendv, recvv, szentry, src, dest, reqs, status, 0); + MPI_Barrier(MPI_COMM_WORLD); + if(my_rank == 0) { + clock_gettime(CLOCK_REALTIME, &end); + diffusec = DIFFNSEC(end, start) / (double)1000; + printf("%4.4f sec\n", DIFFNSEC(end, start) / (double)1000000000); fflush(stdout); + } + + MPI_Barrier(MPI_COMM_WORLD); + if(my_rank == 0) { + clock_gettime(CLOCK_REALTIME, &start); + } + sendrecv(my_rank, nentry, sendv, recvv, szentry, src, dest, reqs, status, diffusec); + MPI_Barrier(MPI_COMM_WORLD); + if(my_rank == 0) { + clock_gettime(CLOCK_REALTIME, &end); + printf("%4.4f sec\n", DIFFNSEC(end, start) / (double)1000000000); fflush(stdout); + } + } + + fn_exit: + MPI_Finalize(); + return 0; + fn_fail: + goto fn_exit; +} diff --git a/test/uti/mpi/012.c b/test/uti/mpi/012.c new file mode 100755 index 00000000..9510de5e --- /dev/null +++ b/test/uti/mpi/012.c @@ -0,0 +1,338 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* For SYS_xxx definitions */ +#include + +//#define DEBUG +#ifdef DEBUG +#define dprintf printf +#else +#define dprintf {} +#endif + +#define DIFFNSEC(end, start) ((end.tv_sec - start.tv_sec) * 1000000000UL + (end.tv_nsec - start.tv_nsec)) + +#define BEGIN_EPOCH(win) do { MPI_Win_lock_all(0, win); } while(0) +#define END_EPOCH(win) do { MPI_Win_unlock_all(win); } while(0) +#define FLUSH(win) do { MPI_Win_flush_local_all(win); } while(0) + + +static inline void fixed_size_work() { + asm volatile( + "movq $0, %%rcx\n\t" + "1:\t" + "addq $1, %%rcx\n\t" + "cmpq $99, %%rcx\n\t" + "jle 1b\n\t" + : + : + : "rcx", "cc"); +} + +static inline void bulk_fsw(unsigned long n) { + int j; + for (j = 0; j < (n); j++) { + fixed_size_work(); + } +} + +double nspw; /* nsec per work */ +unsigned long nsec; + +void fwq_init() { + struct timespec start, end; + int i; + clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start); +#define N_INIT 10000000 + bulk_fsw(N_INIT); + clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end); + nsec = DIFFNSEC(end, start); + nspw = nsec / (double)N_INIT; +} + +#if 0 +void fwq(long delay_nsec) { + if (delay_nsec < 0) { + return; + //printf("%s: delay_nsec < 0\n", __FUNCTION__); + } + bulk_fsw(delay_nsec / nspw); +} +#else /* For machines with large core-to-core performance variation (e.g. OFP) */ +void fwq(long delay_nsec) { + struct timespec start, end; + + if (delay_nsec < 0) { return; } + clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start); + + while (1) { + clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end); + if (DIFFNSEC(end, start) >= delay_nsec) { + break; + } + bulk_fsw(2); /* ~150 ns per iteration on FOP */ + } +} +#endif + + +static int print_cpu_last_executed_on() { + char fn[256]; + char* result; + pid_t tid = syscall(SYS_gettid); + int fd; + int offset; + int mpi_errno = 0; + + sprintf(fn, "/proc/%d/task/%d/stat", getpid(), (int)tid); + //printf("fn=%s\n", fn); + fd = open(fn, O_RDONLY); + if(fd == -1) { + printf("open() failed\n"); + goto fn_fail; + } + + result = malloc(65536); + if(result == NULL) { + printf("malloc() failed"); + goto fn_fail; + } + + int amount = 0; + offset = 0; + while(1) { + amount = read(fd, result + offset, 65536); + // printf("amount=%d\n", amount); + if(amount == -1) { + printf("read() failed"); + goto fn_fail; + } + if(amount == 0) { + goto eof; + } + offset += amount; + } + eof:; + //printf("result:%s\n", result); + + char* next_delim = result; + char* field; + int i; + for(i = 0; i < 39; i++) { + field = strsep(&next_delim, " "); + } + + int cpu = sched_getcpu(); + if(cpu == -1) { + printf("getpu() failed\n"); + goto fn_fail; + } + + printf("compute thread,pmi_rank=%02d,stat-cpu=%02d,sched_getcpu=%02d,tid=%d\n", atoi(getenv("PMI_RANK")), atoi(field), cpu, tid); fflush(stdout); + fn_exit: + free(result); + return mpi_errno; + fn_fail: + mpi_errno = -1; + goto fn_exit; +} + +static inline int on_same_node(int ppn, int me, int you) { + return (me / ppn == you / ppn); +} + +/* get_acc-calc-flush_local */ +void rma(int nproc, int ppn, int rank, double *wbuf, double *rbuf, double *result, int ndoubles, MPI_Win win, long calc_nsec) { + int i, j; + int r = 0, s = 0; + int req = 0; + for (i = 0; i < nproc; i++) { + if (!on_same_node(ppn, rank, i)) { + for (j = 0; j < ndoubles; j++) { + //printf("i=%d,j=%d,rbuf=%f,wbuf=%f\n", i, j, rbuf[i * ndoubles + j], wbuf[i * ndoubles + j]); + MPI_Get_accumulate(rbuf + i * ndoubles + j, 1, MPI_DOUBLE, + result + i * ndoubles + j, 1, MPI_DOUBLE, + i, i * ndoubles + j, 1, MPI_DOUBLE, + MPI_SUM, win); + } + } + } + fwq(calc_nsec); + FLUSH(win); +} + +static struct option options[] = { + { + .name = "ppn", + .has_arg = required_argument, + .flag = NULL, + .val = 'P', + }, + /* end */ + { NULL, 0, NULL, 0, }, +}; + +int main(int argc, char **argv) { + int rc; + int actual; + int ppn = -1; + int nproc; + int ndoubles = -1; + int my_rank = -1, size = -1; + int i, j; + double *wbuf, *rbuf, *result; + MPI_Win win; + struct timespec start, end; + long t_flush_l, t_pure_l, t_overall_l; + long t_flush, t_pure, t_overall; + int opt; + + fwq_init(); + + while ((opt = getopt_long(argc, argv, "+d:P:", options, NULL)) != -1) { + switch (opt) { + case 'd': + ndoubles = (1ULL << atoi(optarg)); + break; + case 'P': + ppn = atoi(optarg); + break; + default: /* '?' */ + printf("unknown option %c\n", optopt); + exit(1); + } + } + + if (ndoubles == -1 || ppn == -1) { + printf("specify ndoubles with -d and ppn with --ppn"); + exit(1); + } + + MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &actual); + if (actual != 3) { + printf("ERROR: MPI_THREAD_MULTIPLE not available (level was set to %d)\n", actual); + exit(1); + } + + MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); + MPI_Comm_size(MPI_COMM_WORLD, &nproc); + + if (my_rank == 0) { + printf("ndoubles=%d,nproc=%d\n", ndoubles, nproc); + printf("nsec=%ld, nspw=%f\n", nsec, nspw); + } + + /* accumulate-to buffer */ + wbuf = malloc(sizeof(double) * ndoubles * nproc); + if(!wbuf) { printf("malloc failed"); goto fn_fail; } + memset(wbuf, 0, sizeof(double) * ndoubles * nproc); + + /* read-from buffer */ + rbuf = malloc(sizeof(double) * ndoubles * nproc); + if(!rbuf) { printf("malloc failed"); goto fn_fail; } + memset(rbuf, 0, sizeof(double) * ndoubles * nproc); + + /* fetch-to buffer */ + result = malloc(sizeof(double) * ndoubles * nproc); + if(!result) { printf("malloc failed"); goto fn_fail; } + memset(result, 0, sizeof(double) * ndoubles * nproc); + + /* Expose accumulate-to buffer*/ + if (rc = MPI_Win_create(wbuf, sizeof(double) * ndoubles * nproc, sizeof(double), MPI_INFO_NULL, MPI_COMM_WORLD, &win)) { + printf("MPI_Win_create failed,rc=%d\n", rc); + } + + //print_cpu_last_executed_on(); + + for (i = 0; i < nproc; i++) { + for (j = 0; j < ndoubles; j++) { + wbuf[i * ndoubles + j] = (i + 1) * 1000 + (j + 1); + rbuf[i * ndoubles + j] = (i + 1) * 10000 + (j + 1); + result[i * ndoubles + j] = (i + 1) * 100000 + (j + 1); + } + } + +#if 0 + for (i = 0; i < nproc; i++) { + for (j = 0; j < ndoubles; j++) { + printf("wbuf,proc=%d,j=%d,val=%f\n", i, j, wbuf[i * ndoubles + j]); + printf("rbuf,proc=%d,j=%d,val=%f\n", i, j, rbuf[i * ndoubles + j]); + printf("result,proc=%d,j=%d,val=%f\n", i, j, result[i * ndoubles + j]); + } + } +#endif + /* Measure flush time */ + MPI_Barrier(MPI_COMM_WORLD); +#define NFENCE 10 + BEGIN_EPOCH(win); + clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start); + for (i = 0; i < NFENCE; i++) { + FLUSH(win); + } + clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end); + END_EPOCH(win); + t_flush_l = DIFFNSEC(end, start) / NFENCE; + //printf("t_flush (local): %ld usec\n", t_flush_l / 1000UL); + MPI_Allreduce(&t_flush_l, &t_flush, 1, MPI_LONG, MPI_MAX, MPI_COMM_WORLD); + if (my_rank == 0) printf("t_flush (max): %ld usec\n", t_flush / 1000UL); + + /* Measure get_acc-flush time */ + MPI_Barrier(MPI_COMM_WORLD); +#define NPURE 10 + clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start); + for (i = 0; i < NPURE; i++) { + BEGIN_EPOCH(win); + rma(nproc, ppn, my_rank, wbuf, rbuf, result, ndoubles, win, 0); + END_EPOCH(win); + } + clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end); + t_pure_l = DIFFNSEC(end, start) / NPURE; + //printf("t_pure (local): %ld usec\n", t_pure_l / 1000UL); + MPI_Allreduce(&t_pure_l, &t_pure, 1, MPI_LONG, MPI_MAX, MPI_COMM_WORLD); + if (my_rank == 0) printf("t_pure (max): %ld usec\n", t_pure / 1000UL); + +#if 0 + for (i = 0; i < nproc; i++) { + for (j = 0; j < ndoubles; j++) { + printf("wbuf,proc=%d,j=%d,val=%f\n", i, j, wbuf[i * ndoubles + j]); + printf("rbuf,proc=%d,j=%d,val=%f\n", i, j, rbuf[i * ndoubles + j]); + printf("result,proc=%d,j=%d,val=%f\n", i, j, result[i * ndoubles + j]); + } + } +#endif + + /* Measure get_acc-calc-flush time */ + MPI_Barrier(MPI_COMM_WORLD); +#define NOVERALL 10 + clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start); + for (i = 0; i < NOVERALL; i++) { + BEGIN_EPOCH(win); + rma(nproc, ppn, my_rank, wbuf, rbuf, result, ndoubles, win, t_pure - t_flush); + END_EPOCH(win); + } + clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end); + t_overall_l = DIFFNSEC(end, start) / NOVERALL; + //printf("t_overall (local): %ld usec\n", t_overall_l / 1000UL); + MPI_Allreduce(&t_overall_l, &t_overall, 1, MPI_LONG, MPI_MAX, MPI_COMM_WORLD); + if (my_rank == 0) printf("t_overall (max): %ld usec\n", t_overall / 1000UL); + if (my_rank == 0) { + long t_abs = (t_pure * 2) - t_overall; + printf("overlap: %.2f %%\n", (t_abs * 100) / (double)t_pure); +} + + fn_exit: + MPI_Finalize(); + return 0; + fn_fail: + goto fn_exit; +} diff --git a/test/uti/mpi/012.sh b/test/uti/mpi/012.sh new file mode 100755 index 00000000..e5ff9bb6 --- /dev/null +++ b/test/uti/mpi/012.sh @@ -0,0 +1,174 @@ +#!/usr/bin/bash + +#!/usr/bin/bash -x + +MYHOME=/work/gg10/e29005 +UTI_MPI_TOP=${MYHOME}/project/os/mckernel/test/uti/mpi + +MCK=${MYHOME}/project/os/install +unset DISABLE_UTI + +stop=0 +reboot=0 +go=0 + +async=0 +mck=0 +nnodes=2 +LASTNODE=8200 +ndoubles=10 #12-15 +omp_num_threads=1 +ppn=1 #16 +async_progress_pin=64,132,200,268,65,133,201,269,66,134,202,270,67,135,203,271 +lpp=4 # logical-per-physical +ncpu_mt=256 # number of CPUs for main-thread + +while getopts srga:c:n:mdl:N:P:o: OPT +do + case ${OPT} in + s) stop=1 + ;; + r) reboot=1 + ;; + g) go=1 + ;; + a) async=$OPTARG + ;; + n) ndoubles=$OPTARG + ;; + m) mck=1 + ;; + d) export DISABLE_UTI=1 + ;; + N) nnodes=$OPTARG + ;; + P) ppn=$OPTARG + ;; + o) omp_num_threads=$OPTARG + ;; + *) echo "invalid option -${OPT}" >&2 + exit 1 + esac +done + +nprocs=$((ppn * nnodes)) +nodes=`echo $(seq -s ",c" $(($LASTNODE + 1 - $nnodes)) $LASTNODE) | sed 's/^/c/'` +echo nprocs=$nprocs nnodes=$nnodes ppn=$ppn nodes=$nodes + +if [ ${mck} -eq 1 ]; then + mcexec="${mck_dir}/bin/mcexec" + nmcexecthr=$((omp_num_threads + 4)) + mcexecopt="--uti-thread-rank=$uti_thread_rank" + if [ ${use_hfi} -eq 1 ]; then + mcexecopt="--enable-hfi1 $mcexecopt" + fi + mcexecopt="-n $ppn -t $nmcexecthr $mcexecopt" +else + mcexec= + mcexecopt= +fi + +if [ ${mck} -eq 1 ]; then + i_mpi_pin=off + i_mpi_pin_domain= + i_mpi_pin_order= +else + # Let each domain have all logical cores and use KMP_AFFINITY=scatter if you want to use only physical cores + i_mpi_pin=on + if [ $((omp_num_threads * lpp * ppn)) -le $ncpu_mt ]; then + domain=$((omp_num_threads * lpp)) # Prefer physical but adjacent physicals share L1 + else + domain=$((ncpu_mt / ppn)) # Use logical as well + fi + i_mpi_pin_domain="export I_MPI_PIN_DOMAIN=$domain" + i_mpi_pin_order="export I_MPI_PIN_ORDER=compact" +fi + +if [ $async -eq 0 ] || [ "$async_progress_pin" == "" ] ; then + i_mpi_async_progress_pin= +else + i_mpi_async_progress_pin="export I_MPI_ASYNC_PROGRESS_PIN=$async_progress_pin" +fi + +if [ ${stop} -eq 1 ]; then + PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $nodes sudo mount /work + + if [ ${mck} -eq 1 ]; then + sudo ${MCK}/sbin/mcstop+release.sh + else + : + fi +fi + +if [ ${reboot} -eq 1 ]; then + PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $nodes sudo mount /work + + if [ ${mck} -eq 1 ]; then + if hostname | grep ofp &>/dev/null; then + sudo ${MCK}/sbin/mcreboot.sh -s -c 2-17,70-85,138-153,206-221,20-35,88-103,156-171,224-239,36-51,104-119,172-187,240-255,52-67,120-135,188-203,256-271 -r 2-5,70-73,138-141,206-209:0+6-9,74-77,142-145,210-213:1+10-13,78-81,146-149,214-217:68+14-17,82-85,150-153,218-221:69+20-23,88-91,156-159,224-227:136+24-27,92-95,160-163,228-231:137+28-31,96-99,164-167,232-235:204+32-35,100-103,168-171,236-239:205+36-39,104-107,172-175,240-243:18+40-43,108-111,176-179,244-247:19+44-47,112-115,180-183,248-251:86+48-51,116-119,184-187,252-255:87+52-55,120-123,188-191,256-259:154+56-59,124-127,192-195,260-263:155+60-63,128-131,196-199,264-267:222+64-67,132-135,200-203,268-271:223 -m 32G@0,12G@1 + else + sudo ${MCK}/sbin/mcreboot.sh -s -c 1-15,65-79,129-143,193-207,17-31,81-95,145-159,209-223,33-47,97-111,161-175,225-239,49-63,113-127,177-191,241-255 -r 1-15:0+65-79:64+129-143:128+193-207:192+17-31:16+81-95:80+145-159:144+209-223:208+33-47:32+97-111:96+161-175:160+225-239:224+49-63:48+113-127:112+177-191:176+241-255:240 -m 12G@0,12G@1,12G@2,12G@3,3920M@4,3920M@5,3920M@6,3920M@7 + fi + else + : + fi +fi + +cd ${UTI_MPI_TOP} +( +cat < ./job.sh +chmod u+x ./job.sh + +if [ ${go} -eq 1 ]; then + cd ${UTI_MPI_TOP} + if [ $mck -eq 1 ]; then + make clean && make 012 + else + make clean && make CC=mpiicc 012 + fi + ./job.sh +fi + + + diff --git a/test/uti/mpi/013.c b/test/uti/mpi/013.c new file mode 100755 index 00000000..0f3bc2b1 --- /dev/null +++ b/test/uti/mpi/013.c @@ -0,0 +1,335 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* For SYS_xxx definitions */ +#include + +//#define DEBUG +#ifdef DEBUG +#define dprintf printf +#else +#define dprintf {} +#endif + +#define DIFFNSEC(end, start) ((end.tv_sec - start.tv_sec) * 1000000000UL + (end.tv_nsec - start.tv_nsec)) + +#define BEGIN_EPOCH(win) do { MPI_Win_lock_all(0, win); } while(0) +#define END_EPOCH(win) do { MPI_Win_unlock_all(win); } while(0) + + +static inline void fixed_size_work() { + asm volatile( + "movq $0, %%rcx\n\t" + "1:\t" + "addq $1, %%rcx\n\t" + "cmpq $99, %%rcx\n\t" + "jle 1b\n\t" + : + : + : "rcx", "cc"); +} + +static inline void bulk_fsw(unsigned long n) { + int j; + for (j = 0; j < (n); j++) { + fixed_size_work(); + } +} + +double nspw; /* nsec per work */ +unsigned long nsec; + +void fwq_init() { + struct timespec start, end; + int i; + clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start); +#define N_INIT 10000000 + bulk_fsw(N_INIT); + clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end); + nsec = DIFFNSEC(end, start); + nspw = nsec / (double)N_INIT; +} + +#if 0 +void fwq(long delay_nsec) { + if (delay_nsec < 0) { + return; + //printf("%s: delay_nsec < 0\n", __FUNCTION__); + } + bulk_fsw(delay_nsec / nspw); +} +#else /* For machines with large core-to-core performance variation (e.g. OFP) */ +void fwq(long delay_nsec) { + struct timespec start, end; + + if (delay_nsec < 0) { return; } + clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start); + + while (1) { + clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end); + if (DIFFNSEC(end, start) >= delay_nsec) { + break; + } + bulk_fsw(2); /* ~150 ns per iteration on OFP */ + } +} +#endif + + +static int print_cpu_last_executed_on() { + char fn[256]; + char* result; + pid_t tid = syscall(SYS_gettid); + int fd; + int offset; + int mpi_errno = 0; + + sprintf(fn, "/proc/%d/task/%d/stat", getpid(), (int)tid); + //printf("fn=%s\n", fn); + fd = open(fn, O_RDONLY); + if(fd == -1) { + printf("open() failed\n"); + goto fn_fail; + } + + result = malloc(65536); + if(result == NULL) { + printf("malloc() failed"); + goto fn_fail; + } + + int amount = 0; + offset = 0; + while(1) { + amount = read(fd, result + offset, 65536); + // printf("amount=%d\n", amount); + if(amount == -1) { + printf("read() failed"); + goto fn_fail; + } + if(amount == 0) { + goto eof; + } + offset += amount; + } + eof:; + //printf("result:%s\n", result); + + char* next_delim = result; + char* field; + int i; + for(i = 0; i < 39; i++) { + field = strsep(&next_delim, " "); + } + + int cpu = sched_getcpu(); + if(cpu == -1) { + printf("getpu() failed\n"); + goto fn_fail; + } + + printf("compute thread,pmi_rank=%02d,stat-cpu=%02d,sched_getcpu=%02d,tid=%d\n", atoi(getenv("PMI_RANK")), atoi(field), cpu, tid); fflush(stdout); + fn_exit: + free(result); + return mpi_errno; + fn_fail: + mpi_errno = -1; + goto fn_exit; +} + +static inline int on_same_node(int ppn, int me, int you) { + return (me / ppn == you / ppn); +} + +/* get_acc-calc-flush_local */ +void rma(int nproc, int ppn, int rank, double *wbuf, double *rbuf, int ndoubles, MPI_Win win, long calc_nsec, int flush_only) { + int i, j; + int r = 0, s = 0; + int req = 0; + for (i = 0; i < nproc; i++) { + if (!on_same_node(ppn, rank, i)) { + for (j = 0; j < ndoubles; j++) { + //printf("i=%d,j=%d,rbuf=%f,wbuf=%f\n", i, j, rbuf[i * ndoubles + j], wbuf[i * ndoubles + j]); + if (!flush_only) { + MPI_Accumulate(rbuf + i * ndoubles + j, 1, MPI_DOUBLE, + i, i * ndoubles + j, 1, MPI_DOUBLE, + MPI_SUM, win); + } + MPI_Win_flush_local(i, win); + } + } + } + fwq(calc_nsec); +} + +static struct option options[] = { + { + .name = "ppn", + .has_arg = required_argument, + .flag = NULL, + .val = 'P', + }, + /* end */ + { NULL, 0, NULL, 0, }, +}; + +int main(int argc, char **argv) { + int rc; + int actual; + int ppn = -1; + int nproc; + int ndoubles = -1; + int my_rank = -1, size = -1; + int i, j; + double *wbuf, *rbuf; + MPI_Win win; + struct timespec start, end; + long t_flush_l, t_pure_l, t_overall_l; + long t_flush, t_pure, t_overall; + int opt; + + fwq_init(); + + while ((opt = getopt_long(argc, argv, "+d:P:", options, NULL)) != -1) { + switch (opt) { + case 'd': + ndoubles = (1ULL << atoi(optarg)); + break; + case 'P': + ppn = atoi(optarg); + break; + default: /* '?' */ + printf("unknown option %c\n", optopt); + exit(1); + } + } + + if (ndoubles == -1 || ppn == -1) { + printf("specify ndoubles with -d and ppn with --ppn"); + exit(1); + } + + MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &actual); + if (actual != 3) { + printf("ERROR: MPI_THREAD_MULTIPLE not available (level was set to %d)\n", actual); + exit(1); + } + + MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); + MPI_Comm_size(MPI_COMM_WORLD, &nproc); + + if (my_rank == 0) { + printf("ndoubles=%d,nproc=%d\n", ndoubles, nproc); + printf("nsec=%ld, nspw=%f\n", nsec, nspw); + } + + /* accumulate-to buffer */ + wbuf = malloc(sizeof(double) * ndoubles * nproc); + if(!wbuf) { printf("malloc failed"); goto fn_fail; } + memset(wbuf, 0, sizeof(double) * ndoubles * nproc); + + /* read-from buffer */ + rbuf = malloc(sizeof(double) * ndoubles * nproc); + if(!rbuf) { printf("malloc failed"); goto fn_fail; } + memset(rbuf, 0, sizeof(double) * ndoubles * nproc); + + /* Expose accumulate-to buffer*/ + if (rc = MPI_Win_create(wbuf, sizeof(double) * ndoubles * nproc, sizeof(double), MPI_INFO_NULL, MPI_COMM_WORLD, &win)) { + printf("MPI_Win_create failed,rc=%d\n", rc); + } + + //print_cpu_last_executed_on(); + + for (i = 0; i < nproc; i++) { + for (j = 0; j < ndoubles; j++) { + wbuf[i * ndoubles + j] = (i + 1) * 1000 + (j + 1); + rbuf[i * ndoubles + j] = (i + 1) * 10000 + (j + 1); + } + } + +#if 0 + for (i = 0; i < nproc; i++) { + for (j = 0; j < ndoubles; j++) { + printf("wbuf,proc=%d,j=%d,val=%f\n", i, j, wbuf[i * ndoubles + j]); + printf("rbuf,proc=%d,j=%d,val=%f\n", i, j, rbuf[i * ndoubles + j]); + } + } +#endif + + /* Measure flush time */ + MPI_Barrier(MPI_COMM_WORLD); +#define NFENCE 10 + BEGIN_EPOCH(win); + clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start); + for (i = 0; i < NFENCE; i++) { + rma(nproc, ppn, my_rank, wbuf, rbuf, ndoubles, win, 0, 1); + } + clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end); + END_EPOCH(win); + t_flush_l = DIFFNSEC(end, start) / NFENCE; + //printf("t_flush (local): %ld usec\n", t_flush_l / 1000UL); + MPI_Allreduce(&t_flush_l, &t_flush, 1, MPI_LONG, MPI_MAX, MPI_COMM_WORLD); + if (my_rank == 0) printf("t_flush (max): %ld usec\n", t_flush / 1000UL); + + /* Measure get_acc-flush time */ + MPI_Barrier(MPI_COMM_WORLD); +#define NPURE 10 + clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start); + //MPI_Pcontrol(1, "rma"); + for (i = 0; i < NPURE; i++) { + BEGIN_EPOCH(win); + rma(nproc, ppn, my_rank, wbuf, rbuf, ndoubles, win, 0, 0); + END_EPOCH(win); + } + //MPI_Pcontrol(-1, "rma"); + clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end); + t_pure_l = DIFFNSEC(end, start) / NPURE; + //printf("t_pure (local): %ld usec\n", t_pure_l / 1000UL); + MPI_Allreduce(&t_pure_l, &t_pure, 1, MPI_LONG, MPI_MAX, MPI_COMM_WORLD); + if (my_rank == 0) printf("t_pure (max): %ld usec\n", t_pure / 1000UL); + +#if 0 + for (i = 0; i < nproc; i++) { + for (j = 0; j < ndoubles; j++) { + printf("wbuf,proc=%d,j=%d,val=%f\n", i, j, wbuf[i * ndoubles + j]); + printf("rbuf,proc=%d,j=%d,val=%f\n", i, j, rbuf[i * ndoubles + j]); + } + } +#endif + + /* Measure get_acc-calc-flush time */ + MPI_Barrier(MPI_COMM_WORLD); +#define NOVERALL 10 + clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start); + //MPI_Pcontrol(1, "rma-calc"); + for (i = 0; i < NOVERALL; i++) { + BEGIN_EPOCH(win); + rma(nproc, ppn, my_rank, wbuf, rbuf, ndoubles, win, t_pure - t_flush, 0); + END_EPOCH(win); + } + //MPI_Pcontrol(-1, "rma-calc"); + clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end); + t_overall_l = DIFFNSEC(end, start) / NOVERALL; + //printf("t_overall (local): %ld usec\n", t_overall_l / 1000UL); + MPI_Allreduce(&t_overall_l, &t_overall, 1, MPI_LONG, MPI_MAX, MPI_COMM_WORLD); + if (my_rank == 0) printf("t_overall (max): %ld usec\n", t_overall / 1000UL); + if (my_rank == 0) { + long t_abs = (t_pure * 2) - t_overall; + printf("overlap: %.2f %%\n", (t_abs * 100) / (double)t_pure); +} + + fn_exit: + MPI_Finalize(); + return 0; + fn_fail: + goto fn_exit; +} diff --git a/test/uti/mpi/013.sh b/test/uti/mpi/013.sh new file mode 100755 index 00000000..56edfe86 --- /dev/null +++ b/test/uti/mpi/013.sh @@ -0,0 +1,176 @@ +#!/usr/bin/bash + +#!/usr/bin/bash -x + +MYHOME=/work/gg10/e29005 +UTI_MPI_TOP=${MYHOME}/project/os/mckernel/test/uti/mpi + +MCK=${MYHOME}/project/os/install +unset DISABLE_UTI + +stop=0 +reboot=0 +go=0 + +async=0 +mck=0 +nnodes=4 +LASTNODE=8200 +ndoubles=10 #12-15 +omp_num_threads=1 +ppn=16 #16 +async_progress_pin=64,132,200,268,65,133,201,269,66,134,202,270,67,135,203,271 +lpp=4 # logical-per-physical +ncpu_mt=256 # number of CPUs for main-thread +exe=`basename $0 | sed 's/\.sh$//'` + +while getopts srga:c:n:mdl:N:P:o: OPT +do + case ${OPT} in + s) stop=1 + ;; + r) reboot=1 + ;; + g) go=1 + ;; + a) async=$OPTARG + ;; + n) ndoubles=$OPTARG + ;; + m) mck=1 + ;; + d) export DISABLE_UTI=1 + ;; + N) nnodes=$OPTARG + ;; + P) ppn=$OPTARG + ;; + o) omp_num_threads=$OPTARG + ;; + *) echo "invalid option -${OPT}" >&2 + exit 1 + esac +done + +nprocs=$((ppn * nnodes)) +nodes=`echo $(seq -s ",c" $(($LASTNODE + 1 - $nnodes)) $LASTNODE) | sed 's/^/c/'` +echo nprocs=$nprocs nnodes=$nnodes ppn=$ppn nodes=$nodes + +if [ ${mck} -eq 1 ]; then + mcexec="${mck_dir}/bin/mcexec" + nmcexecthr=$((omp_num_threads + 4)) + mcexecopt="--uti-thread-rank=$uti_thread_rank" + if [ ${use_hfi} -eq 1 ]; then + mcexecopt="--enable-hfi1 $mcexecopt" + fi + mcexecopt="-n $ppn -t $nmcexecthr $mcexecopt" +else + mcexec= + mcexecopt= +fi + +if [ ${mck} -eq 1 ]; then + i_mpi_pin=off + i_mpi_pin_domain= + i_mpi_pin_order= +else + # Let each domain have all logical cores and use KMP_AFFINITY=scatter if you want to use only physical cores + i_mpi_pin=on + if [ $((omp_num_threads * lpp * ppn)) -le $ncpu_mt ]; then + domain=$((omp_num_threads * lpp)) # Prefer physical but adjacent physicals share L1 + else + domain=$((ncpu_mt / ppn)) # Use logical as well + fi + i_mpi_pin_domain="export I_MPI_PIN_DOMAIN=$domain" + i_mpi_pin_order="export I_MPI_PIN_ORDER=compact" +fi + +if [ $async -eq 0 ] || [ "$async_progress_pin" == "" ] ; then + i_mpi_async_progress_pin= +else + i_mpi_async_progress_pin="export I_MPI_ASYNC_PROGRESS_PIN=$async_progress_pin" +fi + +if [ ${stop} -eq 1 ]; then + + PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $nodes sudo mount /work + + if [ ${mck} -eq 1 ]; then + sudo ${MCK}/sbin/mcstop+release.sh + else + : + fi +fi + +if [ ${reboot} -eq 1 ]; then + PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $nodes sudo mount /work + + if [ ${mck} -eq 1 ]; then + if hostname | grep ofp &>/dev/null; then + sudo ${MCK}/sbin/mcreboot.sh -s -c 2-17,70-85,138-153,206-221,20-35,88-103,156-171,224-239,36-51,104-119,172-187,240-255,52-67,120-135,188-203,256-271 -r 2-5,70-73,138-141,206-209:0+6-9,74-77,142-145,210-213:1+10-13,78-81,146-149,214-217:68+14-17,82-85,150-153,218-221:69+20-23,88-91,156-159,224-227:136+24-27,92-95,160-163,228-231:137+28-31,96-99,164-167,232-235:204+32-35,100-103,168-171,236-239:205+36-39,104-107,172-175,240-243:18+40-43,108-111,176-179,244-247:19+44-47,112-115,180-183,248-251:86+48-51,116-119,184-187,252-255:87+52-55,120-123,188-191,256-259:154+56-59,124-127,192-195,260-263:155+60-63,128-131,196-199,264-267:222+64-67,132-135,200-203,268-271:223 -m 32G@0,12G@1 + else + sudo ${MCK}/sbin/mcreboot.sh -s -c 1-15,65-79,129-143,193-207,17-31,81-95,145-159,209-223,33-47,97-111,161-175,225-239,49-63,113-127,177-191,241-255 -r 1-15:0+65-79:64+129-143:128+193-207:192+17-31:16+81-95:80+145-159:144+209-223:208+33-47:32+97-111:96+161-175:160+225-239:224+49-63:48+113-127:112+177-191:176+241-255:240 -m 12G@0,12G@1,12G@2,12G@3,3920M@4,3920M@5,3920M@6,3920M@7 + fi + else + : + fi +fi + +cd ${UTI_MPI_TOP} +( +cat < ./job.sh +chmod u+x ./job.sh + +if [ ${go} -eq 1 ]; then + cd ${UTI_MPI_TOP} + if [ $mck -eq 1 ]; then + make clean && make $exe + else + make clean && make CC=mpiicc $exe + fi + ./job.sh +fi + + + diff --git a/test/uti/mpi/014.c b/test/uti/mpi/014.c new file mode 100755 index 00000000..6fa95045 --- /dev/null +++ b/test/uti/mpi/014.c @@ -0,0 +1,242 @@ +#define _GNU_SOURCE /* See feature_test_macros(7) */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* For SYS_xxx definitions */ +#include +#include +#include +#include +#include "async_progress.h" +#include "util.h" + +//#define DEBUG +#ifdef DEBUG +#define dprintf printf +#else +#define dprintf {} +#endif + +static struct option options[] = { + /* end */ + { NULL, 0, NULL, 0, }, +}; + +int main(int argc, char **argv) { + int rc; + int actual; + int nproc; + int nsamples = -1; + int my_rank = -1, size = -1; + int i, j, k, l, m; + double *wbuf, *rbuf, *result; + MPI_Win win; + long start, end; + long t_pure_l, t_pure, t_pure0 = 0; + int opt; + int szbuf = 8; + struct rusage ru_start, ru_end; + struct timeval tv_start, tv_end; + + fwq_init(); + + while ((opt = getopt_long(argc, argv, "+n:", options, NULL)) != -1) { + switch (opt) { + case 'n': + nsamples = atoi(optarg); + break; + default: /* '?' */ + printf("unknown option %c\n", optopt); + exit(1); + } + } + + if (nsamples == -1) { + printf("specify nsamples with -n"); + exit(1); + } + + MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &actual); + if (actual != 3) { + printf("ERROR: MPI_THREAD_MULTIPLE not available (level was set to %d)\n", actual); + exit(1); + } + + MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); + MPI_Comm_size(MPI_COMM_WORLD, &nproc); + + if (my_rank == 0) { + printf("nsamples=%d,nproc=%d\n", nsamples, nproc); + } + + /* accumulate-to buffer */ + wbuf = malloc(sizeof(double) * szbuf); + if(!wbuf) { printf("malloc failed"); goto fn_fail; } + memset(wbuf, 0, sizeof(double) * szbuf); + + /* read-from buffer */ + rbuf = malloc(sizeof(double) * szbuf); + if(!rbuf) { printf("malloc failed"); goto fn_fail; } + memset(rbuf, 0, sizeof(double) * szbuf); + + /* fetch-to buffer */ + result = malloc(sizeof(double) * szbuf); + if(!result) { printf("malloc failed"); goto fn_fail; } + memset(result, 0, sizeof(double) * szbuf); + + /* Expose accumulate-to buffer*/ + if (rc = MPI_Win_create(wbuf, sizeof(double) * szbuf, sizeof(double), MPI_INFO_NULL, MPI_COMM_WORLD, &win)) { + printf("MPI_Win_create failed,rc=%d\n", rc); + } + + for (j = 0; j < szbuf; j++) { + wbuf[j] = j + 1; + rbuf[j] = 10000 + j + 1; + result[j] = 100000 + j + 1; + } + +#if 0 + for (j = 0; j < szbuf; j++) { + printf("wbuf,j=%d,val=%f\n", j, wbuf[j]); + printf("rbuf,j=%d,val=%f\n", j, rbuf[j]); + printf("result,j=%d,val=%f\n", j, result[j]); + } + } +#endif + + for (k = 0; k < 2; k++) { + + if (k == 1) { + + print_cpu_last_executed_on("main"); + + INIT_ASYNC_THREAD_(); + + if ((rc = getrusage(RUSAGE_THREAD, &ru_start))) { + printf("%s: ERROR: getrusage failed (%d)\n", __FUNCTION__, rc); + } + + if ((rc = gettimeofday(&tv_start, NULL))) { + printf("%s: ERROR: gettimeofday failed (%d)\n", __FUNCTION__, rc); + } + + syscall(701, 1 | 2 | 0x80000000); + } + + for (m = 0; m < 3; m++) { + + for (l = 0; l <= 10; l++) { + long calc_cyc = /*(k == 1 && l == 0) ? (double)t_pure0 * 0.1 :*/ t_pure0 / 10 * l; + + MPI_Barrier(MPI_COMM_WORLD); + MPI_Win_lock_all(0, win); + //clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start); + + start = rdtsc_light(); + for (j = 0; j < nsamples; j++) { + for (i = 0; i < nproc; i++) { + int target = j % nproc; + if (target == my_rank) { + continue; + } +#if 0 + MPI_Get_accumulate(rbuf + j % szbuf, 1, MPI_DOUBLE, + result + j % szbuf, 1, MPI_DOUBLE, + i, + j % szbuf, 1, MPI_DOUBLE, + MPI_SUM, win); +#endif +#if 1 + MPI_Get_accumulate(rbuf, szbuf, MPI_DOUBLE, + result, szbuf, MPI_DOUBLE, + i, + 0, szbuf, MPI_DOUBLE, + MPI_SUM, win); +#endif +#if 0 + MPI_Accumulate(rbuf, szbuf, MPI_DOUBLE, + i, + 0, szbuf, MPI_DOUBLE, + MPI_SUM, win); +#endif +#if 0 + MPI_Get(rbuf + j % szbuf, 1, MPI_DOUBLE, + i, + j % szbuf, 1, MPI_DOUBLE, + win); +#endif + } + } + fwq(calc_cyc * nsamples); + MPI_Win_flush_local_all(win); + end = rdtsc_light(); + + //clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end); + MPI_Win_unlock_all(win); + MPI_Barrier(MPI_COMM_WORLD); + t_pure_l = (end - start) / nsamples; + //t_pure_l = DIFFNSEC(end, start) / nsamples; + + if (1||m == 2) { + MPI_Allreduce(&t_pure_l, &t_pure, 1, MPI_LONG, MPI_MAX, MPI_COMM_WORLD); + if (my_rank == 0) { + if (l == 0) { + printf("async: %d, trial: %d\n", k, m); + } + if (k == 0) { + printf("%ld\t%ld\n", calc_cyc, t_pure); + } else { + printf("%ld\n", t_pure); + } + } + } + + if (k == 0 && l == 0) { + t_pure0 = t_pure; + } +#if 0 + for (i = 0; i < nproc; i++) { + for (j = 0; j < sbuf; j++) { + printf("wbuf,j=%d,val=%f\n", j, wbuf[j]); + printf("rbuf,j=%d,val=%f\n", j, rbuf[j]); + printf("result,j=%d,val=%f\n", j, result[j]); + } + } +#endif + } + } + + if (k == 1) { + FINALIZE_ASYNC_THREAD_(); + +#if 0 + if ((rc = getrusage(RUSAGE_THREAD, &ru_end))) { + printf("%s: ERROR: getrusage failed (%d)\n", __FUNCTION__, rc); + } + + if ((rc = gettimeofday(&tv_end, NULL))) { + printf("%s: ERROR: gettimeofday failed (%d)\n", __FUNCTION__, rc); + } + + printf("%s: wall: %ld, user: %ld, sys: %ld\n", __FUNCTION__, + DIFFUSEC(tv_end, tv_start), + DIFFUSEC(ru_end.ru_utime, ru_start.ru_utime), + DIFFUSEC(ru_end.ru_stime, ru_start.ru_stime)); + syscall(701, 4 | 8 | 0x80000000); +#endif + } + } + + fn_exit: + MPI_Finalize(); + return 0; + fn_fail: + goto fn_exit; +} diff --git a/test/uti/mpi/014.sh b/test/uti/mpi/014.sh new file mode 100755 index 00000000..371e3e21 --- /dev/null +++ b/test/uti/mpi/014.sh @@ -0,0 +1,191 @@ +#!/usr/bin/bash + +#!/usr/bin/bash -x + +MYHOME=/work/gg10/e29005 +UTI_MPI_TOP=${MYHOME}/project/os/mckernel/test/uti/mpi + +mck_dir=${MYHOME}/project/os/install + +exe=`basename $0 | sed 's/\.sh//'` + +stop=0 +reboot=0 +go=0 + +async=0 +mck=0 +nnodes=2 +LASTNODE=8200 +nsamples=100 #2^12-15 +use_hfi=0 +omp_num_threads=1 +ppn=4 +lpp=4 # logical-per-physical +ncpu_mt=256 # number of CPUs for main-thread +myasync=1 +async_in_mck=0 + +while getopts srga:c:n:ml:N:P:ho:A:M: OPT +do + case ${OPT} in + s) stop=1 + ;; + r) reboot=1 + ;; + g) go=1 + ;; + a) async=$OPTARG + ;; + n) nsamples=$OPTARG + ;; + m) mck=1 + ;; + N) nnodes=$OPTARG + ;; + P) ppn=$OPTARG + ;; + h) use_hfi=1 + ;; + o) omp_num_threads=$OPTARG + ;; + A) myasync=$OPTARG + ;; + M) async_in_mck=$OPTARG + ;; + *) echo "invalid option -${OPT}" >&2 + exit 1 + esac +done + +nprocs=$((ppn * nnodes)) +nodes=`echo $(seq -s ",c" $(($LASTNODE + 1 - $nnodes)) $LASTNODE) | sed 's/^/c/'` +echo nprocs=$nprocs nnodes=$nnodes ppn=$ppn nodes=$nodes + +PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $nodes bash -c \'if \[ \"\`cat /etc/mtab \| while read line\; do cut -d\" \" -f 2\; done \| grep /work\`\" == \"\" \]\; then sudo mount /work\; fi\' + +if [ $mck -eq 0 ] || [ $async_in_mck -eq 1 ]; then + export DISABLE_UTI=1 +else + unset DISABLE_UTI +fi + +if [ $mck -eq 0 ]; then + async_progress_pin=64,65,66,67,132,133,134,135,200,201,202,203,268,269,270,271 +else + async_progress_pin=`(for ((i=0;i/dev/null; then + PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $nodes \ + sudo ${mck_dir}/sbin/mcreboot.sh -s -c 2-17,70-85,138-153,206-221,20-35,88-103,156-171,224-239,36-51,104-119,172-187,240-255,52-67,120-135,188-203,256-271 -r 2-5,70-73,138-141,206-209:0+6-9,74-77,142-145,210-213:1+10-13,78-81,146-149,214-217:68+14-17,82-85,150-153,218-221:69+20-23,88-91,156-159,224-227:136+24-27,92-95,160-163,228-231:137+28-31,96-99,164-167,232-235:204+32-35,100-103,168-171,236-239:205+36-39,104-107,172-175,240-243:18+40-43,108-111,176-179,244-247:19+44-47,112-115,180-183,248-251:86+48-51,116-119,184-187,252-255:87+52-55,120-123,188-191,256-259:154+56-59,124-127,192-195,260-263:155+60-63,128-131,196-199,264-267:222+64-67,132-135,200-203,268-271:223 -m 32G@0,12G@1 + else + PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $nodes \ + sudo ${mck_dir}/sbin/mcreboot.sh -s -c 1-15,65-79,129-143,193-207,17-31,81-95,145-159,209-223,33-47,97-111,161-175,225-239,49-63,113-127,177-191,241-255 -r 1-15:0+65-79:64+129-143:128+193-207:192+17-31:16+81-95:80+145-159:144+209-223:208+33-47:32+97-111:96+161-175:160+225-239:224+49-63:48+113-127:112+177-191:176+241-255:240 -m 12G@0,12G@1,12G@2,12G@3,3920M@4,3920M@5,3920M@6,3920M@7 + fi + else + : + fi +fi + +cd ${UTI_MPI_TOP} +( +cat < ./job.sh +chmod u+x ./job.sh + +if [ ${go} -eq 1 ]; then + . /home/opt/local/cores/intel/compilers_and_libraries_2018.1.163/linux/bin/compilervars.sh intel64 + cd ${UTI_MPI_TOP} + make ./$exe + bash -c '. /home/opt/local/cores/intel/compilers_and_libraries_2018.1.163/linux/bin/compilervars.sh intel64; ./job.sh' +fi diff --git a/test/uti/mpi/015.c b/test/uti/mpi/015.c new file mode 100755 index 00000000..cde43202 --- /dev/null +++ b/test/uti/mpi/015.c @@ -0,0 +1,346 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* For SYS_xxx definitions */ +#include +#include "async_progress.h" + +//#define DEBUG +#ifdef DEBUG +#define dprintf printf +#else +#define dprintf {} +#endif + +#define DIFFNSEC(end, start) ((end.tv_sec - start.tv_sec) * 1000000000UL + (end.tv_nsec - start.tv_nsec)) + +#define BEGIN_EPOCH(win) do { MPI_Win_lock_all(0, win); } while(0) +#define END_EPOCH(win) do { MPI_Win_unlock_all(win); } while(0) + +static inline uint64_t rdtsc_light(void ) +{ + uint64_t x; + __asm__ __volatile__("rdtscp;" /* rdtscp don't jump over earlier instructions */ + "shl $32, %%rdx;" + "or %%rdx, %%rax" : + "=a"(x) : + : + "%rcx", "%rdx", "memory"); + return x; +} + +static inline void fixed_size_work() { + asm volatile( + "movq $0, %%rcx\n\t" + "1:\t" + "addq $1, %%rcx\n\t" + "cmpq $99, %%rcx\n\t" + "jle 1b\n\t" + : + : + : "rcx", "cc"); +} + +static inline void bulk_fsw(unsigned long n) { + int j; + for (j = 0; j < (n); j++) { + fixed_size_work(); + } +} + +long cyc, cycpw; /* cycles per work */ + +void fwq_init() { + long start, end; + int i; + start = rdtsc_light(); +#define N_INIT 10000000 + bulk_fsw(N_INIT); + end = rdtsc_light(); + cyc = end - start; + cycpw = cyc / (double)N_INIT; +} + +#if 0 +void fwq(long delay_cyc) { + if (delay_cyc < 0) { + return; + //printf("%s: delay_cyc < 0\n", __FUNCTION__); + } + bulk_fsw(delay_cyc / cycpw); +} +#else /* For machines with large core-to-core performance variation (e.g. OFP) */ +void fwq(long delay_cyc) { + long start, end; + + if (delay_cyc < 0) { return; } + start = rdtsc_light(); + + while (1) { + end = rdtsc_light(); + if (end - start >= delay_cyc) { + break; + } + bulk_fsw(2); /* ~150 ns per iteration on FOP */ + } +} +#endif + + +static int print_cpu_last_executed_on() { + char fn[256]; + char* result; + pid_t tid = syscall(SYS_gettid); + int fd; + int offset; + int mpi_errno = 0; + + sprintf(fn, "/proc/%d/task/%d/stat", getpid(), (int)tid); + //printf("fn=%s\n", fn); + fd = open(fn, O_RDONLY); + if(fd == -1) { + printf("open() failed\n"); + goto fn_fail; + } + + result = malloc(65536); + if(result == NULL) { + printf("malloc() failed"); + goto fn_fail; + } + + int amount = 0; + offset = 0; + while(1) { + amount = read(fd, result + offset, 65536); + // printf("amount=%d\n", amount); + if(amount == -1) { + printf("read() failed"); + goto fn_fail; + } + if(amount == 0) { + goto eof; + } + offset += amount; + } + eof:; + //printf("result:%s\n", result); + + char* next_delim = result; + char* field; + int i; + for(i = 0; i < 39; i++) { + field = strsep(&next_delim, " "); + } + + int cpu = sched_getcpu(); + if(cpu == -1) { + printf("getpu() failed\n"); + goto fn_fail; + } + + printf("compute thread,pmi_rank=%02d,stat-cpu=%02d,sched_getcpu=%02d,tid=%d\n", atoi(getenv("PMI_RANK")), atoi(field), cpu, tid); fflush(stdout); + fn_exit: + free(result); + return mpi_errno; + fn_fail: + mpi_errno = -1; + goto fn_exit; +} + +/* ga_acc per rank:ga_sync=40:1 */ +void rma(int nproc, int my_rank, double *wbuf, double *rbuf, int ndoubles, MPI_Win win, long calc_nsec) { + int i, j; + int r = 0, s = 0; + int req = 0; + for (i = 0; i < nproc; i++) { + if (i != my_rank) { + for (j = 0; j < ndoubles; j++) { + MPI_Accumulate(rbuf + i * ndoubles + j, 1, MPI_DOUBLE, + i, i * ndoubles + j, 1, MPI_DOUBLE, + MPI_SUM, win); + MPI_Win_flush_local(i, win); /* ga_acc() calls flush_local() immediately */ + } + } + } + fwq(calc_nsec); +} + +static struct option options[] = { + { + .name = "ppn", + .has_arg = required_argument, + .flag = NULL, + .val = 'P', + }, + /* end */ + { NULL, 0, NULL, 0, }, +}; + +int main(int argc, char **argv) { + int rc; + int actual; + int ppn = -1; + int nproc; + int ndoubles = -1; + double add_rate = 1.0; + int my_rank = -1, size = -1; + int i, j, k, l; + double *wbuf, *rbuf, *result; + MPI_Win win; + long start, end; + //struct timespec start, end; + long t_pure_l, t_overall_l; + long t_pure, t_overall; + int opt; + + fwq_init(); + + while ((opt = getopt_long(argc, argv, "+d:P:R:", options, NULL)) != -1) { + switch (opt) { + case 'd': + ndoubles = atoi(optarg); + break; + case 'P': + ppn = atoi(optarg); + break; + case 'R': + add_rate = atof(optarg); + break; + default: /* '?' */ + printf("unknown option %c\n", optopt); + exit(1); + } + } + + if (ndoubles == -1 || ppn == -1) { + printf("specify ndoubles with -d and ppn with --ppn"); + exit(1); + } + + MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &actual); + if (actual != 3) { + printf("ERROR: MPI_THREAD_MULTIPLE not available (level was set to %d)\n", actual); + exit(1); + } + + MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); + MPI_Comm_size(MPI_COMM_WORLD, &nproc); + + if (my_rank == 0) { + printf("ndoubles=%d,nproc=%d,add_rate=%f\n", ndoubles, nproc, add_rate); + printf("cyc=%ld, cycpw=%ld\n", cyc, cycpw); + } + + /* accumulate-to buffer */ + wbuf = malloc(sizeof(double) * ndoubles * nproc); + if(!wbuf) { printf("malloc failed"); goto fn_fail; } + memset(wbuf, 0, sizeof(double) * ndoubles * nproc); + + /* read-from buffer */ + rbuf = malloc(sizeof(double) * ndoubles * nproc); + if(!rbuf) { printf("malloc failed"); goto fn_fail; } + memset(rbuf, 0, sizeof(double) * ndoubles * nproc); + + /* Expose accumulate-to buffer*/ + if (rc = MPI_Win_create(wbuf, sizeof(double) * ndoubles * nproc, sizeof(double), MPI_INFO_NULL, MPI_COMM_WORLD, &win)) { + printf("MPI_Win_create failed,rc=%d\n", rc); + } + + //print_cpu_last_executed_on(); + + for (i = 0; i < nproc; i++) { + for (j = 0; j < ndoubles; j++) { + wbuf[i * ndoubles + j] = (i + 1) * 1000 + (j + 1); + rbuf[i * ndoubles + j] = (i + 1) * 10000 + (j + 1); + } + } + +#if 0 + for (i = 0; i < nproc; i++) { + for (j = 0; j < ndoubles; j++) { + printf("wbuf,proc=%d,j=%d,val=%f\n", i, j, wbuf[i * ndoubles + j]); + printf("rbuf,proc=%d,j=%d,val=%f\n", i, j, rbuf[i * ndoubles + j]); + } + } +#endif + + for (k = 0; k < 2; k++) { + if (k == 1) { + INIT_ASYNC_THREAD_(); + } + + /* Measure get_acc-flush time */ + MPI_Barrier(MPI_COMM_WORLD); +#define NPURE 10 + //clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start); + start = rdtsc_light(); + MPI_Pcontrol(1, "rma"); + syscall(701, 1); + syscall(701, 2); + for (i = 0; i < NPURE; i++) { + BEGIN_EPOCH(win); + rma(nproc, my_rank, wbuf, rbuf, ndoubles, win, 0); + END_EPOCH(win); + } + MPI_Pcontrol(-1, "rma"); + syscall(701, 4); + syscall(701, 8); + end = rdtsc_light(); + //clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end); + MPI_Barrier(MPI_COMM_WORLD); + t_pure_l = (end - start) / NPURE; + //t_pure_l = DIFFNSEC(end, start) / NPURE; + //printf("t_pure (local): %ld usec\n", t_pure_l / 1000UL); + MPI_Allreduce(&t_pure_l, &t_pure, 1, MPI_LONG, MPI_MAX, MPI_COMM_WORLD); + if (my_rank == 0) printf("t_pure (max): %ld cycles\n", t_pure); + + +#if 1 + for (l = 1; l <= 10; l++) { + MPI_Barrier(MPI_COMM_WORLD); +#define NOVERALL 10 + start = rdtsc_light(); + for (i = 0; i < NOVERALL; i++) { + BEGIN_EPOCH(win); + rma(nproc, my_rank, wbuf, rbuf, ndoubles, win, 100UL * 1000000 * l); + END_EPOCH(win); + } + end = rdtsc_light(); + MPI_Barrier(MPI_COMM_WORLD); + t_overall_l = (end - start) / NOVERALL; + MPI_Allreduce(&t_overall_l, &t_overall, 1, MPI_LONG, MPI_MAX, MPI_COMM_WORLD); + if (my_rank == 0) printf("t_overall (max): %ld cycle\n", t_overall); + } +#endif + + if (k == 1) { + FINALIZE_ASYNC_THREAD_(); + } + +#if 0 + for (i = 0; i < nproc; i++) { + for (j = 0; j < ndoubles; j++) { + printf("wbuf,proc=%d,j=%d,val=%f\n", i, j, wbuf[i * ndoubles + j]); + printf("rbuf,proc=%d,j=%d,val=%f\n", i, j, rbuf[i * ndoubles + j]); + printf("result,proc=%d,j=%d,val=%f\n", i, j, result[i * ndoubles + j]); + } + } +#endif + } + + fn_exit: + MPI_Finalize(); + return 0; + fn_fail: + goto fn_exit; +} diff --git a/test/uti/mpi/015.sh b/test/uti/mpi/015.sh new file mode 100755 index 00000000..719cd6ba --- /dev/null +++ b/test/uti/mpi/015.sh @@ -0,0 +1,189 @@ +#!/usr/bin/bash + +#!/usr/bin/bash -x + +MYHOME=/work/gg10/e29005 +UTI_MPI_TOP=${MYHOME}/project/os/mckernel/test/uti/mpi + +mck_dir=${MYHOME}/project/os/install + +exe=`basename $0 | sed 's/\.sh//'` + +stop=0 +reboot=0 +go=0 + +async=0 +mck=0 +nnodes=2 +LASTNODE=8200 +ndoubles=16 #2^12-15 +add_rate="1.0" +disable_uti=0 +omp_num_threads=1 +ppn=16 #16 +async_progress_pin=64,132,200,268,65,133,201,269,66,134,202,270,67,135,203,271 +lpp=4 # logical-per-physical +ncpu_mt=256 # number of CPUs for main-thread +myasync=1 +use_hfi=0 + +while getopts srga:c:n:md:l:N:P:o:A:R: OPT +do + case ${OPT} in + s) stop=1 + ;; + r) reboot=1 + ;; + g) go=1 + ;; + a) async=$OPTARG + ;; + n) ndoubles=$OPTARG + ;; + m) mck=1 + ;; + d) disable_uti=$OPTARG + ;; + N) nnodes=$OPTARG + ;; + P) ppn=$OPTARG + ;; + o) omp_num_threads=$OPTARG + ;; + A) myasync=$OPTARG + ;; + R) add_rate=$OPTARG + ;; + *) echo "invalid option -${OPT}" >&2 + exit 1 + esac +done + +nprocs=$((ppn * nnodes)) +nodes=`echo $(seq -s ",c" $(($LASTNODE + 1 - $nnodes)) $LASTNODE) | sed 's/^/c/'` +echo nprocs=$nprocs nnodes=$nnodes ppn=$ppn nodes=$nodes + +PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $nodes bash -c \'if \[ \"\`cat /etc/mtab \| while read line\; do cut -d\" \" -f 2\; done \| grep /work\`\" == \"\" \]\; then sudo mount /work\; fi\' + +if [ $disable_uti -eq 1 ]; then + export DISABLE_UTI=1 +else + unset DISABLE_UTI +fi + +if [ ${mck} -eq 1 ]; then + mcexec="${mck_dir}/bin/mcexec" + nmcexecthr=$((omp_num_threads + 4)) + mcexecopt="--uti-thread-rank=$uti_thread_rank" + if [ ${use_hfi} -eq 1 ]; then + mcexecopt="--enable-hfi1 $mcexecopt" + fi + mcexecopt="-n $ppn -t $nmcexecthr $mcexecopt" +else + mcexec= + mcexecopt= +fi + +if [ ${mck} -eq 1 ]; then + i_mpi_pin=off + i_mpi_pin_domain= + i_mpi_pin_order= +else + # Let each domain have all logical cores and use KMP_AFFINITY=scatter if you want to use only physical cores + i_mpi_pin=on + if [ $((omp_num_threads * lpp * ppn)) -le $ncpu_mt ]; then + domain=$((omp_num_threads * lpp)) # Prefer physical but adjacent physicals share L1 + else + domain=$((ncpu_mt / ppn)) # Use logical as well + fi + i_mpi_pin_domain="export I_MPI_PIN_DOMAIN=$domain" + i_mpi_pin_order="export I_MPI_PIN_ORDER=compact" +fi + +if [[ ($async -eq 1 && "$async_progress_pin" != "" ) || $myasync -eq 1 ]]; then + i_mpi_async_progress_pin="export I_MPI_ASYNC_PROGRESS_PIN=$async_progress_pin" +else + i_mpi_async_progress_pin= +fi + +if [ ${stop} -eq 1 ]; then + if [ ${mck} -eq 1 ]; then + PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $nodes sudo ${mck_dir}/sbin/mcstop+release.sh + else + : + fi +fi + +if [ ${reboot} -eq 1 ]; then + if [ ${mck} -eq 1 ]; then + if hostname | grep ofp &>/dev/null; then + PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $nodes sudo ${mck_dir}/sbin/mcreboot.sh -s -c 2-17,70-85,138-153,206-221,20-35,88-103,156-171,224-239,36-51,104-119,172-187,240-255,52-67,120-135,188-203,256-271 -r 2-5,70-73,138-141,206-209:0+6-9,74-77,142-145,210-213:1+10-13,78-81,146-149,214-217:68+14-17,82-85,150-153,218-221:69+20-23,88-91,156-159,224-227:136+24-27,92-95,160-163,228-231:137+28-31,96-99,164-167,232-235:204+32-35,100-103,168-171,236-239:205+36-39,104-107,172-175,240-243:18+40-43,108-111,176-179,244-247:19+44-47,112-115,180-183,248-251:86+48-51,116-119,184-187,252-255:87+52-55,120-123,188-191,256-259:154+56-59,124-127,192-195,260-263:155+60-63,128-131,196-199,264-267:222+64-67,132-135,200-203,268-271:223 -m 32G@0,12G@1 + else + PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $nodes sudo ${mck_dir}/sbin/mcreboot.sh -s -c 1-15,65-79,129-143,193-207,17-31,81-95,145-159,209-223,33-47,97-111,161-175,225-239,49-63,113-127,177-191,241-255 -r 1-15:0+65-79:64+129-143:128+193-207:192+17-31:16+81-95:80+145-159:144+209-223:208+33-47:32+97-111:96+161-175:160+225-239:224+49-63:48+113-127:112+177-191:176+241-255:240 -m 12G@0,12G@1,12G@2,12G@3,3920M@4,3920M@5,3920M@6,3920M@7 + fi + else + : + fi +fi + +cd ${UTI_MPI_TOP} +( +cat < ./job.sh +chmod u+x ./job.sh + +if [ ${go} -eq 1 ]; then + cd ${UTI_MPI_TOP} + if [ $mck -eq 1 ]; then + make $exe + else + . /home/opt/local/cores/intel/compilers_and_libraries_2018.1.163/linux/bin/compilervars.sh intel64 + make CC=mpiicc $exe + fi + ./job.sh +fi + + + diff --git a/test/uti/mpi/016.c b/test/uti/mpi/016.c new file mode 100755 index 00000000..fc83c198 --- /dev/null +++ b/test/uti/mpi/016.c @@ -0,0 +1,349 @@ +#define _GNU_SOURCE /* See feature_test_macros(7) */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* For SYS_xxx definitions */ +#include +#include +#include +#include +#include "async_progress.h" +#include "util.h" + +#define MYTIME_UNIT "usec" +#define MYTIME_TOUSEC 1000000 +#define MYTIME_TONSEC 1000000000 + +#define NROW 16 /* 0%, 10%, ..., 140% */ +#define NCOL 4 + +#define NSAMPLES_DROP 5/*10*/ +#define NSAMPLES_COMM 10/*20*/ +#define NSAMPLES_TOTAL 10/*20*/ +#define NSAMPLES_INNER 5 + +#define PROGRESS_CALC_PHASE_ONLY + +static inline double mytime() { + return /*rdtsc_light()*/MPI_Wtime(); +} + +static int ppn = -1; + +void init_buf(double *origin_buf, double *result, double *target_buf, int szbuf, int rank, int id) { + int j; + for (j = 0; j < szbuf; j++) { + origin_buf[j] = (rank + 1) * 100.0 + (j + 1); + result[j] = (id + 1) * 100000000.0 + (rank + 1) * 10000.0 + (j + 1); + target_buf[j] = (rank + 1) * 1000000.0 + (j + 1); + } +} + +void pr_buf(double *origin_buf, double *result, double *target_buf, int szbuf, int rank, int nproc) { + int i, j; + for (i = 0; i < nproc; i++) { + MPI_Barrier(MPI_COMM_WORLD); + + if (i != rank) { + usleep(100000); + continue; + } + + for (j = 0; j < szbuf; j++) { + pr_debug("[%d] origin_buf,j=%d,val=%f\n", rank, j, origin_buf[j]); + pr_debug("[%d] result,j=%d,val=%f\n", rank, j, result[j]); + pr_debug("[%d] target_buf,j=%d,val=%f\n", rank, j, target_buf[j]); + } + } +} + +void rma(int rank, int nproc, MPI_Win win, double *origin_buf, double *result, int szbuf, long nsec_calc, int async_progress, int sync_progress, double pct_calc) { + int i, j, target_rank; + int completed, ret; + + for (j = 0; j < NSAMPLES_INNER; j++) { + for (i = 1; i < nproc; i++) { + target_rank = (rank + i) % nproc; + + MPI_Get_accumulate(origin_buf, szbuf, MPI_DOUBLE, + result, szbuf, MPI_DOUBLE, + target_rank, + 0, szbuf, MPI_DOUBLE, + MPI_NO_OP, win); +#if 0 + if (sync_progress) { + if ((ret = MPI_Iprobe(MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD, &completed, MPI_STATUS_IGNORE)) != MPI_SUCCESS) { + pr_err("%s: error: MPI_Iprobe: %d\n", __func__, ret); + } + } +#endif + } + } + + if (async_progress) { +#ifdef PROGRESS_CALC_PHASE_ONLY + progress_start(); +#endif + } + + ndelay(nsec_calc); + + if (async_progress) { +#ifdef PROGRESS_CALC_PHASE_ONLY + progress_stop(); +#endif + } + +#define MAX2(x,y) ((x) > (y) ? (x) : (y)) + +#if 1 + /* iprobe is 10 times faster than win_flush_local_all, + 20679 usec / (8*63*5) messages for 8-ppn 8-node case */ + if (1/*!sync_progress*/) + for (j = 0; j < (async_progress ? MAX2(NSAMPLES_INNER * (nproc - 1) * (1.0 - pct_calc), nproc - 1) : NSAMPLES_INNER * (nproc - 1)); j++) { + //for (j = 0; j < MAX2(NSAMPLES_INNER * (nproc - 1) * (1.0 - pct_calc), nproc - 1); j++) { + if ((ret = MPI_Iprobe(MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD, &completed, MPI_STATUS_IGNORE)) != MPI_SUCCESS) { + pr_err("%s: error: MPI_Iprobe: %d\n", __func__, ret); + } + } +#endif + + MPI_Win_flush_local_all(win); +} + +double measure(int rank, int nproc, MPI_Win win, double *origin_buf, double* result, double *target_buf, int szbuf, long nsec_calc, int async_progress, int sync_progress, int nsamples, int nsamples_drop, double pct_calc) { + int i; + double t_l, t_g, t_sum = 0; + double start, end; + + for (i = 0; i < nsamples + nsamples_drop; i++) { + MPI_Barrier(MPI_COMM_WORLD); + MPI_Win_lock_all(0, win); + + /* Set parameter based on current IPC and frequency */ + ndelay_init(0); + + start = mytime(); + rma(rank, nproc, win, origin_buf, result, szbuf, nsec_calc, async_progress, sync_progress, pct_calc); + end = mytime(); + + MPI_Win_unlock_all(win); + MPI_Barrier(MPI_COMM_WORLD); + + t_l = end - start; + MPI_Allreduce(&t_l, &t_g, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD); + + if (i < nsamples_drop) { + continue; + } + + t_sum += t_g; + } + return t_sum / nsamples; +} + +int main(int argc, char **argv) +{ + int ret; + int actual; + int rank = -1; + int nproc; + int i, j, progress, l, m; + double *target_buf, *origin_buf, *result; + MPI_Win win; + double t_comm_l, t_comm_g, t_comm_sum, t_comm_ave; + double t_total_l, t_total_g, t_total_sum, t_total_ave; + double t_table[NROW][NCOL]; + int opt; + int szbuf = 1; /* Number of doubles to send */ + struct rusage ru_start, ru_end; + struct timeval tv_start, tv_end; + int disable_syscall_intercept = 0; + + cpu_set_t cpuset; + + //test_set_loglevel(TEST_LOGLEVEL_WARN); + ndelay_init(1); + + while ((opt = getopt(argc, argv, "+p:I:")) != -1) { + switch (opt) { + case 'p': + ppn = atoi(optarg); + break; + case 'I': + disable_syscall_intercept = atoi(optarg); + break; + default: /* '?' */ + printf("unknown option %c\n", optopt); + ret = -1; + goto out; + } + } + + if (ppn == -1) { + pr_err("Error: Specify processes-per-rank with -p"); + ret = -1; + goto out; + } + + MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &actual); + if (actual != MPI_THREAD_MULTIPLE) { + pr_err("Error: MPI_THREAD_MULTIPLE is not available\n"); + ret = -1; + goto out; + } + + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + MPI_Comm_size(MPI_COMM_WORLD, &nproc); + + if (rank == 0) { + printf("ndoubles=%d,nproc=%d\n", szbuf, nproc); + +#pragma omp parallel + { + //printf("%d cpu\n", sched_getcpu()); + if (omp_get_thread_num() == 0) { + printf("#threads=%d\n", omp_get_num_threads()); + } + } + } + + /* accumulate-to buffer */ + target_buf = malloc(sizeof(double) * szbuf); + if (!target_buf) { + pr_err("Error: allocating target_buf"); + ret = -1; + goto out; + } + memset(target_buf, 0, sizeof(double) * szbuf); + + /* read-from buffer */ + origin_buf = malloc(sizeof(double) * szbuf); + if (!origin_buf) { + pr_err("Error: alloacting origin_buf"); + ret = -1; + goto out; + } + memset(origin_buf, 0, sizeof(double) * szbuf); + + /* fetch-to buffer */ + result = malloc(sizeof(double) * szbuf); + if (!result) { + pr_err("Error: allocating result"); + ret = -1; + goto out; + } + memset(result, 0, sizeof(double) * szbuf); + + /* Expose accumulate-to buffer*/ + ret = MPI_Win_create(target_buf, sizeof(double) * szbuf, sizeof(double), MPI_INFO_NULL, MPI_COMM_WORLD, &win); + if (ret != 0) { + pr_err("Error: MPI_Win_create returned %d\n", ret); + ret = -1; + goto out; + } + + /* Measure RMA-only time */ + init_buf(origin_buf, result, target_buf, szbuf, rank, 99); + t_comm_ave = measure(rank, nproc, win, origin_buf, result, target_buf, szbuf, 0, 0, 1, NSAMPLES_COMM, NSAMPLES_DROP, 0); + + if (rank == 0) { + printf("t_comm_ave: %.0f %s\n", t_comm_ave * MYTIME_TOUSEC, MYTIME_UNIT); + } + +#ifdef PROFILE + syscall(701, 1 | 2 | 0x80000000); /* syscall profile start */ +#endif + + /* 0: no progress, 1: progress, no uti, 2: progress, uti */ + for (progress = 0; progress <= (disable_syscall_intercept ? 0 : 2); progress += 1) { + + if (progress == 1) { + setenv("DISABLE_UTI", "1", 1); /* Don't use uti_attr and pin to Linux/McKernel CPUs */ + progress_init(); + } else if (progress == 2) { + progress_finalize(); + unsetenv("DISABLE_UTI"); + progress_init(); + } + + if (progress == 1 || progress == 2) { +#ifndef PROGRESS_CALC_PHASE_ONLY + //progress_start(); +#endif + } + + /* RMA-start, compute for T_{RMA} * l / 10, RMA-flush */ + for (l = 0; l <= NROW - 1; l += 1) { + long nsec_calc = (t_comm_ave * MYTIME_TONSEC * l) / 10; + + init_buf(origin_buf, result, target_buf, szbuf, rank, l); + //pr_buf(origin_buf, result, target_buf, szbuf, rank, nproc); + t_total_ave = measure(rank, nproc, win, origin_buf, result, target_buf, szbuf, nsec_calc, progress, 0, NSAMPLES_TOTAL, NSAMPLES_DROP, l / 10.0); + //pr_buf(origin_buf, result, target_buf, szbuf, rank, nproc); + + if (rank == 0) { + + if (l == 0) { + pr_debug("progress=%d\n", progress); + if (progress == 0) { + pr_debug("calc\ttotal\n"); + } else { + pr_debug("total\n"); + } + } + + t_table[l][0] = nsec_calc * (MYTIME_TOUSEC / (double)MYTIME_TONSEC); + if (progress == 0) { + pr_debug("%.0f\t%.0f\n", nsec_calc * (MYTIME_TOUSEC / (double)MYTIME_TONSEC), t_total_ave * MYTIME_TOUSEC); + t_table[l][progress + 1] = t_total_ave * MYTIME_TOUSEC; + } else { + pr_debug("%.0f\n", t_total_ave * MYTIME_TOUSEC); + t_table[l][progress + 1] = t_total_ave * MYTIME_TOUSEC; + } + } + } + + if (progress == 1 || progress == 2) { +#ifndef PROGRESS_CALC_PHASE_ONLY + //progress_stop(); +#endif + } + + } + +#ifdef PROFILE + syscall(701, 4 | 8 | 0x80000000); /* syscall profile report */ +#endif + + if (rank == 0) { + printf("calc,no prog,prog and no uti, prog and uti\n"); + for (l = 0; l <= NROW - 1; l++) { + for (i = 0; i < NCOL; i++) { + if (i > 0) { + printf(","); + } + printf("%.0f", t_table[l][i]); + } + printf("\n"); + } + } + + MPI_Barrier(MPI_COMM_WORLD); + + if (progress >= 1) { + progress_finalize(); + } + + MPI_Finalize(); + ret = 0; +out: + return ret; +} diff --git a/test/uti/mpi/016.sh b/test/uti/mpi/016.sh new file mode 100755 index 00000000..90d87107 --- /dev/null +++ b/test/uti/mpi/016.sh @@ -0,0 +1,272 @@ +#!/usr/bin/bash + +#!/usr/bin/bash -x + +MYHOME=/home/e29005 +test_dir=`pwd -P` +mck_dir=${MYHOME}/project/os/install +uti_dir_lin=${MYHOME}/project/uti/install_linux +uti_dir_mck=${MYHOME}/project/uti/install_mckernel + +exe=`basename $0 | sed 's/\.sh//'` + +stop=0 +reboot=0 +go=0 + +interactive=0 +pjsub=0 +gdb=0 +disable_syscall_intercept=0 +mck=0 +nnodes=2 +LASTNODE=8196 +use_hfi=0 +omp_num_threads=32 +ppn=4 + +while getopts srgc:ml:N:P:o:hGI:ipL: OPT +do + case ${OPT} in + s) stop=1 + ;; + r) reboot=1 + ;; + g) go=1 + ;; + m) mck=1 + ;; + N) nnodes=$OPTARG + ;; + P) ppn=$OPTARG + ;; + o) omp_num_threads=$OPTARG + ;; + h) use_hfi=1 + ;; + G) gdb=1 + ;; + I) disable_syscall_intercept=$OPTARG + ;; + i) interactive=1 + ;; + p) pjsub=1 + ;; + L) LASTNODE=$OPTARG + ;; + *) echo "invalid option -${OPT}" >&2 + exit 1 + esac +done + +nprocs=$((ppn * nnodes)) +nodes=`echo $(seq -s ",c" $(($LASTNODE + 1 - $nnodes)) $LASTNODE) | sed 's/^/c/'` + +# vertical cut, excluding phys loaded with Linux tasks +uti_cpu_set_lin=1,69,137,205,18-19,86-87,154-155,222-223 +exclude_list=0-1,68-69,136-137,204-205,18-19,86-87,154-155,222-223 +#64-67,132-135,200-203,268-271 + +uti_cpu_set_mck=1,69,137,205,18-19,86-87,154-155,222-223 + +# horizontal cut, excluding phys loaded with Linux tasks for mckernel +#uti_cpu_set_lin=204-271 +#uti_cpu_set_mck=1-67 + +if [ $mck -eq 0 ]; then + uti_cpu_set_str="export UTI_CPU_SET=$uti_cpu_set_lin" + i_mpi_pin_processor_exclude_list="export I_MPI_PIN_PROCESSOR_EXCLUDE_LIST=$exclude_list" +else + uti_cpu_set_str="export UTI_CPU_SET=$uti_cpu_set_mck" + i_mpi_pin_processor_exclude_list= +fi + +if [ ${mck} -eq 1 ]; then + i_mpi_pin=off + i_mpi_pin_domain= + i_mpi_pin_order= +# if [ $omp_num_threads -eq 1 ]; then +# # Avoid binding main thread and uti thread to one CPU + kmp_affinity="export KMP_AFFINITY=disabled" +# else +# # Bind rank to OMP_NUM_THREAD-sized CPU-domain +# kmp_affinity="export KMP_AFFINITY=granularity=thread,scatter" +# fi +else + i_mpi_pin=on + domain=$omp_num_threads # Use 32 when you want to match mck's -n division + i_mpi_pin_domain="export I_MPI_PIN_DOMAIN=$domain" + i_mpi_pin_order="export I_MPI_PIN_ORDER=compact" + kmp_affinity="export KMP_AFFINITY=granularity=thread,scatter" +fi + +echo nprocs=$nprocs nnodes=$nnodes ppn=$ppn nodes=$nodes domain=$domain + +if [ ${mck} -eq 1 ]; then + makeopt="UTI_DIR=$uti_dir_mck" + use_mck="#PJM -x MCK=$mck_dir" + mck_mem="#PJM -x MCK_MEM=32G@0,8G@1" + mcexec="${mck_dir}/bin/mcexec" + nmcexecthr=$((omp_num_threads + 4)) + mcexecopt="-n $ppn --uti-use-last-cpu" # -t $nmcexecthr + + if [ ${use_hfi} -eq 1 ]; then + mcexecopt="--enable-hfi1 $mcexecopt" + fi + + if [ $disable_syscall_intercept -eq 0 ]; then + mcexecopt="--enable-uti $mcexecopt" + fi + +else + offline=`PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $nodes lscpu \| grep Off 2>&1 | dshbak -c | grep Off` + if [ "$offline" != "" ]; then + echo "Error: Some CPUs are offline: $offline" + exit + fi + + makeopt="UTI_DIR=$uti_dir_lin" + use_mck= + mck_mem= + mcexec= + mcexecopt= +fi + +if [ $gdb -eq 1 ]; then + enable_x="-enable-x" + gdbcmd="xterm -display localhost:11 -hold -e gdb -ex run --args" +fi + +if [ $interactive -eq 1 ]; then + i_mpi_hydra_bootstrap_exec= + i_mpi_hydra_bootstrap= + hosts= + opt_dir=/opt/intel + ssh= +else +# PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $nodes bash -c \'if \[ \"\`cat /etc/mtab \| while read line\; do cut -d\" \" -f 2\; done \| grep /work\`\" == \"\" \]\; then sudo mount /work\; fi\' + i_mpi_hydra_bootstrap_exec="export I_MPI_HYDRA_BOOTSTRAP_EXEC=/usr/bin/ssh" + i_mpi_hydra_bootstrap="export I_MPI_HYDRA_BOOTSTRAP=ssh" + hosts="-hosts $nodes" + opt_dir=/home/opt/local/cores/intel + ssh="ssh -A c$LASTNODE" +fi + +# If using ssh +# Latest versions are: 1.163, 2.199, 3.222 +if [ $pjsub -eq 0 ] && [ $interactive -eq 0 ]; then + compilervars=". ${opt_dir}/compilers_and_libraries_2018.2.199/linux/bin/compilervars.sh intel64" +else + compilervars= +fi + +if [ ${stop} -eq 1 ]; then + if [ ${mck} -eq 1 ]; then + PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $nodes \ + /usr/sbin/pidof mcexec \| xargs -r sudo kill -9 + PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $nodes \ + /usr/sbin/pidof $exe \| xargs -r sudo kill -9 + PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $nodes \ + sudo ${mck_dir}/sbin/mcstop+release.sh + else + PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $nodes \ + /usr/sbin/pidof $exe \| xargs -r sudo kill -9 + fi +fi + +if [ ${reboot} -eq 1 ]; then + if [ ${mck} -eq 1 ]; then + if hostname | grep ofp &>/dev/null; then + + # -h: Hide idle thread to prevent KNL CPU from mux-ing resource and halving throughput + PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $nodes \ + sudo ${mck_dir}/sbin/mcreboot.sh -h -O -c 2-17,70-85,138-153,206-221,20-35,88-103,156-171,224-239,36-51,104-119,172-187,240-255,52-67,120-135,188-203,256-271 -r 2-5,70-73,138-141,206-209:0+6-9,74-77,142-145,210-213:1+10-13,78-81,146-149,214-217:68+14-17,82-85,150-153,218-221:69+20-23,88-91,156-159,224-227:136+24-27,92-95,160-163,228-231:137+28-31,96-99,164-167,232-235:204+32-35,100-103,168-171,236-239:205+36-39,104-107,172-175,240-243:18+40-43,108-111,176-179,244-247:19+44-47,112-115,180-183,248-251:86+48-51,116-119,184-187,252-255:87+52-55,120-123,188-191,256-259:154+56-59,124-127,192-195,260-263:155+60-63,128-131,196-199,264-267:222+64-67,132-135,200-203,268-271:223 -m 32G@0,12G@1 + +# perl -e 'for ($i=0;$i<68;$i++){if($i>0){print "+";}printf("%d,%d,%d:%d", $i+68,$i+136,$i+204,$i);}' + +# PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $nodes \ +# sudo ${mck_dir}/sbin/mcreboot.sh -O -c 68-271 -r 68,136,204:0+69,137,205:1+70,138,206:2+71,139,207:3+72,140,208:4+73,141,209:5+74,142,210:6+75,143,211:7+76,144,212:8+77,145,213:9+78,146,214:10+79,147,215:11+80,148,216:12+81,149,217:13+82,150,218:14+83,151,219:15+84,152,220:16+85,153,221:17+86,154,222:18+87,155,223:19+88,156,224:20+89,157,225:21+90,158,226:22+91,159,227:23+92,160,228:24+93,161,229:25+94,162,230:26+95,163,231:27+96,164,232:28+97,165,233:29+98,166,234:30+99,167,235:31+100,168,236:32+101,169,237:33+102,170,238:34+103,171,239:35+104,172,240:36+105,173,241:37+106,174,242:38+107,175,243:39+108,176,244:40+109,177,245:41+110,178,246:42+111,179,247:43+112,180,248:44+113,181,249:45+114,182,250:46+115,183,251:47+116,184,252:48+117,185,253:49+118,186,254:50+119,187,255:51+120,188,256:52+121,189,257:53+122,190,258:54+123,191,259:55+124,192,260:56+125,193,261:57+126,194,262:58+127,195,263:59+128,196,264:60+129,197,265:61+130,198,266:62+131,199,267:63+132,200,268:64+133,201,269:65+134,202,270:66+135,203,271:67 -m 32G@0,12G@1 + else + echo "unkwon host type" + exit 1 + fi + else + : + fi +fi + +( +cat < ./job.sh +chmod u+x ./job.sh + +if [ ${go} -eq 1 ]; then + if [ $pjsub -eq 1 ]; then + pjsub ./job.sh + else + if [ $interactive -eq 0 ]; then + . ${opt_dir}/compilers_and_libraries_2018.2.199/linux/bin/compilervars.sh intel64 + fi + rm ./$exe + make $makeopt ./$exe + PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $nodes \ + /usr/sbin/pidof $exe \| xargs -r sudo kill -9 + $ssh ${test_dir}/job.sh + fi +fi diff --git a/test/uti/mpi/Makefile b/test/uti/mpi/Makefile new file mode 100755 index 00000000..43418165 --- /dev/null +++ b/test/uti/mpi/Makefile @@ -0,0 +1,56 @@ +.SUFFIXES: # Clear suffixes + +MYHOME=/home/e29005 + +# Specify it via 016.sh +UTI_DIR=${MYHOME}/project/uti/install_linux + +CC=mpiicc +LD=$(CC) + +CFLAGS = -g -O0 -Wall +LDFLAGS = -lpthread -lpsm2 -L$(UTI_DIR)/lib -Wl,-rpath -Wl,$(UTI_DIR)/lib -luti +SRCS = $(shell ls *.c) +OBJS = $(SRCS:.c=.o) +EXES = $(SRCS:.c=) +TMPFILES = $(shell ls psm2-demo-* 2>/dev/null) + +all: $(EXES) file + +file: $(TMPFILES) + rm -f $(TMPFILES) + dd if=/dev/zero of=./file bs=1M count=1 + +async_progress.o:: async_progress.c util.h + $(CC) $(CFLAGS) -I$(UTI_DIR)/include -c $< + +util.o:: util.c util.h + $(CC) $(CFLAGS) -qopenmp -c $< + +014: 014.o async_progress.o util.o + $(LD) -o $@ $^ $(LDFLAGS) + +015: 015.o async_progress.o + $(LD) -o $@ $^ $(LDFLAGS) + +016: 016.o async_progress.o util.o + $(LD) -o $@ $^ $(LDFLAGS) -qopenmp + +016.o::016.c + $(CC) $(CFLAGS) -qopenmp -c $< + +011: 011.o + $(LD) -o $@ $^ $(LDFLAGS) -qopenmp + +011.o::011.c + $(CC) $(CFLAGS) -qopenmp -c $< + +%: %.o + $(LD) -o $@ $^ $(LDFLAGS) + +%.o::%.c + $(CC) $(CFLAGS) -c $< + +clean: + rm -f core $(EXES) $(OBJS) $(DSRCS) + diff --git a/test/uti/mpi/README b/test/uti/mpi/README new file mode 100644 index 00000000..592b6d56 --- /dev/null +++ b/test/uti/mpi/README @@ -0,0 +1,25 @@ +001 isend 送受信に使用するバッファは毎回異なる +002 barrier +003 isend 送受信に使用するバッファは一つ、waitの前にsleepしない +004 isend-calc-wait, all-to-all +005 lockall-accumulate-calc-unlockall, all-to-all +006 parent isend-calc-wait, child does nothing --> crash +007 parent isend-calc-wait, child psm2 send/recv --> one ep per process +008 parent psm2-init and psm2-connect, child psm2-send/recv --> receiver side crash +009 parent does nothing, child psm2-init, psm2-connect, psm2-send/recv --> receiver side crash +010 parent psm2-init, psm2-connect, psm2-send/recv, child does nothing +011 001にopenmpスレッドを追加 +012 get_acc-calc-flush_local_all, all-to-all. Execute ./012.sh +013 acc-flush_local-calc, all-to-all, acc:flush_local=1:1 +014 012 + async progress thread. +015 013 + async progress thread + +016 MPI_Get_accumulate()のオーバーラップ + +* 通信パターンは全対全、 +* CPUはいくつかをprogress thread専用に割く +* ステップは以下の通り + (1) MPI_Get_accumulate() + (2) MPI_Get_accumulate()とMPI_Flush_local_all()だけを行った場合の +   時間の0.i倍の計算を実行 + (3) MPI_Flush_local_all() diff --git a/test/uti/mpi/async_progress.c b/test/uti/mpi/async_progress.c new file mode 100644 index 00000000..3034ee28 --- /dev/null +++ b/test/uti/mpi/async_progress.c @@ -0,0 +1,530 @@ +#define _GNU_SOURCE /* See feature_test_macros(7) */ +#include +#include /* For SYS_xxx definitions */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "util.h" +#include "async_progress.h" + +//#define PROFILE + +#define STOP_BY_MPI 0 +#define STOP_BY_MEM 1 +#define STOP_TYPE STOP_BY_MEM/*STOP_BY_MPI*/ + +#define POLL_BY_PROBE 0 +#define POLL_BY_WAIT 1 +#define POLL_BY_TEST 2 +#define POLL_TYPE POLL_BY_PROBE/*POLL_BY_WAIT*/ + +static int progress_rank, progress_world_rank, progress_world_nproc; +static pthread_t progress_thr; +static pthread_mutex_t progress_mutex; +static pthread_cond_t progress_cond_down; +static volatile int progress_flag_up, progress_flag_down; + +static enum progress_state progress_state; +static int progress_stop_flag; +static MPI_Comm progress_comm; +static int progress_refc; +#define WAKE_TAG 100 + +#define NROW_STAT 10 +#define NRANK_STAT 1 +#define RECORD_STAT(count, array, end, start) do { \ + if (count < NROW_STAT) { \ + array[count++] += (end - start); \ + } \ +} while(0) + +static int cyc_prog1_count, cyc_prog2_count, cyc_init1_count, cyc_init2_count, cyc_start_count, cyc_stop1_count, cyc_stop2_count, cyc_stop3_count, cyc_finalize_count; +static unsigned long cyc_prog1[NROW_STAT]; +static unsigned long cyc_prog2[NROW_STAT]; +static unsigned long cyc_init1[NROW_STAT]; +static unsigned long cyc_init2[NROW_STAT]; +static unsigned long cyc_start[NROW_STAT]; +static unsigned long cyc_stop1[NROW_STAT]; +static unsigned long cyc_stop2[NROW_STAT]; +static unsigned long cyc_stop3[NROW_STAT]; +static unsigned long cyc_finalize[NROW_STAT]; + +#define MIN2(x,y) ((x) < (y) ? (x) : (y)) + +void pr_stat(char *name, int count, unsigned long *array) { + int i; + + pr_debug("[%d] %s: ", progress_world_rank, name); + for (i = 0; i < MIN2(count, NROW_STAT); i++) { + if (i > 0) pr_debug(","); + pr_debug("%ld", array[i]); + } + pr_debug("\n"); +} + +static void *progress_fn(void* data) +{ + int ret; + MPI_Request req; + struct rusage ru_start, ru_end; + struct timeval tv_start, tv_end; + unsigned long start, end; + +#if 0 + ret = syscall(732); + if (ret == -1) { + pr_debug("Progress is running on Linux\n"); + } else { + pr_debug("Progress is running on McKernel\n"); + } + + if ((ret = getrusage(RUSAGE_THREAD, &ru_start))) { + pr_err("%s: error: getrusage failed (%d)\n", __func__, ret); + } + + if ((ret = gettimeofday(&tv_start, NULL))) { + pr_err("%s: error: gettimeofday failed (%d)\n", __func__, ret); + } + +#endif + +#if STOP_TYPE == STOP_BY_MEM && POLL_TYPE == POLL_BY_TEST + + if ((ret = MPI_Irecv(NULL, 0, MPI_CHAR, progress_rank, WAKE_TAG, progress_comm, &req)) != MPI_SUCCESS) { + pr_err("%s: error: MPI_Irecv: %d\n", __func__, ret); + } + +#endif + +init: +#ifdef PROFILE + start = rdtsc_light(); +#endif + + /* Wait for state transition */ + pthread_mutex_lock(&progress_mutex); + while (!progress_flag_down) { + pthread_cond_wait(&progress_cond_down, &progress_mutex); + } + progress_flag_down = 0; + + if (progress_state == PROGRESS_FINALIZE) { + pthread_mutex_unlock(&progress_mutex); + goto finalize; + } + + if (progress_state != PROGRESS_START) { + pr_err("%s: error: unexpected state: %d\n", __func__, progress_state); + pthread_mutex_unlock(&progress_mutex); + goto finalize; + } + + pthread_mutex_unlock(&progress_mutex); + +#ifdef PROFILE + end = rdtsc_light(); + RECORD_STAT(cyc_prog1_count, cyc_prog1, end, start); +#endif + + //if (progress_world_rank < 2) pr_debug("[%d] poll,cpu=%d\n", progress_world_rank, sched_getcpu()); + +#ifdef PROFILE + start = rdtsc_light(); +#endif + +#if STOP_TYPE == STOP_BY_MEM + +#if POLL_TYPE == POLL_BY_PROBE + + int completed = 0; + while (!progress_stop_flag) { + if ((ret = MPI_Iprobe(MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD, &completed, MPI_STATUS_IGNORE)) != MPI_SUCCESS) { + pr_err("%s: error: MPI_Iprobe: %d\n", __func__, ret); + break; + } + //usleep(1); + } + +#elif POLL_TYPE == POLL_BY_TEST + + int completed = 0; + while (!completed && !progress_stop_flag) { + if ((ret = MPI_Test(&req, &completed, MPI_STATUS_IGNORE)) != MPI_SUCCESS) { + pr_err("%s: error: MPI_Iprobe: %d\n", __func__, ret); + break; + } + //usleep(1); + } + +#endif /* POLL_TYPE */ + +#elif STOP_TYPE == STOP_BY_MPI + + +#if POLL_TYPE == POLL_BY_WAIT + + if ((ret = MPI_Irecv(NULL, 0, MPI_CHAR, progress_rank, WAKE_TAG, progress_comm, &req)) != MPI_SUCCESS) { + pr_err("%s: error: MPI_Irecv: %d\n", __func__, ret); + } + + if ((ret = MPI_Wait(&req, MPI_STATUS_IGNORE)) != MPI_SUCCESS) { + pr_err("%s: error: MPI_Wait failed (%d)\n", __func__, ret); + } + +#elif POLL_TYPE == POLL_BY_PROBE + + int completed = 0; + while (!completed) { + if ((ret = MPI_Iprobe(progress_rank, WAKE_TAG, progress_comm, &completed, MPI_STATUS_IGNORE)) != MPI_SUCCESS) { + pr_err("%s: error: MPI_Iprobe: %d\n", __func__, ret); + break; + } + usleep(1); + } + + if ((ret = MPI_Recv(NULL, 0, MPI_CHAR, progress_rank, WAKE_TAG, progress_comm, MPI_STATUS_IGNORE)) != MPI_SUCCESS) { + pr_err("%s: error: MPI_Irecv: %d\n", __func__, ret); + } + +#endif /* POLL_TYPE */ +#endif /* STOP_TYPE */ + + progress_state = PROGRESS_INIT; + __sync_synchronize(); /* st-st barrier */ + progress_flag_up = 1; + +#ifdef PROFILE + end = rdtsc_light(); + RECORD_STAT(cyc_prog2_count, cyc_prog2, end, start); +#endif + goto init; + + finalize: + + if ((ret = getrusage(RUSAGE_THREAD, &ru_end))) { + pr_err("%s: error: getrusage failed (%d)\n", __func__, ret); + } + + if ((ret = gettimeofday(&tv_end, NULL))) { + pr_err("%s: error: gettimeofday failed (%d)\n", __func__, ret); + } + +#if 0 + pr_debug("%s: wall: %ld, user: %ld, sys: %ld\n", __func__, + DIFFUSEC(tv_end, tv_start), + DIFFUSEC(ru_end.ru_utime, ru_start.ru_utime), + DIFFUSEC(ru_end.ru_stime, ru_start.ru_stime)); +#endif + + progress_state = PROGRESS_INIT; + __sync_synchronize(); /* st-st barrier */ + progress_flag_up = 1; + + return NULL; +} + +void progress_init() +{ + int ret = 0; + pthread_attr_t pthread_attr; + uti_attr_t uti_attr; + unsigned long start, end; + +#ifdef PROFILE + start = rdtsc_light(); +#endif + MPI_Comm_rank(MPI_COMM_WORLD, &progress_world_rank); + MPI_Comm_size(MPI_COMM_WORLD, &progress_world_nproc); + + if (__sync_val_compare_and_swap(&progress_refc, 0, 1) == 1) { + return; + } + + /* printf costs much in MPI */ + uti_set_loglevel(UTI_LOGLEVEL_ERR); + + if ((ret = MPI_Comm_dup(MPI_COMM_SELF, &progress_comm))) { + pr_err("%s: error: MPI_Comm_dup failed (%d)\n", __func__, ret); + goto out; + } + + MPI_Comm_rank(progress_comm, &progress_rank); + + if ((ret = pthread_mutex_init(&progress_mutex, NULL))) { + pr_err("%s: error: pthread_mutex_init failed (%d)\n", __func__, ret); + goto out; + } + + if ((ret = pthread_cond_init(&progress_cond_down, NULL))) { + pr_err("%s: error: pthread_cond_init failed (%d)\n", __func__, ret); + goto out; + } + + if ((ret = pthread_attr_init(&pthread_attr))) { + pr_err("%s: error: pthread_attr_init failed (%d)\n", __func__, ret); + goto out; + } + + if ((ret = uti_attr_init(&uti_attr))) { + pr_err("%s: error: uti_attr_init failed (%d)\n", __func__, ret); + goto out; + } + +#if 0 + if ((ret = UTI_ATTR_SAME_L1(&uti_attr))) { + pr_err("%s: error: UTI_ATTR_SAME_L1 failed\n", __func__); + } +#endif + +#if 1 /* Expecting round-robin binding */ + if ((ret = UTI_ATTR_CPU_INTENSIVE(&uti_attr))) { + pr_err("%s: error: UTI_ATTR_CPU_INTENSIVE failed\n", __func__); + } + +#endif + +#ifdef PROFILE + end = rdtsc_light(); + RECORD_STAT(cyc_init1_count, cyc_init1, end, start); +#endif + +#ifdef PROFILE + start = rdtsc_light(); +#endif + + if ((ret = uti_pthread_create(&progress_thr, &pthread_attr, progress_fn, NULL, &uti_attr))) { + pr_err("%s: error: uti_pthread_create failed (%d)\n", __func__, ret); + goto out; + } + + ret = 0; + out: + if (ret) { + __sync_fetch_and_sub(&progress_refc, 1); + } + +#ifdef PROFILE + end = rdtsc_light(); + RECORD_STAT(cyc_init2_count, cyc_init2, end, start); +#endif +} + +void progress_start() +{ + unsigned long start, end; + + if (progress_refc == 0) { + progress_init(); + } + +#ifdef PROFILE + start = rdtsc_light(); +#endif + pthread_mutex_lock(&progress_mutex); + + if (progress_state == PROGRESS_FINALIZE) { + pr_warn("%s: warning: FINALIZE\n", __func__); + pthread_mutex_unlock(&progress_mutex); + return; + } + + if (progress_state == PROGRESS_START) { + //pr_warn("%s: warning: START\n", __func__); + pthread_mutex_unlock(&progress_mutex); + return; + } + + if (progress_state != PROGRESS_INIT) { + pr_err("%s: error: unexpected state: %d\n", __func__, progress_state); + pthread_mutex_unlock(&progress_mutex); + return; + } + + progress_state = PROGRESS_START; +#if STOP_TYPE == STOP_BY_MEM + progress_stop_flag = 0; +#endif + __sync_synchronize(); /* memory barrier instruction */ + progress_flag_down = 1; + pthread_cond_signal(&progress_cond_down); + pthread_mutex_unlock(&progress_mutex); + +#ifdef PROFILE + end = rdtsc_light(); + RECORD_STAT(cyc_start_count, cyc_start, end, start); +#endif +} + +void do_progress_stop() +{ + int ret; + unsigned long start, end; + + //if (progress_world_rank < 2) pr_debug("[%d] stop,cpu=%d\n", progress_world_rank, sched_getcpu()); + +#ifdef PROFILE + start = rdtsc_light(); +#endif + +#if STOP_TYPE == STOP_BY_MEM + + progress_stop_flag = 1; + __sync_synchronize(); /* st-st barrier */ + +#elif STOP_TYPE == STOP_BY_MPI + + if ((ret = MPI_Send(NULL, 0, MPI_CHAR, progress_rank, WAKE_TAG, progress_comm)) != MPI_SUCCESS) { + pr_err("%s: error: MPI_Send failed (%d)\n", __func__, ret); + return; + } + + +#endif /* STOP_TYPE */ + +#ifdef PROFILE + end = rdtsc_light(); + RECORD_STAT(cyc_stop2_count, cyc_stop2, end, start); + start = rdtsc_light(); +#endif + + /* Make sure the following command will observe INIT */ + while (!progress_flag_up) { + } + progress_flag_up = 0; + +#ifdef PROFILE + end = rdtsc_light(); + RECORD_STAT(cyc_stop3_count, cyc_stop3, end, start); +#endif +} + +void progress_stop() +{ + unsigned long start, end; + +#ifdef PROFILE + start = rdtsc_light(); +#endif + + if (progress_refc == 0) { + return; + } + + pthread_mutex_lock(&progress_mutex); + + if (progress_state == PROGRESS_INIT) { + pthread_mutex_unlock(&progress_mutex); + return; + } + + if (progress_state == PROGRESS_FINALIZE) { + pthread_mutex_unlock(&progress_mutex); + return; + } + + if (progress_state != PROGRESS_START) { + pr_err("%s: error: unexpected state: %d\n", __func__, progress_state); + pthread_mutex_unlock(&progress_mutex); + return; + } + + pthread_mutex_unlock(&progress_mutex); + +#ifdef PROFILE + end = rdtsc_light(); + RECORD_STAT(cyc_stop1_count, cyc_stop1, end, start); +#endif + + do_progress_stop(); +} + +void progress_finalize() +{ + int ret; + int i, j; + MPI_Request req; + unsigned long start, end; + int nproc; + + MPI_Comm_size(MPI_COMM_WORLD, &nproc); + +#ifdef PROFILE + start = rdtsc_light(); +#endif + + if (progress_refc == 0) { + return; + } + + retry: + pthread_mutex_lock(&progress_mutex); + + if (progress_state == PROGRESS_START) { + pthread_mutex_unlock(&progress_mutex); + do_progress_stop(); + goto retry; + } + + if (progress_state == PROGRESS_FINALIZE) { + pthread_mutex_unlock(&progress_mutex); + return; + } + + if (progress_state != PROGRESS_INIT) { + pr_err("%s: error: unexpected state: %d\n", __func__, progress_state); + pthread_mutex_unlock(&progress_mutex); + return; + } + + progress_state = PROGRESS_FINALIZE; + __sync_synchronize(); /* st-st barrier */ + progress_flag_down = 1; + pthread_cond_signal(&progress_cond_down); + pthread_mutex_unlock(&progress_mutex); + + /* Make sure the following command will observe INIT */ + while (!progress_flag_up) { + } + progress_flag_up = 0; + + pthread_join(progress_thr, NULL); + + if ((ret = MPI_Comm_free(&progress_comm)) != MPI_SUCCESS) { + pr_err("%s: error: MPI_Comm_free failed (%d)\n", __func__, ret); + return; + } + + progress_refc = 0; + +#ifdef PROFILE + end = rdtsc_light(); + RECORD_STAT(cyc_finalize_count, cyc_finalize, end, start); + + for (j = 0; j < NRANK_STAT; j++) { + + MPI_Barrier(MPI_COMM_WORLD); + + if (j != progress_world_rank) { + usleep(1000000); + continue; + } + + pr_stat("cyc_prog1", cyc_prog1_count, cyc_prog1); + pr_stat("cyc_prog2", cyc_prog2_count, cyc_prog2); + pr_stat("cyc_init1", cyc_init1_count, cyc_init1); + pr_stat("cyc_init2", cyc_init2_count, cyc_init2); + pr_stat("cyc_start", cyc_start_count, cyc_start); + pr_stat("cyc_stop1", cyc_stop1_count, cyc_stop1); + pr_stat("cyc_stop2", cyc_stop2_count, cyc_stop2); + pr_stat("cyc_stop3", cyc_stop3_count, cyc_stop3); + pr_stat("cyc_finalize", cyc_finalize_count, cyc_finalize); + } +#endif +} diff --git a/test/uti/mpi/async_progress.h b/test/uti/mpi/async_progress.h new file mode 100644 index 00000000..bd0d39a2 --- /dev/null +++ b/test/uti/mpi/async_progress.h @@ -0,0 +1,15 @@ +#ifndef _ASYNC_PROGRESS_INCLUDED_ +#define _ASYNC_PROGRESS_INCLUDED_ + +enum progress_state { + PROGRESS_INIT = 0, + PROGRESS_START, + PROGRESS_FINALIZE +}; + +void progress_init(); +void progress_start(); +void progress_stop(); +void progress_finalize(); + +#endif diff --git a/test/uti/mpi/env_intel.sh b/test/uti/mpi/env_intel.sh new file mode 100644 index 00000000..5455e20f --- /dev/null +++ b/test/uti/mpi/env_intel.sh @@ -0,0 +1,17 @@ +export HYDRA_BOOTSTRAP_EXEC=/bin/pjrsh +export HYDRA_BOOTSTRAP=rsh +export HYDRA_PROXY_RETRY_COUNT=30 + +#export HYDRA_BRANCH_COUNT=4 + +export I_MPI_PIN=off +export HFI_NO_CPUAFFINITY=1 +export KMP_AFFINITY=granularity=thread,scatter +export I_MPI_COLL_INTRANODE_SHM_THRESHOLD=4194304 +export I_MPI_FABRICS=shm:tmi +export PSM2_RCVTHREAD=0 +export I_MPI_TMI_PROVIDER=psm2 +export I_MPI_FALLBACK=0 +export PSM2_MQ_RNDV_HFI_WINDOW=4194304 +export PSM2_MQ_EAGER_SDMA_SZ=65536 +export PSM2_MQ_RNDV_HFI_THRESH=200000 diff --git a/test/uti/mpi/env_mpich.sh b/test/uti/mpi/env_mpich.sh new file mode 100644 index 00000000..e61e71d8 --- /dev/null +++ b/test/uti/mpi/env_mpich.sh @@ -0,0 +1,5 @@ +export HYDRA_BOOTSTRAP_EXEC=/bin/pjrsh +export HYDRA_BOOTSTRAP=rsh +export HYDRA_PROXY_RETRY_COUNT=30 +export MPIR_CVAR_OFI_USE_PROVIDER=psm2 + diff --git a/test/uti/mpi/filter.pl b/test/uti/mpi/filter.pl new file mode 100755 index 00000000..e61c66ef --- /dev/null +++ b/test/uti/mpi/filter.pl @@ -0,0 +1,22 @@ +#!/usr/bin/perl + +while(<>) { # For each line of hostfile + open(); + $found = 0; + while(<>) { + if($_ =~ /progress_fn,enter,tid=(\d+)/) { + $tid = $1; + $found = 1; + # print 'tid='.$tid."\n" + } + if($found == 1 && $_ =~ /^$tid/) { + if($_ =~ /^$tid\s(\w+)/) { + # print $1."\n"; + $freq{$1}{$hostname}++; + } + } + } +} +foreach $key (sort(keys(%freq))) { + print $key.",".$freq{$key}."\n"; +} diff --git a/test/uti/mpi/mpi_progress_thread.pl b/test/uti/mpi/mpi_progress_thread.pl new file mode 100755 index 00000000..273f1d84 --- /dev/null +++ b/test/uti/mpi/mpi_progress_thread.pl @@ -0,0 +1,100 @@ +#!/usr/bin/perl + +# Usage ./mpi_progress.pl <#procs> <#nnodes> (mck|lin) (mpich|intel) + +use File::Basename; +use File::Copy "cp"; + +($nprocs, $nnodes, $os, $mpi) = @ARGV; +$ppn = $nprocs / $nnodes; + +@command = split /\s+/, basename($0); +@fn = split /\./, $command[0]; + +if($nnodes <= 16) { + $rg = 'MCK-FLAT-QUADRANT'; +} elsif($ARGV[1] <= 128) { + $rg = 'debug-flat'; +} else { + $rg = 'regular-flat'; +} + +%elapse = ( +'1', '00:10:00', +'2', '00:10:00', +'4', '00:10:00', +'8', '00:10:00', +'16', '00:10:00', +'32', '00:10:00', +'64', '00:05:00', +'128', '00:05:00', +'256', '00:10:00', +'512', '00:15:00', +'1024', '00:15:00', +'2048', '00:30:00', + ); + +if ($os eq 'lin') { + $use_mck = ''; + $mck_mem = ''; + $mcexec = ''; + $mcexecopt = ''; +} else { + $path_to_mck = '/work/gg10/e29005/project/os/install'; + $use_mck = '#PJM -x MCK='.$path_to_mck; + $mck_mem = '#PJM -x MCK_MEM=32G@0,8G@1'; + $mcexec = $path_to_mck.'/bin/mcexec'; + $mcexecopt = '-n '.$ppn; +} + +if ($mpi eq 'intel') { + $cc = 'mpiicc'; + $mpiexec = 'mpiexec'; + $genv = ''; + $progress = '-genv I_MPI_ASYNC_PROGRESS 1'; # -genv I_MPI_ASYNC_PROGRESS_PIN 1 +} else { + $mpi_lib = '/work/gg10/e29005/project/mpich/install'; + $cc = $mpi_lib.'/bin/mpicc'; + $mpiexec = $mpi_lib.'/bin/mpiexec'; + $genv = '-genv LD_LIBRARY_PATH '.$mpi_lib.'/lib:$LD_LIBRARY_PATH'; + $progress = '-genv MPIR_CVAR_ASYNC_PROGRESS 1'; +} + +system("make clean; make CC=$cc"); + +$dir=$ARGV[2].'_'.$ARGV[0].'_'.$ARGV[1].'_'.`date +%Y%m%d_%H%M%S`; +chomp($dir); +print 'less '.$dir.'/job.sh.o*'."\n"; + +mkdir $dir; +chdir $dir; +cp('../001', './001') or die 'copy failed'; +open(IN, "../$fn[0].sh.in"); +open(OUT, ">./job.sh"); +while() { + s/\@rg@/$rg/g; + s/\@nnodes@/$nnodes/g; + s/\@nprocs@/$nprocs/g; + s/\@elapse@/$elapse{$nnodes}/g; + s/\@use_mck@/$use_mck/g; + s/\@mck_mem@/$mck_mem/g; + s/\@progress@/$progress/g; + s/\@genv@/$genv/g; + s/\@mpiexec@/$mpiexec/g; + s/\@mcexec@/$mcexec/g; + s/\@mcexecopt@/$mcexecopt/g; + if(/\@env@/) { + open(INCL, "../env_$mpi.sh"); + while(my $line = ) { + print OUT $line; + } + next; + } + print OUT $_; +} +close(IN); +close(OUT); + +$cmd = 'PJM_MCK_AVAILABLE=1 pjsub ./job.sh'; +#print $cmd."\n"; +exec($cmd); diff --git a/test/uti/mpi/mpi_progress_thread.sh.in b/test/uti/mpi/mpi_progress_thread.sh.in new file mode 100644 index 00000000..f149a1f8 --- /dev/null +++ b/test/uti/mpi/mpi_progress_thread.sh.in @@ -0,0 +1,16 @@ +#!/bin/sh + +#PJM -L rscgrp=@rg@ +#PJM -L node=@nnodes@ +#PJM --mpi proc=@nprocs@ +#PJM -L elapse=@elapse@ +#PJM -L proc-crproc=16384 +#PJM -g gg10 +#PJM -j +#PJM -s +@use_mck@ +@mck_mem@ + +@env@ + +@mpiexec@ @genv@ @progress@ -np @nprocs@ -machinefile ${PJM_O_NODEINF} @mcexec@ @mcexecopt@ ./001 1048576 1000 diff --git a/test/uti/mpi/util.c b/test/uti/mpi/util.c new file mode 100644 index 00000000..cdf51140 --- /dev/null +++ b/test/uti/mpi/util.c @@ -0,0 +1,186 @@ +#define _GNU_SOURCE /* See feature_test_macros(7) */ +#include +#include +#include +#include +#include /* For SYS_xxx definitions */ +#include +#include +#include +#include +#include +#include +#include "util.h" + +/* Messaging */ +enum test_loglevel test_loglevel = TEST_LOGLEVEL_DEBUG; + +/* Calculation */ +static inline void asmloop(unsigned long n) { + int j; + + for (j = 0; j < n; j++) { + asm volatile( + "movq $0, %%rcx\n\t" + "1:\t" + "addq $1, %%rcx\n\t" + "cmpq $99, %%rcx\n\t" + "jle 1b\n\t" + : + : + : "rcx", "cc"); + } +} + +#define N_INIT 10000000 +double nspw; /* nsec per work */ + +void ndelay_init(int verbose) { + struct timeval start, end; + + //clock_gettime(TIMER_KIND, &start); + gettimeofday(&start, NULL); + +#pragma omp parallel + { + asmloop(N_INIT); + } + + //clock_gettime(TIMER_KIND, &end); + gettimeofday(&end, NULL); + + nspw = DIFFUSEC(end, start) * 1000 / (double)N_INIT; + if (verbose) { + pr_debug("nspw=%f\n", nspw); + } +} + +#if 1 +void ndelay(long delay_nsec) { + if (delay_nsec < 0) { + printf("delay_nsec < 0\n"); + return; + } +#pragma omp parallel + { + asmloop(delay_nsec / nspw); + } +} +#else /* For machines with large core-to-core performance variation (e.g. OFP) */ +void ndelay(long delay_nsec) { + struct timespec start, end; + + if (delay_nsec < 0) { return; } + clock_gettime(TIMER_KIND, &start); + + while (1) { + clock_gettime(TIMER_KIND, &end); + if (DIFFNSEC(end, start) >= delay_nsec) { + break; + } + asmloop(2); /* ~150 ns per iteration on FOP */ + } +} +#endif + + +double cycpw; /* cyc per work */ + +void cdlay_init() { + unsigned long start, end; + + start = rdtsc_light(); +#define N_INIT 10000000 + asmloop(N_INIT); + end = rdtsc_light(); + cycpw = (end - start) / (double)N_INIT; +} + +#if 0 +void cdelay(long delay_cyc) { + if (delay_cyc < 0) { + return; + } + asmloop(delay_cyc / cycpw); +} +#else /* For machines with large core-to-core performance variation (e.g. OFP) */ +void cdelay(long delay_cyc) { + unsigned long start, end; + + if (delay_cyc < 0) { return; } + start = rdtsc_light(); + + while (1) { + end = rdtsc_light(); + if (end - start >= delay_cyc) { + break; + } + asmloop(2); + } +} +#endif + + +int print_cpu_last_executed_on(const char *name) { + char fn[256]; + char* result; + pid_t tid = syscall(SYS_gettid); + int fd; + int offset; + int mpi_errno = 0; + int rc; + + sprintf(fn, "/proc/%d/task/%d/stat", getpid(), (int)tid); + //printf("fn=%s\n", fn); + fd = open(fn, O_RDONLY); + if(fd == -1) { + printf("open() failed\n"); + goto fn_fail; + } + + result = malloc(65536); + if(result == NULL) { + printf("malloc() failed"); + goto fn_fail; + } + + int amount = 0; + offset = 0; + while(1) { + amount = read(fd, result + offset, 65536); + // printf("amount=%d\n", amount); + if(amount == -1) { + printf("read() failed"); + goto fn_fail; + } + if(amount == 0) { + goto eof; + } + offset += amount; + } + eof:; + //printf("result:%s\n", result); + + char* next_delim = result; + char* field; + int i; + for(i = 0; i < 39; i++) { + field = strsep(&next_delim, " "); + } + + int cpu = sched_getcpu(); + if(cpu == -1) { + printf("getpu() failed\n"); + goto fn_fail; + } + + rc = syscall(732); + + printf("%s: pmi_rank=%02d,os=%s,stat-cpu=%02d,sched_getcpu=%02d,tid=%d\n", name, atoi(getenv("PMI_RANK")), rc == -1 ? "lin" : "mck", atoi(field), cpu, tid); fflush(stdout); + fn_exit: + free(result); + return mpi_errno; + fn_fail: + mpi_errno = -1; + goto fn_exit; +} diff --git a/test/uti/mpi/util.h b/test/uti/mpi/util.h new file mode 100644 index 00000000..3482aae3 --- /dev/null +++ b/test/uti/mpi/util.h @@ -0,0 +1,73 @@ +#ifndef __UTIL_H_INCLUDED__ +#define __UTIL_H_INCLUDED__ + +#include + +/* Messaging */ + +enum test_loglevel { + TEST_LOGLEVEL_ERR = 0, + TEST_LOGLEVEL_WARN, + TEST_LOGLEVEL_DEBUG +}; + +extern enum test_loglevel test_loglevel; +static inline void test_set_loglevel(enum test_loglevel level) +{ + test_loglevel = level; +} + +#define pr_level(level, fmt, args...) do { \ + if (test_loglevel >= level) { \ + fprintf(stdout, fmt, ##args); \ + } \ +} while (0) + +#define pr_err(fmt, args...) pr_level(TEST_LOGLEVEL_ERR, fmt, ##args) +#define pr_warn(fmt, args...) pr_level(TEST_LOGLEVEL_WARN, fmt, ##args) +#define pr_debug(fmt, args...) pr_level(TEST_LOGLEVEL_DEBUG, fmt, ##args) + +#define _OKNG(verb, jump, cond, fmt, args...) do { \ + if (cond) { \ + if (verb) \ + printf("[ OK ] " fmt, ##args); \ + } else { \ + printf("[ NG ] " fmt, ##args); \ + if (jump) { \ + ret = -1; \ + goto out; \ + } \ + } \ +} while (0) + +#define OKNG(args...) _OKNG(1, 1, ##args) +#define NG(args...) _OKNG(0, 1, ##args) +#define OKNGNOJUMP(args...) _OKNG(1, 0, ##args) + +/* Time */ +inline uint64_t rdtsc_light(void) +{ + uint64_t x; + __asm__ __volatile__("rdtscp;" /* rdtscp don't jump over earlier instructions */ + "shl $32, %%rdx;" + "or %%rdx, %%rax" : + "=a"(x) : + : + "%rcx", "%rdx", "memory"); + return x; +} + +#define DIFFUSEC(end, start) ((end.tv_sec - start.tv_sec) * 1000000UL + (end.tv_usec - start.tv_usec)) +#define DIFFNSEC(end, start) ((end.tv_sec - start.tv_sec) * 1000000000UL + (end.tv_nsec - start.tv_nsec)) +#define TIMER_KIND CLOCK_MONOTONIC_RAW /* CLOCK_THREAD_CPUTIME_ID */ + +/* Calculation emulation */ +void ndelay_init(); +void ndelay(long delay_nsec); +void cdelay_init(); +void cdelay(long delay_cyc); + +/* CPU location */ +int print_cpu_last_executed_on(); + +#endif diff --git a/test/uti/posix_aio/001.c b/test/uti/posix_aio/001.c new file mode 100644 index 00000000..2e09b9cf --- /dev/null +++ b/test/uti/posix_aio/001.c @@ -0,0 +1,517 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "util.h" + +#define NREQS 1 /* # of parallel I/O requests per process */ +#define SZBUF (1ULL<<23) + +#define MYTIME_TOUSEC 1000000 +#define MYTIME_TONSEC 1000000000 + +#define NROW 11 +#define NCOL 4 + +#define NSAMPLES_DROP 0/*10*/ +#define NSAMPLES_IO 2/*20*/ +#define NSAMPLES_TOTAL 2/*20*/ +#define NSAMPLES_INNER 1 + +#define Q(x) #x +#define QUOTE(x) Q(x) + +char test_srcdir[PATH_MAX]; + +static inline double mytime() { + return /*rdtsc_light()*/MPI_Wtime(); +} + +struct aioreq { + int rank; + int status; + struct aiocb *aiocbp; +}; + +static void aio_sighandler(int sig, siginfo_t *si, void *ucontext) +{ + if (si->si_code == SI_ASYNCIO) { + //struct aioreq *aioreq = si->si_value.sival_ptr; + //pr_debug("I/O completion signal received\n"); + } +} + +int my_aio_init(int nreqs, struct aioreq *iolist, struct aiocb *aiocblist, char *aiobufs[NREQS]) { + int j; + + for (j = 0; j < nreqs; j++) { + iolist[j].rank = j; + iolist[j].aiocbp = &aiocblist[j]; + iolist[j].aiocbp->aio_buf = aiobufs[j]; + iolist[j].aiocbp->aio_nbytes = SZBUF; + iolist[j].aiocbp->aio_reqprio = 0; + iolist[j].aiocbp->aio_offset = 0; + iolist[j].aiocbp->aio_sigevent.sigev_notify = SIGEV_SIGNAL; + iolist[j].aiocbp->aio_sigevent.sigev_signo = SIGUSR1; + iolist[j].aiocbp->aio_sigevent.sigev_value.sival_ptr = &iolist[j]; + } + + return 0; +} + + +int my_aio_evict(int nreqs, char **fn) { + int ret; + int i; + char cmd[PATH_MAX]; + + for (i = 0; i < NREQS; i++) { + + sprintf(cmd, "%s -e %s > /dev/null", QUOTE(VMTOUCH), fn[i]); + ret = system(cmd); + + if (ret == -1) { + pr_err("%s: error: system\n", + __func__); + goto out; + } + + if (WEXITSTATUS(ret)) { + pr_err("%s: error: system returned %d\n", + __func__, WEXITSTATUS(ret)); + ret = WEXITSTATUS(ret); + goto out; + } + } + ret = 0; + out: + return ret; +} +int my_aio_open(int nreqs, struct aioreq *iolist, char **fn) { + int ret; + int j; + + for (j = 0; j < NREQS; j++) { + iolist[j].aiocbp->aio_fildes = open(fn[j], O_RDWR | O_CREAT | O_TRUNC, S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH); + if (iolist[j].aiocbp->aio_fildes == -1) { + pr_err("%s: error: open %s: %s\n", + __func__, fn[j], strerror(errno)); + ret = 1; + goto out; + } + } + + ret = 0; + out: + return ret; +} + +int my_aio_check(int nreqs, char **fn, char **mem_data) { + int ret; + int i; + FILE *fp[NREQS] = { 0 }; + char *file_data[NREQS]; + + /* Check contents */ + for (i = 0; i < nreqs; i++) { + + if (!(file_data[i] = malloc(SZBUF))) { + pr_err("error: allocating data\n"); + ret = -ENOMEM; + goto out; + } + + if (!(fp[i] = fopen(fn[i], "r+"))) { + pr_err("error: fopen %s: %s\n", + fn[i], strerror(errno)); + ret = -errno; + goto out; + } + + if (fread(file_data[i], sizeof(char), SZBUF, fp[i]) != SZBUF) { + pr_err("error: fread: %s\n", + strerror(errno)); + ret = -1; + goto out; + } + + fclose(fp[i]); + + if (memcmp((const char *)file_data[i], mem_data[i], SZBUF)) { + pr_err("%s: file_data[%d] and mem_data[%d] doesn't match\n", + __func__, i, i); + ret = -1; + goto out; + } + + free(file_data[i]); + } + ret = 0; + out: + return ret; +} + +void my_aio_close(int nreqs, struct aioreq *iolist) { + int j; + + for (j = 0; j < NREQS; j++) { + close(iolist[j].aiocbp->aio_fildes); + iolist[j].aiocbp->aio_fildes = -1; + } +} + +int my_aio(int nreqs, struct aioreq *iolist, char **fn, long nsec_calc) { + int ret; + int i, j; + + /* Start async IO */ + for (j = 0; j < NSAMPLES_INNER; j++) { + int completion_count = 0; + + //pr_debug("debug: opening file\n"); + if ((ret = my_aio_open(nreqs, iolist, fn)) == -1) { + pr_err("%s: error: aio_read: %s\n", + __func__, strerror(errno)); + ret = -errno; + goto out; + } + + //pr_debug("debug: issuing write command\n"); + for (j = 0; j < nreqs; j++) { + + /* Reset completion notice */ + iolist[j].status = EINPROGRESS; + + if ((ret = aio_write(iolist[j].aiocbp)) == -1) { + pr_err("%s: error: aio_read: %s\n", + __func__, strerror(errno)); + ret = -errno; + goto out; + } + } + + /* Emulate calcuation phase */ + ndelay(nsec_calc); + + /* Wait for completion of async IO */ + //pr_debug("debug: waiting for completion\n"); + while (completion_count != nreqs) { + for (j = 0; j < nreqs; j++) { + if (iolist[j].status != EINPROGRESS) { + continue; + } + + iolist[j].status = aio_error(iolist[j].aiocbp); + + switch (iolist[j].status) { + case 0: /* Succeeded */ + goto completed; + case EINPROGRESS: + break; + case ECANCELED: + pr_err("%s: error: aio is cancelled\n", + __func__); + goto completed; + default: + pr_err("%s: error: unexpected status: %d\n", + __func__, iolist[j].status); + goto completed; + completed: + completion_count++; + break; + } + } + } + + /* Check write amount */ + for (j = 0; j < nreqs; j++) { + ssize_t size; + + if ((size = aio_return(iolist[j].aiocbp)) != SZBUF) { + pr_err("%s: Expected to have written %ld B but reported to have written %ld B\n", + __func__, SZBUF, size); + ret = -1; + goto out; + } + } + + my_aio_close(nreqs, iolist); + } + ret = 0; + out: + return ret; +} + +int measure(double *result, int nsamples, int nsamples_drop, int nreqs, struct aioreq *iolist, char **fn, char **aiobufs, long nsec_calc) { + int ret; + int i; + double t_l, t_g, t_sum = 0; + double start, end; + + for (i = 0; i < nsamples + nsamples_drop; i++) { + +#if 0 + pr_debug("debug: evicting file cache\n"); + if ((ret = my_aio_evict(nreqs, fn))) { + pr_err("%s: error: my_aio_evict returned %d\n", + __func__, ret); + } +#endif + MPI_Barrier(MPI_COMM_WORLD); + + start = mytime(); + if ((ret = my_aio(nreqs, iolist, fn, nsec_calc))) { + pr_err("%s: error: my_aio_read returned %d\n", + __func__, ret); + } + end = mytime(); + + MPI_Barrier(MPI_COMM_WORLD); + + /* Check contents */ + if ((ret = my_aio_check(nreqs, fn, aiobufs))) { + pr_err("%s: error: my_aio_check returned %d\n", + __func__, ret); + } + + if (i < nsamples_drop) { + continue; + } + + /* Take max */ + t_l = end - start; + MPI_Allreduce(&t_l, &t_g, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD); + t_sum += t_g; + } + + *result = t_sum / nsamples; + ret = 0; + out: + return ret; +} + +int main(int argc, char **argv) +{ + int ret; + int i, j, progress, l; + int rank, nproc; + int disable_syscall_intercept = 0, ppn = -1; + struct aioreq *iolist; + struct aiocb *aiocblist; + struct sigaction sa; + double t_io_ave, t_total_ave; + double t_table[NROW][NCOL] = { 0 }; + int opt; + char *aiobufs[NREQS] = { 0 }; + char **fn; + + opterr = 0; /* Don't print out error when not recognizing option character */ + + while ((opt = getopt(argc, argv, ":I:p:")) != -1) { + switch (opt) { + case 'I': + disable_syscall_intercept = atoi(optarg); + break; + case 'p': + ppn = atoi(optarg); + break; + case '?': + pr_err("error: invalid option: -%c\n", + optopt); + ret = 1; + goto out; + case ':': + pr_err("error: option -%c requires an argument\n", + optopt); + ret = 1; + goto out; + } + } + + if (ppn == -1) { + pr_err("error: specify -p \n"); + ret = -EINVAL; + goto out; + } + + /* Initialize MPI */ + MPI_Init(&argc, &argv); + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + MPI_Comm_size(MPI_COMM_WORLD, &nproc); + + /* Show parameters */ + if (rank == 0) { +#pragma omp parallel + { + if (omp_get_thread_num() == 0) { + printf("nproc: %d, ppn: %d, #threads: %d\n", nproc, ppn, omp_get_num_threads()); + } + } + } + + /* Set verbosity */ + //test_set_loglevel(TEST_LOGLEVEL_WARN); + + /* Initialize delay function */ + ndelay_init(); + + /* Prepare file names */ + +#define TEST_SRCDIR "/work/gg10/e29005" + sprintf(test_srcdir, "%s", /*TEST_SRCDIR*/dirname(argv[0])); + + if (!(fn = malloc(sizeof(char *) * NREQS))) { + pr_err("error: allocating fn\n"); + ret = -ENOMEM; + goto out; + } + + for (i = 0; i < NREQS; i++) { + if (!(fn[i] = malloc(PATH_MAX))) { + pr_err("error: allocating fn\n"); + ret = -ENOMEM; + goto out; + } + + sprintf(fn[i], "%s/rank%d-number%d", test_srcdir, rank, i); + if (rank == 0) pr_debug("debug: rank: %d, fn[%d]: %s\n", + rank, i, fn[i]); + } + + /* Allocate aio commands */ + if (!(iolist = calloc(NREQS, sizeof(struct aioreq)))) { + pr_err("%s: error: allocating iolist\n", + __func__); + ret = 1; + goto out; + } + + if (!(aiocblist = calloc(NREQS, sizeof(struct aiocb)))) { + pr_err("%s: error: allocating aiocblist\n", + __func__); + ret = 1; + goto out; + } + + /* Prepare contents to be written */ + for (i = 0; i < NREQS; i++) { + aiobufs[i] = malloc(SZBUF); + if (!aiobufs[i]) { + pr_err("%s: error: allocating aiobufs\n", + __func__); + ret = 1; + goto out; + } + + for (j = 0; j < SZBUF; j++) { + *(aiobufs[i] + j) = i + j + rank; + } + } + + /* Set signal handlers */ + sa.sa_flags = SA_RESTART | SA_SIGINFO; + sa.sa_sigaction = aio_sighandler; + if (sigaction(SIGUSR1, &sa, NULL) == -1) { + pr_err("%s: error: sigaction: %s\n", + __func__, strerror(errno)); + ret = 1; + goto out; + } + + /* Set aio parameters except fd and status */ + if ((ret = my_aio_init(NREQS, iolist, aiocblist, aiobufs))) { + pr_err("%s: error: my_aio_init returned %d\n", + __func__, ret); + goto out; + } + + /* Measure IO only time */ + //pr_debug("debug: measuring IO only time\n"); + if ((ret = measure(&t_io_ave, NSAMPLES_IO, NSAMPLES_DROP, NREQS, iolist, fn, aiobufs, 0))) { + pr_err("error: measure returned %d\n", ret); + goto out; + } + + if (rank == 0) { + printf("t_io_ave: %.0f usec, %.0f MB/s per node\n", + t_io_ave * MYTIME_TOUSEC, + SZBUF * ppn / t_io_ave / 1000000); + } + + /* Measure time with no progress, progress and no uti, progress and uti */ + for (progress = 0; progress <= (disable_syscall_intercept ? 0 : 0); progress += 1) { + + /* Spawn helper thread onto compute CPUs with ignoring uti_attr */ + if (progress == 1) { + setenv("DISABLE_UTI", "1", 1); + } + /* Spawn helper thread onto dedicated CPUs with respecting uti_attr */ + else if (progress == 2) { + unsetenv("DISABLE_UTI"); + } + + /* Measure with various calculation time */ + for (l = 0; l <= 10; l += 2) { + long nsec_calc = (t_io_ave * MYTIME_TONSEC * l) / 10; + + if ((ret = measure(&t_total_ave, NSAMPLES_TOTAL, NSAMPLES_DROP, NREQS, iolist, fn, aiobufs, nsec_calc))) { + pr_err("error: measure returned %d\n", ret); + goto out; + } + + if (rank == 0) { + if (l == 0) { + pr_debug("progress=%d\n", progress); + if (progress == 0) { + pr_debug("calc\ttotal\n"); + } else { + pr_debug("total\n"); + } + } + + t_table[l][0] = nsec_calc * (MYTIME_TOUSEC / (double)MYTIME_TONSEC); + if (progress == 0) { + pr_debug("%.0f\t%.0f\n", nsec_calc * (MYTIME_TOUSEC / (double)MYTIME_TONSEC), t_total_ave * MYTIME_TOUSEC); + t_table[l][progress + 1] = t_total_ave * MYTIME_TOUSEC; + } else { + pr_debug("%.0f\n", t_total_ave * MYTIME_TOUSEC); + t_table[l][progress + 1] = t_total_ave * MYTIME_TOUSEC; + } + } + } + } + + if (rank == 0) { + printf("calc,no prog,prog and no uti, prog and uti\n"); + for (l = 0; l <= 10; l++) { + for (i = 0; i < NCOL; i++) { + if (i > 0) { + printf(","); + } + printf("%.0f", t_table[l][i]); + } + printf("\n"); + } + } + + MPI_Barrier(MPI_COMM_WORLD); + + MPI_Finalize(); + + ret = 0; +out: + for (i = 0; i < NREQS; i++) { + free(aiobufs[i]); + } + return ret; +} diff --git a/test/uti/posix_aio/001.sh b/test/uti/posix_aio/001.sh new file mode 100755 index 00000000..6d3289d8 --- /dev/null +++ b/test/uti/posix_aio/001.sh @@ -0,0 +1,270 @@ +#!/usr/bin/bash + +#!/usr/bin/bash -x + +MYHOME=/home/e29005 +test_dir=`pwd -P` +mck_dir=${MYHOME}/project/os/install +uti_dir_lin=${MYHOME}/project/uti/install_linux +uti_dir_mck=${MYHOME}/project/uti/install_mckernel + +exe=`basename $0 | sed 's/\.sh//'` + +stop=0 +reboot=0 +go=0 + +interactive=0 +pjsub=0 +gdb=0 +disable_syscall_intercept=0 +mck=0 +nnodes=2 +LASTNODE=8196 +use_hfi=0 +omp_num_threads=1 +ppn=4 + +while getopts srgc:ml:N:P:o:hGI:ipL: OPT +do + case ${OPT} in + s) stop=1 + ;; + r) reboot=1 + ;; + g) go=1 + ;; + m) mck=1 + ;; + N) nnodes=$OPTARG + ;; + P) ppn=$OPTARG + ;; + o) omp_num_threads=$OPTARG + ;; + h) use_hfi=1 + ;; + G) gdb=1 + ;; + I) disable_syscall_intercept=$OPTARG + ;; + i) interactive=1 + ;; + p) pjsub=1 + ;; + L) LASTNODE=$OPTARG + ;; + *) echo "invalid option -${OPT}" >&2 + exit 1 + esac +done + +nprocs=$((ppn * nnodes)) +nodes=`echo $(seq -s ",c" $(($LASTNODE + 1 - $nnodes)) $LASTNODE) | sed 's/^/c/'` + +# vertical cut, excluding phys loaded with Linux tasks +uti_cpu_set_lin=1,69,137,205,18-19,86-87,154-155,222-223 +exclude_list=0-1,68-69,136-137,204-205,18-19,86-87,154-155,222-223 +#64-67,132-135,200-203,268-271 + +uti_cpu_set_mck=1,69,137,205,18-19,86-87,154-155,222-223 + +# horizontal cut, excluding phys loaded with Linux tasks for mckernel +#uti_cpu_set_lin=204-271 +#uti_cpu_set_mck=1-67 + +if [ $mck -eq 0 ]; then + uti_cpu_set_str="export UTI_CPU_SET=$uti_cpu_set_lin" + i_mpi_pin_processor_exclude_list="export I_MPI_PIN_PROCESSOR_EXCLUDE_LIST=$exclude_list" +else + uti_cpu_set_str="export UTI_CPU_SET=$uti_cpu_set_mck" + i_mpi_pin_processor_exclude_list= +fi + +if [ ${mck} -eq 1 ]; then + i_mpi_pin=off + i_mpi_pin_domain= + i_mpi_pin_order= +# if [ $omp_num_threads -eq 1 ]; then +# # Avoid binding main thread and uti thread to one CPU + kmp_affinity="export KMP_AFFINITY=disabled" +# else +# # Bind rank to OMP_NUM_THREAD-sized CPU-domain +# kmp_affinity="export KMP_AFFINITY=granularity=thread,scatter" +# fi +else + i_mpi_pin=on + domain=$omp_num_threads # Use 32 when you want to match mck's -n division + i_mpi_pin_domain="export I_MPI_PIN_DOMAIN=$domain" + i_mpi_pin_order="export I_MPI_PIN_ORDER=compact" + kmp_affinity="export KMP_AFFINITY=granularity=thread,scatter" +fi + +echo nprocs=$nprocs nnodes=$nnodes ppn=$ppn nodes=$nodes omp_num_threads=$omp_num_threads + +if [ ${mck} -eq 1 ]; then + makeopt="UTI_DIR=$uti_dir_mck" + use_mck="#PJM -x MCK=$mck_dir" + mck_mem="#PJM -x MCK_MEM=32G@0,8G@1" + mcexec="${mck_dir}/bin/mcexec" + nmcexecthr=$((omp_num_threads + 4)) + mcexecopt="-n $ppn --uti-use-last-cpu" # -t $nmcexecthr + + if [ ${use_hfi} -eq 1 ]; then + mcexecopt="--enable-hfi1 $mcexecopt" + fi + + if [ $disable_syscall_intercept -eq 0 ]; then + mcexecopt="--enable-uti $mcexecopt" + fi + +else + offline=`PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $nodes lscpu \| grep Off 2>&1 | dshbak -c | grep Off` + if [ "$offline" != "" ]; then + echo "Error: Some CPUs are offline: $offline" + exit + fi + + makeopt="UTI_DIR=$uti_dir_lin" + use_mck= + mck_mem= + mcexec= + mcexecopt= +fi + +if [ $gdb -eq 1 ]; then + enable_x="-enable-x" + gdbcmd="xterm -display localhost:11 -hold -e gdb -ex run --args" +fi + +if [ $interactive -eq 1 ]; then + i_mpi_hydra_bootstrap_exec= + i_mpi_hydra_bootstrap= + hosts= + opt_dir=/opt/intel + ssh= +else +# PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $nodes bash -c \'if \[ \"\`cat /etc/mtab \| while read line\; do cut -d\" \" -f 2\; done \| grep /work\`\" == \"\" \]\; then sudo mount /work\; fi\' + i_mpi_hydra_bootstrap_exec="export I_MPI_HYDRA_BOOTSTRAP_EXEC=/usr/bin/ssh" + i_mpi_hydra_bootstrap="export I_MPI_HYDRA_BOOTSTRAP=ssh" + hosts="-hosts $nodes" + opt_dir=/home/opt/local/cores/intel + ssh="ssh -A c$LASTNODE" +fi + +# If using ssh +# Latest versions are: 1.163, 2.199, 3.222 +if [ $pjsub -eq 0 ] && [ $interactive -eq 0 ]; then + compilervars=". ${opt_dir}/compilers_and_libraries_2018.2.199/linux/bin/compilervars.sh intel64" +else + compilervars= +fi + +if [ ${stop} -eq 1 ]; then + if [ ${mck} -eq 1 ]; then + PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $nodes \ + /usr/sbin/pidof mcexec \| xargs -r sudo kill -9 + PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $nodes \ + /usr/sbin/pidof $exe \| xargs -r sudo kill -9 + PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $nodes \ + sudo ${mck_dir}/sbin/mcstop+release.sh + else + : + fi +fi + +if [ ${reboot} -eq 1 ]; then + if [ ${mck} -eq 1 ]; then + if hostname | grep ofp &>/dev/null; then + + # -h: Hide idle thread to prevent KNL CPU from mux-ing resource and halving throughput + PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $nodes \ + sudo ${mck_dir}/sbin/mcreboot.sh -h -O -c 2-17,70-85,138-153,206-221,20-35,88-103,156-171,224-239,36-51,104-119,172-187,240-255,52-67,120-135,188-203,256-271 -r 2-5,70-73,138-141,206-209:0+6-9,74-77,142-145,210-213:1+10-13,78-81,146-149,214-217:68+14-17,82-85,150-153,218-221:69+20-23,88-91,156-159,224-227:136+24-27,92-95,160-163,228-231:137+28-31,96-99,164-167,232-235:204+32-35,100-103,168-171,236-239:205+36-39,104-107,172-175,240-243:18+40-43,108-111,176-179,244-247:19+44-47,112-115,180-183,248-251:86+48-51,116-119,184-187,252-255:87+52-55,120-123,188-191,256-259:154+56-59,124-127,192-195,260-263:155+60-63,128-131,196-199,264-267:222+64-67,132-135,200-203,268-271:223 -m 32G@0,12G@1 + +# perl -e 'for ($i=0;$i<68;$i++){if($i>0){print "+";}printf("%d,%d,%d:%d", $i+68,$i+136,$i+204,$i);}' + +# PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $nodes \ +# sudo ${mck_dir}/sbin/mcreboot.sh -O -c 68-271 -r 68,136,204:0+69,137,205:1+70,138,206:2+71,139,207:3+72,140,208:4+73,141,209:5+74,142,210:6+75,143,211:7+76,144,212:8+77,145,213:9+78,146,214:10+79,147,215:11+80,148,216:12+81,149,217:13+82,150,218:14+83,151,219:15+84,152,220:16+85,153,221:17+86,154,222:18+87,155,223:19+88,156,224:20+89,157,225:21+90,158,226:22+91,159,227:23+92,160,228:24+93,161,229:25+94,162,230:26+95,163,231:27+96,164,232:28+97,165,233:29+98,166,234:30+99,167,235:31+100,168,236:32+101,169,237:33+102,170,238:34+103,171,239:35+104,172,240:36+105,173,241:37+106,174,242:38+107,175,243:39+108,176,244:40+109,177,245:41+110,178,246:42+111,179,247:43+112,180,248:44+113,181,249:45+114,182,250:46+115,183,251:47+116,184,252:48+117,185,253:49+118,186,254:50+119,187,255:51+120,188,256:52+121,189,257:53+122,190,258:54+123,191,259:55+124,192,260:56+125,193,261:57+126,194,262:58+127,195,263:59+128,196,264:60+129,197,265:61+130,198,266:62+131,199,267:63+132,200,268:64+133,201,269:65+134,202,270:66+135,203,271:67 -m 32G@0,12G@1 + else + echo "unkwon host type" + exit 1 + fi + else + : + fi +fi + +( +cat < ./job.sh +chmod u+x ./job.sh + +if [ ${go} -eq 1 ]; then + if [ $pjsub -eq 1 ]; then + pjsub ./job.sh + else + if [ $interactive -eq 0 ]; then + . ${opt_dir}/compilers_and_libraries_2018.2.199/linux/bin/compilervars.sh intel64 + fi + #rm ./$exe + make $makeopt ./$exe + + $ssh ${test_dir}/job.sh + fi +fi diff --git a/test/uti/posix_aio/002.c b/test/uti/posix_aio/002.c new file mode 100644 index 00000000..f36ee18a --- /dev/null +++ b/test/uti/posix_aio/002.c @@ -0,0 +1,658 @@ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "util.h" + +#define SZBUF (1ULL << 23)/*23*/ + +#define MYTIME_TOUSEC 1000000 +#define MYTIME_TONSEC 1000000000 + +#define NROW 16 +#define NCOL 4 + +#define NSAMPLES_PROFILE 3 +#define NSAMPLES_DROP 1/*10*/ +#define NSAMPLES_IO 5/*20*/ +#define NSAMPLES_TOTAL 5/*20*/ +#define NSAMPLES_INNER 1 + +#define WAIT_TYPE_BUSY_LOOP 0 +#define WAIT_TYPE_SEM 1 +#define WAIT_TYPE WAIT_TYPE_SEM + +static sem_t aio_sem; +volatile int completion_count; + +static inline double mytime() { + return /*rdtsc_light()*/MPI_Wtime(); +} + +struct aioreq { + int rank, aio_num_threads; + int status; + struct aiocb *aiocbp; +}; + +static void aio_handler(sigval_t sigval) +{ + struct aioreq *aioreq = sigval.sival_ptr; + int ret; + + //pr_debug("%s: debug: rank=%d\n", __func__, aioreq->rank); + ret = __sync_add_and_fetch(&completion_count, 1); + if (ret == aioreq->aio_num_threads) { + if (sem_post(&aio_sem)) { + pr_err("%s: error: sem_post: %s\n", + __func__, strerror(errno)); + } + } + + //pr_debug("%s: debug: completion_count: %d\n", __func__, ret); +} + +static void aio_sighandler(int sig, siginfo_t *si, void *ucontext) +{ + pr_debug("%s: debug: enter\n", __func__); +#if WAIT_TYPE == WAIT_TYPE_SEM + struct aioreq *aioreq = si->si_value.sival_ptr; + + if (si->si_code != SI_ASYNCIO) { + pr_err("%s: error: unexpected si_code: %d\n", + __func__, si->si_code); + } + + aioreq->status = aio_error(aioreq->aiocbp); + if (aioreq->status != 0) { + pr_err("%s: error: unexpected status: %d\n", + __func__, aioreq->status); + } + + if (__sync_add_and_fetch(&completion_count, 1) == aioreq->aio_num_threads) { + if (sem_post(&aio_sem)) { + pr_err("%s: error: sem_post: %s\n", + __func__, strerror(errno)); + } + } + + //pr_debug("%s: debug: completion_count: %d\n", __func__, completion_count); +#endif /* WAIT_TYPE */ +} + +int my_aio_init(int nreqs, struct aioreq *iolist, struct aiocb *aiocblist, char **aiobufs) { + int ret; + int i; + + for (i = 0; i < nreqs; i++) { + iolist[i].rank = i; + iolist[i].aio_num_threads = nreqs; + iolist[i].aiocbp = &aiocblist[i]; + iolist[i].aiocbp->aio_fildes = -1; + iolist[i].aiocbp->aio_buf = aiobufs[i]; + iolist[i].aiocbp->aio_nbytes = SZBUF; + iolist[i].aiocbp->aio_reqprio = 0; + iolist[i].aiocbp->aio_offset = 0; +#if 0 + iolist[i].aiocbp->aio_sigevent.sigev_notify = SIGEV_SIGNAL; + iolist[i].aiocbp->aio_sigevent.sigev_signo = SIGUSR1; + iolist[i].aiocbp->aio_sigevent.sigev_value.sival_ptr = &iolist[i]; +#else + iolist[i].aiocbp->aio_sigevent.sigev_notify = SIGEV_THREAD; + iolist[i].aiocbp->aio_sigevent.sigev_notify_function = aio_handler; + iolist[i].aiocbp->aio_sigevent.sigev_notify_attributes = NULL; + iolist[i].aiocbp->aio_sigevent.sigev_value.sival_ptr = &iolist[i]; +#endif + } + + ret = 0; + return ret; +} + +int my_aio_open(int aio_num_threads, struct aioreq *iolist, char **fn) { + int ret; + int i; + + for (i = 0; i < aio_num_threads; i++) { + iolist[i].aiocbp->aio_fildes = open(fn[i], O_RDWR | O_CREAT | O_TRUNC | O_DIRECT, S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH); + if (iolist[i].aiocbp->aio_fildes == -1) { + pr_err("%s: error: open %s: %s\n", + __func__, fn[i], strerror(errno)); + ret = 1; + goto out; + } + } + ret = 0; + out: + return ret; +} + +int my_aio_check(struct aioreq *iolist, int aio_num_threads, char **fn) { + int ret; + int i; + FILE **fp = { 0 }; + char *data; + + if (!(fp = malloc(sizeof(FILE *) * aio_num_threads))) { + pr_err("error: allocating fp\n"); + ret = -ENOMEM; + goto out; + } + + /* Check contents */ + for (i = 0; i < aio_num_threads; i++) { + if (!(data = malloc(SZBUF))) { + pr_err("error: allocating data\n"); + ret = -ENOMEM; + goto out; + } + + if (!(fp[i] = fopen(fn[i], "r+"))) { + pr_err("%s: error: fopen %s: %s\n", + __func__, fn[i], strerror(errno)); + ret = -errno; + goto out; + } + + if (fread(data, sizeof(char), SZBUF, fp[i]) != SZBUF) { + pr_err("%s: error: fread\n", + __func__); + ret = -1; + goto out; + } + + if (memcmp((const void*)iolist[i].aiocbp->aio_buf, data, SZBUF)) { + pr_err("%s: Data written to file %s differs from data in memory\n", + __func__, fn[i]); + ret = -1; + goto out; + } + } + ret = 0; + out: + for (i = 0; i < aio_num_threads; i++) { + fclose(fp[i]); + } + + return ret; +} + +void my_aio_close(int aio_num_threads, struct aioreq *iolist) { + int ret; + int i; + + for (i = 0; i < aio_num_threads; i++) { + if (iolist[i].aiocbp->aio_fildes != -1) { + close(iolist[i].aiocbp->aio_fildes); + iolist[i].aiocbp->aio_fildes = -1; + } + } +} + +int my_aio(int aio_num_threads, struct aioreq *iolist, char **fn, long nsec_calc, int no_aio) { + int ret; + int i, j; + + //pr_debug("%s: debug: enter\n", __func__); + + + /* Start async IO */ + for (i = 0; i < NSAMPLES_INNER; i++) { + if (no_aio) goto skip1; + + if ((ret = my_aio_open(aio_num_threads, iolist, fn)) == -1) { + pr_err("%s: error: my_aio_open: %s\n", + __func__, strerror(errno)); + ret = -errno; + goto out; + } + //pr_debug("%s: debug: after my_aio_open\n", __func__); + + + /* Reset completion */ + completion_count = 0; + __sync_synchronize(); + + for (j = 0; j < aio_num_threads; j++) { + iolist[j].status = EINPROGRESS; + + if ((ret = aio_write(iolist[j].aiocbp)) == -1) { + pr_err("%s: error: aio_write: %s\n", + __func__, strerror(errno)); + ret = -errno; + goto out; + } + + //pr_debug("%s: debug: after %d-th aio_write\n", __func__, j); + } + skip1: + /* Emulate calcuation phase */ + ndelay(nsec_calc); + if (no_aio) goto skip2; + +#if 0 + int k; + for (k = 0; k < 20; k++) { + char cmd[256]; + sprintf(cmd, "ls /proc/%d/task | wc -l", getpid()); + system(cmd); + usleep(200000); + } +#endif + + /* Wait for completion of async IO */ +#if WAIT_TYPE == WAIT_TYPE_SEM + + retry: + ret = sem_wait(&aio_sem); + if (ret == -1) { + if (errno == EINTR) { + pr_warn("%s: warning: sem_wait interrupted\n", + __func__); + goto retry; + } else { + pr_err("%s: error: sem_wait: %s\n", + __func__, strerror(errno)); + } + } + //pr_debug("%s: debug: completion_count: %d\n", __func__, completion_count); + +#elif WAIT_TYPE == WAIT_TYPE_BUSY_LOOP + + while (completion_count != aio_num_threads) { + for (j = 0; j < aio_num_threads; j++) { + if (iolist[j].status != EINPROGRESS) { + continue; + } + + iolist[j].status = aio_error(iolist[j].aiocbp); + + switch (iolist[j].status) { + case 0: /* Completed */ + goto completed; + case EINPROGRESS: + break; + case ECANCELED: + pr_err("%s: error: aio is cancelled\n", + __func__); + goto completed; + default: + pr_err("%s: error: aio_error: %s\n", + __func__, strerror(iolist[j].status)); + goto completed; + completed: + completion_count++; + break; + } + } + } +#endif /* WAIT_TYPE */ + /* Check amount read */ + for (j = 0; j < aio_num_threads; j++) { + ssize_t size; + + if ((size = aio_return(iolist[j].aiocbp)) != SZBUF) { + pr_err("%s: Expected to read %ld B but #%d has read %ld B\n", + __func__, SZBUF, j, size); + continue; + } + } + + my_aio_close(aio_num_threads, iolist); + skip2:; + } + ret = 0; + out: + my_aio_close(aio_num_threads, iolist); + return ret; +} + +int measure(double *result, int nsamples, int nsamples_drop, int aio_num_threads, struct aioreq *iolist, char **fn, long nsec_calc, int rank, int profile, int no_aio) { + int ret; + int i; + double t_l, t_g, t_sum = 0; + double start, end; + + for (i = 0; i < nsamples + nsamples_drop; i++) { + + MPI_Barrier(MPI_COMM_WORLD); + + /* Set parameter based on current IPC and frequency */ + ndelay_init(0); + + start = mytime(); + + struct rusage ru_start, ru_end; + struct timeval tv_start, tv_end; + + if (profile) { + if ((ret = getrusage(RUSAGE_SELF, &ru_start))) { + pr_err("%s: error: getrusage failed (%d)\n", __func__, ret); + } + + if ((ret = gettimeofday(&tv_start, NULL))) { + pr_err("%s: error: gettimeofday failed (%d)\n", __func__, ret); + } + } + + if ((ret = my_aio(aio_num_threads, iolist, fn, nsec_calc, no_aio))) { + pr_err("%s: error: my_aio returned %d\n", + __func__, ret); + } + + if (profile) { + if ((ret = getrusage(RUSAGE_SELF, &ru_end))) { + pr_err("%s: error: getrusage failed (%d)\n", __func__, ret); + } + + if ((ret = gettimeofday(&tv_end, NULL))) { + pr_err("%s: error: gettimeofday failed (%d)\n", __func__, ret); + } + + if (rank == 0) pr_debug("%s: wall: %ld, user: %ld, sys: %ld\n", __func__, + DIFFUSEC(tv_end, tv_start), + DIFFUSEC(ru_end.ru_utime, ru_start.ru_utime), + DIFFUSEC(ru_end.ru_stime, ru_start.ru_stime)); + } + + end = mytime(); + + MPI_Barrier(MPI_COMM_WORLD); + + /* Check contents */ + if ((ret = my_aio_check(iolist, aio_num_threads, fn))) { + pr_err("%s: error: my_aio_check returned %d\n", + __func__, ret); + } + + if (i < nsamples_drop) { + continue; + } + + /* Take max */ + t_l = end - start; + MPI_Allreduce(&t_l, &t_g, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD); + t_sum += t_g; + } + + *result = t_sum / nsamples; + ret = 0; + + return ret; +} + +int main(int argc, char **argv) +{ + int ret; + int i, j, progress, l; + int rank, nproc; + int ppn = -1; + int aio_num_threads = -1; + int disable_syscall_intercept = 0; + struct aioreq *iolist; + struct aiocb *aiocblist; + struct sigaction sa; + double t_io_ave, t_total_ave; + double t_table[NROW][NCOL] = { 0 }; + int opt; + char **aiobufs; + char **fn; + char src_dir[PATH_MAX]; + char *argv0; + + opterr = 0; /* Don't print out error when not recognizing option character */ + + while ((opt = getopt(argc, argv, ":I:p:t:")) != -1) { + switch (opt) { + case 'I': + disable_syscall_intercept = atoi(optarg); + break; + case 'p': + ppn = atoi(optarg); + break; + case 't': + aio_num_threads = atoi(optarg); + break; + case '?': + pr_err("error: invalid option: -%c\n", + optopt); + ret = 1; + goto out; + case ':': + pr_err("error: option -%c requires an argument\n", + optopt); + ret = 1; + goto out; + } + } + + if (ppn == -1) { + pr_err("error: specify ppn with -p \n"); + ret = 1; + goto out; + } + + if (aio_num_threads == -1) { + pr_err("error: specify aio_num_threads with -p \n"); + ret = 1; + goto out; + } + + /* Initialize MPI */ + MPI_Init(&argc, &argv); + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + MPI_Comm_size(MPI_COMM_WORLD, &nproc); + +#if 0 + int k; + for (k = 0; k < 20; k++) { + char cmd[256]; + sprintf(cmd, "ls /proc/%d/task | wc -l", getpid()); + system(cmd); + usleep(200000); + } +#endif + + /* Show parameters */ + if (rank == 0) { +#pragma omp parallel + { + if (omp_get_thread_num() == 0) { + printf("nproc=%d,#threads=%d\n", nproc, omp_get_num_threads()); + } + } + } + + /* Set verbosity */ + //test_set_loglevel(TEST_LOGLEVEL_WARN); + + /* Set parameter based on current IPC and frequency */ + ndelay_init(1); + + /* Initialize files */ + if (!(fn = malloc(sizeof(char *) * aio_num_threads))) { + pr_err("error: allocating fn\n"); + ret = -ENOMEM; + goto out; + } + + argv0 = strdup(argv[0]); + sprintf(src_dir, "%s", dirname(argv0)); + for (i = 0; i < aio_num_threads; i++) { + if (!(fn[i] = malloc(SZBUF))) { + pr_err("error: allocating data\n"); + ret = -ENOMEM; + goto out; + } + + sprintf(fn[i], "%s/rank%d-number%d", src_dir, rank, i); + if (rank < 2 && i < 2) { + pr_debug("debug: rank: %d, fn[%d]: %s\n", + rank, i, fn[i]); + } + } + + /* Allocate aio arrays */ + if (!(iolist = calloc(aio_num_threads, sizeof(struct aioreq)))) { + pr_err("%s: error: allocating iolist\n", + __func__); + ret = 1; + goto out; + } + + if (!(aiocblist = calloc(aio_num_threads, sizeof(struct aiocb)))) { + pr_err("%s: error: allocating aiocblist\n", + __func__); + ret = 1; + goto out; + } + + /* Prepare data to be written */ + if (!(aiobufs = malloc(sizeof(char *) * aio_num_threads))) { + pr_err("error: allocating aiobufs\n"); + ret = -ENOMEM; + goto out; + } + + for (i = 0; i < aio_num_threads; i++) { + aiobufs[i] = malloc(SZBUF); + if (!aiobufs[i]) { + pr_err("%s: error: allocating aiobufs\n", + __func__); + ret = 1; + goto out; + } + + for (j = 0; j < SZBUF; j++) { + *(aiobufs[i] + j) = i + j + rank; + } + } + + /* Initialize aio parameters except fd and status */ + if ((ret = my_aio_init(aio_num_threads, iolist, aiocblist, aiobufs))) { + pr_err("%s: error: my_aio_init returned %d\n", + __func__, ret); + goto out; + } + +#if 0 + /* Set signal handlers */ + sa.sa_flags = SA_RESTART | SA_SIGINFO; + sa.sa_sigaction = aio_sighandler; + if (sigaction(SIGUSR1, &sa, NULL) == -1) { + pr_err("%s: error: sigaction: %s\n", + __func__, strerror(errno)); + ret = 1; + goto out; + } +#endif + + /* Initialize semaphore */ + if ((ret = sem_init(&aio_sem, 0, 0))) { + pr_err("%s: error: sem_init: %s\n", __func__, strerror(errno)); + ret = -errno; + goto out; + } + + /* Take profile */ + if ((ret = measure(&t_io_ave, NSAMPLES_PROFILE, 0, aio_num_threads, iolist, fn, 0, rank, 1, 0))) { + pr_err("error: measure returned %d\n", ret); + goto out; + } + + /* Measure IO only time */ + if ((ret = measure(&t_io_ave, NSAMPLES_IO, NSAMPLES_DROP, aio_num_threads, iolist, fn, 0, rank, 0, 0))) { + pr_err("error: measure returned %d\n", ret); + goto out; + } + + if (rank == 0) { + printf("t_io_ave: %.0f usec, %.0f MB/s per node\n", + t_io_ave * MYTIME_TOUSEC, + SZBUF * ppn * aio_num_threads / t_io_ave / 1000000); + } + + /* Measure time with no progress, progress and no uti, progress and uti */ + for (progress = 0; progress <= (disable_syscall_intercept ? 0 : -1); progress += 1) { + + if (progress == 1) { + /* Ignore uti_attr, spawn a thread onto compute CPUs */ + setenv("DISABLE_UTI", "1", 1); + } else if (progress == 2) { + unsetenv("DISABLE_UTI"); + } + + /* Increasing calculation time up to 100% of IO time */ + for (l = 0; l <= NROW - 1; l += 1) { + long nsec_calc = (t_io_ave * MYTIME_TONSEC * l) / 10; + + if ((ret = measure(&t_total_ave, NSAMPLES_TOTAL, NSAMPLES_DROP, aio_num_threads, iolist, fn, nsec_calc, rank, 0, 0))) { + pr_err("error: measure returned %d\n", ret); + goto out; + } + + if (rank == 0) { + if (l == 0) { + pr_debug("progress=%d\n", progress); + if (progress == 0) { + pr_debug("calc\ttotal\n"); + } else { + pr_debug("total\n"); + } + } + + t_table[l][0] = nsec_calc * (MYTIME_TOUSEC / (double)MYTIME_TONSEC); + if (progress == 0) { + pr_debug("%.0f\t%.0f\n", nsec_calc * (MYTIME_TOUSEC / (double)MYTIME_TONSEC), t_total_ave * MYTIME_TOUSEC); + t_table[l][progress + 1] = t_total_ave * MYTIME_TOUSEC; + } else { + pr_debug("%.0f\n", t_total_ave * MYTIME_TOUSEC); + t_table[l][progress + 1] = t_total_ave * MYTIME_TOUSEC; + } + } + } + } + + if (rank == 0) { + printf("calc,no prog,prog and no uti, prog and uti\n"); + for (l = 0; l <= NROW - 1; l++) { + for (i = 0; i < NCOL; i++) { + if (i > 0) { + printf(","); + } + printf("%.0f", t_table[l][i]); + } + printf("\n"); + } + } + + MPI_Barrier(MPI_COMM_WORLD); + //pr_debug("after barrier\n"); + + MPI_Finalize(); + //pr_debug("after finalize\n"); + + ret = 0; +out: + if ((ret = sem_destroy(&aio_sem))) { + pr_err("%s: error: sem_destroy: %s\n", __func__, strerror(errno)); + goto out; + } + + free(argv0); + return ret; +} diff --git a/test/uti/posix_aio/002.sh b/test/uti/posix_aio/002.sh new file mode 100755 index 00000000..6e09a7ab --- /dev/null +++ b/test/uti/posix_aio/002.sh @@ -0,0 +1,308 @@ +#!/usr/bin/bash + +test_dir=`pwd -P` +mck_dir=${HOME}/project/os/install +uti_dir_lin=${HOME}/project/uti/install_linux +uti_dir_mck=${HOME}/project/uti/install_mckernel + +exe=`basename $0 | sed 's/\.sh//'` + +stop=0 +reboot=0 +go=0 + +interactive=0 +pjsub=0 +gdb=0 +disable_syscall_intercept=0 +mck=0 +nnodes=2 +host_type=wallaby +LASTNODE=15 +use_hfi=0 +omp_num_threads=4 +ppn=4 +aio_num_threads=1 + +while getopts srgc:ml:N:P:o:hGI:ipL: OPT +do + case ${OPT} in + s) stop=1 + ;; + r) reboot=1 + ;; + g) go=1 + ;; + m) mck=1 + ;; + N) nnodes=$OPTARG + ;; + P) ppn=$OPTARG + ;; + o) omp_num_threads=$OPTARG + ;; + h) use_hfi=1 + ;; + G) gdb=1 + ;; + I) disable_syscall_intercept=$OPTARG + ;; + i) interactive=1 + ;; + p) pjsub=1 + ;; + L) LASTNODE=$OPTARG + ;; + *) echo "invalid option -${OPT}" >&2 + exit 1 + esac +done + +case $host_type in + wallaby) hnprefix=wallaby + ;; + ofp) hnprefix=c + ;; + *) echo "invalid host_type $host_type" + exit 1 +esac + +nprocs=$((ppn * nnodes)) +nodes="$hnprefix`echo $(seq -s ",$hnprefix" $(($LASTNODE + 1 - $nnodes)) $LASTNODE)`" + +case $host_type in + wallaby) + uti_cpu_set_lin=0,16,8,24 + exclude_list=0,16,8,24 + uti_cpu_set_mck=0,16,8,24 + ;; + ofp) + # vertical cut, excluding phys loaded with Linux tasks + uti_cpu_set_lin=1,69,137,205,18-19,86-87,154-155,222-223 + exclude_list=0-1,68-69,136-137,204-205,18-19,86-87,154-155,222-223 + #64-67,132-135,200-203,268-271 + + uti_cpu_set_mck=1,69,137,205,18-19,86-87,154-155,222-223 + + # horizontal cut, excluding phys loaded with Linux tasks for mckernel + #uti_cpu_set_lin=204-271 + #uti_cpu_set_mck=1-67 + ;; + *) echo "invalid host_type $host_type" + exit 1 +esac + +if [ $mck -eq 0 ]; then + uti_cpu_set_str="export UTI_CPU_SET=$uti_cpu_set_lin" + i_mpi_pin_processor_exclude_list="export I_MPI_PIN_PROCESSOR_EXCLUDE_LIST=$exclude_list" +else + uti_cpu_set_str="export UTI_CPU_SET=$uti_cpu_set_mck" + i_mpi_pin_processor_exclude_list= +fi + +if [ ${mck} -eq 1 ]; then + i_mpi_pin=off + i_mpi_pin_domain= + i_mpi_pin_order= +# if [ $omp_num_threads -eq 1 ]; then +# # Avoid binding main thread and uti thread to one CPU + kmp_affinity="export KMP_AFFINITY=disabled" +# else +# # Bind rank to OMP_NUM_THREAD-sized CPU-domain +# kmp_affinity="export KMP_AFFINITY=granularity=thread,scatter" +# fi +else + i_mpi_pin=on + domain=$omp_num_threads # Use 32 when you want to match mck's -n division + i_mpi_pin_domain="export I_MPI_PIN_DOMAIN=$domain" + i_mpi_pin_order="export I_MPI_PIN_ORDER=compact" + kmp_affinity="export KMP_AFFINITY=granularity=thread,scatter" +fi + +echo nprocs=$nprocs nnodes=$nnodes ppn=$ppn nodes=$nodes domain=$domain + +if [ ${mck} -eq 1 ]; then + makeopt="UTI_DIR=$uti_dir_mck" + use_mck="#PJM -x MCK=$mck_dir" + mck_mem="#PJM -x MCK_MEM=32G@0,8G@1" + mcexec="${mck_dir}/bin/mcexec" + nmcexecthr=$((omp_num_threads + 1 + aio_num_threads * 2 + 2)) + mcexecopt="-n $ppn -t $nmcexecthr" # --uti-use-last-cpu + + if [ ${use_hfi} -eq 1 ]; then + mcexecopt="--enable-hfi1 $mcexecopt" + fi + + if [ $disable_syscall_intercept -eq 0 ]; then + mcexecopt="--enable-uti $mcexecopt" + fi + +else + offline=`PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $nodes lscpu 2>&1 | dshbak -c | grep Off-line` + if [ "$offline" != "" ]; then + echo "Error: Some CPUs are offline: $offline" + exit + fi + + makeopt="UTI_DIR=$uti_dir_lin" + use_mck= + mck_mem= + mcexec= + mcexecopt= +fi + +if [ $gdb -eq 1 ]; then + enable_x="-enable-x" + gdbcmd="xterm -display localhost:11 -hold -e gdb -ex run --args" +fi + +if [ $interactive -eq 1 ]; then + i_mpi_hydra_bootstrap_exec= + i_mpi_hydra_bootstrap= + hosts= + ssh= +else +# PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $nodes bash -c \'if \[ \"\`cat /etc/mtab \| while read line\; do cut -d\" \" -f 2\; done \| grep /work\`\" == \"\" \]\; then sudo mount /work\; fi\' + i_mpi_hydra_bootstrap_exec="export I_MPI_HYDRA_BOOTSTRAP_EXEC=/usr/bin/ssh" + i_mpi_hydra_bootstrap="export I_MPI_HYDRA_BOOTSTRAP=ssh" + hosts="-hosts $nodes" + ssh="ssh -A $(echo $nodes | cut -d',' -f1)" +fi + +case $host_type in + wallaby) + i_mpi_fabrics="export I_MPI_FABRICS=shm:dapl" + i_mpi_tmi_provider= + + opt_dir=/opt/intel + impiver=2018.3.222 # 1.163, 2.199, 3.222 + ;; + ofp) + i_mpi_fabrics="export I_MPI_FABRICS=shm:tmi" + i_mpi_tmi_provider="export I_MPI_TMI_PROVIDER=psm2" + + if [ $interactive -eq 1 ]; then + opt_dir=/opt/intel + else + opt_dir=/home/opt/local/cores/intel + fi + impiver=2018.1.163 # 1.163, 2.199, 3.222 + ;; + *) echo "invalid host_type $host_type" + exit 1 +esac + +# If using ssh +if [ $pjsub -eq 0 ] && [ $interactive -eq 0 ]; then + compilervars=". ${opt_dir}/compilers_and_libraries_${impiver}/linux/bin/compilervars.sh intel64" +else + compilervars= +fi + +if [ ${stop} -eq 1 ]; then + if [ ${mck} -eq 1 ]; then + PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $nodes \ + /usr/sbin/pidof mcexec \| xargs -r sudo kill -9 + PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $nodes \ + /usr/sbin/pidof $exe \| xargs -r sudo kill -9 + PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $nodes \ + sudo ${mck_dir}/sbin/mcstop+release.sh + else + : + fi +fi + +if [ ${reboot} -eq 1 ]; then + if [ ${mck} -eq 1 ]; then + case $host_type in + wallaby) hnprefix=wallaby + PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $nodes sudo ${mck_dir}/sbin/mcreboot.sh -h -O -c 1-7,17-23,9-15,25-31 -r 1-7:0+17-23:16+9-15:8+25-31:24 -m 10G@0,10G@1 + #PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $nodes sudo ${mck_dir}/sbin/mcreboot.sh -h -O -c 1-4 -r 1-4:0 -m 10G@0,10G@1 + ;; + ofp) + # -h: Prevent unnessary CPU resource division for KNL + PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $nodes \ + sudo ${mck_dir}/sbin/mcreboot.sh -h -O -c 2-17,70-85,138-153,206-221,20-35,88-103,156-171,224-239,36-51,104-119,172-187,240-255,52-67,120-135,188-203,256-271 -r 2-5,70-73,138-141,206-209:0+6-9,74-77,142-145,210-213:1+10-13,78-81,146-149,214-217:68+14-17,82-85,150-153,218-221:69+20-23,88-91,156-159,224-227:136+24-27,92-95,160-163,228-231:137+28-31,96-99,164-167,232-235:204+32-35,100-103,168-171,236-239:205+36-39,104-107,172-175,240-243:18+40-43,108-111,176-179,244-247:19+44-47,112-115,180-183,248-251:86+48-51,116-119,184-187,252-255:87+52-55,120-123,188-191,256-259:154+56-59,124-127,192-195,260-263:155+60-63,128-131,196-199,264-267:222+64-67,132-135,200-203,268-271:223 -m 32G@0,12G@1 + ;; + *) echo "invalid host_type $host_type" + exit 1 + esac + else + : + fi +fi + +( +cat < ./job.sh +chmod u+x ./job.sh + +if [ ${go} -eq 1 ]; then + if [ $pjsub -eq 1 ]; then + pjsub ./job.sh + else + if [ $interactive -eq 0 ]; then + eval $compilervars + fi + make $makeopt ./$exe + PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $nodes \ + /usr/sbin/pidof $exe \| xargs -r sudo kill -9 + $ssh ${test_dir}/job.sh + fi +fi diff --git a/test/uti/posix_aio/Makefile b/test/uti/posix_aio/Makefile new file mode 100755 index 00000000..4f027e77 --- /dev/null +++ b/test/uti/posix_aio/Makefile @@ -0,0 +1,51 @@ +.SUFFIXES: # Clear suffixes +.ONESHELL: # Pack all the lines and pass it to shell + +VMTOUCH=$(HOME)/project/src/vmtouch/install/bin/vmtouch + +# Specify it via *.sh +UTI_DIR=${HOME}/project/uti/install_linux + +CC=mpiicc +LD=$(CC) + +CFLAGS = -g -O0 -Wall -DVMTOUCH=$(VMTOUCH) +LDFLAGS = -lpthread -L$(UTI_DIR)/lib -Wl,-rpath -Wl,$(UTI_DIR)/lib -luti -lrt +SRCS = $(shell ls 0*.c) +OBJS = $(SRCS:.c=.o) util.o +EXES = $(SRCS:.c=) + +define create_files = + for i in {1..2}; do + dd if=/dev/zero of=./data/$i bs=1M count=1 + done +endef + +all: $(EXES) + +file:: + $(value create_files) + +util.o:: util.c util.h + $(CC) $(CFLAGS) -qopenmp -c $< + +001: 001.o util.o + $(LD) -o $@ $^ $(LDFLAGS) -qopenmp + +001.o:: 001.c + $(CC) $(CFLAGS) -qopenmp -c $< + +002: 002.o util.o + $(LD) -o $@ $^ $(LDFLAGS) -qopenmp + +002.o:: 002.c + $(CC) $(CFLAGS) -qopenmp -c $< + +%: %.o + $(LD) -o $@ $^ $(LDFLAGS) + +%.o::%.c + $(CC) $(CFLAGS) -c $< + +clean: + rm -f core $(EXES) $(OBJS) $(DSRCS) diff --git a/test/uti/posix_aio/README b/test/uti/posix_aio/README new file mode 100644 index 00000000..097b1b40 --- /dev/null +++ b/test/uti/posix_aio/README @@ -0,0 +1,15 @@ +============================================= +Benchmarks of asynchronous I/O with busy CPUs +============================================= + +The purpose is to show the benefit of spawning the asynchronous threads onto dedicated CPUs. + +--- +001 +--- +Write + +--- +002 +--- +Write, IO completion is notified by spawning thread diff --git a/test/uti/posix_aio/util.c b/test/uti/posix_aio/util.c new file mode 100644 index 00000000..673639ab --- /dev/null +++ b/test/uti/posix_aio/util.c @@ -0,0 +1,133 @@ +#define _GNU_SOURCE /* See feature_test_macros(7) */ +#include +#include +#include +#include +#include /* For SYS_xxx definitions */ +#include +#include +#include +#include +#include +#include +#include +#include "util.h" + +/* Messaging */ +enum test_loglevel test_loglevel = TEST_LOGLEVEL_DEBUG; + +/* Calculation */ +static inline void asmloop(unsigned long n) { + int j; + + for (j = 0; j < n; j++) { + asm volatile( + "movq $0, %%rcx\n\t" + "1:\t" + "addq $1, %%rcx\n\t" + "cmpq $99, %%rcx\n\t" + "jle 1b\n\t" + : + : + : "rcx", "cc"); + } +} + +#define N_INIT 10000000 +double nspw; /* nsec per work */ + +void ndelay_init(int verbose) { + struct timeval start, end; + int rank, nproc; + double min, sum, max; + + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + MPI_Comm_size(MPI_COMM_WORLD, &nproc); + + //clock_gettime(TIMER_KIND, &start); + gettimeofday(&start, NULL); + +#pragma omp parallel + { + asmloop(N_INIT); + } + + //clock_gettime(TIMER_KIND, &end); + gettimeofday(&end, NULL); + + nspw = DIFFUSEC(end, start) * 1000 / (double)N_INIT; + + if (verbose) { + MPI_Reduce(&nspw, &min, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD); + MPI_Reduce(&nspw, &sum, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD); + MPI_Reduce(&nspw, &max, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); + if (rank == 0) { + pr_debug("nspw: min=%.0f, ave=%.0f, max=%.0f\n", min, sum / nproc, max); + } + } +} + +#if 1 +void ndelay(long delay_nsec) { + if (delay_nsec < 0) { + printf("delay_nsec < 0\n"); + return; + } +#pragma omp parallel + { + asmloop(delay_nsec / nspw); + } +} +#else /* For machines with large core-to-core performance variation (e.g. OFP) */ +void ndelay(long delay_nsec) { + struct timespec start, end; + + if (delay_nsec < 0) { return; } + clock_gettime(TIMER_KIND, &start); + + while (1) { + clock_gettime(TIMER_KIND, &end); + if (DIFFNSEC(end, start) >= delay_nsec) { + break; + } + asmloop(2); /* ~150 ns per iteration on FOP */ + } +} +#endif + + +double cycpw; /* cyc per work */ + +void cdlay_init() { + unsigned long start, end; + + start = rdtsc_light(); +#define N_INIT 10000000 + asmloop(N_INIT); + end = rdtsc_light(); + cycpw = (end - start) / (double)N_INIT; +} + +#if 0 +void cdelay(long delay_cyc) { + if (delay_cyc < 0) { + return; + } + asmloop(delay_cyc / cycpw); +} +#else /* For machines with large core-to-core performance variation (e.g. OFP) */ +void cdelay(long delay_cyc) { + unsigned long start, end; + + if (delay_cyc < 0) { return; } + start = rdtsc_light(); + + while (1) { + end = rdtsc_light(); + if (end - start >= delay_cyc) { + break; + } + asmloop(2); + } +} +#endif diff --git a/test/uti/posix_aio/util.h b/test/uti/posix_aio/util.h new file mode 100644 index 00000000..48b53fcd --- /dev/null +++ b/test/uti/posix_aio/util.h @@ -0,0 +1,70 @@ +#ifndef __UTIL_H_INCLUDED__ +#define __UTIL_H_INCLUDED__ + +#include + +/* Messaging */ + +enum test_loglevel { + TEST_LOGLEVEL_ERR = 0, + TEST_LOGLEVEL_WARN, + TEST_LOGLEVEL_DEBUG +}; + +extern enum test_loglevel test_loglevel; +static inline void test_set_loglevel(enum test_loglevel level) +{ + test_loglevel = level; +} + +#define pr_level(level, fmt, args...) do { \ + if (test_loglevel >= level) { \ + fprintf(stdout, fmt, ##args); \ + } \ +} while (0) + +#define pr_err(fmt, args...) pr_level(TEST_LOGLEVEL_ERR, fmt, ##args) +#define pr_warn(fmt, args...) pr_level(TEST_LOGLEVEL_WARN, fmt, ##args) +#define pr_debug(fmt, args...) pr_level(TEST_LOGLEVEL_DEBUG, fmt, ##args) + +#define _OKNG(verb, jump, cond, fmt, args...) do { \ + if (cond) { \ + if (verb) \ + printf("[ OK ] " fmt, ##args); \ + } else { \ + printf("[ NG ] " fmt, ##args); \ + if (jump) { \ + ret = -1; \ + goto out; \ + } \ + } \ +} while (0) + +#define OKNG(args...) _OKNG(1, 1, ##args) +#define NG(args...) _OKNG(0, 1, ##args) +#define OKNGNOJUMP(args...) _OKNG(1, 0, ##args) + +/* Time */ +inline uint64_t rdtsc_light(void) +{ + uint64_t x; + __asm__ __volatile__("rdtscp;" /* rdtscp don't jump over earlier instructions */ + "shl $32, %%rdx;" + "or %%rdx, %%rax" : + "=a"(x) : + : + "%rcx", "%rdx", "memory"); + return x; +} + +#define DIFFUSEC(end, start) ((end.tv_sec - start.tv_sec) * 1000000UL + (end.tv_usec - start.tv_usec)) +#define DIFFNSEC(end, start) ((end.tv_sec - start.tv_sec) * 1000000000UL + (end.tv_nsec - start.tv_nsec)) +#define TIMER_KIND CLOCK_MONOTONIC_RAW /* CLOCK_THREAD_CPUTIME_ID */ + +/* Calculation emulation */ +void ndelay_init(); +void ndelay(long delay_nsec); +void cdelay_init(); +void cdelay(long delay_cyc); + +#endif diff --git a/test/uti/preloadlib.c b/test/uti/preloadlib.c new file mode 100644 index 00000000..7b8ba350 --- /dev/null +++ b/test/uti/preloadlib.c @@ -0,0 +1,40 @@ +#include +#include +#include +#define __USE_GNU +#include + +static int +hook(long syscall_number, + long arg0, long arg1, + long arg2, long arg3, + long arg4, long arg5, + long *result) +{ + if (syscall_number == SYS_getdents) { + /* + * Prevent the application from + * using the getdents syscall. From + * the point of view of the calling + * process, it is as if the kernel + * would return the ENOTSUP error + * code from the syscall. + */ + *result = -ENOTSUP; + return 0; + } else { + /* + * Ignore any other syscalls + * i.e.: pass them on to the kernel + * as would normally happen. + */ + return 1; + } +} + +static __attribute__((constructor)) void +init(void) +{ + // Set up the callback function + intercept_hook_point = hook; +} diff --git a/test/uti/psm2/Makefile b/test/uti/psm2/Makefile new file mode 100755 index 00000000..4fcb442c --- /dev/null +++ b/test/uti/psm2/Makefile @@ -0,0 +1,27 @@ +.SUFFIXES: # Clear suffixes + +CC=gcc + +LD=$(CC) + +CFLAGS = -g -O2 +LDFLAGS = -lpthread -lpsm2 +SRCS = $(shell ls *.c) +OBJS = $(SRCS:.c=.o) +EXES = $(SRCS:.c=) +TMPFILES = $(shell ls psm2-demo-*) + +all: $(EXES) file + +file::$(TMPFILES) + rm -f $(TMPFILES) + +%: %.o + $(LD) -o $@ $^ $(LDFLAGS) + +%.o::%.c + $(CC) $(CFLAGS) -c $< + +clean: + rm -f core $(EXES) $(OBJS) $(DSRCS) + diff --git a/test/uti/psm2/psm2-demo.c b/test/uti/psm2/psm2-demo.c new file mode 100644 index 00000000..955d76bd --- /dev/null +++ b/test/uti/psm2/psm2-demo.c @@ -0,0 +1,212 @@ +/* + * PSM2 example program. + * Start two instances of this program from the same working directory. + * These processes can execute on the same host, or on two hosts connected + * with OPA. + * Compile with: gcc psm2-demo.c -o psm2-demo -lpsm2 + * Run as: ./psm2-demo -s # this is the server process + * and: ./psm2-demo # this is the client process + * Copyright(c) 2015 Intel Corporation. + * */ +#include +#include /* required for core PSM2 functions */ +#include /* required for PSM2 MQ functions (send, recv, etc) */ +#include +#include +#include +#include +#include + +#define BUFFER_LENGTH 8000000 +#define CONNECT_ARRAY_SIZE 8 +void die(char *msg, int rc) { + fprintf(stderr, "%s: %d\n", msg, rc); + exit(1); +} + +/* Helper functions to find the server's PSM2 endpoint identifier (epid). */ +psm2_epid_t find_server() { + FILE *fp = NULL; + psm2_epid_t server_epid = 0; + printf("PSM2 client waiting for epid mapping file to appear...\n"); + while (!fp) { + sleep(1); + fp = fopen("psm2-demo-server-epid", "r"); + } + fscanf(fp, "%lx", &server_epid); + fclose(fp); + printf("PSM2 client found server epid = 0x%lx\n", server_epid); + return server_epid; +} + +void write_epid_to_file(psm2_epid_t myepid) { + FILE *fp; + fp = fopen("psm2-demo-server-epid", "w"); + if (!fp) { + fprintf(stderr, + "Exiting, couldn't write server's epid mapping file: "); + die(strerror(errno), errno); + } + fprintf(fp, "0x%lx", myepid); + fclose(fp); + printf("PSM2 server wrote epid = 0x%lx to file.\n", myepid); + return; +} + +int main(int argc, char **argv) { + struct psm2_ep_open_opts o; + psm2_uuid_t uuid; + psm2_ep_t myep; + psm2_epid_t myepid; + psm2_epid_t server_epid; + psm2_epid_t epid_array[CONNECT_ARRAY_SIZE]; + int epid_array_mask[CONNECT_ARRAY_SIZE]; + psm2_error_t epid_connect_errors[CONNECT_ARRAY_SIZE]; + psm2_epaddr_t epaddr_array[CONNECT_ARRAY_SIZE]; + int rc; + int ver_major = PSM2_VERNO_MAJOR; + int ver_minor = PSM2_VERNO_MINOR; + char msgbuf[BUFFER_LENGTH]; + psm2_mq_t q; + psm2_mq_req_t req_mq; + int is_server = 0; + if (argc > 2) { + die("To run in server mode, invoke as ./psm2-demo -s\n" \ + "or run in client mode, invoke as ./psm2-demo\n" \ + "Wrong number of args", argc); + } + is_server = argc - 1; /* Assume any command line argument is -s */ + memset(uuid, 0, sizeof(psm2_uuid_t)); /* Use a UUID of zero */ +/* Try to initialize PSM2 with the requested library version. + * * In this example, given the use of the PSM2_VERNO_MAJOR and MINOR + * * as defined in the PSM2 headers, ensure that we are linking with + * * the same version of PSM2 as we compiled against. */ + + if ((rc = psm2_init(&ver_major, &ver_minor)) != PSM2_OK) { + die("couldn't init", rc); + } + printf("PSM2 init done.\n"); + /* Setup the endpoint options struct */ + if ((rc = psm2_ep_open_opts_get_defaults(&o)) != PSM2_OK) { + die("couldn't set default opts", rc); + } + printf("PSM2 opts_get_defaults done.\n"); + /* Attempt to open a PSM2 endpoint. This allocates hardware resources. */ + if ((rc = psm2_ep_open(uuid, &o, &myep, &myepid)) != PSM2_OK) { + die("couldn't psm2_ep_open()", rc); + } + printf("PSM2 endpoint open done.\n"); + if (is_server) { + write_epid_to_file(myepid); + } else { + server_epid = find_server(); + } + if (is_server) { + /* Server does nothing here. A connection does not have to be + * * established to receive messages. */ + printf("PSM2 server up.\n"); + } else { + /* Setup connection request info */ + /* PSM2 can connect to a single epid per request, + * * or an arbitrary number of epids in a single connect call. + * * For this example, use part of an array of + * * connection requests. */ + memset(epid_array_mask, 0, sizeof(int) * CONNECT_ARRAY_SIZE); + epid_array[0] = server_epid; + epid_array_mask[0] = 1; + /* Begin the connection process. + * * note that if a requested epid is not responding, + * * the connect call will still return OK. + * * The errors array will contain the state of individual + * * connection requests. */ + if ((rc = psm2_ep_connect(myep, + CONNECT_ARRAY_SIZE, + epid_array, + epid_array_mask, + epid_connect_errors, + epaddr_array, + 0 /* no timeout */ + )) != PSM2_OK) { + die("couldn't ep_connect", rc); + } + printf("PSM2 connect request processed.\n"); + /* Now check if our connection to the server is ready */ + if (epid_connect_errors[0] != PSM2_OK) { + die("couldn't connect to server", + epid_connect_errors[0]); + } + printf("PSM2 client-server connection established.\n"); + } + /* Setup our PSM2 message queue */ + if ((rc = psm2_mq_init(myep, PSM2_MQ_ORDERMASK_NONE, NULL, 0, &q)) + != PSM2_OK) { + die("couldn't initialize PSM2 MQ", rc); + } + printf("PSM2 MQ init done.\n"); + if (is_server) { + psm2_mq_tag_t t = {0xABCD}; + psm2_mq_tag_t tm = {-1}; + /* Post the receive request */ + if ((rc = psm2_mq_irecv2(q, PSM2_MQ_ANY_ADDR, + &t, /* message tag */ + &tm, /* message tag mask */ + 0, /* no flags */ + msgbuf, BUFFER_LENGTH, + NULL, /* no context to add */ + &req_mq /* track irecv status */ + )) != PSM2_OK) { + die("couldn't post psm2_mq_irecv()", rc); + } + printf("PSM2 MQ irecv() posted\n"); + /* Wait until the message arrives */ + if ((rc = psm2_mq_wait(&req_mq, NULL)) != PSM2_OK) { + die("couldn't wait for the irecv", rc); + } + printf("PSM2 MQ wait() done.\n"); + printf("Message from client:\n"); + printf("%s", msgbuf); + unlink("psm2-demo-server-epid"); + } else { + /* Say hello */ + snprintf(msgbuf, BUFFER_LENGTH, + "Hello world from epid=0x%lx, pid=%d.\n", + myepid, getpid()); + psm2_mq_tag_t t = {0xABCD}; + if ((rc = psm2_mq_send2(q, + epaddr_array[0], /* destination epaddr */ + PSM2_MQ_FLAG_SENDSYNC, /* no flags */ + &t, /* tag */ + msgbuf, BUFFER_LENGTH + )) != PSM2_OK) { + die("couldn't post psm2_mq_isend", rc); + } + printf("PSM2 MQ send() done.\n"); + } +/* Close down the MQ */ + if ((rc = psm2_mq_finalize(q)) != PSM2_OK) { + die("couldn't psm2_mq_finalize()", rc); + } + printf("PSM2 MQ finalized.\n"); +/* Close our ep, releasing all hardware resources. + * * Try to close all connections properly */ + if ((rc = psm2_ep_close(myep, PSM2_EP_CLOSE_GRACEFUL, + 0 /* no timeout */)) != PSM2_OK) { + die("couldn't psm2_ep_close()", rc); + } + printf("PSM2 ep closed.\n"); + /* Release all local PSM2 resources */ + if ((rc = psm2_finalize()) != PSM2_OK) { + die("couldn't psm2_finalize()", rc); + } + printf("PSM2 shut down, exiting.\n"); + return 0; +} + + + + + + + + + diff --git a/test/uti/util.c b/test/uti/util.c new file mode 100644 index 00000000..7e1965d5 --- /dev/null +++ b/test/uti/util.c @@ -0,0 +1,130 @@ +#define _GNU_SOURCE /* See feature_test_macros(7) */ +#include +#include /* For SYS_xxx definitions */ +#include +#include +#include +#include +#include +#include +#include +#include "util.h" + +static inline void fixed_size_work() { + asm volatile( + "movq $0, %%rcx\n\t" + "1:\t" + "addq $1, %%rcx\n\t" + "cmpq $99, %%rcx\n\t" + "jle 1b\n\t" + : + : + : "rcx", "cc"); +} + +static inline void bulk_fsw(unsigned long n) { + int j; + for (j = 0; j < (n); j++) { + fixed_size_work(); + } +} + +double nspw; /* nsec per work */ +unsigned long nsec; + +void fwq_init() { + struct timespec start, end; + int i; + clock_gettime(TIMER_KIND, &start); +#define N_INIT 10000000 + bulk_fsw(N_INIT); + clock_gettime(TIMER_KIND, &end); + nsec = DIFFNSEC(end, start); + nspw = nsec / (double)N_INIT; +} + +#if 1 +void fwq(long delay_nsec) { + if (delay_nsec < 0) { + return; + } + bulk_fsw(delay_nsec / nspw); +} +#else /* For machines with large core-to-core performance variation (e.g. OFP) */ +void fwq(long delay_nsec) { + struct timespec start, end; + + if (delay_nsec < 0) { return; } + clock_gettime(TIMER_KIND, &start); + + while (1) { + clock_gettime(TIMER_KIND, &end); + if (DIFFNSEC(end, start) >= delay_nsec) { + break; + } + bulk_fsw(2); /* ~150 ns per iteration on FOP */ + } +} +#endif + +int print_cpu_last_executed_on(const char *name) { + char fn[256]; + char* result; + pid_t tid = syscall(SYS_gettid); + int fd; + int offset; + int mpi_errno = 0; + + sprintf(fn, "/proc/%d/task/%d/stat", getpid(), (int)tid); + //printf("fn=%s\n", fn); + fd = open(fn, O_RDONLY); + if(fd == -1) { + printf("open() failed\n"); + goto fn_fail; + } + + result = malloc(65536); + if(result == NULL) { + printf("malloc() failed"); + goto fn_fail; + } + + int amount = 0; + offset = 0; + while(1) { + amount = read(fd, result + offset, 65536); + // printf("amount=%d\n", amount); + if(amount == -1) { + printf("read() failed"); + goto fn_fail; + } + if(amount == 0) { + goto eof; + } + offset += amount; + } + eof:; + //printf("result:%s\n", result); + + char* next_delim = result; + char* field; + int i; + for(i = 0; i < 39; i++) { + field = strsep(&next_delim, " "); + } + + int cpu = sched_getcpu(); + if(cpu == -1) { + printf("getcpu() failed\n"); + goto fn_fail; + } + + printf("[INFO] %s (tid: %d) is running on %02d,%02d\n", name, tid, atoi(field), cpu); + fn_exit: + free(result); + return mpi_errno; + fn_fail: + mpi_errno = -1; + goto fn_exit; +} + diff --git a/test/uti/util.h b/test/uti/util.h new file mode 100644 index 00000000..396f5183 --- /dev/null +++ b/test/uti/util.h @@ -0,0 +1,70 @@ +#ifndef __UTIL_H_INCLUDED__ +#define __UTIL_H_INCLUDED__ + +#include + +#define DEBUG + +#ifdef DEBUG +#define dprintf(...) do { \ + char msg[1024]; \ + sprintf(msg, __VA_ARGS__); \ + fprintf(stderr, "%s,%s", __func__, msg); \ +} while (0) +#else +#define dprintf(...) do { } while (0) +#endif + +#define eprintf(...) do { \ + char msg[1024]; \ + sprintf(msg, __VA_ARGS__); \ + fprintf(stderr, "%s,%s", __func__, msg); \ +} while (0) + +#define CHKANDJUMP(cond, err, ...) do { \ + if (cond) { \ + eprintf(__VA_ARGS__); \ + ret = err; \ + goto fn_fail; \ + } \ +} while (0) + +#define _OKNG(verb, jump, cond, fmt, args...) do { \ + if (cond) { \ + if (verb) \ + printf("[ OK ] " fmt, ##args); \ + } else { \ + printf("[ NG ] " fmt, ##args); \ + if (jump) \ + goto fn_fail; \ + } \ +} while (0) + +#define OKNG(args...) _OKNG(1, 1, ##args) +#define NG(args...) _OKNG(0, 1, ##args) +#define OKNGNOJUMP(args...) _OKNG(1, 0, ##args) + +#define DIFFNSEC(end, start) ((end.tv_sec - start.tv_sec) * 1000000000UL + (end.tv_nsec - start.tv_nsec)) +#define TIMER_KIND CLOCK_MONOTONIC_RAW /* CLOCK_THREAD_CPUTIME_ID */ + +static inline uint64_t rdtsc_light(void ) +{ + uint64_t x; + __asm__ __volatile__("rdtscp;" /* rdtscp works as instruction execution barrier */ + "shl $32, %%rdx;" + "or %%rdx, %%rax" : + "=a"(x) : + : + "%rcx", "%rdx", "memory"); + return x; +} + +extern double nspw; /* nsec per work */ +extern unsigned long nsec; + +void fwq_init(); +void fwq(long delay_nsec); +int print_cpu_last_executed_on(const char *name); + +#endif +